汉兰达语言完毕对基因组SNV进行诠释

 

广大时候,大家需求对取出的SNV进行诠释,那几个时候可能会在LX570上举办注解,经常注释文件都包括Chr、Start、End、Description,而我们的SNV文件一般是有所Position,由此大家得以先稳住Chr,再用Postion去稳定到Start和End之间,找到相对应的Description。为了加火速度,能够行使二分查找法。

   
很多时候,大家必要对取出的SNV进行诠释,这么些时候大概会在Escort上举办阐明,平日注释文件都蕴含Chr(染色体)、Start(起先位点)、End(结束位点)、Description(描述),而笔者辈的SNV文件一般是兼备Position(地点),因而大家能够先稳住Chr,再用Postion去稳定到Start和End之间,找到相呼应的Description。为了加迅速度,能够行使二分查找法。

使用协助能够在自笔者github看到 https://github.com/yiliu4234/BedAnnoGene

 1 #!/bin/Rscript
 2 
 3 library(data.table)
 4 
 5 arg <- commandArgs(T)
 6 if (length(arg) != 3) {
 7     message("[usage]: BedAnnoGene.R bedfile gtffile outputfile")
 8     message("    bedfile format: chr start end information(Arbitrary but can not be lacked)")
 9     message("    GTFfile: gtf file downloaded from GENCODE")
10     message("    outputfile: file to be writen out")
11     message("    needed package: data.table 1.10.4")
12     stop("Please check your arguments!")
13 }
14     
15 bedfile <- arg[1]
16 annofile <- arg[2]
17 outfile <- arg[3]
18 
19 #read file 
20 anno <- fread(annofile,sep="\t",header=F)
21 bed <- fread(bedfile,sep="\t",header=F)
22 setnames(anno,c("V1","V2","V3","V4","V5","V9"),c("Chr","Gene","Type","Start","End","Info"))
23 anno <- anno[Type=="gene",.(Chr,Start,End,Gene=sapply(strsplit(tstrsplit(Info,";")[3][[1]],"\""),function(x)x[2]))]
24 setkey(anno,Chr,Start,End)
25 setkey(bed,V1,V2,V3)
26 
27 #find overlaps by Chr
28 lst <- list()
29 for (ChrI in intersect(unique(bed$V1),unique(anno$Chr))){
30   anno_reg <- anno[Chr == ChrI,.(Start,End)]
31   bed_reg <- bed[V1 == ChrI,.(V2,V3)]
32   setkey(anno_reg,Start,End)
33   setkey(bed_reg,V2,V3)
34   overl <- foverlaps(bed_reg,anno_reg,which=TRUE,nomatch = 0)
35   if (nrow(overl) > 0){
36     lst[[ChrI]] <- data.table(Chr=ChrI,bed[V1 == ChrI,][overl[["xid"]],.(V2,V3,V4)],anno[Chr == ChrI][overl[["yid"]],.(Gene)])
37   }
38 }
39 merge_dt <- rbindlist(lst)
40 setnames(merge_dt,c("V2","V3","V4"),c("Start","End","Name"))
41 
42 #if one region has more than one gene
43 torm <- list()
44 for (i in 1:(nrow(merge_dt)-1)){if(merge_dt[i,"Name"]==merge_dt[i+1,"Name"]){set(merge_dt,i+1L,ncol(merge_dt),paste(merge_dt[i,"Gene"],merge_dt[i+1,"Gene"],sep=";"));torm <- c(torm,list(i))}}
45 torm <- unlist(torm)
46 merge_dt <- merge_dt[-torm,]
47 
48 fwrite(merge_dt,file=outfile)

在纳瓦拉中央银行使for循环效用低,因此也得以用data.table包的foverlap函数,立异代码如下,对bed文件进行申明,倘若要对snv实行评释,只必要将snv改成相应的start和end相等的bed文件即可。

选取支持能够在自家github看到   https://github.com/yiliu4234/BedAnnoGene

 1 for (value in dt$value){ 2 #df:data.frame, V1 and V2 should be Start and End   value: Postition  used to find region  return:df row number where position locates  ,if no region return -1 3     low=1 4     high=nrow 5     mid=high %/% 2 6     if (df[low,1] <= value & value <= df[low,2]) low 7     else if (df[high,1] <= value & value <= df[high,2]) high 8     else{ 9     while (value > df[mid,2] || value < df[mid,1]){10       if (value > df[mid,2]){11         low = mid+112       } else if (value < df[mid,1]) {13         high = mid - 114       } 15       if(high<low){16          mid=-1;break17       }18       mid=%/%219     }20       mid21 }22 }
 1 for (value in dt$value){
 2 #df:data.frame, V1 and V2 should be Start and End   value: Postition  used to find region  return:df row number where position locates  ,if no region return -1
 3     low=1
 4     high=nrow(df)
 5     mid=high %/% 2
 6     if (df[low,1] <= value & value <= df[low,2]) low
 7     else if (df[high,1] <= value & value <= df[high,2]) high
 8     else{
 9     while (value > df[mid,2] || value < df[mid,1]){
10       if (value > df[mid,2]){
11         low = mid+1
12       } else if (value < df[mid,1]) {
13         high = mid - 1
14       } 
15       if(high<low){
16          mid=-1;break
17       }
18       mid=(low+high)%/%2
19     }
20       mid
21 }
22 }
 1 #!/bin/Rscript 2  3 library(data.table) 4  5 arg <- commandArgs 6 if (length != 3) { 7     message("[usage]: BedAnnoGene.R bedfile gtffile outputfile") 8     message("    bedfile format: chr start end information(Arbitrary but can not be lacked)") 9     message("    GTFfile: gtf file downloaded from GENCODE")10     message("    outputfile: file to be writen out")11     message("    needed package: data.table 1.10.4")12     stop("Please check your arguments!")13 }14     15 bedfile <- arg[1]16 annofile <- arg[2]17 outfile <- arg[3]18 19 #read file 20 anno <- fread(annofile,sep="\t",header=F)21 bed <- fread(bedfile,sep="\t",header=F)22 setnames(anno,c("V1","V2","V3","V4","V5","V9"),c("Chr","Gene","Type","Start","End","Info"))23 anno <- anno[Type=="gene",.(Chr,Start,End,Gene=sapply(strsplit(tstrsplit(Info,";")[3][[1]],"\""),functionx[2]))]24 setkey(anno,Chr,Start,End)25 setkey(bed,V1,V2,V3)26 27 #find overlaps by Chr28 lst <- list()29 for (ChrI in intersect(unique,unique)){30   anno_reg <- anno[Chr == ChrI,.(Start,End)]31   bed_reg <- bed[V1 == ChrI,.]32   setkey(anno_reg,Start,End)33   setkey(bed_reg,V2,V3)34   overl <- foverlaps(bed_reg,anno_reg,which=TRUE,nomatch = 0)35   if (nrow > 0){36     lst[[ChrI]] <- data.table(Chr=ChrI,bed[V1 == ChrI,][overl[["xid"]],.],anno[Chr == ChrI][overl[["yid"]],.37   }38 }39 merge_dt <- rbindlist40 setnames(merge_dt,c("V2","V3","V4"),c("Start","End","Name"))41 42 #if one region has more than one gene43 torm <- list()44 for (i in 1:(nrow-1)){if(merge_dt[i,"Name"]==merge_dt[i+1,"Name"]){set(merge_dt,i+1L,ncol,paste(merge_dt[i,"Gene"],merge_dt[i+1,"Gene"],sep=";"));torm <- c(torm,list}}45 torm <- unlist46 merge_dt <- merge_dt[-torm,]47 48 fwrite(merge_dt,file=outfile)

在LX570中选拔for循环效用低,由此也可以用data.table包的foverlap函数,创新代码如下,对bed文件进行注脚,借使要对snv进行申明,只要求将snv改成相应的start和end相等的bed文件即可。