From f6e91d05329289d42d0334d51856f54bdeca6bc1 Mon Sep 17 00:00:00 2001 From: Trung Nghia Vu Date: Thu, 11 Oct 2018 11:06:06 +0200 Subject: [PATCH] FuSeq v1.0.0 --- NEWS | 22 ++- R/FuSeq.R | 42 ++++-- R/FuSeq_functions.R | 213 +++++++++++++++++++++++--- R/detectJunctionBreaks.R | 19 ++- R/integrateFusion.R | 11 +- R/params.txt | 2 +- R/postProcessMappedRead.R | 7 +- R/postProcessSplitRead.R | 9 +- R/processFEQ.R | 305 ++++++++++++++++++++------------------ R/processMappedRead.R | 25 ++-- R/processSplitRead.R | 4 +- README.md | 50 ++++--- 12 files changed, 487 insertions(+), 222 deletions(-) diff --git a/NEWS b/NEWS index e4a90eb..ef0dfb3 100644 --- a/NEWS +++ b/NEWS @@ -1,2 +1,22 @@ +Date 01 Oct 2018 +1) Correct the headers of split reads when exporting their sequences to file + +Date: 23 July 2018 +1) Error when running with Ubuntu 16 +- Error: +TxIndexer: relocation error: /home/nghiavtr/Tests/FuSeq_v0.1.1_linux_x86-64/linux/bin/../lib/librt.so.1: symbol __vdso_clock_gettime, version GLIBC_PRIVATE not defined in file libc.so.6 with link time reference +- Solution: remove the librt.so.1 +FuSeq_v0.1.1_linux_x86-64/linux/lib/librt.so.1 + +2) Allow inverted fusion genes by default: there are existing both fusion genes A-B and B-A +maxInvertedFusionCount=0 + +3) Optimize memory and space +processFEQ.R: remove myFusion as a redundant data +FuSeq_functions.R: fix some bugs of memory usage in function computeGeneDistance() + +4) Export more information in the output fusions.FuSeq + + Date: 21 May 2017 -- First submission \ No newline at end of file +- First submission diff --git a/R/FuSeq.R b/R/FuSeq.R index 1323d4b..eb00fe3 100644 --- a/R/FuSeq.R +++ b/R/FuSeq.R @@ -56,7 +56,7 @@ if (is.na(paramsFn)){ FuSeq.params$maxSharedCount=5e-2 FuSeq.params$minGeneDist=1e5 FuSeq.params$minJunctionDist=1e5 - FuSeq.params$maxInvertedFusionCount=0.01 + FuSeq.params$maxInvertedFusionCount=0 FuSeq.params$maxMRfusionFc=2 FuSeq.params$maxMRfusionNum=2 FuSeq.params$sgtMRcount=10 @@ -109,6 +109,13 @@ if (validatedCommand){ cat("\n keepRData=",FuSeq.params$keepRData) cat("\n exportFasta=",FuSeq.params$exportFasta) } + +fragDistFn=paste(inputFeqDir,"/fragmentDist.txt",sep="") +if (!file.exists(fragDistFn)) { + cat("There is no fragmentDist.txt file, your input sequencing data are probably too small. Stop!") + validatedCommand=FALSE +} + cat("\n-------------------------\n") if (validatedCommand){ @@ -132,7 +139,7 @@ if (validatedCommand){ myFusionOut=NULL; FuSeq.MR=processMappedRead(inPath,geneAnno=geneAnno, anntxdb=anntxdb, geeqMap=geeqMap,FuSeq.params=FuSeq.params) - FuSeq.SR=processSplitRead(inPath,geneAnno=geneAnno, anntxdb=anntxdb, geeqMap=geeqMap,FuSeq.params=FuSeq.params, txFastaFile=txFastaFile) + FuSeq.SR=processSplitRead(inPath,geneAnno=geneAnno, anntxdb=anntxdb, FuSeq.params=FuSeq.params, txFastaFile=txFastaFile) FuSeq.MR.postPro=postProcessMappedRead(inPath, anntxdb, FuSeq.SR, FuSeq.MR, FuSeq.params) FuSeq.SR.postPro=postProcessSplitRead(inPath, anntxdb, FuSeq.SR, FuSeq.MR, txFastaFile, FuSeq.params) @@ -145,15 +152,26 @@ if (validatedCommand){ myFusionFinal=FuSeq.integration$myFusionFinal if (nrow(myFusionFinal)==0){ - myFusionExport= "# No fusion genes existing" - write.table(myFusionExport, file=outputDir, col.names=FALSE, row.names = FALSE,quote = FALSE, sep="\t") + cat("\n No final fusion-gene candidates existing. It might be interesting to further investigate other SR and MR fusion candidates in FuSeq_process.RData.") + myFusionExport= "# No final fusion-gene candidates existing. It might be interesting to further investigate other SR and MR fusion candidates in FuSeq_process.RData." + write.table(myFusionExport, file=paste(outputDir,"/fusions.FuSeq",sep=""), col.names=FALSE, row.names = FALSE,quote = FALSE, sep="\t") + #keep all RData + if (FuSeq.params$keepRData){ + cat("\n Saving all data of FuSeq process...") + save(inPath,outputDir, myFusionFinal,myFusionExport,FuSeq.params, FuSeq.MR, FuSeq.SR, FuSeq.MR.postPro, FuSeq.SR.postPro,FuSeq.integration, file=paste(outputDir,"/FuSeq_process.RData",sep="")) + } } else{ - myFusionOut=myFusionFinal + myFusionOut=myFusionFinal myFusionOut=myFusionOut[order(myFusionOut$score, decreasing = TRUE),] - myFusionExport=myFusionOut[,c("gene5","chrom5p","strand5p","brpos5.start","brpos5.end","gene3","chrom3p","strand3p","brpos3.start","brpos3.end","fusionName","supportRead","score")] + myFusionExport=myFusionOut[,c("gene5","chrom5p","strand5p","brpos5.start","brpos5.end","gene3","chrom3p","strand3p","brpos3.start","brpos3.end","fusionName","SR","MR","supportRead","score")] - colnames(myFusionExport)=c("gene5","chrom5","strand5","brpos5.start","brpos5.end","gene3","chrom3","strand3","brpos3.start","brpos3.end","fusionName","supportRead","score") + colnames(myFusionExport)=c("gene5","chrom5","strand5","brpos5.start","brpos5.end","gene3","chrom3","strand3","brpos3.start","brpos3.end","fusionName","SR.passed","MR.passed","supportRead","score") + #get HGNC names of genes + myFusionExport$HGNC5=hgncName$hgnc_symbol[match(myFusionExport$gene5,hgncName$ensembl_gene_id)] + myFusionExport$HGNC3=hgncName$hgnc_symbol[match(myFusionExport$gene3,hgncName$ensembl_gene_id)] + #reorder + myFusionExport=myFusionExport[,c("gene5","chrom5","strand5","brpos5.start","brpos5.end","gene3","chrom3","strand3","brpos3.start","brpos3.end","fusionName","HGNC5","HGNC3","SR.passed","MR.passed","supportRead","score")] #Detect extra information here myFusionExport$info=rep("",nrow(myFusionExport)) @@ -176,12 +194,16 @@ if (validatedCommand){ if (FuSeq.params$exportFasta){ cat("\n Export supporing read sequences to files...") fastaOut=paste(outputDir,"/FuSeq_",sep="") - exportMappedFusionReads(inPath, readStrands=FuSeq.params$readStrands, fastaOut=fastaOut, junctInfo=FuSeq.MR$junctBr$junctInfo, fusionName=as.character(myFusionFinal$fusionName),fsizeLadder=FuSeq.MR$junctBr$fsizeLadder) - exportSplitFusionReads(inPath, readStrands=FuSeq.params$readStrands, fastaOut=fastaOut, splitReads=FuSeq.SR$splitReads, fusionName=as.character(myFusionFinal$fusionName)) + #exportMappedFusionReads(inPath, readStrands=FuSeq.params$readStrands, fastaOut=fastaOut, junctInfo=FuSeq.MR$junctBr$junctInfo, fusionName=as.character(myFusionFinal$fusionName),fsizeLadder=FuSeq.MR$junctBr$fsizeLadder) + #exportSplitFusionReads(inPath, readStrands=FuSeq.params$readStrands, fastaOut=fastaOut, splitReads=FuSeq.SR$splitReads, fusionName=as.character(myFusionFinal$fusionName)) + MRinfo=getMRinfo(fusionName=as.character(myFusionFinal$fusionName),inPath=inPath,feq=FuSeq.MR$feqInfo$feq, feqFgeMap=FuSeq.MR$feqInfo$feqFgeMap, anntxdb=anntxdb,readStrands=FuSeq.params$readStrands) + exportMappedFusionReads(inPath, readStrands=FuSeq.params$readStrands, fastaOut=fastaOut, junctInfo=MRinfo$junctInfo, fusionName=as.character(myFusionFinal$fusionName),fsizeLadder=FuSeq.MR$junctBr$fsizeLadder) + exportSplitFusionReads(inPath, readStrands=FuSeq.params$readStrands, fastaOut=fastaOut, splitReads=FuSeq.SR$fusionGene, fusionName=as.character(myFusionFinal$fusionName)) } } -cat("\n Done! \n") + cat("\n Done! \n") } + diff --git a/R/FuSeq_functions.R b/R/FuSeq_functions.R index 30b598d..a5e8652 100644 --- a/R/FuSeq_functions.R +++ b/R/FuSeq_functions.R @@ -1,3 +1,5 @@ +#Data: 07/09/2018 +#- Correct the headers of split reads when exporting their sequences to file #Date:19/05/2017 #- Fix and clean codes ################### @@ -272,26 +274,54 @@ computeReversedFusionCount <- function(myfusionTx){ } -computeGeneDistance <- function(myfusionTx,anntxdb,minGeneDist=1e5){ - res1=select(anntxdb, keys=as.character(myfusionTx$gene1), columns=c("TXSTART","TXEND"), keytype = "GENEID") - res2=select(anntxdb, keys=as.character(myfusionTx$gene2), columns=c("TXSTART","TXEND"), keytype = "GENEID") +computeGeneDistance <- function(myfusionTx,anntxdb){ + #NOTE: it is meaningless for the distances between genes from two different chromosomes - minRes1=tapply(res1$TXSTART,as.character(res1$GENEID),min) - maxRes1=tapply(res1$TXEND,as.character(res1$GENEID),max) - minRes1=minRes1[match(as.character(myfusionTx$gene1),names(minRes1))] - maxRes1=maxRes1[match(as.character(myfusionTx$gene1),names(maxRes1))] + allgenes.info=select(anntxdb, keys=unique(c(as.character(myfusionTx$gene1),as.character(myfusionTx$gene2))), columns=c("TXSTART","TXEND"), keytype = "GENEID") + #res1=select(anntxdb, keys=as.character(myfusionTx$gene1), columns=c("TXSTART","TXEND"), keytype = "GENEID") + #res2=select(anntxdb, keys=as.character(myfusionTx$gene2), columns=c("TXSTART","TXEND"), keytype = "GENEID") + # minRes1=tapply(res1$TXSTART,as.character(res1$GENEID),min) + # maxRes1=tapply(res1$TXEND,as.character(res1$GENEID),max) + # minRes1=minRes1[match(as.character(myfusionTx$gene1),names(minRes1))] + # maxRes1=maxRes1[match(as.character(myfusionTx$gene1),names(maxRes1))] + # + # minRes2=tapply(res2$TXSTART,as.character(res2$GENEID),min) + # maxRes2=tapply(res2$TXEND,as.character(res2$GENEID),max) + # minRes2=minRes2[match(as.character(myfusionTx$gene2),names(minRes2))] + # maxRes2=maxRes2[match(as.character(myfusionTx$gene2),names(maxRes2))] - minRes2=tapply(res2$TXSTART,as.character(res2$GENEID),min) - maxRes2=tapply(res2$TXEND,as.character(res2$GENEID),max) - minRes2=minRes2[match(as.character(myfusionTx$gene2),names(minRes2))] - maxRes2=maxRes2[match(as.character(myfusionTx$gene2),names(maxRes2))] - geneDist=unlist(apply(cbind(abs(minRes1-maxRes2),abs(minRes1-minRes2),abs(maxRes1-maxRes2),abs(maxRes1-minRes2)),1,min)) - if (minGeneDist < 0) return(geneDist); + res1=allgenes.info[match(unique(as.character(myfusionTx$gene1)),as.character(allgenes.info$GENEID)),c("GENEID","TXSTART","TXEND")] + res2=allgenes.info[match(unique(as.character(myfusionTx$gene2)),as.character(allgenes.info$GENEID)),c("GENEID","TXSTART","TXEND")] - myfusionTx$geneDist=geneDist - myfusionTx=myfusionTx[myfusionTx$geneDist>=minGeneDist,] - return(myfusionTx) + ## for gene 1 + #sort by the increasing of TXSTART and remove duplicates + startMin1=res1 + startMin1=startMin1[order(startMin1$TXSTART, decreasing=FALSE),] + startMin1=startMin1[!duplicated(startMin1$GENEID),] + minRes1=startMin1$TXSTART[match(as.character(myfusionTx$gene1),as.character(startMin1$GENEID))] + #sort by the decreasing of TXEND and remove duplicates + endMax1=res1 + endMax1=endMax1[order(endMax1$TXEND, decreasing=TRUE),] + endMax1=endMax1[!duplicated(endMax1$GENEID),] + maxRes1=endMax1$TXEND[match(as.character(myfusionTx$gene1),as.character(endMax1$GENEID))] + + + ## for gene 2 + #sort by the increasing of TXSTART and remove duplicates + startMin2=res2 + startMin2=startMin2[order(startMin2$TXSTART, decreasing=FALSE),] + startMin2=startMin2[!duplicated(startMin2$GENEID),] + minRes2=startMin2$TXSTART[match(as.character(myfusionTx$gene2),as.character(startMin2$GENEID))] + #sort by the decreasing of TXEND and remove duplicates + endMax2=res2 + endMax2=endMax2[order(endMax2$TXEND, decreasing=TRUE),] + endMax2=endMax2[!duplicated(endMax2$GENEID),] + maxRes2=endMax2$TXEND[match(as.character(myfusionTx$gene2),as.character(endMax2$GENEID))] + + + geneDist=unlist(apply(cbind(abs(minRes1-maxRes2),abs(minRes1-minRes2),abs(maxRes1-maxRes2),abs(maxRes1-minRes2)),1,min)) + return(geneDist); } @@ -343,7 +373,7 @@ convertReverseComplement<-function(DNAseq){ DNAarr[Cid]="G" #result DNAseqRc=paste(DNAarr,collapse = "") - return(DNAseqRc) + return(DNAseqRc) } @@ -567,7 +597,7 @@ matchFusionReads <- function(inPath, readStrands, fastaOut, junctInfo, splitRead entropy5=c(entropy5,-sum(consProp5*log(consProp5))) } - + res=list() res$estBr5=estBr5 res$estBr3=estBr3 @@ -598,7 +628,7 @@ matchFusionReads <- function(inPath, readStrands, fastaOut, junctInfo, splitRead fusionCandidate$consensusProp=consensusProp fusionCandidate$consensusErr=consensusErr fusionCandidate$consensusScore=fusionCandidate$consensusErr*fusionCandidate$consensusProp - + return(list(fusionCandidate=fusionCandidate,matchInfo=matchInfo)) } @@ -687,12 +717,13 @@ exportSplitFusionReads <- function(inPath, readStrands, fastaOut, splitReads, fu myheader=as.character(splitReads$header[myreadID]) rID.adj=match(myheader,fastaDat1) #export reads to file - for (j in rID.adj){ - readHead=paste(">FuSeq__SR__",splitReads$front_hitpos[j],"__",splitReads$back_hitpos[j],"__",fName," /1",sep="") + for (k in 1:length(rID.adj)){ + j=rID.adj[k] + readHead=paste(">FuSeq__SR__",splitReads$front_hitpos[myreadID[k]],"__",splitReads$back_hitpos[myreadID[k]],"__",fName," /1",sep="") writeLines(readHead,con1) writeLines(fastaDat1[j+1],con1) - readHead=paste(">FuSeq__SR__",splitReads$front_hitpos[j],"__",splitReads$back_hitpos[j],"__",fName," /2",sep="") + readHead=paste(">FuSeq__SR__",splitReads$front_hitpos[myreadID[k]],"__",splitReads$back_hitpos[myreadID[k]],"__",fName," /2",sep="") writeLines(readHead,con2) writeLines(fastaDat2[j+1],con2) } @@ -1017,7 +1048,7 @@ refineJunctionBreak <-function(junctBr, anntxdb, fragmentInfo, readStrands, frag } } - + } return(junctBr) @@ -1115,9 +1146,145 @@ checkEndExon <-function(myFusionFinal, junctBr=NULL, anntxdb, readStrands,shrink } +##### get information of mapped reads +getMRinfo <-function(fusionName,inPath,feq, feqFgeMap, anntxdb, readStrands="UN", shrinkLen=3){ + if (length(fusionName) == 0){ return(NULL) } + + gene5p=sapply(fusionName,function(x) unlist(strsplit(x,"-"))[1]) + gene3p=sapply(fusionName,function(x) unlist(strsplit(x,"-"))[2]) + + myFusionFinal=data.frame(fusionName=fusionName) + if (readStrands=="RF" || readStrands=="UN" || readStrands=="RR"){ + myFusionFinal$gene1=gene3p + myFusionFinal$gene2=gene5p + myFusionFinal$name12=paste(myFusionFinal$gene1, myFusionFinal$gene2,sep="-") + }else{ + myFusionFinal$gene1=gene5p + myFusionFinal$gene2=gene3p + myFusionFinal$name12=paste(myFusionFinal$gene1, myFusionFinal$gene2,sep="-") + } + myFusionFinal$fusionName=as.character(myFusionFinal$fusionName) + ##### input fusion reads + #load fusion reads + cat("\n Get mapped fusion reads") + frfiles=list.files(inPath,paste(readStrands,"_fusionMappedReadsChunk_*",sep="")) + fusionRead=NULL; + fsizes=NULL; + for (i in 1:length(frfiles)){ + tmpDat=read.csv(paste(inPath,"/",frfiles[i],sep=""), header =FALSE, sep="\t") + fusionRead=rbind(fusionRead,tmpDat) + fsizes=c(fsizes,nrow(tmpDat)) + } + fsizeLadder=cumsum(fsizes) + colnames(fusionRead)=c("read1","read2","read1Pos","read2Pos","seq1Pos","seq2Pos","seq1Len","seq2Len") + #dim(fusionRead) + cat("\n The number of mapped fusion reads: ",nrow(fusionRead)) + fre1=as.character(fusionRead$read1) + fre1=trimws(fre1) + fre2=as.character(fusionRead$read2) + fre2=trimws(fre2) + read1Pos=as.character(fusionRead$read1Pos) + read2Pos=as.character(fusionRead$read2Pos) + seq1Pos=as.character(fusionRead$seq1Pos) + seq2Pos=as.character(fusionRead$seq2Pos) + seq1Len=as.character(fusionRead$seq1Len) + seq2Len=as.character(fusionRead$seq2Len) + #load fragment info - this is not neccessary thi moment + fragmentInfo=read.csv(paste(inPath,"/fragmentInfo.txt",sep=""), header =TRUE, sep="\t") + readLen=fragmentInfo[1,1] + #load the feq file + feqRaw=read.csv(paste(inPath,"/feq_",readStrands,".txt",sep=""), header =TRUE, sep="\t") + # #Keep only reads and feqs relating to myFusionFinal + # allTx=select(anntxdb, keys=unique(c(as.character(myFusionFinal$gene5p),as.character(myFusionFinal$gene3p))), columns=c("TXNAME"), keytype = "GENEID") + # feqRaw=feqRaw[as.character(feqRaw$Transcript) %in% as.character(allTx$TXNAME),] + feqRead1=feqRaw[feqRaw$Read==1,] + feqRead2=feqRaw[feqRaw$Read==2,] + #get feq-fge map + # feqFgeMap=feqInfo$feqFgeMap + # feq=feqInfo$feq + cat("\n Preparing some information ...") + feqFtxMap1=tapply(as.character(feqRead1$Transcript),feqRead1$Feq,c) + feqRead1Name=unlist(lapply(feqFtxMap1,function(x) paste(x,collapse =" "))) + feqFtxMap2=tapply(as.character(feqRead2$Transcript),feqRead2$Feq,c) + feqRead2Name=unlist(lapply(feqFtxMap2,function(x) paste(x,collapse =" "))) + + read1Pos=lapply(read1Pos,function(x) as.integer(unlist(strsplit(x," ")))) + read2Pos=lapply(read2Pos,function(x) as.integer(unlist(strsplit(x," ")))) + seq1Pos=lapply(seq1Pos,function(x) as.integer(unlist(strsplit(x," ")))) + seq2Pos=lapply(seq2Pos,function(x) as.integer(unlist(strsplit(x," ")))) + seq1Len=lapply(seq1Len,function(x) as.integer(unlist(strsplit(x," ")))) + seq2Len=lapply(seq2Len,function(x) as.integer(unlist(strsplit(x," ")))) + #get some annotations + txToGene=select(anntxdb, keys=unique(as.character(feqRaw[,1])), columns=c("GENEID","TXCHROM"), keytype = "TXNAME") + cat("\n Detect positions of junction breaks") + brpos5.start=brpos5.end=brpos3.start=brpos3.end=NULL + genebrpos5.start=genebrpos5.end=genebrpos3.start=genebrpos3.end=NULL + fragmentMean=fragmentSd=fragmentNum=fragmentTest=NULL; + tx5LenMean=tx3LenMean=tx5LenSd=tx3LenSd=NULL; + tx5GapMean=tx3GapMean=tx5GapSd=tx3GapSd=NULL; + flen5=flen3=NULL; + splitRNum5=splitRNum3=exNum1=exNum2=NULL; + nondupCount=NULL; + + junctInfo=list(); + for (i in 1:nrow(myFusionFinal)){ + fgeName=as.character(myFusionFinal$name12)[i] + + gene1=as.character(myFusionFinal$gene1[i]) + gene2=as.character(myFusionFinal$gene2[i]) + myfeqID=feqFgeMap[[fgeName]] + readL.seqLen=readR.seqLen=readID=readL.chrPos=readR.chrPos=readL.Pos=readR.Pos=readL.GenePos=readR.GenePos=readL.tx=readR.tx=list(); + readL.chrReadPos=readR.chrReadPos=readR.ReadPos=readL.ReadPos=list(); + readL.ExonID=readR.ExonID=list(); + for (j in 1:length(myfeqID)){ + feqName=names(feq[myfeqID[j]]) + if (length(feqName)==0) next() + #keepID=which(fre1==feqRead1Name[myfeqID[j]] & fre2==feqRead2Name[myfeqID[j]]) + keepID1=which(fre1==feqRead1Name[myfeqID[j]]) + keepID=keepID1[which(fre2[keepID1]==feqRead2Name[myfeqID[j]])] + + readID[[feqName]]=keepID + + xl=unlist(feqFtxMap1[myfeqID[j]]) + xr=unlist(feqFtxMap2[myfeqID[j]]) + IDl=which(txToGene[match(xl,txToGene$TXNAME),]$GENEID==gene1) + IDr=which(txToGene[match(xr,txToGene$TXNAME),]$GENEID==gene2) + #select only one because the others will locate to the same posistion + posl=IDl[1] + posr=IDr[1] + readposl=sapply(read1Pos[keepID],function(x) x[posl]) + readposr=sapply(read2Pos[keepID],function(x) x[posr]) + txposl=sapply(seq1Pos[keepID],function(x) x[posl]) + txposr=sapply(seq2Pos[keepID],function(x) x[posr]) + txSeqlenl=sapply(seq1Len[keepID],function(x) x[posl]) + txSeqlenr=sapply(seq2Len[keepID],function(x) x[posr]) + #reduce length of match kmer shrinkLen=3 or prop=0.25^3=0.015625 + readposl=readposl+shrinkLen + readposr=readposr+shrinkLen + txposl=txposl+shrinkLen + txposr=txposr+shrinkLen + txSeqlenl=txSeqlenl-shrinkLen + txSeqlenr=txSeqlenr-shrinkLen + + #save the information + readL.Pos[[feqName]]=txposl + readR.Pos[[feqName]]=txposr + readL.ReadPos[[feqName]]=readposl + readR.ReadPos[[feqName]]=readposr + readL.seqLen[[feqName]]=txSeqlenl + readR.seqLen[[feqName]]=txSeqlenr + + } + res=list(readL.ReadPos=readL.ReadPos,readR.ReadPos=readR.ReadPos,readID=readID) + if (length(res$readID)>0) junctInfo[[myFusionFinal$fusionName[i]]]=res; + } + return(list(myFusionFinal=myFusionFinal,junctInfo=junctInfo,fsizeLadder=fsizeLadder)) + } + + diff --git a/R/detectJunctionBreaks.R b/R/detectJunctionBreaks.R index d6c8ea0..8340515 100644 --- a/R/detectJunctionBreaks.R +++ b/R/detectJunctionBreaks.R @@ -34,11 +34,20 @@ detectJunctionBreaks <-function(fgeList,inPath,feq, feqFgeMap, anntxdb, readStra readLen=fragmentInfo[1,1] #load the feq file feqRaw=read.csv(paste(inPath,"/feq_",readStrands,".txt",sep=""), header =TRUE, sep="\t") + + # #Keep only reads and feqs relating to myFusionFinal + # allTx=select(anntxdb, keys=unique(c(as.character(myFusionFinal$gene5p),as.character(myFusionFinal$gene3p))), columns=c("TXNAME"), keytype = "GENEID") + # feqRaw=feqRaw[as.character(feqRaw$Transcript) %in% as.character(allTx$TXNAME),] + feqRead1=feqRaw[feqRaw$Read==1,] feqRead2=feqRaw[feqRaw$Read==2,] #get feq-fge map # feqFgeMap=feqInfo$feqFgeMap # feq=feqInfo$feq + + + + cat("\n Preparing other information ...") feqFtxMap1=tapply(as.character(feqRead1$Transcript),feqRead1$Feq,c) feqRead1Name=unlist(lapply(feqFtxMap1,function(x) paste(x,collapse =" "))) @@ -350,7 +359,9 @@ detectJunctionBreaks <-function(fgeList,inPath,feq, feqFgeMap, anntxdb, readStra cat("\n Get reads supporting constituent genes") txeq=read.csv(paste(inPath,"/rawCount.txt",sep=""), header =TRUE, sep="\t") - txeq$geneID=geneAnno[match(as.character(txeq$Transcript),geneAnno[,1]),6] + #txeq$geneID=geneAnno[match(as.character(txeq$Transcript),geneAnno[,1]),6] # risky: do not pass geneAnno but still use here + txeq$geneID=txToGene$GENEID[match(as.character(txeq$Transcript),as.character(txToGene$TXNAME))] + geeq=txeq[,c(3,7,8)] geeq=geeq[!duplicated(geeq),] geCount=tapply(geeq$Count,geeq$geneID,sum) @@ -360,7 +371,11 @@ detectJunctionBreaks <-function(fgeList,inPath,feq, feqFgeMap, anntxdb, readStra myFusionFinal$gene2Count[is.na(myFusionFinal$gene2Count)]=0 #number of feq supporting a fge - feqNum=sapply(as.character(myFusionFinal$name12),function(mykey) length(feqFgeMap[[mykey]])) + #feqNum=sapply(as.character(myFusionFinal$name12),function(mykey) length(feqFgeMap[[mykey]])) + matchID=match(as.character(myFusionFinal$name12),names(feqFgeMap)) + myfeqID=feqFgeMap[matchID] + feqNum=sapply(myfeqID, length) + myFusionFinal$feqNum=feqNum return(list(myFusionFinal=myFusionFinal,junctInfo=junctInfo,fsizeLadder=fsizeLadder)) diff --git a/R/integrateFusion.R b/R/integrateFusion.R index 0df37d0..a8b97f4 100644 --- a/R/integrateFusion.R +++ b/R/integrateFusion.R @@ -120,8 +120,15 @@ integrateFusion <-function(myFusionFinal.MR, myFusionFinal.SR, FuSeq.params, fra } myFusionFinal$MR=myFusionFinal$SR=0 - myFusionFinal$SR[which(!is.na(match(as.character(myFusionFinal$fusionName),as.character(myFusionFinal.SR$fusionName))))]=1 - myFusionFinal$MR[which(!is.na(match(as.character(myFusionFinal$fusionName),as.character(myFusionFinal.MR$fusionName))))]=1 + #myFusionFinal$SR[which(!is.na(match(as.character(myFusionFinal$fusionName),as.character(myFusionFinal.SR$fusionName))))]=1 + #myFusionFinal$MR[which(!is.na(match(as.character(myFusionFinal$fusionName),as.character(myFusionFinal.MR$fusionName))))]=1 + matchID=match(as.character(myFusionFinal$fusionName),as.character(myFusionFinal.SR$fusionName)) + pick=which(!is.na(matchID)) + myFusionFinal$SR[pick]=myFusionFinal.SR$supportCount[matchID[pick]] + matchID=match(as.character(myFusionFinal$fusionName),as.character(myFusionFinal.MR$fusionName)) + pick=which(!is.na(matchID)) + myFusionFinal$MR[pick]=myFusionFinal.MR$supportCount[matchID[pick]] + return(list(myFusionFinal=myFusionFinal,myFusionFinal.SR=myFusionFinal.SR,myFusionFinal.MR=myFusionFinal.MR)) } diff --git a/R/params.txt b/R/params.txt index c2301df..5fe29cd 100644 --- a/R/params.txt +++ b/R/params.txt @@ -5,7 +5,7 @@ onlyProteinCodingGenes=TRUE maxSharedCount=5e-2 minGeneDist=1e5 minJunctionDist=1e5 -maxInvertedFusionCount=0.01 +maxInvertedFusionCount=0 minMR=2 minNonDupMR=2 maxMRfusionFc=2 diff --git a/R/postProcessMappedRead.R b/R/postProcessMappedRead.R index 7cf848c..a408a2c 100644 --- a/R/postProcessMappedRead.R +++ b/R/postProcessMappedRead.R @@ -62,6 +62,10 @@ postProcessMappedRead <-function(inPath, anntxdb, FuSeq.SR, FuSeq.MR, FuSeq.para myFusionFinal=myFusionFinal[which(is.na(match(myFusionFinal$fusionName,rmFusion))),] } + if (nrow(myFusionFinal)==0){ #no fusions + res=list(myFusionFinal=myFusionFinal, junctBr.refine=junctBr) + return(res) + } tx3LenTest=tx5LenTest=NULL; for (i in 1:nrow(myFusionFinal)){ @@ -75,7 +79,7 @@ postProcessMappedRead <-function(inPath, anntxdb, FuSeq.SR, FuSeq.MR, FuSeq.para myFusionFinal=myFusionFinal[myFusionFinal$tx5LenTest >= 0.10,] myFusionFinal=myFusionFinal[myFusionFinal$tx3LenTest >= 0.10,] - if (nrow(myFusionFinal)==0){ + if (nrow(myFusionFinal)==0){ #no fusions res=list(myFusionFinal=myFusionFinal, junctBr.refine=junctBr) return(res) } @@ -113,3 +117,4 @@ postProcessMappedRead <-function(inPath, anntxdb, FuSeq.SR, FuSeq.MR, FuSeq.para + diff --git a/R/postProcessSplitRead.R b/R/postProcessSplitRead.R index 7162c7b..6e1ebbe 100644 --- a/R/postProcessSplitRead.R +++ b/R/postProcessSplitRead.R @@ -47,7 +47,8 @@ myFusionTmp2$mappedCount=0 myFusionTmp2$mappedCount[which(!is.na(matchID))]=mappedFge$supportCount[na.omit(matchID)] #filter again by inverted direction fusion genes -myFusionTmp2=myFusionTmp2[myFusionTmp2$total21 <= myFusionTmp2$totalCount*0.01,] +if (FuSeq.params$maxInvertedFusionCount > 0) +myFusionTmp2=myFusionTmp2[myFusionTmp2$total21 <= myFusionTmp2$totalCount*FuSeq.params$maxInvertedFusionCount,] ##### process cases: geneA-geneB vs geneA-geneC where geneB and geneC are paralogs or overlapping @@ -595,9 +596,11 @@ if (length(keepID)>0) for (i in 1:length(keepID)){ myID=which(as.character(myFusion$name12)==as.character(myFusionFinal$name12[keepID[i]])) res=myFusion[myID,] + #if(sd(res$front_hitpos)<1) rmID=c(rmID,myID) + #if(sd(res$back_hitpos)<1) rmID=c(rmID,myID) - if(sd(res$front_hitpos)<1) rmID=c(rmID,myID) - if(sd(res$back_hitpos)<1) rmID=c(rmID,myID) + if(sd(res$front_hitpos)<1 & length(unique(res$front_hitpos))<3) rmID=c(rmID,myID) + if(sd(res$back_hitpos)<1 & length(unique(res$back_hitpos))<3) rmID=c(rmID,myID) } if(length(rmID)>0) myFusion=myFusion[-rmID,] diff --git a/R/processFEQ.R b/R/processFEQ.R index af8261c..d6eb0c3 100644 --- a/R/processFEQ.R +++ b/R/processFEQ.R @@ -1,149 +1,162 @@ ############################################################ #####process fusion equivalence classes to generate initial fusion gene candidates -processFEQ <-function(inPath,geneAnno,anntxdb,geeqMap,readStrands="UN",chromRef=as.character(c(1:22,"X","Y"))){ - cat("\n Read fusion equivalence classes") - ##### read fragment information - fragmentInfo=read.csv(paste(inPath,"/fragmentInfo.txt",sep=""), header =TRUE, sep="\t") - libsize=fragmentInfo[1,5] - #read fusion-equivalence classes - feqRaw=read.csv(paste(inPath,"/feq_",readStrands,".txt",sep=""), header =TRUE, sep="\t") - - #generate feq - res=feqRaw[,c(2,4)] - myDup=duplicated(res) - res=res[!myDup,] - feq=res[,1] - names(feq)=res[,2] - cat("\n The total number of fusion equivalence classes: ",length(feq)) - - if (length(feq) == 0) return(NULL) - - cat("\n Create the maps between feq and fge") - #get a map between tx and gene - txToGene=select(anntxdb, keys=as.character(feqRaw[,1]), columns=c("GENEID","TXCHROM"), keytype = "TXNAME") - feqRaw$GENEID=txToGene$GENEID - feqGene=feqRaw[,-c(1,2)] - #remove duplicates - myDup=duplicated(feqGene) - feqGene=feqGene[!myDup,] - feqGene1=feqGene[feqGene$Read==1,] - feqGene2=feqGene[feqGene$Read==2,] - #remove duplicates of txToGene - myDup=duplicated(txToGene) - txToGene=txToGene[!myDup,] - ##### do indexing feq and fge - feqGene1Num=table(feqGene1$Feq) - feqGene2Num=table(feqGene2$Feq) - - feqGene1$Read=feqGene2Num[feqGene1$Feq] - feqGene2$Read=feqGene1Num[feqGene2$Feq] - - gene1list=apply(feqGene1,1,function(x){ - rep(x[3],x[1]) - }) - gene1list=unlist(gene1list) - res=tapply(feqGene2$GENEID,feqGene2$Feq,c) - res=data.frame(y=feqGene1Num,x=res) - gene2list=apply(res,1,function(x){ - rep(x[3],x[2]) - }) - gene2list=unlist(gene2list) - - res=cbind(seq_along(feqGene1Num),feqGene1Num,feqGene2Num) - feqID=apply(res,1,function(x){ - rep(x[1],x[2]*x[3]) - }) - feqID=unlist(feqID) - - fgeneNames=paste(gene1list,gene2list,sep="-") - res=feqID - names(res)=fgeneNames - - #index from fge to feq - feqFgeMap=tapply(res,names(res),c) - feqFgeMap=lapply(feqFgeMap,function(x) as.integer(unlist(x))) - length(feqFgeMap) # should be equal to number of fge - #create fge - fge=data.frame(fgeID=c(1:length(feqFgeMap)), name12=names(feqFgeMap)) - - #index from feq to fge - matchID=match(names(res),as.character(fge$name12)) - names(res)=matchID - fgeFeqMap=tapply(names(res),res,c) - fgeFeqMap=lapply(fgeFeqMap,function(x) as.integer(unlist(x))) - length(fgeFeqMap) #shoud be equal to number of feq - - - ##### add features to fge - res=lapply(as.character(fge$name12), function(x) unlist(strsplit(x,"-"))) - res=do.call(rbind,res) - colnames(res)=c("gene1","gene2") - fge=cbind(fge,res) - fge$name21=paste(fge$gene2,fge$gene1,sep="-") - - ########### - cat("\n Get the number of supporting reads") - ### find raw counts of fusion genes - fusionGene=fge - #sum-up all counts of feq - supportCount=rep(0,nrow(fge)) - for (feqID in 1:length(feq)){ - feqCount=feq[feqID] - gid=unlist(fgeFeqMap[feqID]) - supportCount[gid]=supportCount[gid]+feqCount - } - fusionGene$supportCount=supportCount - - ### estimate counts for fusion genes - cat("\n Correct the number of supporting reads") - alp0=rep(sum(feq)/nrow(fusionGene),nrow(fusionGene)) - alpOut=rep(0,nrow(fusionGene)) - - res=estimateCountEM(alp0,alpOut,feq,fgeFeqMap,itNum=1,alpDiff.thres=0.01) - - fusionGene$correctedCount=res$correctedCount - fusionGene=fusionGene[order(fusionGene$correctedCount,decreasing=TRUE),] - - dim(fusionGene) - - ####### - cat("\n Extract other biological information...") - #add few biological information - res=select(anntxdb, keys=as.character(fusionGene$gene1), columns=c("GENEID","TXCHROM","TXSTRAND"), keytype = "GENEID") - colnames(res)=c("GENEID","chrom1","strand1") - fusionGene=cbind(fusionGene,res[,-1]) - res=select(anntxdb, keys=as.character(fusionGene$gene2), columns=c("GENEID","TXCHROM","TXSTRAND"), keytype = "GENEID") - colnames(res)=c("GENEID","chrom2","strand2") - fusionGene=cbind(fusionGene,res[,-1]) - - ##### - cat("\n Keep only candidates in the selected chromosomes") - #do filter by chromosomes - fusionGene=chromFilter(fusionGene,chromRef=chromRef) - - ##### generate features to detect fusion genes - cat("\n Extract extra features...") - myFusion=fusionGene - #Fusion-genes sharing the same set of fusion equivalence classes - matchID=match(as.character(myFusion$name12),names(feqFgeMap)) - feqDup=duplicated(feqFgeMap[matchID], fromLast=FALSE) + duplicated(feqFgeMap[matchID], fromLast=TRUE) - myFusion$feqDup=feqDup - - #compute cpm - myFusion$cpm=myFusion$supportCount*10^6/libsize - #distance between two genes >= minGeneDist - geneDist=computeGeneDistance(myFusion,anntxdb,minGeneDist=-1) - myFusion$geneDist=geneDist - - #get reversed fusion (3-5) count - name21Count=computeReversedFusionCount(myFusion) - myFusion$name21Count=name21Count - - #compute dupGene - res=computeDupGene(myFusion,dupGene.thres=-1) - myFusion=cbind(myFusion,res) - - res=list(fgeList=myFusion,fge=fge,feq=feq,fgeFeqMap=fgeFeqMap,feqFgeMap=feqFgeMap,feqRaw=feqRaw) - return(res) - +processFEQ <-function(inPath,anntxdb,readStrands="UN",chromRef=as.character(c(1:22,"X","Y"))){ + cat("\n Read fusion equivalence classes") + ##### read fragment information + fragmentInfo=read.csv(paste(inPath,"/fragmentInfo.txt",sep=""), header =TRUE, sep="\t") + libsize=fragmentInfo[1,5] + #read fusion-equivalence classes + feqRaw=read.csv(paste(inPath,"/feq_",readStrands,".txt",sep=""), header =TRUE, sep="\t") + + #generate feq + res=feqRaw[,c(2,4)] + myDup=duplicated(res) + res=res[!myDup,] + feq=res[,1] + names(feq)=res[,2] + cat("\n The total number of fusion equivalence classes: ",length(feq)) + + if (length(feq) == 0) return(NULL) + + cat("\n Create the maps between feq and fge") + #get a map between tx and gene + txToGene=select(anntxdb, keys=as.character(feqRaw[,1]), columns=c("GENEID","TXCHROM"), keytype = "TXNAME") + feqRaw$GENEID=txToGene$GENEID + feqGene=feqRaw[,-c(1,2)] + #remove duplicates + myDup=duplicated(feqGene) + feqGene=feqGene[!myDup,] + feqGene1=feqGene[feqGene$Read==1,] + feqGene2=feqGene[feqGene$Read==2,] + #remove duplicates of txToGene + myDup=duplicated(txToGene) + txToGene=txToGene[!myDup,] + ##### do indexing feq and fge + feqGene1Num=table(feqGene1$Feq) + feqGene2Num=table(feqGene2$Feq) + + feqGene1$Read=feqGene2Num[feqGene1$Feq] + feqGene2$Read=feqGene1Num[feqGene2$Feq] + + gene1list=apply(feqGene1,1,function(x){ + rep(x[3],x[1]) + }) + gene1list=unlist(gene1list) + res=tapply(feqGene2$GENEID,feqGene2$Feq,c) + res=data.frame(y=feqGene1Num,x=res) + gene2list=apply(res,1,function(x){ + rep(x[3],x[2]) + }) + gene2list=unlist(gene2list) + + res=cbind(seq_along(feqGene1Num),feqGene1Num,feqGene2Num) + feqID=apply(res,1,function(x){ + rep(x[1],x[2]*x[3]) + }) + feqID=unlist(feqID) + + fgeneNames=paste(gene1list,gene2list,sep="-") + res=feqID + names(res)=fgeneNames + + #index from fge to feq + feqFgeMap=tapply(res,names(res),c) + feqFgeMap=lapply(feqFgeMap,function(x) as.integer(unlist(x))) + #length(feqFgeMap) # should be equal to the number of fge + #create fge + fge=data.frame(fgeID=c(1:length(feqFgeMap)), name12=names(feqFgeMap)) + + #index from feq to fge + matchID=match(names(res),as.character(fge$name12)) + names(res)=matchID + fgeFeqMap=tapply(names(res),res,c) + fgeFeqMap=lapply(fgeFeqMap,function(x) as.integer(unlist(x))) + length(fgeFeqMap) #shoud be equal to the number of feq + + + ##### add features to fge + res=lapply(as.character(fge$name12), function(x) unlist(strsplit(x,"-"))) + res=do.call(rbind,res) + colnames(res)=c("gene1","gene2") + fge=cbind(fge,res) + fge$name21=paste(fge$gene2,fge$gene1,sep="-") + + ########### + cat("\n Get the number of supporting reads") + ### find raw counts of fusion genes + fusionGene=fge + #sum-up all counts of feq + supportCount=rep(0,nrow(fge)) + for (feqID in 1:length(feq)){ + feqCount=feq[feqID] + gid=unlist(fgeFeqMap[feqID]) + supportCount[gid]=supportCount[gid]+feqCount + } + fusionGene$supportCount=supportCount + + ### estimate counts for fusion genes + cat("\n Correct the number of supporting reads") + alp0=rep(sum(feq)/nrow(fusionGene),nrow(fusionGene)) + alpOut=rep(0,nrow(fusionGene)) + + res=estimateCountEM(alp0,alpOut,feq,fgeFeqMap,itNum=1,alpDiff.thres=0.01) + + fusionGene$correctedCount=res$correctedCount + fusionGene=fusionGene[order(fusionGene$correctedCount,decreasing=TRUE),] + + ####### + cat("\n Keep only candidates in the selected chromosomes") + + cat("\n Extract other biological information...") + #add few biological information + allgenes=genes(anntxdb,single.strand.genes.only=FALSE) + allgenes.info=select(anntxdb, keys=names(allgenes), columns=c("TXCHROM","TXSTRAND"), keytype = "GENEID") + ##Check: it is sure that for GRCh37.75, none of geneID from 1-22,X,Y overlapping with the rest + #allgenes.info1=allgenes.info[allgenes.info$TXCHROM %in% chromRef,] + #allgenes.info2=allgenes.info[!allgenes.info$TXCHROM %in% chromRef,] + #sum(!is.na(match(allgenes.info2$GENEID, allgenes.info1$GENEID))) + + #keep only candidates from selected chromosomes in chromRef + allgenes.info=allgenes.info[allgenes.info$TXCHROM %in% chromRef,] + + #res=select(anntxdb, keys=as.character(fusionGene$gene1), columns=c("GENEID","TXCHROM","TXSTRAND"), keytype = "GENEID") + #ptm <- proc.time() + res=allgenes.info[match(as.character(fusionGene$gene1),allgenes.info$GENEID),c("GENEID","TXCHROM","TXSTRAND")] + #proc.time() - ptm + #select(anntxdb, keys=as.character(fusionGene$gene1), columns=c("GENEID","TXCHROM","TXSTRAND"), keytype = "GENEID") + colnames(res)=c("GENEID","chrom1","strand1") + fusionGene=cbind(fusionGene,res[,-1]) + #res=select(anntxdb, keys=as.character(fusionGene$gene2), columns=c("GENEID","TXCHROM","TXSTRAND"), keytype = "GENEID") + res=allgenes.info[match(as.character(fusionGene$gene2),allgenes.info$GENEID),c("GENEID","TXCHROM","TXSTRAND")] + colnames(res)=c("GENEID","chrom2","strand2") + fusionGene=cbind(fusionGene,res[,-1]) + + #do filter by chromosomes + fusionGene=chromFilter(fusionGene,chromRef=chromRef) + + + ##### generate features to detect fusion genes + cat("\n Extract extra features...") + #Fusion-genes sharing the same set of fusion equivalence classes + matchID=match(as.character(fusionGene$name12),names(feqFgeMap)) + feqDup=duplicated(feqFgeMap[matchID], fromLast=FALSE) + duplicated(feqFgeMap[matchID], fromLast=TRUE) + fusionGene$feqDup=feqDup + + #compute cpm + fusionGene$cpm=fusionGene$supportCount*10^6/libsize + #distance between two genes >= minGeneDist + geneDist=computeGeneDistance(fusionGene,anntxdb) + fusionGene$geneDist=geneDist + + #get reversed fusion (3-5) count + name21Count=computeReversedFusionCount(fusionGene) + fusionGene$name21Count=name21Count + + #compute dupGene + res=computeDupGene(fusionGene,dupGene.thres=-1) + fusionGene=cbind(fusionGene,res) + + res=list(fgeList=fusionGene,fge=fge,feq=feq,fgeFeqMap=fgeFeqMap,feqFgeMap=feqFgeMap,feqRaw=feqRaw) + return(res) + } \ No newline at end of file diff --git a/R/processMappedRead.R b/R/processMappedRead.R index 49864cc..2b3ba5c 100644 --- a/R/processMappedRead.R +++ b/R/processMappedRead.R @@ -2,7 +2,7 @@ ##### process mapped reads ############################################################ -processMappedRead <-function(inPath,geneAnno, anntxdb, geeqMap, FuSeq.params,feqInfo=NULL){ +processMappedRead <-function(inPath,geneAnno,anntxdb,geeqMap=NULL,FuSeq.params,feqInfo=NULL,doBioFilter=TRUE){ cat("\n ------------------------------------------------------------------") cat("\n Processing mapped reads (MR) from dataset: ",inPath, " read strands:", FuSeq.params$readStrands) @@ -14,7 +14,7 @@ processMappedRead <-function(inPath,geneAnno, anntxdb, geeqMap, FuSeq.params,feq } if (is.null(feqInfo)){ - feqInfo=processFEQ(inPath,geneAnno,anntxdb,geeqMap,readStrands=FuSeq.params$readStrands,chromRef=FuSeq.params$chromRef) + feqInfo=processFEQ(inPath,anntxdb,readStrands=FuSeq.params$readStrands,chromRef=FuSeq.params$chromRef) if (FuSeq.params$keepRData){ cat("\n Saving MR fusion candidates ...") save(feqInfo,file=paste(FuSeq.params$outputDir,"/FuSeq_MR_feqInfo.RData",sep="")) @@ -76,7 +76,7 @@ processMappedRead <-function(inPath,geneAnno, anntxdb, geeqMap, FuSeq.params,feq ### add gene types, keep only protein_coding genes later - dim(geneAnno) + #dim(geneAnno) matchID=match(myFusionFinal$gene1,geneAnno[,6]) res=geneAnno[matchID,] colnames(res)=paste(colnames(res),"1",sep="") @@ -128,7 +128,10 @@ processMappedRead <-function(inPath,geneAnno, anntxdb, geeqMap, FuSeq.params,feq feqRaw1=feqInfo$feqRaw[feqInfo$feqRaw$Read==1,] feqRaw2=feqInfo$feqRaw[feqInfo$feqRaw$Read==2,] - myfeqID=lapply(as.character(myFusionFinal$name12), function(mykey) feqInfo$feqFgeMap[[mykey]]) + #myfeqID=lapply(as.character(myFusionFinal$name12), function(mykey) feqInfo$feqFgeMap[[mykey]]) + matchID=match(as.character(myFusionFinal$name12),names(feqInfo$feqFgeMap)) + myfeqID=feqInfo$feqFgeMap[matchID] + myfeqIDSize=sapply(myfeqID, length) myfeqIDName=unlist(apply(cbind(as.character(myFusionFinal$name12),myfeqIDSize),1, function(x) rep(x[1],x[2]))) myfeqID=unlist(myfeqID) @@ -169,12 +172,14 @@ processMappedRead <-function(inPath,geneAnno, anntxdb, geeqMap, FuSeq.params,feq cat("\n The number of remaining fge candidates: ",nrow(myFusionFinal)) #do biological filters - cat("\n Filter by biological features... ") - bioFilter.res=doBiologicalFilter(myFusionFinal, chromRef=FuSeq.params$chromRef, onlyProteinCodingGenes=FuSeq.params$onlyProteinCodingGenes, doFilter=TRUE) - myFusionFinal=bioFilter.res - - cat("\n The number of remaining fge candidates: ",nrow(myFusionFinal)) + if (doBioFilter){ + cat("\n Filter by biological features... ") + bioFilter.res=doBiologicalFilter(myFusionFinal, chromRef=FuSeq.params$chromRef, onlyProteinCodingGenes=FuSeq.params$onlyProteinCodingGenes, doFilter=TRUE) + myFusionFinal=bioFilter.res + cat("\n The number of remaining fge candidates: ",nrow(myFusionFinal)) + } #sequence similarity between two genes + if (!is.null(geeqMap)){ cat("\n Filter by sequence similarity... ") seqHmlog=NULL for (i in 1:nrow(myFusionFinal)){ @@ -186,7 +191,7 @@ processMappedRead <-function(inPath,geneAnno, anntxdb, geeqMap, FuSeq.params,feq #do filter myFusionFinal=myFusionFinal[myFusionFinal$seqHmlog==0,] cat("\n The number of remaining fge candidates: ",nrow(myFusionFinal)) - + } #detect junction breaks cat("\n Detect junction breaks... ") diff --git a/R/processSplitRead.R b/R/processSplitRead.R index 52ffa05..022a07e 100644 --- a/R/processSplitRead.R +++ b/R/processSplitRead.R @@ -2,7 +2,7 @@ ##### process split reads ############################################################ -processSplitRead <-function(inPath,geneAnno, anntxdb, geeqMap, txFastaFile, FuSeq.params){ +processSplitRead <-function(inPath,geneAnno, anntxdb, txFastaFile, FuSeq.params){ cat("\n ------------------------------------------------------------------") cat("\n Processing split reads (SR) from dataset: ",inPath, " read strands:", FuSeq.params$readStrands) if (FuSeq.params$keepRData){ @@ -84,7 +84,7 @@ processSplitRead <-function(inPath,geneAnno, anntxdb, geeqMap, txFastaFile, FuSe fusionGene$adjtx12Count=adjtx12Count[match(as.character(fusionGene$tx12),names(adjtx12Count))] #gene distance - geneDist=computeGeneDistance(fusionGene,anntxdb,minGeneDist=-1) + geneDist=computeGeneDistance(fusionGene,anntxdb) fusionGene$geneDist=geneDist if (FuSeq.params$keepRData){ diff --git a/README.md b/README.md index 2d30594..38cf7e4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ##################################### -# Documents for FuSeq version 0.1.1 +# Documents for FuSeq version 1.0.0 ##################################### ## 1. Introduction @@ -15,32 +15,36 @@ Annotation reference: FuSeq requires a fasta file of transcript sequences and a The latest version and information of FuSeq are updated at https://github.com/nghiavtr/FuSeq +The older versions can be found here: +- Version 0.1.1: https://github.com/nghiavtr/FuSeq/releases/tag/v0.1.1 +- Version 0.1.0: https://github.com/nghiavtr/FuSeq/releases/tag/v0.1.0 + ## 2. Download and installation If you use the binary verion of FuSeq: -- Download the lastest binary version from FuSeq website: [FuSeq_v0.1.1_linux_x86-64](https://github.com/nghiavtr/FuSeq/releases/download/v0.1.1/FuSeq_v0.1.1_linux_x86-64.tar.gz) +- Download the lastest binary version from FuSeq website: [FuSeq_v1.0.0_linux_x86-64](https://github.com/nghiavtr/FuSeq/releases/download/v1.0.0/FuSeq_v1.0.0_linux_x86-64.tar.gz) ```sh -wget https://github.com/nghiavtr/FuSeq/releases/download/v0.1.1/FuSeq_v0.1.1_linux_x86-64.tar.gz -O FuSeq_v0.1.1_linux_x86-64.tar.gz +wget https://github.com/nghiavtr/FuSeq/releases/download/v1.0.0/FuSeq_v1.0.0_linux_x86-64.tar.gz -O FuSeq_v1.0.0_linux_x86-64.tar.gz ``` - Uncompress to folder ```sh -tar -xzvf FuSeq_v0.1.1_linux_x86-64.tar.gz +tar -xzvf FuSeq_v1.0.0_linux_x86-64.tar.gz ``` - Move to the *FuSeq_home* directory and do configuration for FuSeq ```sh -cd FuSeq_v0.1.1_linux_x86-64 +cd FuSeq_v1.0.0_linux_x86-64 bash configure.sh ``` - Add paths of lib folder and bin folder to LD_LIBRARY_PATH and PATH ```sh -export LD_LIBRARY_PATH=/path/to/FuSeq_v0.1.1_linux_x86-64/linux/lib:$LD_LIBRARY_PATH -export PATH=/path/to/FuSeq_v0.1.1_linux_x86-64/linux/bin:$PATH +export LD_LIBRARY_PATH=/path/to/FuSeq_v1.0.0_linux_x86-64/linux/lib:$LD_LIBRARY_PATH +export PATH=/path/to/FuSeq_v1.0.0_linux_x86-64/linux/bin:$PATH ``` If you want to build FuSeq from sources: - Download FuSeq from [FuSeq website](https://github.com/nghiavtr/FuSeq) and move to *FuSeq_home* directory ```sh -wget https://github.com/nghiavtr/FuSeq/archive/v0.1.1.tar.gz -tar -xzvf v0.1.1.tar.gz -cd FuSeq-0.1.1 +wget https://github.com/nghiavtr/FuSeq/archive/v1.0.0.tar.gz +tar -xzvf v1.0.0.tar.gz +cd FuSeq-1.0.0 ``` - FuSeq requires information of flags from Sailfish including DFETCH_BOOST, DBOOST_ROOT, DTBB_INSTALL_DIR and DCMAKE_INSTALL_PREFIX. Please refer to the Sailfish website for more details of these flags. - Do installation by the following command: @@ -102,6 +106,8 @@ Fusion genes discovered by FuSeq are stored in a file named *fusions.fuseq* cont - fusionName: names of fusion genes - supportRead: the number of reads supporting the fusion genes - score: score of each fusion genes from FuSeq +- HGNC5 and HGNC3: the HGNC symbol of gene5 and gene3, respectively +- SR.passed and MR.passed: the number of reads passed the SR and MR pipeline, respectively - info: additional information of the fusion gene, for example constituent genes involve in mitochondrial translation, cytosolic ribosomal subunit and ribonucleoprotein. ##### - If exportFasta=TRUE is set in the *params_file*, FuSeq will export fusion reads supporting fusion genes in two fasta files named *_fusionReads_1.fa and *_fusionReads_2.fa for read1 and read2, respectively. @@ -114,7 +120,7 @@ There are several parameters input for the pipeline. The default parameter setti - maxSharedCount (5e-2): the maximum proportion of sharing counts of a fusion gene. - minGeneDist (1e5): the minimum distance between two constituent genes. - minJunctionDist (1e5): the minimum junction distance of fusion gene. -- maxInvertedFusionCount (0.01): the proportion of expression of inverted fusion. +- maxInvertedFusionCount (0): the proportion of expression of inverted fusion. If this value is zero (maxInvertedFusionCount=0 by default), we accept both fusion gene A-B and its inverted fusion gene B-A. If, for example, we set maxInvertedFusionCount=0.01, we accept the read count of B-A is at most 1% of the read count of A-B. - maxMRfusionFc (2), maxMRfusionNum (2) and sgtMRcount (10): the parameter settings for fitlering multiple-fusion genes in the mapped read (MR) pipeline. - minMR (2), minNonDupMR (2): the minimum supporting count (non-duplicated supporting count) of a fusion gene in the mapped read pipeline. - minSR (1): the minimum supporting count of a fusion gene in the split read (SR) pipeline. @@ -132,16 +138,16 @@ For simplicity, in this practice, the FuSeq software, the annotation, RNA-seq da ### 7.1. Download and install #### Download and configure FuSeq ```sh -wget https://github.com/nghiavtr/FuSeq/releases/download/v0.1.1/FuSeq_v0.1.1_linux_x86-64.tar.gz -O FuSeq_v0.1.1_linux_x86-64.tar.gz -tar -xzvf FuSeq_v0.1.1_linux_x86-64.tar.gz -cd FuSeq_v0.1.1_linux_x86-64 +wget https://github.com/nghiavtr/FuSeq/releases/download/v1.0.0/FuSeq_v1.0.0_linux_x86-64.tar.gz -O FuSeq_v1.0.0_linux_x86-64.tar.gz +tar -xzvf FuSeq_v1.0.0_linux_x86-64.tar.gz +cd FuSeq_v1.0.0_linux_x86-64 bash configure.sh cd .. ``` #### Set paths to FuSeq ```sh -export LD_LIBRARY_PATH=$PWD/FuSeq_v0.1.1_linux_x86-64/linux/lib:$LD_LIBRARY_PATH -export PATH=$PWD/FuSeq_v0.1.1_linux_x86-64/linux/bin:$PATH +export LD_LIBRARY_PATH=$PWD/FuSeq_v1.0.0_linux_x86-64/linux/lib:$LD_LIBRARY_PATH +export PATH=$PWD/FuSeq_v1.0.0_linux_x86-64/linux/bin:$PATH ``` ### 7.2. Download and prepare the reference files #### Download the fasta and gtf of transcripts @@ -153,14 +159,14 @@ gunzip Homo_sapiens.GRCh37.75.gtf.gz ``` #### Create sqlite ```sh -Rscript FuSeq_v0.1.1_linux_x86-64/R/createSqlite.R Homo_sapiens.GRCh37.75.gtf Homo_sapiens.GRCh37.75.sqlite +Rscript FuSeq_v1.0.0_linux_x86-64/R/createSqlite.R Homo_sapiens.GRCh37.75.gtf Homo_sapiens.GRCh37.75.sqlite ``` #### Download the extra transcript information and annotation from FuSeq ```sh wget https://github.com/nghiavtr/FuSeq/releases/download/v0.1.0/Homo_sapiens.GRCh37.75.txAnno.RData ``` ### 7.3. Parameter setting -The default of parameter setting is located at FuSeq_v0.1.1_linux_x86-64/R/params.txt that we will use for the pratical examples. +The default of parameter setting is located at FuSeq_v1.0.0_linux_x86-64/R/params.txt that we will use for the pratical examples. For more advanced-level users, we suggest running FuSeq with the setting of keepRData=TRUE to keep the processed data of FuSeq, then FuSeq will save all data into file FuSeq_process.RData. This file contains the results of both mapped read pipeline and split read pipeline, and extra relevant information of fusion gene candidates such as supporting exons, read mapping positions, sequence reads, etc. ### 7.4. An example for a short read sample @@ -182,9 +188,9 @@ FuSeq -i TxIndexer_idx_k21 -l IU -1 <(gunzip -c SRR064287_1.fastq.gz) -2 <(gunzi ``` #### Discover fusion genes ```sh -Rscript FuSeq_v0.1.1_linux_x86-64/R/FuSeq.R in=SRR064287_feqDir txfasta=Homo_sapiens.GRCh37.75.cdna.all.fa sqlite=Homo_sapiens.GRCh37.75.sqlite txanno=Homo_sapiens.GRCh37.75.txAnno.RData out=SRR064287_FuseqOut params=FuSeq_v0.1.1_linux_x86-64/R/params.txt +Rscript FuSeq_v1.0.0_linux_x86-64/R/FuSeq.R in=SRR064287_feqDir txfasta=Homo_sapiens.GRCh37.75.cdna.all.fa sqlite=Homo_sapiens.GRCh37.75.sqlite txanno=Homo_sapiens.GRCh37.75.txAnno.RData out=SRR064287_FuseqOut params=FuSeq_v1.0.0_linux_x86-64/R/params.txt ``` -The results is a list of fusion gene candidates stored in file fusions.FuSeq in the output folder (SRR064287_FuseqOut). +The results is a list of gene-fusion candidates stored in file fusions.FuSeq in the output folder (SRR064287_FuseqOut). ### 7.5. An example for a long read sample Now we apply FuSeq for a long read sample from a glioma dataset. @@ -209,7 +215,7 @@ FuSeq -i TxIndexer_idx_k31 -l IU -1 <(gunzip -c SRR934746_1.fastq.gz) -2 <(gunzi ``` #### Discover fusion genes ```sh -Rscript FuSeq_v0.1.1_linux_x86-64/R/FuSeq.R in=SRR934746_feqDir txfasta=Homo_sapiens.GRCh37.75.cdna.all.fa sqlite=Homo_sapiens.GRCh37.75.sqlite txanno=Homo_sapiens.GRCh37.75.txAnno.RData out=SRR934746_FuseqOut params=FuSeq_v0.1.1_linux_x86-64/R/params.txt +Rscript FuSeq_v1.0.0_linux_x86-64/R/FuSeq.R in=SRR934746_feqDir txfasta=Homo_sapiens.GRCh37.75.cdna.all.fa sqlite=Homo_sapiens.GRCh37.75.sqlite txanno=Homo_sapiens.GRCh37.75.txAnno.RData out=SRR934746_FuseqOut params=FuSeq_v1.0.0_linux_x86-64/R/params.txt ``` The results is a list of fusion gene candidates stored in file fusions.FuSeq in the output folder (SRR934746_FuseqOut). ## 8. License @@ -217,3 +223,5 @@ FuSeq uses GNU General Public License GPL-3 ## 9. References (update later) + +