Skip to content

Commit

Permalink
Remove the assumption of read length.
Browse files Browse the repository at this point in the history
  • Loading branch information
mourisl committed Nov 29, 2022
1 parent 69d65f5 commit 50da4f5
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 40 deletions.
54 changes: 26 additions & 28 deletions Annotator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "KmerCount.hpp"
#include "SeqSet.hpp"
#include "AlignAlgo.hpp"
#include "FileLineReader.hpp"

char usage[] = "./annotator [OPTIONS]:\n"
"Required:\n"
Expand Down Expand Up @@ -41,8 +42,6 @@ char numToNuc[26] = {'A', 'C', 'G', 'T'} ;

char buffer[10241] = "" ;
char bufferHeader[10241] = "" ;
char weightBuffer[100241] = "" ;
char seq[10241] = "" ;

static const char *short_options = "f:a:r:p:o:t:" ;
static struct option long_options[] = {
Expand Down Expand Up @@ -346,7 +345,7 @@ void *AnnotateReads_Thread( void *pArg )
{
if ( arg.needRC )
{
int strand = arg.refSet->HasHitInSet( arg.seqSet->GetSeqConsensus(i), buffer) ;
int strand = arg.refSet->HasHitInSet( arg.seqSet->GetSeqConsensus(i)) ;
if ( strand == -1 )
arg.seqSet->ReverseComplementInSeqSet( i ) ;
}
Expand Down Expand Up @@ -397,14 +396,14 @@ int main( int argc, char *argv[] )
SeqSet refSet( 7 ) ;
SeqSet seqSet( 17 ) ;
int c, option_index ;
FILE *fpAssembly = NULL ;
FileLineReader flrAssembly ;
FILE *fpReadAssignment = NULL ;
struct _overlap geneOverlap[4] ;
struct _overlap cdr[3] ; // the coordinate for cdr1,2,3
option_index = 0 ;
bool ignoreWeight = false ;
char outputPrefix[1024] = "trust" ;
FILE *fpReads = NULL ;
FileLineReader flrReads ;
int threadCnt = 1 ;
bool includePartial = true ;
bool isIMGT = true ;
Expand Down Expand Up @@ -432,11 +431,11 @@ int main( int argc, char *argv[] )
}
else if ( c == 'a' )
{
fpAssembly = fopen( optarg, "r" ) ;
flrAssembly.Open( optarg ) ;
}
else if ( c == 'r' )
{
fpReads = fopen( optarg, "r" ) ;
flrReads.Open(optarg) ;
}
else if ( c == 'o' )
{
Expand Down Expand Up @@ -517,7 +516,7 @@ int main( int argc, char *argv[] )
return EXIT_FAILURE ;
}

if ( fpAssembly == NULL )
if ( !flrAssembly.IsOpen() )
{
fprintf( stderr, "Need to use -a to specify the assembly file.\n" ) ;
return EXIT_FAILURE ;
Expand All @@ -526,32 +525,30 @@ int main( int argc, char *argv[] )
refSet.SetHitLenRequired( 17 ) ;
refSet.SetRadius( radius ) ;
PrintLog( "Start to annotate assemblies." ) ;
while ( fgets( buffer, sizeof( buffer ), fpAssembly ) != NULL )
while ( flrAssembly.ReadLine() != NULL )
{
strcpy(buffer, flrAssembly.GetLinePtr()) ;
if ( (format != 2 && buffer[0] != '>')
|| (format == 2 && buffer[0] != '@') )
{
printf( "%s", buffer ) ;
continue ;
}

fgets( seq, sizeof( seq ), fpAssembly ) ;

flrAssembly.ReadLine() ;
char *seq = strdup( flrAssembly.GetLinePtr() ) ;
int len = strlen( seq ) ;
if ( seq[len - 1] == '\n' )
{
seq[len - 1] = '\0' ;
--len ;
}

// Read in the four line of pos weight
SimpleVector<struct _posWeight> posWeight ;
double depthSum = 0 ;
if ( !ignoreWeight )
{
posWeight.ExpandTo( strlen( seq ) ) ;
posWeight.ExpandTo( len ) ;
for ( k = 0 ; k < 4 ; ++k )
{
fgets( weightBuffer, sizeof( weightBuffer ), fpAssembly ) ;
flrAssembly.ReadLine() ;
const char *weightBuffer = flrAssembly.GetLinePtr() ;

int num = 0 ;
i = 0 ;
Expand Down Expand Up @@ -581,14 +578,15 @@ int main( int argc, char *argv[] )
}
else
seqSet.InputNovelRead( buffer + 1, seq, 1, -1 ) ;
free(seq) ;

if (format == 2) // fastq
{
fgets(buffer, sizeof(buffer), fpAssembly) ;
fgets(buffer, sizeof(buffer), fpAssembly) ;
flrAssembly.ReadLine() ;
flrAssembly.ReadLine() ;
}
}
fclose( fpAssembly ) ;
flrAssembly.Close() ;

if ( hasBarcode )
seqSet.SetBarcodeFromSeqName( barcodeStrToInt ) ;
Expand All @@ -601,7 +599,7 @@ int main( int argc, char *argv[] )
{
if ( needRC )
{
int strand = refSet.HasHitInSet( seqSet.GetSeqConsensus(i), buffer ) ;
int strand = refSet.HasHitInSet( seqSet.GetSeqConsensus(i) ) ;
if ( strand == -1 )
seqSet.ReverseComplementInSeqSet( i ) ;
}
Expand Down Expand Up @@ -681,7 +679,7 @@ int main( int argc, char *argv[] )
}

// Output more CDR3 information
if ( fpReads != NULL )
if ( flrReads.IsOpen() )
{
struct _overlap assign ;
std::vector< std::vector<struct _CDR3info> > cdr3Infos ;
Expand All @@ -696,12 +694,11 @@ int main( int argc, char *argv[] )
int assembledReadCnt ;

//while ( fscanf( fpReads, "%s %d %d %d", buffer, &strand, &minCnt, &medCnt ) != EOF )
while ( fgets( bufferHeader, sizeof( bufferHeader ), fpReads ) != NULL )
while ( flrReads.ReadLine() != NULL )
{
strcpy(bufferHeader, flrReads.GetLinePtr()) ;
sscanf( bufferHeader, "%s %d %d %d", buffer, &strand, &minCnt, &medCnt ) ;
//fscanf( fpReads, "%s", seq ) ;
fgets( seq, sizeof(seq), fpReads ) ;
seq[strlen(seq) - 1] = '\0' ;

struct _assignRead nr ;
nr.barcode = -1 ;
Expand Down Expand Up @@ -733,7 +730,8 @@ int main( int argc, char *argv[] )
}

nr.id = strdup( buffer + 1 ) ; // skip the > sign
nr.read = strdup( seq ) ;
flrReads.ReadLine() ;
nr.read = strdup( flrReads.GetLinePtr() ) ;
nr.overlap.strand = strand ;
assembledReads.push_back( nr ) ;
}
Expand Down Expand Up @@ -1075,7 +1073,7 @@ int main( int argc, char *argv[] )
}

fclose( fpOutput ) ;
fclose( fpReads ) ;
flrReads.Close() ;
}
else if ( outputCDR3File ) // force output cdr3 file
{
Expand Down
14 changes: 7 additions & 7 deletions BamExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ void *ProcessUnmappedReads_Thread( void *pArg )
{
// single-end
if ( !IsLowComplexity( arg.candidates[i].mate1 )
&& info.refSet->HasHitInSet( arg.candidates[i].mate1, info.seqBuffer ) != 0)
&& info.refSet->HasHitInSet( arg.candidates[i].mate1 ) != 0)
{
pick.PushBack( i ) ;
}
Expand All @@ -221,8 +221,8 @@ void *ProcessUnmappedReads_Thread( void *pArg )
{
// paired-end
if ( ( !IsLowComplexity( arg.candidates[i].mate1 ) && !IsLowComplexity( arg.candidates[i].mate2 ) )
&& ( info.refSet->HasHitInSet( arg.candidates[i].mate1, info.seqBuffer ) != 0
|| info.refSet->HasHitInSet( arg.candidates[i].mate2, info.seqBuffer ) != 0 ) )
&& ( info.refSet->HasHitInSet( arg.candidates[i].mate1 ) != 0
|| info.refSet->HasHitInSet( arg.candidates[i].mate2 ) != 0 ) )
{
pick.PushBack( i ) ;
}
Expand Down Expand Up @@ -666,8 +666,8 @@ int main( int argc, char *argv[] )
if ( threadCnt == 1 )
{
if ( ( !IsLowComplexity( buffer2 ) && !IsLowComplexity( buffer ) ) &&
( refSet.HasHitInSet( buffer2, seqBuffer ) != 0 ||
refSet.HasHitInSet( buffer, seqBuffer ) != 0 ) )
( refSet.HasHitInSet( buffer2 ) != 0 ||
refSet.HasHitInSet( buffer ) != 0 ) )
{
if ( !alignments.IsFirstMate() )
{
Expand Down Expand Up @@ -725,7 +725,7 @@ int main( int argc, char *argv[] )
// reads from alternative chromosomes, or the unmapped flag is not set appropriately
alignments.GetReadSeq( buffer ) ;
alignments.GetQual( bufferQual ) ;
if ( !IsLowComplexity( buffer ) && refSet.HasHitInSet( buffer, seqBuffer ) != 0)
if ( !IsLowComplexity( buffer ) && refSet.HasHitInSet( buffer ) != 0)
{
std::string name( alignments.GetReadId() ) ;
TrimName( name, mateIdLen ) ;
Expand Down Expand Up @@ -754,7 +754,7 @@ int main( int argc, char *argv[] )
continue ;

}
if ( !IsLowComplexity( buffer ) && refSet.HasHitInSet( buffer, seqBuffer ) != 0)
if ( !IsLowComplexity( buffer ) && refSet.HasHitInSet( buffer ) != 0)
{
//alignments.GetReadSeq( buffer ) ;
// No need to trim read id for single-end data.
Expand Down
2 changes: 1 addition & 1 deletion FastqExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ bool IsLowComplexity( char *seq )

int IsGoodCandidate( char *read, char *buffer, SeqSet *refSet )
{
if ( !IsLowComplexity( read ) && refSet->HasHitInSet( read, buffer ) != 0)
if ( !IsLowComplexity( read ) && refSet->HasHitInSet( read ) != 0)
return 1 ;
return 0 ;
}
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CXX = g++
CXXFLAGS= -O3 #-pg #-Wall #-O3
CXXFLAGS= -O3 -Wall -g #-pg -g #-Wall #-O3
LINKPATH= -I./samtools-0.1.19 -L./samtools-0.1.19
LINKFLAGS = -lpthread -lz
DEBUG=
Expand Down Expand Up @@ -32,7 +32,7 @@ annotator: Annotator.o
main.o: main.cpp AlignAlgo.hpp ReadFiles.hpp kseq.h SeqSet.hpp KmerIndex.hpp SimpleVector.hpp defs.h KmerCode.hpp KmerCount.hpp
BamExtractor.o: BamExtractor.cpp alignments.hpp defs.h SeqSet.hpp
FastqExtractor.o: FastqExtractor.cpp ReadFiles.hpp defs.h SeqSet.hpp BarcodeCorrector.hpp SimpleVector.hpp
Annotator.o: Annotator.cpp AlignAlgo.hpp ReadFiles.hpp kseq.h SeqSet.hpp KmerIndex.hpp SimpleVector.hpp defs.h KmerCode.hpp KmerCount.hpp
Annotator.o: Annotator.cpp AlignAlgo.hpp ReadFiles.hpp kseq.h SeqSet.hpp KmerIndex.hpp SimpleVector.hpp defs.h KmerCode.hpp KmerCount.hpp FileLineReader.hpp
#Alignment.o: Alignment.cpp Alignment.h SimpleVector.h defs.h StatsTests.h KmerTree.h ReadSet.h KmerIndex.h poa.h

clean:
Expand Down
5 changes: 3 additions & 2 deletions SeqSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2618,15 +2618,16 @@ class SeqSet

// Test whether the read share a kmer hit on the seqs.
// 0-no hit. Other wise return the strand
int HasHitInSet( char *read, char *rcRead )
int HasHitInSet( char *read )
{
int i, j, k ;
int len = strlen( read ) ;
if ( len < kmerLength )
return 0 ;

char *rcRead = strdup(read) ;
SimpleVector<struct _hit> hits ;
GetHitsFromRead( read, rcRead, 0, -1, false, hits, NULL ) ;
free(rcRead) ;

int hitCnt = hits.Size() ;
if ( hitCnt == 0 )
Expand Down

0 comments on commit 50da4f5

Please sign in to comment.