Skip to content

Commit

Permalink
Fix a bug for extracing reference sequences when taking reverse-compl…
Browse files Browse the repository at this point in the history
…ement. Add ref names when extending. Ignore alignment if the overlap is low complex. Handle character N
  • Loading branch information
mourisl committed Jan 2, 2019
1 parent 7dbacfe commit 4efc622
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 8 deletions.
2 changes: 1 addition & 1 deletion BuildDatabaseFa.pl
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ sub OutputGene
if ( $strand eq "-" )
{
$tmp = reverse( $tmp ) ;
$tmp =~ tr/ACGT/TCGA/ ;
$tmp =~ tr/ACGT/TGCA/ ;
}
$output .= $tmp ;
}
Expand Down
3 changes: 2 additions & 1 deletion KmerCode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ class KmerCode
code = ( ( code << 2ull ) & mask ) |
( (uint64_t)( nucToNum[ c - 'A' ] & 3 ) ) ;

if ( nucToNum[c - 'A'] == -1 )
//if ( nucToNum[c - 'A'] == -1 )
if ( c == 'N' )
{
invalidPos = 0 ;
}
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ bam-extractor: BamExtractor.o
fi ;
$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) -lbam

main.o: main.cpp AlignAlgo.hpp ReadFiles.hpp kseq.h SeqSet.hpp KmerIndex.hpp SimpleVector.hpp defs.h StatsTests.h poa.hpp
main.o: main.cpp AlignAlgo.hpp ReadFiles.hpp kseq.h SeqSet.hpp KmerIndex.hpp SimpleVector.hpp defs.h StatsTests.h poa.hpp KmerCode.hpp
BamExtractor.o: BamExtractor.cpp alignments.hpp
#Alignment.o: Alignment.cpp Alignment.h SimpleVector.h defs.h StatsTests.h KmerTree.h ReadSet.h KmerIndex.h poa.h

Expand Down
121 changes: 117 additions & 4 deletions SeqSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,26 @@ class SeqSet
}
}


bool IsOverlapLowComplex( char *r, struct _overlap &o )
{
int cnt[4] = {0, 0, 0, 0} ;
int i ;
for ( i = o.readStart ; i <= o.readEnd ; ++i )
{
if ( r[i] == 'N' )
continue ;
++cnt[ nucToNum[ r[i] - 'A' ] ] ;
}
int len = o.readEnd - o.readStart + 1 ;
int lowCnt = 0 ;
for ( i = 0 ; i < 4 ; ++i )
if ( cnt[i] <= 2 )
++lowCnt ;
if ( lowCnt >= 2 )
return true ;
return false ;
}
public:
SeqSet( int kl )
{
Expand Down Expand Up @@ -692,6 +712,9 @@ class SeqSet
overlaps[i].readEnd - overlaps[i].readStart + 1 ) ;
else
overlaps[i].similarity = 0 ;

if ( IsOverlapLowComplex( r, overlaps[i]) )
overlaps[i].similarity = 0 ;
} // for i
delete[] rcRead ;

Expand Down Expand Up @@ -930,10 +953,10 @@ class SeqSet
int newSeqIdx = extendedOverlaps[0].seqIdx ;
SimpleVector<struct _posWeight> &posWeight = seqs[newSeqIdx].posWeight ;
posWeight.ExpandTo( newConsensusLen ) ;
posWeight.SetZero( 0, newConsensusLen ) ;
posWeight.SetZero( seqs[newSeqIdx].consensusLen, newConsensusLen - seqs[ newSeqIdx ].consensusLen ) ;


for ( i = 0 ; i < eOverlapCnt ; ++i )
for ( i = 1 ; i < eOverlapCnt ; ++i )
{
// Though not the most efficient implementation, it seems very straightforward.
int seqIdx = extendedOverlaps[i].seqIdx ;
Expand Down Expand Up @@ -1002,20 +1025,22 @@ class SeqSet
*/

// Update the name.
// TODO: use array of names.
sum = 0 ;
for ( i = 0 ; i < eOverlapCnt ; ++i )
sum += strlen( seqs[ extendedOverlaps[i].seqIdx ].name ) ;
char* nameBuffer = new char[sum + eOverlapCnt + 1 ] ;

strcpy( nameBuffer, seqs[ newSeqIdx ].name ) ;
sum = strlen( nameBuffer ) ;
//printf( "%d\n", seqs.size() ) ;
for ( i = 1 ; i < eOverlapCnt ; ++i )
{
if ( strcmp( seqs[ extendedOverlaps[i].seqIdx ].name, seqs[ extendedOverlaps[i - 1].seqIdx ].name ) )
{
nameBuffer[ sum ] = '+' ;
nameBuffer[ sum + 1 ] = '\0' ;
strcat( nameBuffer + sum + 1, seqs[ extendedOverlaps[i].seqIdx ].name ) ;
strcpy( nameBuffer + sum + 1, seqs[ extendedOverlaps[i].seqIdx ].name ) ;
sum = sum + 1 + strlen( seqs[ extendedOverlaps[i].seqIdx ].name ) ;
}
}
Expand Down Expand Up @@ -1113,6 +1138,41 @@ class SeqSet
seq.posWeight.SetZero( start, len - extendedOverlaps[0].readEnd - 1 ) ;
}

// Adjust the name.
// Find the possible ref seq
int refIdx = -1 ;
for ( i = 0 ; i < overlapCnt ; ++i )
{
if ( !seqs[ overlaps[i].seqIdx ].isRef )
continue ;
// Use refIdx as the idx in the overlaps list.
if ( refIdx == -1 ||
( overlaps[i].readEnd - overlaps[i].readStart >
overlaps[refIdx].readEnd - overlaps[refIdx].readStart ) )
{
refIdx = i ;
}
}
if ( refIdx != -1 )
{
// Use refIdx as the idx in the seqs
refIdx = overlaps[ refIdx ].seqIdx ;
if ( strstr( seq.name, seqs[ refIdx ].name ) == NULL )
{
char *nameBuffer = new char[ strlen( seqs[ refIdx ].name) + strlen( seq.name ) + 2 ] ;
if ( extendedOverlaps[0].seqStart < seq.consensusLen / 2 )
{
sprintf( nameBuffer, "%s+%s", seqs[refIdx].name, seq.name ) ;
}
else
{
sprintf( nameBuffer, "%s+%s", seq.name, seqs[refIdx].name ) ;
}
free( seq.name ) ;
seq.name = strdup( nameBuffer ) ;
delete[] nameBuffer ;
}
}
// either one of the ends of read or seq should be 0.
readInConsensusOffset = 0 ;
if ( extendedOverlaps[0].seqStart > 0 )
Expand All @@ -1133,8 +1193,42 @@ class SeqSet
{
struct _seqWrapper &seq = seqs[seqIdx] ;
//printf( "%d %d. %d %d\n%s\n%s\n", seqIdx, seq.posWeight.Size(), readInConsensusOffset, len, seq.consensus, r) ;
SimpleVector<int> nPos ;
for ( i = 0 ; i < len ; ++i )
{
if ( r[i] == 'N' )
continue ;
++seq.posWeight[i + readInConsensusOffset].count[ nucToNum[ r[i] - 'A' ] ] ;

if ( seq.consensus[i + readInConsensusOffset ] == 'N' )
{
nPos.PushBack( i ) ;
}
}

int size = nPos.Size() ;
for ( i = 0 ; i < size ; )
{
for ( j = i + 1 ; j < size ; ++j )
if ( nPos[j] > nPos[j - 1] + kmerLength - 1 )
break ;

// [i,j) holds the N positions that are with kmer-length size.
int l ;
// Update the consensus
for ( l = i ; l < j ; ++l )
seq.consensus[ nPos[l] + readInConsensusOffset ] = r[ nPos[l] ] ;
// Update the index
KmerCode kmerCode( kmerLength ) ;
int start = nPos[i] - kmerLength + 1 + readInConsensusOffset ;
if ( start < 0 )
start = 0 ;
int end = nPos[j - 1] + kmerLength - 1 + readInConsensusOffset ;
if ( end >= seq.consensusLen )
end = seq.consensusLen - 1 ;
seqIndex.BuildIndexFromRead( kmerCode, seq.consensus + start, end - start + 1, seqIdx, start ) ;
i = j ;
}
}

free( rcRead ) ;
Expand Down Expand Up @@ -1206,18 +1300,37 @@ class SeqSet
return ret ;
}

void Output()
void ResetPosWeight()
{
int i ;
int seqCnt = seqs.size() ;
for ( i = 0 ; i < seqCnt ; ++i )
{
int size = seqs[i].posWeight.Size() ;
seqs[i].posWeight.SetZero( 0, size ) ;
}
}

void Output()
{
int i, j, k ;
int size = seqs.size() ;
for ( i = 0 ; i < size ; ++i )
{
if ( seqs[i].isRef || seqs[i].consensus == NULL )
continue ;

printf( ">%s\n%s\n", seqs[i].name, seqs[i].consensus ) ;

for ( k = 0 ; k < 4 ; ++k )
{
for ( j = 0 ; j < seqs[i].consensusLen ; ++j )
printf( "%d ", seqs[i].posWeight[j].count[k] ) ;
printf( "\n" ) ;
}
}
}

} ;


Expand Down
13 changes: 12 additions & 1 deletion main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,25 @@ int main( int argc, char *argv[] )
return EXIT_FAILURE ;
}

/*while ( reads.Next() )
{
printf( "%s %s\n", reads.id, reads.seq ) ;
fflush( stdout ) ;
seqSet.AddRead( reads.seq ) ;
printf( "done\n" ) ;
}*/

// Go through the second round.
// TODO: user-defined number of rounds.
seqSet.ResetPosWeight() ;
reads.Rewind() ;
while ( reads.Next() )
{
printf( "%s %s\n", reads.id, reads.seq ) ;
fflush( stdout ) ;
seqSet.AddRead( reads.seq ) ;
printf( "done\n" ) ;
}

seqSet.Output() ;
return 0 ;
}

0 comments on commit 4efc622

Please sign in to comment.