Skip to content

Commit

Permalink
Annotate D gene. Fix a bug that may unnecessarily filter CDR3s.
Browse files Browse the repository at this point in the history
  • Loading branch information
mourisl committed Oct 18, 2019
1 parent 5bd6537 commit 1f2cfde
Show file tree
Hide file tree
Showing 4 changed files with 257 additions and 27 deletions.
97 changes: 97 additions & 0 deletions AlignAlgo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#define SCORE_GAPEXTEND (-1)
#define SCORE_INDEL (-4)

#define SCORE_MATCH_LOCAL 1
#define SCORE_MISMATCH_LOCAL (-2)

struct _posWeight
{
int count[4] ;
Expand Down Expand Up @@ -870,6 +873,99 @@ class AlignAlgo
//printf( "%d\n", score[lenp * bmax + lent] ) ;
return score[lenp * bmax + lent] ;
}

static int LocalAlignment( char *t, int lent, char *p, int lenp, int &tstart, int &pstart, char *align )
{
int i, j ;
int *m = new int[ ( lenp + 1 ) * ( lent + 1 ) ] ;
int bmax = ( lent + 1 ) ;
for ( i = 0 ; i <= lenp ; ++i )
m[i * bmax + 0] = 0 ;
for ( j = 0 ; j <= lent ; ++j )
m[0 * bmax + j] = 0 ;

tstart = 0 ;
pstart = 0 ;
for ( i = 1 ; i <= lenp ; ++i )
{
for ( j = 1 ; j <= lent ; ++j )
{
int score ;
score = m[ ( i - 1 ) * bmax + j - 1] + ( t[j - 1] == p[i - 1] ? SCORE_MATCH_LOCAL : SCORE_MISMATCH_LOCAL ) ;
score = MAX( score, m[ i * bmax + ( j - 1 ) ] + SCORE_INDEL ) ;
score = MAX( score, m[ ( i - 1 ) * bmax + j ] + SCORE_INDEL ) ;
if ( score < 0 )
score = 0 ;
m[i * bmax + j ] = score ;
}
}

int tagi = lenp, tagj = lent ;
int maxScore = 0 ;
for ( i = 0 ; i <= lenp ; ++i )
for ( j = 0 ; j <= lent ; ++j )
if ( m[i * bmax + j] >= maxScore )
{
maxScore = m[i * bmax + j] ;
tagi = i ;
tagj = j ;
}
if ( maxScore == 0 )
{
delete[] m ;
return -1 ;
}
int tag = 0 ;
while ( tagi > 0 || tagj > 0 )
{
int max = m[tagi * bmax + tagj] ;
int a = 0 ;
if ( max == 0 )
{
tstart = tagj ;
pstart = tagi ;
break ;
}
if ( tagj > 0 && m[tagi * bmax + tagj - 1] + SCORE_INDEL == max )
a = EDIT_DELETE ;
if ( tagi > 0 && m[ ( tagi - 1 ) * bmax + tagj] + SCORE_INDEL == max )
a = EDIT_INSERT ;
if ( tagj > 0 && tagi > 0 )
{
int diff = ( t[ tagj - 1] == p[tagi - 1] ? SCORE_MATCH_LOCAL : SCORE_MISMATCH_LOCAL ) ;
if ( m[ ( tagi - 1) * bmax + tagj - 1 ] + diff == max )
{
if ( diff == SCORE_MATCH_LOCAL )
a = EDIT_MATCH ;
else
a = EDIT_MISMATCH ;
}
}

align[ tag ] = a ;
++tag ;
if ( a == EDIT_DELETE )
--tagj ;
else if ( a == EDIT_INSERT )
--tagi ;
else
{
--tagi ;
--tagj ;
}
}

align[tag] = -1 ;
for ( i = 0, j = tag - 1 ; i < j ; ++i, --j )
{
char tmp = align[i] ;
align[i] = align[j] ;
align[j] = tmp ;
}
delete[] m ;
//printf( "%d\n", score[lenp * bmax + lent] ) ;
return maxScore ;
}

static void VisualizeAlignment( char *t, int lent, char *p, int lenp, char *align )
{
Expand Down Expand Up @@ -988,6 +1084,7 @@ class AlignAlgo
return overlapSize ;
}



// Find the partial suffix of a that match with prefix of b of length at least minLen.
// Return: the suffix of a. otherwise -1.
Expand Down
6 changes: 3 additions & 3 deletions Annotator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ char usage[] = "./annotator [OPTIONS]:\n"
"\t-o STRING: the prefix of the file containing CDR3 information (default: trust)\n"
//"\t--partial: including partial CDR3s in the report (default: false)\n"
"\t--geneAlignment: output the gene alignment (default: not set)\n"
"\t--noImpute: do not impute CDR3 sequence for TCR (default: not set (impute))"
"\t--noImpute: do not impute CDR3 sequence for TCR (default: not set (impute))\n"
"\t--notIMGT: the receptor genome sequence is not in IMGT format (default: not set(in IMGT format))\n";

char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2,
Expand Down Expand Up @@ -783,8 +783,8 @@ int main( int argc, char *argv[] )
// The gene ids
for ( k = 0 ; k < 4 ; ++k )
{
if ( k == 1 )
continue ;
//if ( k == 1 )
// continue ;
if ( annotations[i].geneOverlap[k].seqIdx == -1 )
fprintf( fpOutput, "*\t" ) ;
else
Expand Down
137 changes: 126 additions & 11 deletions SeqSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4462,6 +4462,107 @@ class SeqSet
return -1 ;
}

// Annotate the D gene
int AnnotateReadDGene( char *read, struct _overlap geneOverlap[4], struct _overlap cdr[3],
std::vector<struct _overlap> *secondaryGeneOverlaps )
{
int i, j, k ;
std::vector< std::vector<struct _overlap> > overlaps ;
SimpleVector<int> dGeneSeqIdx ;

int seqCnt = seqs.size() ;
int anchorSeqIdx = -1 ;

if ( cdr[2].seqIdx == -1 || cdr[2].similarity == 0 )
return -1 ;

if ( geneOverlap[0].seqIdx != -1 )
anchorSeqIdx = geneOverlap[0].seqIdx ;
else if ( geneOverlap[2].seqIdx != -1 )
anchorSeqIdx = geneOverlap[2].seqIdx ;
else
return -1 ;

// Only consider IGH, TRB, TRD
if ( seqs[ anchorSeqIdx ].name[2] != 'H' && seqs[ anchorSeqIdx ].name[2] != 'B'
&& seqs[ anchorSeqIdx ].name[2] != 'D' )
return -1 ;

int maxDlen = 0 ;
for ( i = 0 ; i < seqCnt ; ++i )
{
if ( seqs[i].isRef && GetGeneType( seqs[i].name ) == 1
&& ( seqs[i].name[0] == seqs[anchorSeqIdx].name[0] )
&& ( seqs[i].name[2] == seqs[anchorSeqIdx].name[2] )
&& ( seqs[i].name[1] == seqs[anchorSeqIdx].name[1] ) )
{
if ( seqs[i].consensusLen > maxDlen )
maxDlen = seqs[i].consensusLen ;
dGeneSeqIdx.PushBack( i ) ;
}
}

seqCnt = dGeneSeqIdx.Size() ;
std::vector<struct _overlap> dOverlaps ;
int cdr3Len = cdr[2].readEnd - cdr[2].readStart + 1 ;
char *align = new char[ cdr3Len ] ;
for ( i = 0 ; i < seqCnt ; ++i )
{
int seqStart ;
int readStart ;
int seqIdx = dGeneSeqIdx[i] ;
int alignScore = AlignAlgo::LocalAlignment( seqs[ seqIdx ].consensus, seqs[ seqIdx ].consensusLen,
read + cdr[2].readStart, cdr3Len, seqStart, readStart, align ) ;

if ( alignScore >= 5 * SCORE_MATCH_LOCAL )
{
readStart += cdr[2].readStart ;
int readEnd = readStart - 1 ;
int seqEnd = seqStart - 1 ;
for ( j = 0 ; align[j] != -1 ; ++j )
{
if ( align[j] != EDIT_INSERT )
++seqEnd ;
if ( align[j] != EDIT_DELETE )
++readEnd ;
}

//printf( "%s\n", seqs[seqIdx].name ) ;
//AlignAlgo::VisualizeAlignment( seqs[ seqIdx ].consensus + seqStart, seqEnd - seqStart + 1,
// read + readStart, readEnd - readStart + 1, align ) ;

int matchCnt, mismatchCnt, indelCnt ;
GetAlignStats( align, false, matchCnt, mismatchCnt, indelCnt ) ;

struct _overlap no ;
no.seqIdx = seqIdx ;
no.seqStart = seqStart ; no.seqEnd = seqEnd ;
no.readStart = readStart ; no.readEnd = readEnd;
no.matchCnt = 2 * matchCnt ;
no.similarity = (double)no.matchCnt / ( seqEnd - seqStart + 1 + readEnd - readStart + 1 ) ;

dOverlaps.push_back( no ) ;
}
}
delete[] align ;

if ( dOverlaps.size() == 0 )
return -1 ;

int bestTag = 0 ;
int overlapCnt = dOverlaps.size() ;
//std::sort( dOverlaps.begin(), dOverlaps.end() ) ;
for ( i = 1 ; i < overlapCnt ; ++i )
{
if ( IsBetterGeneMatch( dOverlaps[i], dOverlaps[bestTag], 1.0) )
{
bestTag = i ;
}
}
geneOverlap[1] = dOverlaps[bestTag] ;
return dOverlaps[bestTag].seqIdx ;
}

// Figure out the gene composition for the read.
// Return successful or not.
int AnnotateRead( char *read, int detailLevel, struct _overlap geneOverlap[4], struct _overlap cdr[3],
Expand Down Expand Up @@ -4599,6 +4700,11 @@ class SeqSet
overlapCnt = overlaps.size() ;
for ( i = 0 ; i < overlapCnt ; ++i )
{
// Remove the hits on the D gene
if ( GetGeneType( seqs[ overlaps[i].seqIdx ].name ) == 1 )
continue ;

// Remove those overlaps that was secondary as well.
if ( !seqUsed[ overlaps[i].seqIdx ] && overlaps[i].similarity >= 0.8 )
{
seqUsed[ overlaps[i].seqIdx ] = true ;
Expand Down Expand Up @@ -5667,16 +5773,20 @@ class SeqSet
}
}
}
if ( locateE + 2 - locateS + 1 < 18 )
{
if ( geneOverlap[0].seqIdx == -1 && geneOverlap[2].seqIdx != -1 )
locateS = -1 ;
else if ( geneOverlap[0].seqIdx != -1 && geneOverlap[2].seqIdx == -1 )
locateE = -1 ;
}
else if ( locateE + 2 - locateS + 1 >= 180 && ( geneOverlap[0].seqIdx == -1 || geneOverlap[1].seqIdx == -1 ) )

if ( locateS != -1 && locateE != -1 )
{
locateS = locateE = -1 ;
if ( locateE + 2 - locateS + 1 < 18 )
{
if ( geneOverlap[0].seqIdx == -1 && geneOverlap[2].seqIdx != -1 )
locateS = -1 ;
else if ( geneOverlap[0].seqIdx != -1 && geneOverlap[2].seqIdx == -1 )
locateE = -1 ;
}
else if ( locateE + 2 - locateS + 1 >= 180 && ( geneOverlap[0].seqIdx == -1 || geneOverlap[2].seqIdx == -1 ) )
{
locateS = locateE = -1 ;
}
}

// If there a gap in the middle of CDR3, pick one side.
Expand Down Expand Up @@ -6471,6 +6581,11 @@ class SeqSet
cdr[2].similarity = cdr3Score / 100.0 ;

}

if ( detailLevel >= 2 && cdr[2].similarity > 0 )
{
AnnotateReadDGene( read, geneOverlap, cdr, secondaryGeneOverlaps ) ;
}
if ( vAlign != NULL )
delete[] vAlign ;

Expand Down Expand Up @@ -6586,8 +6701,8 @@ class SeqSet
// Output the V, J, C information
for ( i = 0 ; i < 4 ; ++i )
{
if ( i == 1 )
continue ;
//if ( i == 1 )
// continue ;
if ( geneOverlap[i].seqIdx != -1 )
{
int seqIdx = geneOverlap[i].seqIdx ;
Expand Down
Loading

0 comments on commit 1f2cfde

Please sign in to comment.