-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalignreads.c
3190 lines (3032 loc) · 148 KB
/
alignreads.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/timeb.h>
#include <limits.h>
#include "version.h"
#include "alignreads.h"
#include "bwtindex.h"
#include "fileformats.h"
#include "samoutput.h"
#include "dynprog.h"
#include "tools.h"
#include "console.h"
#include "graphics.h"
#include "consensus.h"
#include "splicesites.h"
//#define DEBUG 1
//#define DISABLE_SS_ARRAY 1
#ifdef _MSC_VER
#pragma warning(disable:4996)
#endif
#define STATSINTERVAL 2.0
#define MAX(a,b) (((a)>(b))?(a):(b))
#define MIN(a,b) (((a)<(b))?(a):(b))
/**************/
/* STRUCTURES */
/**************/
typedef struct _SeedPos { // structure that stores the position and the corresponding seed number
unsigned int pos; // position of the seed in the reference genome
int readpos; // position of the seed in the read
int seed; // corresponding seed id
int size; // size of the seed
int ssId; // splice site id used to create this seed, if any
} SeedPos;
typedef struct _ReadHit { // structure that stores an occurrence of the read in the genome
unsigned int pos; // starting position of the read in the reference genome
int score; // score of the alignment
int numerrors; // number of errors of the alignment
int cigarsize; // size of the CIGAR string of the alignment
int *cigarcounts; // counts for the CIGAR string of the alignment
char *cigarcodes; // codes for the CIGAR string of the alignment
char strand; // strand of the read
int matedist; // distance from this mate to the other mate (in paired-ends mode)
} ReadHit;
typedef struct _Seed { // structure that stores information about the seeds of the read in the genome
int startPosInRead;
int size;
unsigned int numHits;
unsigned int bwtTopPointer;
} Seed;
typedef struct _Strand {
char *chars;
int numSeeds;
Seed *seeds;
unsigned int *topPointers;
unsigned int *bottomPointers;
} Strand;
typedef struct _Read {
char *name;
int size;
char type;
Strand fwdStrand;
Strand revStrand;
} Read;
int maxNumSeeds = 0; // maximum number of seeds that the array (Read->Strand->seeds) can currently store
/********************/
/* GLOBAL VARIABLES */
/********************/
FILE *outputFile; // file to write results
FILE *unalignedReadsFile; // file to write unaligned reads
FILE *errorsFile;
int pairedEndReadsMode; // indicates if we are in paired-end mode or not
int rnaSequencingMode; // indicates if we are in RNA/Transcriptome sequencing mode or not
int reportUnalignedReads;
int reportAllHits;
int singleRefReadsOnly;
int numReadsFiles; // number of reads files
int currentReadsFile; // index of the current reads file
FILE **readsFiles; // files with the reads to be aligned
char **readsFilenames; // names of the reads files
char *readsFileTypes; // types of the reads files
int *minDists; // mininum distances for read pairs for each reads files
int *maxDists; // maximum distances for read pairs for each reads files
int minIntronLength; // minimum distance between exons
int maxIntronLength; // maximum distance between exons
unsigned int numSplicingEvents;
unsigned int numSplicedReads;
unsigned int numSeedsAddedBySpliceSites; // number of created seeds on the the opposite side of an intron, because a splice site was detected
unsigned int sizeBWT; // reference genome index size
unsigned int sizeRefGenome; // reference genome size
char *refGenome; // reference genome string
char **refsNames; // reference genome(s) name(s)
int numRefs; // number of references
unsigned int *refsEndPositions; // ending positions of each ref in the global ref string
char *genomefilename; // name of the file containing the un-indexed reference genome
unsigned int numReads; // number of reads
unsigned int numAlignedReads; // number of aligned reads
unsigned int numUnalignedReads; // number of unaligned reads
unsigned int numAlignedHardReads; // number of hard reads aligned
unsigned int numHighRepetitionReads; // number of reads with too many hits
unsigned int maxReadHits; // maximum number of hits observed for a read
unsigned int numAlignedBothMates; // number of pairs where both reads were aligned
unsigned int numAlignedSingleMates; // number of pairs where only one read was aligned
unsigned int minObservedPairDistance; // minimum distance observed between two mate reads
unsigned int maxObservedPairDistance; // maximum distance observed between two mate reads
unsigned int avgObservedPairDistances; // aproximate average of all observed distances between the two mate reads
unsigned int weightObservedPairDistances; // weight of the aproximate average
int totalNumSeeds; // total number of seeds in all the reads
int totalNumSeedOccs; // total number of occurrences of all the found seeds
int totalNumSeedChains; // total number of best seed chains tested
unsigned long long int totalNumReadHits; // count of all hits for all reads
unsigned long long int totalNumDPErrors; // total number of DP errors in all the aligned reads
unsigned long long int totalNumReadsChars; // total number of chars in all the processed reads
unsigned long long int totalNumMappedReadsChars; // total number of chars in all the mapped reads
int maxReadSize; // size of the longest read in the reads file (always lower or equal to maxReadArraySize)
ReadHit *readsHits[2]; // stores all hits in the genome for both reads in the paired-end read set
int maxReadHitsArraySize; // maximum number of hits that the read hits array can store
int numReadsHits[2]; // number of hits for each one of the paired-end reads
int bestReadsHitIds[2]; // ids of the top scoring hits for each one of the paired-end reads
int bestReadsScores2nd[2]; // second best alignment score for each one of the paired-end reads
int minReadPairDistance; // minimum allowed distance between the read pair
int maxReadPairDistance; // maximum allowed distance between the read pair
int reqReadPairStrands; // required read pair strands (1=same or 2=opposite)
int minIdentityPct; // minimum allowed identity percentage for each read
int maxNumErrors; // maximum number of errors allowed in the banded dynamic programming among all reads
int maxSeedHits; // maximum allowed number of occurences of the seed in the reference genome
char *fwdRead; // read chars for the forward strand
char *revRead; // read chars for the reverse strand
struct timeb starttb, endtb; // variables for storing time values
long long int prevFilesData, totalFilesData, currentFileData; // variables for storing relative position inside reads file
char (*AnalyzeReads)(int,int); // function pointer for specific file format reader function
int (*LoadNextRead)(); // function pointer for specific file format reader function
/********************************/
/* FUNCTIONS FOR ALIGNMENT MODE */
/********************************/
// returns elapsed time since the given time value
double GetElapsedTime(struct timeb starttb){
ftime( &endtb );
return ((endtb.time) + (endtb.millitm)/1000.0) - ((starttb.time) + (starttb.millitm)/1000.0);
}
// Compare function for quicksort (sort seed positions)
int CompareSeedPositions(const void *a, const void *b){
/*
const SeedPos *aa = *(const SeedPos *const *) a;
const SeedPos *bb = *(const SeedPos *const *) b;
*/
SeedPos *aa = *(SeedPos **)a;
SeedPos *bb = *(SeedPos **)b;
//return ( (aa->pos) - (bb->pos) );
if( (aa->pos) > (bb->pos) ) return +1;
if( (aa->pos) < (bb->pos) ) return -1;
return ( (aa->seed) - (bb->seed) ); // if( (aa->pos) == (bb->pos) ) ; when the position is the same, sort by seed id
}
// Compare function for quicksort (sort read hit positions)
int CompareReadHitPositions(const void *a, const void *b){
return ( ((ReadHit *)a)->pos - ((ReadHit *)b)->pos );
}
// expand the read chars arrays for both forward and reverse strands to store a read with the new larger size
void IncreaseMaxReadArraySize(int newsize){
int i;
if(newsize<maxReadArraySize) return;
while(maxReadArraySize<newsize) maxReadArraySize += maxReadArraySize; // double the array's size each time
for( i = 0 ; i < numPairs ; i++ ){
fwdReads[i]=(char *)realloc(fwdReads[i],(maxReadArraySize+1)*sizeof(char)); // realloc arrays
revReads[i]=(char *)realloc(revReads[i],(maxReadArraySize+1)*sizeof(char));
}
fwdRead = fwdReads[currentPair]; // set pointers to current reads file
revRead = revReads[currentPair];
read=fwdRead; // we always load the read chars from the reads file into the forward strand
}
// expand the read hits arrays to store a the new number of hits
void IncreaseMaxReadHitsArraySize(int newsize, int cigararraysize){
int i,j,n;
n = maxReadHitsArraySize; // old size
maxReadHitsArraySize = newsize;
for(i=0;i<numPairs;i++){
readsHits[i]=(ReadHit *)realloc(readsHits[i],maxReadHitsArraySize*sizeof(ReadHit));
for(j=n;j<maxReadHitsArraySize;j++){ // only alloc CIGAR arrays on newly created read hit entries
readsHits[i][j].cigarcodes = (char *)malloc(cigararraysize*sizeof(char));
readsHits[i][j].cigarcounts = (int *)malloc(cigararraysize*sizeof(int));
}
}
}
// loads the reverse complementary of the read
void LoadReverseStrandRead(char *read, char *revread, int readsize){
char c;
int i, j;
j = 0;
for( i = (readsize-1) ; i >= 0 ; i-- ){
c = read[i];
switch( c ){
case 'A':
revread[j] = 'T';
break;
case 'C':
revread[j] = 'G';
break;
case 'G':
revread[j] = 'C';
break;
case 'T':
revread[j] = 'A';
break;
default:
break;
}
j++;
}
revread[readsize] = '\0';
}
// updates the Cigar string with the given operation
void UpdateCigarString(char opcode, int opsize){
#ifdef DEBUGDP
int i;
#endif
if( cigarStrCodes[cigarStrSize] == opcode ){ // if the previous operation was already the same
cigarStrCounts[cigarStrSize] += opsize;
} else { // if not, create new operation
cigarStrSize++; // holds the last position of the array, size is +1
cigarStrCodes[cigarStrSize] = opcode;
cigarStrCounts[cigarStrSize] = opsize;
}
#ifdef DEBUGDP
i = 0;
if( cigarStrCounts[0] == 0 ) i = 1;
for( ; i <= cigarStrSize ; i++ ) fprintf(debugfile,"%d%c",cigarStrCounts[i],cigarStrCodes[i]);
fputc('\n',debugfile);
#endif
}
// NOTE: currently unused because the CIGAR string is now updated directly inside the main DP function while backtracking
// updates the Cigar string with the latest DP alignment
void UpdateCigarStringWithDP(){
int i;
for( i = 0 ; i < dpTraceSize ; i++ ){ // scan all dp trace directions (the array is already in the correct order)
if( cigarStrCodes[cigarStrSize] == dpTrace[i] ){ // if it's the same operation as the previous one
cigarStrCounts[cigarStrSize]++;
} else { // if not, create new operation
cigarStrSize++; // holds the last position of the array, size is +1
cigarStrCodes[cigarStrSize] = dpTrace[i];
cigarStrCounts[cigarStrSize] = 1;
}
}
#ifdef DEBUGDP
i = 0;
if( cigarStrCounts[0] == 0 ) i = 1;
for( ; i <= cigarStrSize ; i++ ) fprintf(debugfile,"%d%c",cigarStrCounts[i],cigarStrCodes[i]);
fputc('\n',debugfile);
#endif
}
// TODO: move to "samoutput.c" file
// NOTE: single reference is labeled as "R" and multiple references as "R#"
void PrintSAMHeader(){
int i,n;
if(numRefs==1){ // only one ref
fprintf(outputFile,"@SQ\tSN:%s\tLN:%u\tSP:%s\tUR:%s\n","R",sizeRefGenome,refsNames[0],genomefilename);
} else { // multiple refs
for(i=0;i<numRefs;i++){
if(i==0) n=(refsEndPositions[i]+1);
else n=(refsEndPositions[i]-refsEndPositions[(i-1)]); // size of the ref
fprintf(outputFile,"@SQ\tSN:%s%d\tLN:%u\tSP:%s\tUR:%s\n","R",(i+1),n,refsNames[i],genomefilename);
}
}
fprintf(outputFile,"@PG\tID:0\tPN:TAPyR\tVN:%s\n",VERSION);
}
// Returns the reference where this read hit is mapped on
int GetReadRef(unsigned int rpos){
int start, end, middle;
//while( (i<numRefs) && ( readPos > refsEndPositions[i] ) ) i++;
start = -1;
end = ( numRefs - 1 ); // the correct ref number will be stored in here
while( (end-start) != 1 ){ // binary search for correct ref
middle = ( ( start + end ) / 2 );
if( rpos > refsEndPositions[middle] ) start = middle;
else end = middle;
}
return end; // current ref
}
// TODO: update the number of errors of the read if insertions are added
// When there are multiple reference sequences, finds the correct read reference and updates the read position and its CIGAR string accordingly
int FixMultiRefRead(unsigned int *readPos, int readSize){
unsigned int n, rpos, gpos;
int i, ref;
char c;
rpos = (*readPos);
ref = GetReadRef(rpos);
n = refsEndPositions[ref]; // last valid position in the current ref
if( (rpos+readSize-1) <= n ) return ref; // if the read does not extend to the next ref, it is ok
rpos = 0; // position inside the read
gpos = (*readPos); // position inside the genome
i = 0; // position in the CIGAR string
c = 'M'; // set variable just so compiler doesn't complain
while( ( (gpos-1) <= n ) && ( i <= cigarStrSize ) ){ // gpos points to the position next to the last filled one
c = cigarStrCodes[i];
if( c=='M' || c=='=' || c=='X' ){ // advances positions in both read and ref
rpos += cigarStrCounts[i];
gpos += cigarStrCounts[i];
} else if( c=='I' || c=='S' ) rpos += cigarStrCounts[i]; // advance pos only in read
else if( c=='D' || c=='N' ) gpos += cigarStrCounts[i]; // advance pos only in ref
i++;
}
if( (gpos-1) <= n ) return ref; // if the read extends to the next ref we need to fix the CIGAR string, otherwise it is ok
gpos--; // now both variables point to the last filled position
rpos--;
n = ( gpos - n ); // how many chars did we advance into the next ref
i--; // last CIGAR operation (that crossed to the next ref)
cigarStrCounts[i] -= n; // break operation before it enters the next ref
if( c=='M' || c=='=' || c=='X' ) rpos -= n; // if the operation also changed the read pos, change it back
if( rpos >= (unsigned int)(readSize/2) ){ // if the largest part of the read is in the current ref
n = ( readSize - rpos - 1 ); // number of chars inside the next ref until the end of the read
i++;
cigarStrCounts[i] = n; // consider the rest of the read chars as insertions
cigarStrCodes[i] = 'I';
cigarStrSize = i;
} else { // if the largest part of the read is in the next ref
cigarStrCounts[i] = n; // set the last operation (M or D) to account only for the chars inside the new ref
rpos++; // first read char pos inside the new ref
if( i>0 ) i--; // previous operation
else { // if this is the first operation, we need to create a new one before, so, shift all operations forward
i = (cigarStrSize + 1);
while( i>0 ){
cigarStrCounts[i] = cigarStrCounts[(i-1)];
cigarStrCodes[i] = cigarStrCodes[(i-1)];
i--;
}
cigarStrSize++; // one more operation
}
cigarStrCounts[i] = rpos; // consider all the read chars behind as insertions
cigarStrCodes[i] = 'I';
while( i>0 ){ // reset all the other operations behind
i--;
cigarStrCounts[i] = 0;
cigarStrCodes[i] = 'M';
}
(*readPos) = ( refsEndPositions[ref] + 1 ); // set read to start in the 1st pos of the next ref
ref++; // next ref
}
return ref; // return the ref number where the read starts
}
// TODO: in case of multi refs, update pos and ref directly in readHits structure
// TODO: update the matedist variable with the size of the other mate (and use matedist field in ReadHit structure)
// TODO: better calculation of mate pair: max(p1+r1,p2+r2) - min(p1,p2)
// TODO: if the mate maps to another reference, output the correct RNEXT field and fix the position in the PNEXT field
// TODO: move to "samoutput.c" file
// NOTE: when readPos is -1 , it means that the read was not mapped
// NOTE: cigarStrSize points to the last filled entry of the CIGAR arrays (so, real size is +1)
// FORMAT: QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL
void PrintReadInSAMFormat(char *read, char *readName, char strand, unsigned int readPos, int numErrors, int bestScore, int bestScore2nd, int readSize){
int i, flag, mapqual;
int matedist, absmatedist;
unsigned int pos, matepos;
#ifdef DEBUG
unsigned int pg, originalreadpos;
int pr, j, n;
char c;
static int debugstringssize = 0;
static char *cigarstring = NULL;
static char *refstring = NULL;
static char *readstring = NULL;
originalreadpos = readPos;
#endif
matepos = 0; // initialize mate variables here to avoid compiler warnings
matedist = 0;
flag = 0;
if( pairedEndReadsMode ){ // paired-end reads mode
flag += 1; // flag = 1 means it is a paired-end read
if( currentPair == 0 ) flag += 64; // flag = 64 means first read in a pair
else flag += 128; // flag = 128 means second read in a pair
i = ((currentPair + 1) & 1); // id of the mate read
if( numReadsHits[i] > 0 ){ // if the mate read was mapped
matepos = readsHits[i][bestReadsHitIds[i]].pos; // MPOS = mate position
if( numReadsHits[currentPair] > 0 ){ // if this read is mapped too
flag += 2; // flag = 2 means both reads are mapped
pos = readsHits[currentPair][bestReadsHitIds[currentPair]].pos; // pos of this mate
absmatedist = (int) ((matepos>pos)?(matepos-pos):(pos-matepos)); // abs( matepos - pos );
if( (maxReadPairDistance>0) && ((absmatedist<minReadPairDistance) || (absmatedist>maxReadPairDistance)) ) // if a pair distance is defined but this distance is not valid
absmatedist = (int)(sizeRefGenome - absmatedist); // set the extra case when the reads "wrap around" from the end to the beginning of the (circular) genome
if( matepos > pos ) matedist = absmatedist; // fix mate distance signal according to SAM format
else matedist = (-absmatedist);
/*
matedist = ( matepos - readsHits[currentPair][bestReadsHitIds[currentPair]].pos ); // TLEN = template length (other mate's pos minus this mate's pos ; distance covering the full length of both mates)
if(matedist>0) matedist += readSize; // the other mate is to the right, add the other mate's size
else matedist -= readSize; // the other mate is to the left, add this mate's size
*/
if(currentPair==0){ // update count and stats, but only one time per pair
if( (unsigned int)absmatedist < minObservedPairDistance ) minObservedPairDistance = absmatedist;
if( (unsigned int)absmatedist > maxObservedPairDistance ) maxObservedPairDistance = absmatedist;
if( ((unsigned int)absmatedist > (avgObservedPairDistances/2) ) && ((unsigned int)absmatedist < (3*avgObservedPairDistances/2) ) ){ // distance within the current valid interval range (+/-50% the average)
avgObservedPairDistances = ( (weightObservedPairDistances*avgObservedPairDistances) + absmatedist ) / (weightObservedPairDistances+1); // update current average
weightObservedPairDistances++; // increase weight
} else { // distance outside the current valid range
weightObservedPairDistances--; // decrease weight
if(weightObservedPairDistances==0){ // if the current average lost all its weight, set a new average
avgObservedPairDistances = absmatedist;
weightObservedPairDistances = 1;
}
}
numAlignedBothMates++;
}
} else {
numAlignedSingleMates++; // the mate was mapped, but this one was not
}
if( readsHits[i][bestReadsHitIds[i]].strand == '-' ) flag += 32; // flag = 32 means the other read mate is mapped in the reverse strand
} else flag += 8; // flag = 8 means the other read mate is unmapped
}
if( readPos == (unsigned int) -1 ){ // if no positions were found for the read
flag += 4; // flag = 4 means unmapped read
fprintf(outputFile,"%s\t%d\t*\t0\t0\t*\t*\t0\t0\t*\t*\n",readName,flag);
return;
}
if( strand == '-' ) flag += 16; // flag = 16 means reverse strand
fprintf(outputFile,"%s\t%d\t",readName,flag);
pos = readPos; // position in first or only ref
if( numRefs == 1 ){
//fprintf(outputFile,"R\t"); // single ref labeled only as "R"
fprintf(outputFile,"%s\t",refsNames[0]);
} else { // update read pos in case of multiple refs
i = FixMultiRefRead((unsigned int *)(&readPos),readSize); // get the reference number
//fprintf(outputFile,"R%d\t",(i+1));
fprintf(outputFile,"%s\t",refsNames[i]);
if( i != 0 ) pos = ( readPos - refsEndPositions[(i-1)] - 1 ); // change position to match the new reference
}
fprintf(outputFile,"%u\t",(pos+1)); // pos+1 because in SAM, genome starts at pos 1, not 0
if( bestScore == 0 ) mapqual = 255; // unavailable mapping quality
else {
mapqual = 250 * ( bestScore - bestScore2nd ) / bestScore; // = 250 * c1 * c2 * ( S1 - S2 ) / S1
if(mapqual<0) mapqual=0; // prevent negative values
}
fprintf(outputFile,"%d\t",mapqual); // mapping quality
i = 0;
while( cigarStrCounts[i] == 0 ) i++; // skip first 0 size match if it exists
for( ; i <= cigarStrSize ; i++ ) fprintf(outputFile,"%d%c",cigarStrCounts[i],cigarStrCodes[i]); // CIGAR string
if( !pairedEndReadsMode ) fprintf(outputFile,"\t*\t0\t0\t"); // normal read
else fprintf(outputFile,"\t=\t%d\t%d\t",matepos,matedist); // RNEXT, PNEXT, TLEN
fprintf(outputFile,"%s\t*",read); // read chars and base qualities
#ifdef DEBUG
fprintf(outputFile,"\tNM:i:%d",numErrors);
#else
numErrors = numErrors; // just so compiler does not complain about unused variable
#endif
fprintf(outputFile,"\n");
#ifdef DEBUG
n = 0;
for( i = 0 ; i <= cigarStrSize ; i++ ) n += cigarStrCounts[i]; // get size of CIGAR strings
if( cigarstring==NULL || refstring==NULL || readstring==NULL ){ // alloc space for the strings the first time
debugstringssize = n;
cigarstring = (char *)malloc((debugstringssize+1)*sizeof(char));
refstring = (char *)malloc((debugstringssize+1)*sizeof(char));
readstring = (char *)malloc((debugstringssize+1)*sizeof(char));
} else if( n > debugstringssize ){ // realloc the space if needed
debugstringssize = n;
cigarstring = (char *)realloc(cigarstring,(debugstringssize+1)*sizeof(char));
refstring = (char *)realloc(refstring,(debugstringssize+1)*sizeof(char));
readstring = (char *)realloc(readstring,(debugstringssize+1)*sizeof(char));
}
pg = readPos; // position in genome
pr = 0; // position in read
j = 0; // position in debug strings
for( i = 0 ; i <= cigarStrSize ; i++ ){
n = cigarStrCounts[i];
if( n == 0 ) continue;
c = cigarStrCodes[i];
if( c == 'M' || c == '=' || c== 'X' ) while( n > 0 ){ // match or mismatch
cigarstring[j] = c;
refstring[j] = refGenome[pg++];
readstring[j] = read[pr++];
j++;
n--;
}
else if( c == 'N' ) while( n > 0 ){ // intron
cigarstring[j] = c;
refstring[j] = refGenome[pg++];
readstring[j] = ' '; // empty space
j++;
n--;
}
else if( c == 'I' ) while( n > 0 ){ // insertion into reference
cigarstring[j] = c;
refstring[j] = '-';
readstring[j] = read[pr++];
j++;
n--;
}
else if( c == 'D' ) while( n > 0 ){ // deletion from reference
cigarstring[j] = c;
refstring[j] = refGenome[pg++];
readstring[j] = '-';
j++;
n--;
}
else if( c == 'S' ) while( n > 0 ){ // soft clipping of read
cigarstring[j] = c;
refstring[j] = refGenome[pg++];
readstring[j] = (char)(read[pr++]+32); // convert to lowercase
j++;
n--;
}
}
cigarstring[j] = '\0';
refstring[j] = '\0';
readstring[j] = '\0';
fprintf(debugfile,"%s\n%s\n%s\n",cigarstring,refstring,readstring);
if((numRefs==1) || (originalreadpos==readPos)){ // if there are multiple references and the read falls between two of them, the number of errors is not updated correctly yet
n = 0; // number of errors retrieved from the CIGAR string alignment
j--;
for( ; j >= 0 ; j-- ){
c = cigarstring[j];
if( c == 'S' || c == 'N' ) continue; // these two operations do not count as errors
if( refstring[j] != readstring[j] ) n++;
}
for( i = 0 ; i <= cigarStrSize ; i++ ){
j = cigarStrCounts[i];
if( (j <= 0) || ((j > readSize) && cigarStrCodes[i]!='N') ){ // the intron operation can have a length longer than the read size
if( i==0 && j==0 ) continue; // there can be a valid 0 sized match at the first position
break;
}
}
if( (n!=numErrors) || (pr!=readSize) || (i!=(cigarStrSize+1)) ){
if(errorsFile==NULL) errorsFile=fopen("errors.txt","w");
fprintf(errorsFile,">%s",readName);
if(n!=numErrors) fprintf(errorsFile," (%d/%d)",n,numErrors); // check if the found number of errors matches the one calculated before
if(pr!=readSize) fprintf(errorsFile," [%d/%d]",pr,readSize); // check if the output read size matches the original read size
if(i!=(cigarStrSize+1)) fprintf(errorsFile," (%d@%d)",j,i); // check if there is any incorrect cigar count value
fprintf(errorsFile,"\n%s\n",read);
}
}
#endif
}
// TODO: does it need a faster algorithm to find (all the combinations of the) pairs at valid distances ?
// NOTE: if at least one of the min/max distances is set, any single read or any read pair but with an invalid distance will not be reported
// outputs both reads of the paired-end read pair to the SAM file
void PrintReadPair(){
int i, j, k, n, m;
int absDist, bestScoreSum;
char c;
ReadHit *hit;
if( maxReadPairDistance>0 ){ // if the limits are set, check which pairs are at a valid distance
if( numReadsHits[0]==0 || numReadsHits[1]==0 ) return; // if at least one of the reads does not have hits, do not report any of them
/*
if(numReadsHits[0]>1) qsort(readsHits[0],numReadsHits[0],sizeof(ReadHit),CompareReadHitPositions); // if it has multiple hits, sort them by position
if(numReadsHits[1]>1) qsort(readsHits[1],numReadsHits[1],sizeof(ReadHit),CompareReadHitPositions);
*/
m = numReadsHits[0]; // save number of hits
n = numReadsHits[1];
numReadsHits[0] = 0; // invalidate all previous hits
numReadsHits[1] = 0;
bestReadsHitIds[0] = -1;
bestReadsHitIds[1] = -1;
bestScoreSum = -1; // best combined score of both reads hits
for( i = 0 ; i < m ; i++ ){
for( j = 0 ; j < n ; j++ ){
//absDist = abs( readsHits[1][j].pos - readsHits[0][i].pos );
absDist = (int)( (readsHits[1][j].pos > readsHits[0][i].pos)?(readsHits[1][j].pos - readsHits[0][i].pos):(readsHits[0][i].pos - readsHits[1][j].pos) );
if( (absDist < minReadPairDistance) || (absDist > maxReadPairDistance) ){ // less than the min dist or more than the max dist is not acceptable
absDist = (sizeRefGenome - absDist); // check the extra case when the reads "wrap around" from the end to the beginning of the (circular) genome
if( (absDist < minReadPairDistance) || (absDist > maxReadPairDistance) ) continue;
}
if( reqReadPairStrands>0 ){ // if we have requirements about the mates strands, check them too
if(reqReadPairStrands==1 && (readsHits[1][j].strand)!=(readsHits[0][i].strand) ) continue; // same strand
if(reqReadPairStrands==2 && (readsHits[1][j].strand)==(readsHits[0][i].strand) ) continue; // opposite strands
}
readsHits[0][i].matedist = absDist;
readsHits[1][j].matedist = absDist;
numReadsHits[0]++;
numReadsHits[1]++;
k = ( readsHits[0][i].score + readsHits[1][j].score ); // score of this hit pair
if( k > bestScoreSum ){ // only set the best hit if it has a better score
bestScoreSum = k;
bestReadsHitIds[0] = i;
bestReadsHitIds[1] = j;
}
if( reportAllHits ){ // if we want all hits at valid distances, report them now
hit = &(readsHits[0][i]); // hit for 1st mate
cigarStrCodes = hit->cigarcodes;
cigarStrCounts = hit->cigarcounts;
cigarStrSize = hit->cigarsize;
currentPair = 0;
c = hit->strand;
PrintReadInSAMFormat(((c=='+')?(fwdReads[0]):(revReads[0])),readNames[0],c,hit->pos,hit->numerrors,hit->score,bestReadsScores2nd[0],readsSizes[0]);
hit = &(readsHits[1][j]); // hit for 2nd mate
cigarStrCodes = hit->cigarcodes;
cigarStrCounts = hit->cigarcounts;
cigarStrSize = hit->cigarsize;
currentPair = 1;
c = hit->strand;
PrintReadInSAMFormat(((c=='+')?(fwdReads[1]):(revReads[1])),readNames[1],c,hit->pos,hit->numerrors,hit->score,bestReadsScores2nd[1],readsSizes[1]);
}
} // loop for 2nd mate
} // loop for 1st mate
if( numReadsHits[0] == 0 ) return; // if no pairs were found at a valid distance, do not report any hits
if( reportAllHits ) return; // if we wanted all hits, they were all reported above already
} else if( reportAllHits ){ // if no limits have been set but we want all the hits
for( i = 0 ; i < numPairs ; i++ ){ // report all hits for both reads
currentPair = i;
n = numReadsHits[i];
for( j = 0 ; j < n ; j++ ){ // all hits of this read
hit = &(readsHits[i][j]);
cigarStrCodes = hit->cigarcodes;
cigarStrCounts = hit->cigarcounts;
cigarStrSize = hit->cigarsize;
c = hit->strand;
PrintReadInSAMFormat(((c=='+')?(fwdReads[i]):(revReads[i])),readNames[i],c,hit->pos,hit->numerrors,hit->score,bestReadsScores2nd[i],readsSizes[i]);
}
}
currentPair = 1;
return;
}
for( i = 0 ; i < numPairs ; i++ ){ // report the best hit only
if( numReadsHits[i] == 0 ) continue; // cannot report the best hit if none exists
hit = &(readsHits[i][bestReadsHitIds[i]]);
cigarStrCodes = hit->cigarcodes;
cigarStrCounts = hit->cigarcounts;
cigarStrSize = hit->cigarsize;
currentPair = i; // set this because the PrintReadInSAMFormat function will use it
c = hit->strand;
PrintReadInSAMFormat(((c=='+')?(fwdReads[i]):(revReads[i])),readNames[i],c,hit->pos,hit->numerrors,hit->score,bestReadsScores2nd[i],readsSizes[i]);
}
currentPair = 1; // set to 1, so the next read will be loaded from the correct reads file
}
#if defined DEBUG || defined DEBUGDP
void PrintSeeds(Read *read, char strand, int detailedmode){
char *readChars;
int numSeeds, seedId, seedPos, i;
Seed *seedsArray, *seed;
if(strand=='+'){
strand='+';
readChars=(read->fwdStrand.chars);
numSeeds=(read->fwdStrand.numSeeds);
seedsArray=(read->fwdStrand.seeds);
} else {
strand='-';
readChars=(read->revStrand.chars);
numSeeds=(read->revStrand.numSeeds);
seedsArray=(read->revStrand.seeds);
}
if(numSeeds==0) return;
if(detailedmode){ // details mode
fprintf(debugfile,"%c\n%s\n",strand,readChars);
for(i=0;i<numSeeds;i++){
seed=&(seedsArray[i]);
fprintf(debugfile,"%2d [%-3d-%3d] %3d %u \t",(i+1),(seed->startPosInRead),((seed->startPosInRead)+(seed->size)-1),(seed->size),(seed->numHits));
if(i!=(numSeeds-1)) seedPos=((seedsArray[i+1].startPosInRead)+(seedsArray[i+1].size));
else seedPos=0;
while(seedPos<(seed->startPosInRead)) fprintf(debugfile,"%c",(char)(readChars[seedPos++]+32));
fprintf(debugfile,"%.*s\n",(seed->size),(char *)(readChars+(seed->startPosInRead)));
}
if((seedsArray[(numSeeds-1)].startPosInRead)>1) fprintf(debugfile,"...\n");
} else { // overview mode
seedPos=0;
for(seedId=(numSeeds-1);seedId>=0;seedId--){
seed=&(seedsArray[seedId]);
while(seedPos<(seed->startPosInRead)){
fputc(' ',debugfile);
seedPos++;
}
for(i=0;i<(seed->size);i++){
if(i==0) fputc('<',debugfile);
else if(i==((seed->size)-1)) fputc('|',debugfile);
else fputc('=',debugfile);
seedPos++;
}
}
fputc('\n',debugfile);
}
}
#endif
void GetSeeds(Read *read){
unsigned int topPtr, bottomPtr;
unsigned int *fwdTopPointers, *fwdBottomPointers;
unsigned int *revTopPointers, *revBottomPointers;
int seedSize, numFwdSeeds, numRevSeeds;
int readSize, fwdPos, revPos;
unsigned int numOccs;
char *readChars, c;
Seed *seed;
readSize=(read->size);
fwdTopPointers=(read->fwdStrand.topPointers);
fwdBottomPointers=(read->fwdStrand.bottomPointers);
revTopPointers=(read->revStrand.topPointers);
revBottomPointers=(read->revStrand.bottomPointers);
numFwdSeeds=0;
numRevSeeds=0;
fwdPos=(readSize-1); // current position in fwd read
revPos=(readSize-1); // current position in rev read
while(1){ // get one seed at a time for both fwd and rev strands
readChars=(read->fwdStrand.chars); // process fwd chars
topPtr=0; // initialize pointers
bottomPtr=sizeBWT;
seedSize=0;
for(;fwdPos>=0;fwdPos--){ // get longest seed starting at current position of the read (from right to left)
c=readChars[fwdPos]; // process all read chars until no occurrences of the substring exist
numOccs=FMI_FollowLetter(c,&topPtr,&bottomPtr);
if(numOccs==0){ // if there are no matches by this letter, stop seed here
fwdTopPointers[fwdPos]=0; // set pointers of failed position to zero
fwdBottomPointers[fwdPos]=0;
break;
}
fwdTopPointers[fwdPos]=topPtr; // save index pointers for this position
fwdBottomPointers[fwdPos]=bottomPtr;
seedSize++;
}
if(seedSize!=0){ // save information about the newly created seed
if(numFwdSeeds==maxNumSeeds){ // realloc seed arrays if needed
maxNumSeeds=(numFwdSeeds+1);
(read->fwdStrand.seeds)=(Seed *)realloc((read->fwdStrand.seeds),maxNumSeeds*sizeof(Seed));
(read->revStrand.seeds)=(Seed *)realloc((read->revStrand.seeds),maxNumSeeds*sizeof(Seed));
}
seed=&(read->fwdStrand.seeds[numFwdSeeds]);
(seed->size)=seedSize; // save seed size
fwdPos++; // pos of the last valid char
(seed->startPosInRead)=fwdPos; // save pos where this seed ends
(seed->bwtTopPointer)=fwdTopPointers[fwdPos];
(seed->numHits)=(fwdBottomPointers[fwdPos]-fwdTopPointers[fwdPos]+1);
numFwdSeeds++; // one more seed created
fwdPos-=2; // skip the erroneous position and start next seed on next position (to the left)
}
readChars=(read->revStrand.chars); // process rev chars
topPtr=0; // initialize pointers
bottomPtr=sizeBWT;
seedSize=0;
for(;revPos>=0;revPos--){ // get longest seed starting at current position of the read (from right to left)
c=readChars[revPos]; // process all read chars until no occurrences of the substring exist
numOccs=FMI_FollowLetter(c,&topPtr,&bottomPtr);
if(numOccs==0){ // if there are no matches by this letter, stop seed here
revTopPointers[revPos]=0; // set pointers of failed position to zero
revBottomPointers[revPos]=0;
break;
}
revTopPointers[revPos]=topPtr; // save index pointers for this position
revBottomPointers[revPos]=bottomPtr;
seedSize++;
}
if(seedSize!=0){ // save information about the newly created seed
if(numRevSeeds==maxNumSeeds){ // realloc seed arrays if needed
maxNumSeeds=(numRevSeeds+1);
(read->fwdStrand.seeds)=(Seed *)realloc((read->fwdStrand.seeds),maxNumSeeds*sizeof(Seed));
(read->revStrand.seeds)=(Seed *)realloc((read->revStrand.seeds),maxNumSeeds*sizeof(Seed));
}
seed=&(read->revStrand.seeds[numRevSeeds]);
(seed->size)=seedSize; // save seed size
revPos++; // pos of the last valid char
(seed->startPosInRead)=revPos; // save pos where this seed ends
(seed->bwtTopPointer)=revTopPointers[revPos];
(seed->numHits)=(revBottomPointers[revPos]-revTopPointers[revPos]+1);
numRevSeeds++; // one more seed created
revPos-=2; // skip the erroneous position and start next seed on next position (to the left)
}
if(fwdPos<0 && revPos<0){ // stop if we have reached the end of the read in both strands
(read->fwdStrand.numSeeds)=numFwdSeeds;
(read->revStrand.numSeeds)=numRevSeeds;
break;
}
if(numFwdSeeds!=numRevSeeds){ // stop if there will be a difference of more than 1 seed between both strands
if(numFwdSeeds<numRevSeeds){
(read->fwdStrand.numSeeds)=numFwdSeeds;
(read->revStrand.numSeeds)=0; // discard strand seeds if they didn't get to the beggining of the read
} else {
(read->fwdStrand.numSeeds)=0;
(read->revStrand.numSeeds)=numRevSeeds;
}
break;
}
}
#ifdef DEBUG
PrintSeeds(read,'+',1);
PrintSeeds(read,'-',1);
#endif
}
void GetImprovedSeeds(Read *read){
unsigned int topPtr, bottomPtr;
unsigned int *topPointers, *bottomPointers;
int seedExtraSize, numSeeds, seedId;
int seedPos, seedOldLeftPos, seedNewLeftPos, seedOldRightPos, seedNewRightPos, rightmostPos;
unsigned int numOccs;
char *readChars, strandcount, c;
#ifdef DEBUG
char strandchar;
#endif
Seed *seed;
Strand *strand;
for(strandcount=0;strandcount<2;strandcount++){
if(strandcount==0){ // forward strand
strand=&(read->fwdStrand);
#ifdef DEBUG
strandchar='+';
#endif
} else { // reverse strand
strand=&(read->revStrand);
#ifdef DEBUG
strandchar='-';
#endif
}
numSeeds=(strand->numSeeds);
if(numSeeds==0) continue;
#ifdef DEBUG
// print old seeds
fprintf(debugfile,"%c\n%s\n",strandchar,(strand->chars));
PrintSeeds(read,strandchar,0);
#endif
readChars=(strand->chars);
topPointers=(strand->topPointers);
bottomPointers=(strand->bottomPointers);
seedExtraSize=0;
for(seedId=(numSeeds-1);seedId>=0;seedId--){ // process all seeds from the leftmost one to the (before the) rightmost one
seed=&(strand->seeds[seedId]);
seedOldLeftPos=(seed->startPosInRead); // position where the original seed ends (on the left)
seedNewLeftPos=seedOldLeftPos; // new left pos of the seed after (possibly) being changed by the prev seed (at the left)
if(seedExtraSize!=0){ // if the prev seed (to the left) was extended, it will change this seed too
seedExtraSize--; // the mismatched char (that was jumped over) between the 2 seeds does not count
seedNewLeftPos+=seedExtraSize; // update leftmost seed pos
(seed->startPosInRead)=seedNewLeftPos; // the starting (left) pos of the seed was advanced to the right
(seed->size)-=seedExtraSize; // and its size was shortened
(seed->bwtTopPointer)=topPointers[seedNewLeftPos]; // set the correct pointers in case we never get to set them below
(seed->numHits)=(bottomPointers[seedNewLeftPos]-topPointers[seedNewLeftPos]+1);
}
if(seedId==0) break; // the 1st (rightmost) seed cannot be further extended to the right
seedNewRightPos=(seedNewLeftPos+(seed->size)); // start one position to the right of the current rightmost end of the seed
seedOldRightPos=(seedNewRightPos-1); // last valid position on the left of the original seed
rightmostPos=((strand->seeds[seedId-1].startPosInRead)+(strand->seeds[seedId-1].size)-1); // rightmost pos of the following (to the right) seed
seedExtraSize=0; // extended seed size (only the extra length)
for(;seedNewRightPos<rightmostPos;seedNewRightPos++){ // try starting the seed one char at a time to the right, but do not let the seed start beyond the right end of the next seed
topPtr=0; // initialize pointers
bottomPtr=sizeBWT;
for(seedPos=seedNewRightPos;seedPos>=seedOldLeftPos;seedPos--){ // set new starting position of the seed and check how far to the left the seed can reach
c=readChars[seedPos];
numOccs=FMI_FollowLetter(c,&topPtr,&bottomPtr);
if(numOccs==0){ // the seed ended
seedPos++; // the last valid char was the one before (to the right)
break;
}
if(seedPos<=seedOldRightPos){ // only compare and pointers if the pos is over the original seed pointers
if((topPtr==topPointers[seedPos]) && (bottomPtr==bottomPointers[seedPos])){ // if the pointers are the same as previous found pointers
seedPos=seedOldLeftPos; // the seed will end at a pos lower or equal at the original end pos, so we can continue extending it
break;
}
topPointers[seedPos]=topPtr; // save pointers
bottomPointers[seedPos]=bottomPtr;
}
}
if(seedPos>seedOldLeftPos) break; // continue to extend the seed while the end is equal to (or -1) the original end
seedExtraSize++; // seed was successfuly extended one char to the right
(seed->bwtTopPointer)=topPointers[seedNewLeftPos]; // save pointers now because in the last (incomplete) run, they can have a wrong value
(seed->numHits)=(bottomPointers[seedNewLeftPos]-topPointers[seedNewLeftPos]+1);
} // next, start the seed one more char to the right
(seed->size)+=seedExtraSize; // save the new size of this seed
}
#ifdef DEBUG
PrintSeeds(read,strandchar,0);
PrintSeeds(read,strandchar,1);
#endif
}
}
void CheckDepthSeeds(Read *read){
unsigned int topPtr, bottomPtr;
unsigned int *fwdTopPointers, *fwdBottomPointers;
unsigned int *revTopPointers, *revBottomPointers;
int *fwdSeedsEndPos, *revSeedsEndPos;
//int numFwdSeeds, numRevSeeds;
int readSize, i, j;
unsigned int n;
char *readChars, c;
/*
#ifdef DEBUG
fprintf(debugfile,">%s\n%d\n",(read->name),(read->size));
#endif
*/
readSize=(read->size);
fwdTopPointers=(unsigned int *)malloc(readSize*sizeof(unsigned int));
revTopPointers=(unsigned int *)malloc(readSize*sizeof(unsigned int));
fwdBottomPointers=(unsigned int *)malloc(readSize*sizeof(unsigned int));
revBottomPointers=(unsigned int *)malloc(readSize*sizeof(unsigned int));
fwdSeedsEndPos=(int *)malloc(readSize*sizeof(int));
revSeedsEndPos=(int *)malloc(readSize*sizeof(int));
for(i=0;i<readSize;i++){ // reset index pointers
fwdTopPointers[i]=0;
revTopPointers[i]=0;
fwdBottomPointers[i]=0;
revBottomPointers[i]=0;
fwdSeedsEndPos[i]=-1;
revSeedsEndPos[i]=-1;
}
readChars=(read->fwdStrand.chars); // process fwd chars
for(i=(readSize-1);i>=0;i--){ // get longest seed starting on each position of the read (from right to left)
topPtr=0;
bottomPtr=sizeBWT;
for(j=i;j>=0;j--){ // process all read chars from position (i) to position 0
c=readChars[j];
n=FMI_FollowLetter(c,&topPtr,&bottomPtr);
if(n==0) break;// if there are no matches by this letter, stop seed here
if( topPtr==fwdTopPointers[j] && bottomPtr==fwdBottomPointers[j] ){ // if a prev start pos (i>j) already set this same index interval at this pos (j), the end pos will be the same
j=fwdSeedsEndPos[(i+1)]; // set the same end pos
j--; // decrease pos because it will be increased later
break;
}
fwdTopPointers[j]=topPtr; // save index pointers for this position
fwdBottomPointers[j]=bottomPtr;
}
j++; // last pos with matches
fwdSeedsEndPos[i]=j; // save pos where this seed ended (last valid char)
fwdTopPointers[i]=fwdTopPointers[j]; // save last valid index pointers for the seed starting at this pos
fwdBottomPointers[i]=fwdBottomPointers[j];
}
#ifdef DEBUG
fprintf(debugfile,"+\n%s\n",(read->fwdStrand.chars));
for(i=(readSize-1);i>=0;i--){
for(j=0;j<fwdSeedsEndPos[i];j++) fputc(' ',debugfile);
if(j!=i){
fputc('<',debugfile);
j++;
}
for(;j<i;j++) fputc('=',debugfile);
fputc('|',debugfile);
j++;
for(;j<readSize;j++) fputc(' ',debugfile);
fprintf(debugfile,"[%d-%d] %d %u\n",fwdSeedsEndPos[i],i,(i-fwdSeedsEndPos[i]+1),(fwdBottomPointers[i]-fwdTopPointers[i]+1));
}
#endif
readChars=(read->revStrand.chars); // process rev chars
for(i=(readSize-1);i>=0;i--){ // get longest seed starting on each position of the read (from right to left)
topPtr=0;
bottomPtr=sizeBWT;
for(j=i;j>=0;j--){ // process all read chars from position (i) to position 0
c=readChars[j];
n=FMI_FollowLetter(c,&topPtr,&bottomPtr);
if(n==0) break;// if there are no matches by this letter, stop seed here
if( topPtr==revTopPointers[j] && bottomPtr==revBottomPointers[j] ){ // if a prev start pos (i) already set this same index interval at this pos (j), the end pos will be the same
j=revSeedsEndPos[(i+1)]; // set the same end pos
j--; // decrease pos because it will be increased later
break;
}
revTopPointers[j]=topPtr; // save index pointers for this position
revBottomPointers[j]=bottomPtr;
}
j++; // last pos with matches
revSeedsEndPos[i]=j; // save pos where this seed ended (last valid char)
revTopPointers[i]=revTopPointers[j]; // save last valid index pointers for the seed starting at this pos
revBottomPointers[i]=revBottomPointers[j];
}
#ifdef DEBUG
fprintf(debugfile,"-\n%s\n",(read->revStrand.chars));
for(i=(readSize-1);i>=0;i--){
for(j=0;j<revSeedsEndPos[i];j++) fputc(' ',debugfile);
if(j!=i){