diff --git a/somatic_filter/readme b/somatic_filter/readme deleted file mode 100644 index 4c06df5f..00000000 --- a/somatic_filter/readme +++ /dev/null @@ -1,13 +0,0 @@ - -This is the initial implementation of somatic indel filtering for Pindel indel calls based on ICGC DREAM challenge round 3 test result. Please advise. contact kaiye@xjtu.edu.cn - -step 1: extract indel summary lines using -grep ChrID pindel_output_D > D.head -grep ChrID pindel_output_SI > SI.head - -step 2: run the provided perl script with an updated configuration file -perl somatic_indelfilter.pl somatic.indel.filter.config - -Somatic indel calls will be stored as a VCF file with the name specified in the configuration file. - -We assume normal-tumor paired genome runs by pindel, and that the normal sample comes first in the pindel output file. diff --git a/somatic_filter/readme.md b/somatic_filter/readme.md new file mode 100644 index 00000000..40ff529a --- /dev/null +++ b/somatic_filter/readme.md @@ -0,0 +1,29 @@ + +This is the initial implementation of somatic indel filtering for Pindel indel calls based on ICGC DREAM challenge round 3 test result. Please advise. contact + +1. Extract indel summary lines using `grep ChrID [pindel output file] > [output file]`, for example: + + grep ChrID pindel_output_D > D.head + grep ChrID pindel_output_SI > SI.head + + You may optionally use `cat` to combine the files. + +1. Generate the configuration file for the run. The file requires the following entries, with values and entries separated by an equal sign. An example is provided here as somatic.indel.filter.config. + + | Field name | Definition | + | ---------- | ---------- | + | `input` | The indel summary file generated in the previous step. | + | `vaf` | The minimum variant allele frequency at the _tumor_ sample to pass the filter. | + | `cov` | The minimum coverage at _both_ samples to pass the filter. | + | `hom` | The maximum homopolymer length at breakpoint to pass the filter. | + | `pindel2vcf` | The location of pindel's `pindel2vcf` binary. | + | `reference` | The reference genome. Passed to the `-r` parameter in `pindel2vcf`. | + | `referencename` | The name of the reference genome. Passed to the `-R` parameter in `pindel2vcf`. | + | `referencedate` | The date of the reference genome. Passed to the `-d` parameter in `pindel2vcf`. | + | `output` | The name of the output VCF file. Passed to the `-v` parameter in `pindel2vcf`. | +1. Execute `somatic_indelfilter.pl` with the configuration file + perl somatic_indelfilter.pl somatic.indel.filter.config + +Somatic indel calls will be stored as a VCF file with the name specified in the `output` field in the configuration file. + +We assume normal-tumor paired genome runs by `pindel`, and that the normal sample comes first in the pindel output file. \ No newline at end of file diff --git a/somatic_filter/somatic.indel.filter.config b/somatic_filter/somatic.indel.filter.config index ff86d55b..30e3475c 100644 --- a/somatic_filter/somatic.indel.filter.config +++ b/somatic_filter/somatic.indel.filter.config @@ -1,9 +1,9 @@ -indel.filter.input = all.head -indel.filter.vaf = 0.1 -indel.filter.cov = 20 -indel.filter.hom = 6 -indel.filter.pindel2vcf = ../pindel2vcf -indel.filter.reference = /gscuser/kye/reference/hs37d5.fa -indel.filter.referencename = hs37d5.fa -indel.filter.referencedate = 20150720 -indel.filter.output = indel.filter.output +input = all.head +vaf = 0.1 +cov = 20 +hom = 6 +pindel2vcf = ../pindel2vcf +reference = /gscuser/kye/reference/hs37d5.fa +referencename = hs37d5.fa +referencedate = 20150720 +output = indel.filter.output diff --git a/somatic_filter/somatic_indelfilter.pl b/somatic_filter/somatic_indelfilter.pl index 0752d214..a4e625d2 100644 --- a/somatic_filter/somatic_indelfilter.pl +++ b/somatic_filter/somatic_indelfilter.pl @@ -34,11 +34,15 @@ my ( undef, $nocomplex_output ) = tempfile(); my $nocomplex_output_fh = IO::File->new( $nocomplex_output, ">" ) or die "Temporary file could not be created. $!"; map { chomp; my @t = split; - if ($t[32] + $t[34] + $t[36] >= $paras{'cov'} && $t[33] + $t[34] + $t[36] >= $paras{'cov'} && $t[39] + $t[41] + $t[43] >= $paras{'cov'} && $t[40] + $t[41] + $t[43] >= $paras{'cov'}) { - if ( ($t[34] + $t[36]) == 0 && ($t[41] + $t[43])/($t[39] + $t[41] + $t[43]) >= $paras{'vaf'} && (($t[41] + $t[43])/($t[40] + $t[41] + $t[43]) >= $paras{'vaf'} ) ) { + if (($t[32] + $t[34] + $t[36] >= $paras{'cov'}) && # Sample 1: Upstream coverage + left breakpoint coverage + ($t[33] + $t[34] + $t[36] >= $paras{'cov'}) && # Sample 1: Upstream coverage + right breakpoint coverage + ($t[39] + $t[41] + $t[43] >= $paras{'cov'}) && # Sample 2: Upstream coverage + left breakpoint coverage + ($t[40] + $t[41] + $t[43] >= $paras{'cov'})) { # Sample 2: Upstream coverage + right breakpoint coverage + if ((($t[34] + $t[36]) == 0) && # Normal has zero variant frequency + ($t[41] + $t[43])/($t[39] + $t[41] + $t[43]) >= $paras{'vaf'} && (($t[41] + $t[43])/($t[40] + $t[41] + $t[43]) >= $paras{'vaf'} )) { #Tumor has adequete variant frequency #print; print "\n"; # allow complex - if ( $t[1] eq "I" || ($t[1] eq "D" && $t[4] == 0) || ($t[1] eq "D" && $t[4] > 0) ) { + if ( $t[1] eq "I" || ($t[1] eq "D" && $t[4] == 0) || ($t[1] eq "D" && $t[4] > 0) ) { # Variation is an insertion or deletion $nocomplex_output_fh->print($_."\n"); } } @@ -63,13 +67,13 @@ my $pindel2vcf_output_fh = IO::File->new( $pindel2vcf_output ) or die "Temporary file could not be opened. $!"; while ( <$pindel2vcf_output_fh> ) { print; - if ( /^#/ ) { $filter_output_fh->print($_); next }; - my @a= split /\t/; - my @b = split/\;/, $a[7]; + if ( /^#/ ) { $filter_output_fh->print($_); next }; #Preserve headers + my @a= split /\t/; # Split the VCF line + my @b = split/\;/, $a[7]; #Split the INFO field for ( my $i=0; $iprint($_); last; } + if ( $c[1] <= $paras{'hom'} ) { $filter_output_fh->print($_); last; } #Check for whether HOMLEN is shorter or equal to the hom parameter in input. } } } <$pindel2vcf_output_fh>;