From ff50e8f38341027fb0c04567123bbcb7854d4dcf Mon Sep 17 00:00:00 2001 From: Li Date: Mon, 20 Jan 2020 19:05:14 -0500 Subject: [PATCH] Support barcode count in simplerep --- run-trust4 | 4 +++- trust-simplerep.pl | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/run-trust4 b/run-trust4 index 145e9e8..90cc26b 100755 --- a/run-trust4 +++ b/run-trust4 @@ -61,6 +61,7 @@ my $vdjcRef = "" ; my $bamExtractorArgs = "" ; my $mainArgs = "" ; my $annotatorArgs = "" ; +my $simpleRepArgs = "" ; my $threadCnt = 1 ; my $stage = 0 ; my $noExtraction = 0 ; @@ -129,6 +130,7 @@ for ( $i = 0 ; $i < @ARGV ; ++$i ) { $hasBarcode = 1 ; $bamExtractorArgs .= " --barcode ".$ARGV[$i + 1] ; + $simpleRepArgs .= " --barcodeCnt --noPartial" ; ++$i ; } elsif ( $ARGV[$i] eq "--stage" ) @@ -263,7 +265,7 @@ if ( $stage <= 2 ) # Generate the report table if ( $stage <= 3 ) { - system_call( "perl $WD/trust-simplerep.pl ${prefix}_cdr3.out > ${prefix}_report.tsv" ) ; + system_call( "perl $WD/trust-simplerep.pl ${prefix}_cdr3.out $simpleRepArgs > ${prefix}_report.tsv" ) ; } print STDERR "[".localtime()."] TRUST4 finishes.\n" ; diff --git a/trust-simplerep.pl b/trust-simplerep.pl index d605df9..74ebd78 100644 --- a/trust-simplerep.pl +++ b/trust-simplerep.pl @@ -6,7 +6,9 @@ die "Usage: ./trust-simplerep.pl xxx_cdr3.out [OPTIONS] > trust_report.out\n". "OPTIONS:\n". "\t--decimalCnt: the count column uses decimal instead of truncated integer. (default: not used)\n". + "\t--barcodeCnt: the count column is the number of barcode instead of read support. (default: not used)\n". "\t--junction trust_annot.fa: output junction information for the CDR3 (default: not used)\n". + "\t--noPartial: do not including partial CDR3 in report. (default: include partial)\n". "\t--filterTcrError FLOAT: filter TCR CDR3s less than the fraction of representative CDR3 in the consensus. (default: 0.05)\n" if ( @ARGV == 0 ) ; @@ -176,6 +178,8 @@ sub InferConstantGene my $reportJunctionInfo = 0 ; my $tcrErrorFilter = 0.05 ; my $roundDownCount = 1 ; +my $useBarcodeCnt = 0 ; +my $reportPartial = 1 ; for ( $i = 1 ; $i < @ARGV ; ++$i ) { if ( $ARGV[$i] eq "--junction" ) @@ -193,6 +197,14 @@ sub InferConstantGene { $roundDownCount = 0 ; } + elsif ( $ARGV[$i] eq "--barcodeCnt" ) + { + $useBarcodeCnt = 1 ; + } + elsif ( $ARGV[$i] eq "--noPartial" ) + { + $reportPartial = 0 ; + } else { die "Unknown option ", $ARGV[$i], "\n" ; @@ -312,6 +324,7 @@ sub InferConstantGene open FP1, $ARGV[0] ; my @totalCnt = (0, 0, 0) ; my %cdr3AssemblyId ; # Record which assembly is representative for the cdr3 +my %cdr3Barcode ; # record whether a "chain_cdr3_barcode" has showed up before or not while ( ) { chomp ; @@ -326,6 +339,8 @@ sub InferConstantGene my $key = join( "\t", ( $vgene, $dgene, $jgene, $cgene, $cols[8] ) ) ; my $type = GetChainType( $vgene, $jgene, $cgene ) ; + next if ( $reportPartial == 0 && $cols[9] == 0 ) ; + if ($type == 2) # TCR { if ($cols[10] < $assemblyMostReads{$assemblyId} * $tcrErrorFilter ) @@ -334,6 +349,22 @@ sub InferConstantGene } } + if ( $useBarcodeCnt ) + { + my @cols2 = split/_/, $assemblyId ; + my $barcode = join( "_", @cols2[0..scalar(@cols2)-2] ) ; + my $tmp = $type."_".$cols[8]."_".$barcode ; + if ( defined $cdr3Barcode{$tmp} ) + { + next ; + } + else + { + $cdr3Barcode{$tmp} = 1 ; + $cols[10] = 1 ; + } + } + if ( defined $cdr3{ $key } ) { my $val = \@{ $cdr3{ $key } } ;