diff --git a/trust-simplerep.pl b/trust-simplerep.pl index c5823c3..c20b83a 100644 --- a/trust-simplerep.pl +++ b/trust-simplerep.pl @@ -96,11 +96,13 @@ sub GetChainType my $i ; my $annotFile = "" ; +my $reportJunctionInfo = 0 ; for ( $i = 1 ; $i < @ARGV ; ++$i ) { - if ( $ARGV[$i] == "--junction" ) + if ( $ARGV[$i] eq "--junction" ) { $annotFile = $ARGV[$i + 1] ; + $reportJunctionInfo = 1 ; ++$i ; } else @@ -109,26 +111,127 @@ sub GetChainType } } -my %junctionInfo ; # V-end, V-del, VD(J)-Ins, D-Ldel, D-start, D-end, D-Rdel, DJ-Ins, J-del, J-start +my %junctionInfo ; # key: assembly id. Values: V-end, V-del, VD(J)-Ins, D-Ldel, D-start, D-end, D-Rdel, DJ-Ins, J-del, J-start +if ( $reportJunctionInfo == 1 ) +{ + open FP1, $annotFile ; + while ( ) + { + next if ( !/^>/ ) ; + my @cols = split /\s/, $_ ; + my @vCoord ; + my @dCoord ; + my @jCoord ; + my @cCoord ; + my @cdr3Coord ; + + if ( $cols[3] =~ /\(([0-9]+?)\):\(([0-9]+?)-([0-9]+?)\):\(([0-9]+?)-([0-9]+?)\)/ ) + { + @vCoord = ($1, $2, $3, $4, $5) ; + } + else + { + #die "Wrong format $header\n" ; + next ; + } + if ( $cols[4] =~ /\(([0-9]+?)\):\(([0-9]+?)-([0-9]+?)\):\(([0-9]+?)-([0-9]+?)\)/ ) + { + @dCoord = ($1, $2, $3, $4, $5) ; + } + else + { + @dCoord = (-1, -1, -1, -1, -1) ; + } + if ( $cols[5] =~ /\(([0-9]+?)\):\(([0-9]+?)-([0-9]+?)\):\(([0-9]+?)-([0-9]+?)\)/ ) + { + @jCoord = ($1, $2, $3, $4, $5) ; + } + else + { + #die "Wrong format $header\n" ; + next ; + } + if ( $cols[6] =~ /\(([0-9]+?)\):\(([0-9]+?)-([0-9]+?)\):\(([0-9]+?)-([0-9]+?)\)/ ) + { + @cCoord = ($1, $2, $3, $4, $5) ; + } + else + { + #die "Wrong format $header\n" ; + next ; + } + next if ( $vCoord[2] >= $jCoord[1] ) ; + next if ( $cols[9] =~ /:0\.00/ ) ; + if ( $cols[9] =~ /CDR3\(([0-9]+?)-([0-9]+?)\)/ ) + { + @cdr3Coord = ($1, $2) ; + } + else + { + next ; + } + + next if ( $vCoord[2] < $cdr3Coord[0] || $jCoord[1] > $cdr3Coord[1] ) ; + + my $chainType = substr( $cols[3], 0, 3 ) ; + my @info ; + # v end, vdelete + push @info, $vCoord[2] - $cdr3Coord[0], $vCoord[0] - $vCoord[4] - 1 ; + + # v insert, d-left-delete, d-start, d-end, d-right-delete, j insert + if ( $chainType eq "IGH" || $chainType eq "TRB" || $chainType eq "TRD" ) + { + if ( $dCoord[0] == -1 || $dCoord[1] <= $vCoord[2] || $dCoord[2] >= $jCoord[1]) + { + push @info, "*", "*", "*", "*", "*", "*" ; + } + else + { + push @info, $dCoord[1] - $vCoord[2] - 1, $dCoord[3], + $dCoord[1] - $cdr3Coord[0], $dCoord[2] - $cdr3Coord[0], + $dCoord[0] - $dCoord[4] - 1, + $jCoord[1] - $dCoord[2] - 1 ; + } + } + else + { + # already make sure J gene is after V gene in this case. + push @info, $jCoord[1] - $vCoord[2] - 1, "*", "*", "*", "*", "*" ; + } + + # j-left-delete, j-start + push @info, $jCoord[3], $jCoord[1] - $cdr3Coord[0] ; + + #print(substr($cols[0], 1), ":", join(",", @info), "\n") ; + @{$junctionInfo{ substr( $cols[0], 1 )}} = @info ; + } + close FP1 ; +} # read in the input open FP1, $ARGV[0] ; my @totalCnt = (0, 0, 0) ; +my %cdr3AssemblyId ; # Record which assembly is representative for the cdr3 while ( ) { chomp ; my @cols = split ; my $key = join( "\t", ( $cols[2], $cols[3], $cols[4], $cols[5], $cols[8] ) ) ; + my $assemblyId = $cols[0] ; if ( defined $cdr3{ $key } ) { my $val = \@{ $cdr3{ $key } } ; - $val->[0] = $cols[9] if ( $cols[9] > $val->[0] ) ; + if ( $cols[9] > $val->[0] ) + { + $val->[0] = $cols[9]; + $val->[2] = $assemblyId ; + } $val->[1] += $cols[10] ; } else { - @{ $cdr3{ $key } } = ( $cols[9], $cols[10] ) ; + @{ $cdr3{ $key } } = ( $cols[9], $cols[10], $assemblyId ) ; } my $type = GetChainType( $cols[2], $cols[4], $cols[5] ) ; $totalCnt[ $type ] += $cols[10] if ( $type != -1 ) ; @@ -136,7 +239,13 @@ sub GetChainType close FP1 ; # Output what we collected. -print( "#count\tfrequency\tCDR3nt\tCDR3aa\tV\tD\tJ\tC\n" ) ; +print( "#count\tfrequency\tCDR3nt\tCDR3aa\tV\tD\tJ\tC" ) ; +if ( $reportJunctionInfo == 1 ) +{ + print( "\tjunction" ) +} +print( "\n" ) ; + foreach my $key ( sort { $cdr3{$b}[1] <=> $cdr3{$a}[1] } keys %cdr3 ) { my @val = @{ $cdr3{ $key } } ; @@ -172,5 +281,17 @@ sub GetChainType my $freq = 0 ; my $type = GetChainType( $info[0], $info[2], $info[3] ) ; $freq = $val[1] / $totalCnt[ $type ] if ( $type != -1 ) ; - printf( "%.2f\t%e\t%s\t%s\t%s\t%s\t%s\t%s\n", $val[1], $freq, $info[4], $aa, $info[0], $info[1], $info[2], $info[3] ) ; + printf( "%.2f\t%e\t%s\t%s\t%s\t%s\t%s\t%s", $val[1], $freq, $info[4], $aa, $info[0], $info[1], $info[2], $info[3] ) ; + if ( $reportJunctionInfo == 1 ) + { + if ( defined $junctionInfo{$val[2]} ) + { + print( "\t", join( ",", @{$junctionInfo{$val[2]}}) ) ; + } + else + { + print( "\t*" ) ; + } + } + print( "\n" ) ; }