Merge remote-tracking branch 'origin/hotfix'

griffithlab · Dec 9, 2019 · 1c73528 · 1c73528
2 parents 5ea5160 + f892f15
commit 1c73528
Show file tree

Hide file tree

Showing 11 changed files with 65 additions and 28 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -69,7 +69,7 @@
 # The short X.Y version.
 version = '1.5'
 # The full version, including alpha/beta/rc tags.
-release = '1.5.3'
+release = '1.5.4'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/index.rst b/docs/index.rst
@@ -52,21 +52,17 @@ New in release |release|
 
 This is a hotfix release. It fixes the following issues:
 
-- pVACbind would previously throw an error if a peptide sequence in the input
-  fasta was shorter than one of the chosen epitope lengths. This issue has
-  been fixed by first parsing the input fasta and creating individual fasta
-  files for each epitope length that enforce a minimum length of the peptide
-  sequences matching the respective epitope length.
-- Previous versions of pVACtools resolved an issue where IEDB would output a
-  warning line if one of the epitope sequences only contained A, C, G, or T
-  amino acids, since those sequences could also be nuclotide sequences.
-  However, this issue was only fixed in pVACseq, not pVACbind, or pVACvector.
-  This release fixes this issue for all tools.
-- The wrappers for NetChop or NetMHCstabpan split the set of input epitopes
-  into chunks of 100 before processing. Due to a bug in the file splitting
-  logic, one epitope for each chunk over 100 would be errenously dropped. This
-  effectively would result in less epitopes being returned in the filtered
-  report than if running the pipelines without NetChop or NetMHCstabpan.
+- The ``pvacseq generate_protein_fasta`` command would previously error out
+  when running with a selected ``peptide_sequence_length`` that would reduce
+  in peptides < 7 amino acids long. This error would occur when calculating
+  manufacturability metrics. This release now only calculates these metrices
+  for peptides >=7 amino acids long.
+- We updated the calculation for the flanking sequence length when generating
+  peptide sequences to result in peptides that are closer in length to the
+  requested ``peptide_sequence_length``.
+- This release fixes an edge case where a frameshift mutation impacted the
+  first amino acid of a transcript. This case would previously throw a fatal
+  error but will now be processed as expected.
 
 New in version |version|
 ------------------------

diff --git a/lib/calculate_manufacturability.py b/lib/calculate_manufacturability.py
@@ -49,27 +49,30 @@ def append_manufacturability_metrics(self, line, peptide):
     def execute(self):
         if self.file_type == 'fasta':
             with open(self.output_file, 'w') as output_fh:
-                writer = csv.DictWriter(output_fh, delimiter = "\t", fieldnames=['id', 'peptide_sequence'] + self.manufacturability_headers(), extrasaction='ignore')
+                writer = csv.DictWriter(output_fh, delimiter = "\t", fieldnames=['id', 'peptide_sequence'] + self.manufacturability_headers(), extrasaction='ignore', restval='NA')
                 writer.writeheader()
                 for record in SeqIO.parse(self.input_file, "fasta"):
                     seq_num = record.id
-                    peptide = str(record.seq)
+                    sequence = str(record.seq)
                     line = {
                         'id': seq_num,
-                        'peptide_sequence': peptide
+                        'peptide_sequence': sequence
                     }
-                    peptide = PvacpeptideVaccinePeptide(peptide)
-                    line = self.append_manufacturability_metrics(line, peptide)
+                    if len(sequence) >= 7:
+                        peptide = PvacpeptideVaccinePeptide(sequence)
+                        line = self.append_manufacturability_metrics(line, peptide)
                     writer.writerow(line)
         else:
             with open(self.input_file) as input_fh, open(self.output_file, 'w') as output_fh:
                 reader = csv.DictReader(input_fh, delimiter = "\t")
-                writer = csv.DictWriter(output_fh, delimiter = "\t", fieldnames=reader.fieldnames + self.manufacturability_headers(), extrasaction='ignore')
+                writer = csv.DictWriter(output_fh, delimiter = "\t", fieldnames=reader.fieldnames + self.manufacturability_headers(), extrasaction='ignore', restval='NA')
                 writer.writeheader()
                 for line in reader:
                     if self.file_type == 'pVACbind':
-                        peptide = PvacpeptideVaccinePeptide(line['Epitope Seq'])
+                        sequence = line['Epitope Seq']
                     else:
-                        peptide = PvacpeptideVaccinePeptide(line['MT Epitope Seq'])
-                    line = self.append_manufacturability_metrics(line, peptide)
+                        sequence = line['MT Epitope Seq']
+                    if len(sequence) >= 7:
+                        peptide = PvacpeptideVaccinePeptide(sequence)
+                        line = self.append_manufacturability_metrics(line, peptide)
                     writer.writerow(line)
diff --git a/lib/fasta_generator.py b/lib/fasta_generator.py
@@ -58,7 +58,7 @@ def determine_peptide_sequence_length(self, full_wildtype_sequence_length, pepti
     def determine_flanking_sequence_length(self, full_wildtype_sequence_length, peptide_sequence_length, line):
         actual_peptide_sequence_length = self.determine_peptide_sequence_length(full_wildtype_sequence_length, peptide_sequence_length, line)
         if actual_peptide_sequence_length%2 == 0:
-            return int((actual_peptide_sequence_length-2) / 2)
+            return int(actual_peptide_sequence_length / 2)
         else:
             return int((actual_peptide_sequence_length-1) / 2)
 

diff --git a/lib/output_parser.py b/lib/output_parser.py
@@ -130,7 +130,7 @@ def match_wildtype_and_mutant_entry_for_frameshift(self, result, mt_position, wt
                 result['wt_epitope_seq'] = 'NA'
                 result['wt_scores']      = dict.fromkeys(result['mt_scores'].keys(), 'NA')
             mutation_position = self.find_mutation_position(wt_epitope_seq, mt_epitope_seq)
-            if mutation_position == 1 and int(previous_result['mutation_position']) <= 1:
+            if mutation_position == 1 and previous_result is not None and int(previous_result['mutation_position']) <= 1:
                 #The true mutation position is to the left of the current MT eptiope
                 mutation_position = 0
             result['mutation_position'] = mutation_position

diff --git a/setup.py b/setup.py
@@ -61,7 +61,7 @@
 
 setup(
     name="pvactools",
-    version="1.5.3",
+    version="1.5.4",
     packages=[
         "tools",
         "tools.pvacbind",

diff --git a/...test_data/output_parser/input_frameshift_variant_position_1.MHCnuggetsI.HLA-A*02:01.8.tsv b/...test_data/output_parser/input_frameshift_variant_position_1.MHCnuggetsI.HLA-A*02:01.8.tsv
@@ -0,0 +1,8 @@
+peptide	ic50	seq_num	start	allele
+PRPKRLST	32090.567698455194	2	2	HLA-A*02:01
+RPRPKRLS	32488.54503574128	2	1	HLA-A*02:01
+AAAPEAPV	10669.99440612248	1	1	HLA-A*02:01
+RPKRLSTR	31020.901282749608	2	3	HLA-A*02:01
+APEAPVYA	17866.570741869527	1	3	HLA-A*02:01
+PKRLSTRT	31935.534444637513	2	4	HLA-A*02:01
+AAPEAPVY	30318.795326400505	1	2	HLA-A*02:01
diff --git a/tests/test_data/output_parser/input_frameshift_variant_position_1.key b/tests/test_data/output_parser/input_frameshift_variant_position_1.key
@@ -0,0 +1,4 @@
+1:
+- WT.1.JUND.ENST00000600972.FS.1GC/G
+2:
+- MT.1.JUND.ENST00000600972.FS.1GC/G
diff --git a/tests/test_data/output_parser/input_frameshift_variant_position_1.tsv b/tests/test_data/output_parser/input_frameshift_variant_position_1.tsv
@@ -0,0 +1,2 @@
+chromosome_name	start	stop	reference	variant	gene_name	transcript_name	transcript_support_level	amino_acid_change	codon_change	ensembl_gene_id	hgvsc	hgvsp	wildtype_amino_acid_sequence	downstream_amino_acid_sequence	fusion_amino_acid_sequence	variant_type	protein_position	transcript_expression	gene_expression	normal_depth	normal_vaf	tdna_depth	tdna_vaf	trna_depth	trna_vaf	index	protein_length_change
+19	18280928	18280929	GC	G	JUND	ENST00000600972	2	A/X	Gcg/cg	ENSG00000130522	ENST00000600972.1:c.1del	ENSP00000475153.2:p.Ala1ArgfsTer?	AAAPEAPVYANLSSYAGGAGGAGGAATVAFAAEPVPFPPPPPPGALGPPRLAALKDEPQTVPDVPSFGESPPLSPIDMDTQERIKAERKRLRNRIAASKCRKRKLERISRLEEKVKTLKSQNTELASTASLLREQVAQLKQKVLSHVNSGCQLLPQHQREEQSVRF	RPRPKRLSTRT		FS	1	NA	NA	NA	NA	4	1.0	NA	NA	1.JUND.ENST00000600972.FS.1GC/G	-154
diff --git a/tests/test_data/output_parser/output_frameshift_variant_position_1.iedb.parsed.tsv b/tests/test_data/output_parser/output_frameshift_variant_position_1.iedb.parsed.tsv
@@ -0,0 +1,5 @@
+Chromosome	Start	Stop	Reference	Variant	Transcript	Transcript Support Level	Ensembl Gene ID	Variant Type	Mutation	Protein Position	Gene Name	HGVSc	HGVSp	HLA Allele	Peptide Length	Sub-peptide Position	Mutation Position	MT Epitope Seq	WT Epitope Seq	Best MT Score Method	Best MT Score	Corresponding WT Score	Corresponding Fold Change	Tumor DNA Depth	Tumor DNA VAF	Tumor RNA Depth	Tumor RNA VAF	Normal Depth	Normal VAF	Gene Expression	Transcript Expression	Median MT Score	Median WT Score	Median Fold Change	MHCnuggetsI WT Score	MHCnuggetsI MT Score
+19	18280928	18280929	GC	G	ENST00000600972	2	ENSG00000130522	FS	A/X	1	JUND	ENST00000600972.1:c.1del	ENSP00000475153.2:p.Ala1ArgfsTer?	HLA-A*02:01	8	4	0	PKRLSTRT	NA	MHCnuggetsI	31935.534	NA	NA	4	1.0	NA	NA	NA	NA	NA	NA	31935.534	NA	NA	NA	31935.534444637513
+19	18280928	18280929	GC	G	ENST00000600972	2	ENSG00000130522	FS	A/X	1	JUND	ENST00000600972.1:c.1del	ENSP00000475153.2:p.Ala1ArgfsTer?	HLA-A*02:01	8	1	1	RPRPKRLS	NA	MHCnuggetsI	32488.545	NA	NA	4	1.0	NA	NA	NA	NA	NA	NA	32488.545	NA	NA	NA	32488.54503574128
+19	18280928	18280929	GC	G	ENST00000600972	2	ENSG00000130522	FS	A/X	1	JUND	ENST00000600972.1:c.1del	ENSP00000475153.2:p.Ala1ArgfsTer?	HLA-A*02:01	8	3	0	RPKRLSTR	NA	MHCnuggetsI	31020.901	NA	NA	4	1.0	NA	NA	NA	NA	NA	NA	31020.901	NA	NA	NA	31020.901282749608
+19	18280928	18280929	GC	G	ENST00000600972	2	ENSG00000130522	FS	A/X	1	JUND	ENST00000600972.1:c.1del	ENSP00000475153.2:p.Ala1ArgfsTer?	HLA-A*02:01	8	2	0	PRPKRLST	NA	MHCnuggetsI	32090.568	NA	NA	4	1.0	NA	NA	NA	NA	NA	NA	32090.568	NA	NA	NA	32090.567698455194
diff --git a/tests/test_output_parser.py b/tests/test_output_parser.py
@@ -155,6 +155,25 @@ def test_input_frameshift_variant_feature_truncation2_gets_parsed_correctly(self
         expected_output_file  = os.path.join(self.test_data_dir, "output_frameshift_variant_feature_truncation2.iedb.parsed.tsv")
         self.assertTrue(compare(parse_output_output_file.name, expected_output_file))
 
+    def test_input_frameshift_variant_position_1_gets_parsed_correctly(self):
+        parse_output_input_iedb_file = [os.path.join(self.test_data_dir, "input_frameshift_variant_position_1.MHCnuggetsI.HLA-A*02:01.8.tsv")]
+        parse_output_input_tsv_file = os.path.join(self.test_data_dir, "input_frameshift_variant_position_1.tsv")
+        parse_output_key_file = os.path.join(self.test_data_dir, "input_frameshift_variant_position_1.key")
+        parse_output_output_file = tempfile.NamedTemporaryFile()
+
+        parse_output_params = {
+            'input_iedb_files'       : parse_output_input_iedb_file,
+            'input_tsv_file'         : parse_output_input_tsv_file,
+            'key_file'               : parse_output_key_file,
+            'output_file'            : parse_output_output_file.name,
+            'sample_name'            : None,
+        }
+        parser = DefaultOutputParser(**parse_output_params)
+
+        self.assertFalse(parser.execute())
+        expected_output_file  = os.path.join(self.test_data_dir, "output_frameshift_variant_position_1.iedb.parsed.tsv")
+        self.assertTrue(compare(parse_output_output_file.name, expected_output_file))
+
     def test_input_inframe_deletion_aa_deletion_gets_parsed_correctly(self):
         parse_output_input_iedb_file = [os.path.join(self.test_data_dir, "input_inframe_deletion_aa_deletion.ann.HLA-A*29:02.9.tsv")]
         parse_output_input_tsv_file = os.path.join(self.test_data_dir, "input_inframe_deletion_aa_deletion.tsv")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		chromosome_name start stop reference variant gene_name transcript_name transcript_support_level amino_acid_change codon_change ensembl_gene_id hgvsc hgvsp wildtype_amino_acid_sequence downstream_amino_acid_sequence fusion_amino_acid_sequence variant_type protein_position transcript_expression gene_expression normal_depth normal_vaf tdna_depth tdna_vaf trna_depth trna_vaf index protein_length_change
		19 18280928 18280929 GC G JUND ENST00000600972 2 A/X Gcg/cg ENSG00000130522 ENST00000600972.1:c.1del ENSP00000475153.2:p.Ala1ArgfsTer? AAAPEAPVYANLSSYAGGAGGAGGAATVAFAAEPVPFPPPPPPGALGPPRLAALKDEPQTVPDVPSFGESPPLSPIDMDTQERIKAERKRLRNRIAASKCRKRKLERISRLEEKVKTLKSQNTELASTASLLREQVAQLKQKVLSHVNSGCQLLPQHQREEQSVRF RPRPKRLSTRT FS 1 NA NA NA NA 4 1.0 NA NA 1.JUND.ENST00000600972.FS.1GC/G -154