Merge pull request #1079 from griffithlab/issue_1074

Handle nan values in the Mutation Position column
griffithlab · Mar 13, 2024 · ae1f135 · ae1f135
2 parents a3d3e5d + 5f64dc2
commit ae1f135
Show file tree

Hide file tree

Showing 6 changed files with 5,398 additions and 5 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -42,6 +42,7 @@ jobs:
         run: |
           pip install polars==0.16.18
           pip install pypandoc==1.7.2
+          pip install "tensorflow<2.16"
           pip install git+https://github.com/griffithlab/bigmhc.git#egg=bigmhc
           pip install git+https://github.com/griffithlab/deepimmuno.git#egg=deepimmuno
           pip install -e .

diff --git a/pvactools/lib/aggregate_all_epitopes.py b/pvactools/lib/aggregate_all_epitopes.py
@@ -317,8 +317,6 @@ def read_input_file(self, used_columns, dtypes):
     def get_sub_df(self, all_epitopes_df, key):
         key_str = "{}-{}-{}-{}-{}".format(key[0], key[1], key[2], key[3], key[4])
         df = (all_epitopes_df[lambda x: (x['Chromosome'] == key[0]) & (x['Start'] == key[1]) & (x['Stop'] == key[2]) & (x['Reference'] == key[3]) & (x['Variant'] == key[4])]).copy()
-        df['Variant Type'] = df['Variant Type'].cat.add_categories('NA')
-        df['Mutation Position'] = df['Mutation Position'].cat.add_categories('NA')
         df['annotation'] = df[['Transcript', 'Gene Name', 'Mutation', 'Protein Position']].agg('-'.join, axis=1)
         df['key'] = key_str
         return (df, key_str)
@@ -367,14 +365,16 @@ def is_anchor_residue_pass(self, mutation):
         anchors = self.get_anchor_positions(mutation['HLA Allele'], len(mutation['MT Epitope Seq']))
         # parse out mutation position from str
         position = mutation["Mutation Position"]
-        if '-' in position:
+        if pd.isna(position):
+            return anchor_residue_pass
+        elif '-' in position:
             d_ind = position.index('-')
             if all(pos in anchors for pos in range(int(position[0:d_ind]), int(position[d_ind+1:])+1)):
                 if pd.isna(mutation["{} WT IC50 Score".format(self.wt_top_score_metric)]):
                     anchor_residue_pass = False
                 elif mutation["{} WT IC50 Score".format(self.wt_top_score_metric)] < binding_threshold:
                     anchor_residue_pass = False
-        elif position != "NA":
+        else:
             if int(float(position)) in anchors:
                 if pd.isna(mutation["{} WT IC50 Score".format(self.wt_top_score_metric)]):
                     anchor_residue_pass = False
@@ -571,7 +571,7 @@ def get_good_binders_metrics(self, good_binders, prediction_algorithms, el_algor
                         individual_el_calls[peptide_type] = el_calls
                         individual_el_percentile_calls[peptide_type] = el_percentile_calls
                     results[peptide]['hla_types'] = sorted(self.hla_types)
-                    results[peptide]['mutation_position'] = str(good_binders_peptide_annotation.iloc[0]['Mutation Position'])
+                    results[peptide]['mutation_position'] = "NA" if pd.isna(good_binders_peptide_annotation.iloc[0]['Mutation Position']) else str(good_binders_peptide_annotation.iloc[0]['Mutation Position'])
                     results[peptide]['problematic_positions'] = str(good_binders_peptide_annotation.iloc[0]['Problematic Positions']) if 'Problematic Positions' in good_binders_peptide_annotation.iloc[0] else 'None'
                     if len(anchor_fails) > 0:
                         results[peptide]['anchor_fails'] = ', '.join(anchor_fails)

diff --git a/tests/test_aggregate_all_epitopes.py b/tests/test_aggregate_all_epitopes.py
@@ -71,6 +71,32 @@ def test_aggregate_all_epitopes_HCC1395_pvacseq_runs_and_produces_expected_outpu
             self.assertTrue(os.path.isfile(pvacview_file))
             os.remove(pvacview_file)
 
+    def test_aggregate_all_epitopes_pvacseq_na_mutation_position_runs_and_produces_expected_output(self):
+        self.assertTrue(py_compile.compile(self.executable))
+        output_file = tempfile.NamedTemporaryFile(suffix='.tsv')
+        self.assertFalse(PvacseqAggregateAllEpitopes(os.path.join(self.test_data_dir, 'Test.all_epitopes.na_mutation_position.tsv'), output_file.name).execute())
+        self.assertTrue(cmp(
+            output_file.name,
+            os.path.join(self.test_data_dir, "output.na_mutation_position.tsv"),
+        ))
+
+        metrics_file = output_file.name.replace('.tsv', '.metrics.json')
+        self.assertTrue(cmp(
+            metrics_file,
+            os.path.join(self.test_data_dir, "output.na_mutation_position.metrics.json"),
+        ))
+        os.remove(metrics_file)
+
+        for i in ["ui.R", "app.R", "server.R", "styling.R", "anchor_and_helper_functions.R"]:
+            pvacview_file = os.path.join(os.path.dirname(output_file.name), i)
+            self.assertTrue(os.path.isfile(pvacview_file))
+            os.remove(pvacview_file)
+
+        for i in ["anchor.jpg", "pVACview_logo.png", "pVACview_logo_mini.png"]:
+            pvacview_file = os.path.join(os.path.dirname(output_file.name), "www", i)
+            self.assertTrue(os.path.isfile(pvacview_file))
+            os.remove(pvacview_file)
+
     def test_aggregate_all_epitopes_pvacfuse_runs_and_produces_expected_output(self):
         self.assertTrue(py_compile.compile(self.executable))
         output_file = tempfile.NamedTemporaryFile(suffix='.tsv')