genepattern · jluebeck · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/src/AmpliconSuiteAggregatorFunctions.py b/src/AmpliconSuiteAggregatorFunctions.py
@@ -4,7 +4,6 @@
 # AmpliconClassifier results and aggregates the results.
 ###
 import sys
-import tarfile
 import os
 import re
 import pandas as pd
@@ -79,8 +78,19 @@ def unzip_file(fp, dest_root):
                 zip_ref.extractall(destination)
             zip_ref.close()
 
+        elif fp.endswith(".tar"):
+            zip_name = os.path.basename(fp).replace(".tar", "")
+            destination = f'{dest_root}/{zip_name}'
+            with tarfile.open(fp, 'r') as output_zip:
+                output_zip.extractall(destination)
+            output_zip.close()
+
+        else:
+            print("File " + fp + " is not a zip or tar file. It may be ignored!")
+
     except Exception as e:
         print(e)
+        sys.exit(1)
 
 
 def clean_dirs(dlist):
@@ -112,6 +122,7 @@ def __init__(self, filelist, root, output_name, run_classifier, ref, py3_path, n
             self.run_amp_classifier()
         self.samp_AA_dct, self.samp_ckit_dct = defaultdict(str), defaultdict(str)
         self.samp_mdata_dct, self.run_mdata_dct = defaultdict(str), defaultdict(str)
+        self.samp_cnv_calls_dct = defaultdict(str)
         self.locate_dirs_and_metadata_jsons()
         # print(self.samp_ckit_dct)
         # print(self.samp_AA_dct)
@@ -210,6 +221,10 @@ def locate_dirs_and_metadata_jsons(self):
                         implied_sname = rchop(f, "_sample_metadata.json")
                         self.samp_mdata_dct[implied_sname] = fp + "/" + f
 
+                    elif f.endswith("_CNV_CALLS.bed"):
+                        implied_sname = rchop(f, "_CNV_CALLS.bed")
+                        self.samp_cnv_calls_dct[implied_sname] = fp + "/" + f
+
     def run_amp_classifier(self):
         """
         Goes into the OUTPUT_PATH to look for and delete amplicon classifier results. 
@@ -257,16 +272,22 @@ def run_amp_classifier(self):
             AC_SRC = os.environ['AC_SRC']
         except KeyError:
             sys.stderr.write("AC_SRC variable not found! AmpliconClassifier is not properly installed.\n")
+            self.cleanup()
             sys.exit(1)
 
         print(f"AC_SRC is set to {AC_SRC}")
-        os.system(f"{AC_SRC}/make_input.sh {OUTPUT_PATH} {OUTPUT_PATH}/{self.output_name}" )
+        input_ec = os.system(f"{AC_SRC}/make_input.sh {OUTPUT_PATH} {OUTPUT_PATH}/{self.output_name}")
+        if input_ec != 0:
+            print("Failed to make input for AC!")
+            self.cleanup()
+            sys.exit(1)
 
         ## if reference isn't downloaded already, then download appropriate reference genome
         try:
             local_data_repo = os.environ['AA_DATA_REPO']
         except KeyError:
             sys.stderr.write("AA_DATA_REPO variable not found! The AA data repo directory is not properly configured.\n")
+            self.cleanup()
             sys.exit(1)
 
         if not os.path.exists(os.path.join(local_data_repo, self.ref)):
@@ -304,11 +325,13 @@ def aggregate_tables(self):
         sample_num = 1
         # aggregate results
         print("Aggregating results into tables")
+        found_res_table = False
         for res_dir in [OUTPUT_PATH, OTHER_FILES]:
             for root, dirs, files in os.walk(res_dir, topdown = False):
                 for name in files:
                     if name.endswith("_result_table.tsv") and not name.startswith("._"):
                         result_table_fp = os.path.join(root, name)
+                        found_res_table = True
                         try:
                             df = pd.read_csv(result_table_fp, delimiter = '\t')
                             aggregate = pd.concat([aggregate, df], ignore_index = True)
@@ -325,6 +348,12 @@ def aggregate_tables(self):
                             sample_to_ac_dir[f'sample_{sample_num}'] = os.path.dirname(result_table_fp)
                             sample_num += 1
 
+        if not found_res_table:
+            print("Error: No results tables found! Aggregation will be empty or invalid. Please make sure to first run make_results_table.py from AmpliconClassifier.")
+            self.cleanup()
+            sys.exit(1)
+
+
         ## output the table
         with open('./results/run.json', 'w') as run_file:
             json.dump({'runs': runs}, run_file)
@@ -393,10 +422,13 @@ def json_modifications(self):
                 # updating string of lists to lists
                 sample_name = sample_dct["Sample name"]
                 ref_genomes.add(sample_dct["Reference version"])
+                if sample_dct["Reference version"] is None:
+                    sys.stderr.write("WARNING: " + sample_name + " had no reference genome build indicated!\n")
                 if len(ref_genomes) > 1:
                     sys.stderr.write(str(ref_genomes) + "\n")
                     sys.stderr.write("ERROR! Multiple reference genomes detected in project.\n AmpliconRepository only "
                                      "supports single-reference projects currently. Exiting.\n")
+                    self.cleanup()
                     sys.exit(1)
 
                 potential_str_lsts = [
@@ -445,6 +477,9 @@ def json_modifications(self):
                         feat_basename = os.path.basename(sample_dct[feature])
                         feat_file = f'{self.sample_to_ac_location_dct[sample]}/files/{feat_basename}'
                         if feature == "CNV BED file" and any([feat_file.endswith(x) for x in ["AA_CNV_SEEDS.bed", "CNV_CALLS_pre_filtered.bed", "Not provided", "Not Provided"]]):
+                            if self.samp_cnv_calls_dct[sample_name]:
+                                feat_file = self.samp_cnv_calls_dct[sample_name]
+
                             cnvkit_dir = self.samp_ckit_dct[sample_dct['Sample name']]
                             if cnvkit_dir:
                                 for f in os.listdir(cnvkit_dir):