Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add updates from AmpliconSuite branch #6

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
39 changes: 37 additions & 2 deletions src/AmpliconSuiteAggregatorFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# AmpliconClassifier results and aggregates the results.
###
import sys
import tarfile
import os
import re
import pandas as pd
Expand Down Expand Up @@ -79,8 +78,19 @@ def unzip_file(fp, dest_root):
zip_ref.extractall(destination)
zip_ref.close()

elif fp.endswith(".tar"):
zip_name = os.path.basename(fp).replace(".tar", "")
destination = f'{dest_root}/{zip_name}'
with tarfile.open(fp, 'r') as output_zip:
output_zip.extractall(destination)
output_zip.close()

else:
print("File " + fp + " is not a zip or tar file. It may be ignored!")

except Exception as e:
print(e)
sys.exit(1)


def clean_dirs(dlist):
Expand Down Expand Up @@ -112,6 +122,7 @@ def __init__(self, filelist, root, output_name, run_classifier, ref, py3_path, n
self.run_amp_classifier()
self.samp_AA_dct, self.samp_ckit_dct = defaultdict(str), defaultdict(str)
self.samp_mdata_dct, self.run_mdata_dct = defaultdict(str), defaultdict(str)
self.samp_cnv_calls_dct = defaultdict(str)
self.locate_dirs_and_metadata_jsons()
# print(self.samp_ckit_dct)
# print(self.samp_AA_dct)
Expand Down Expand Up @@ -210,6 +221,10 @@ def locate_dirs_and_metadata_jsons(self):
implied_sname = rchop(f, "_sample_metadata.json")
self.samp_mdata_dct[implied_sname] = fp + "/" + f

elif f.endswith("_CNV_CALLS.bed"):
implied_sname = rchop(f, "_CNV_CALLS.bed")
self.samp_cnv_calls_dct[implied_sname] = fp + "/" + f

def run_amp_classifier(self):
"""
Goes into the OUTPUT_PATH to look for and delete amplicon classifier results.
Expand Down Expand Up @@ -257,16 +272,22 @@ def run_amp_classifier(self):
AC_SRC = os.environ['AC_SRC']
except KeyError:
sys.stderr.write("AC_SRC variable not found! AmpliconClassifier is not properly installed.\n")
self.cleanup()
sys.exit(1)

print(f"AC_SRC is set to {AC_SRC}")
os.system(f"{AC_SRC}/make_input.sh {OUTPUT_PATH} {OUTPUT_PATH}/{self.output_name}" )
input_ec = os.system(f"{AC_SRC}/make_input.sh {OUTPUT_PATH} {OUTPUT_PATH}/{self.output_name}")
if input_ec != 0:
print("Failed to make input for AC!")
self.cleanup()
sys.exit(1)

## if reference isn't downloaded already, then download appropriate reference genome
try:
local_data_repo = os.environ['AA_DATA_REPO']
except KeyError:
sys.stderr.write("AA_DATA_REPO variable not found! The AA data repo directory is not properly configured.\n")
self.cleanup()
sys.exit(1)

if not os.path.exists(os.path.join(local_data_repo, self.ref)):
Expand Down Expand Up @@ -304,11 +325,13 @@ def aggregate_tables(self):
sample_num = 1
# aggregate results
print("Aggregating results into tables")
found_res_table = False
for res_dir in [OUTPUT_PATH, OTHER_FILES]:
for root, dirs, files in os.walk(res_dir, topdown = False):
for name in files:
if name.endswith("_result_table.tsv") and not name.startswith("._"):
result_table_fp = os.path.join(root, name)
found_res_table = True
try:
df = pd.read_csv(result_table_fp, delimiter = '\t')
aggregate = pd.concat([aggregate, df], ignore_index = True)
Expand All @@ -325,6 +348,12 @@ def aggregate_tables(self):
sample_to_ac_dir[f'sample_{sample_num}'] = os.path.dirname(result_table_fp)
sample_num += 1

if not found_res_table:
print("Error: No results tables found! Aggregation will be empty or invalid. Please make sure to first run make_results_table.py from AmpliconClassifier.")
self.cleanup()
sys.exit(1)


## output the table
with open('./results/run.json', 'w') as run_file:
json.dump({'runs': runs}, run_file)
Expand Down Expand Up @@ -393,10 +422,13 @@ def json_modifications(self):
# updating string of lists to lists
sample_name = sample_dct["Sample name"]
ref_genomes.add(sample_dct["Reference version"])
if sample_dct["Reference version"] is None:
sys.stderr.write("WARNING: " + sample_name + " had no reference genome build indicated!\n")
if len(ref_genomes) > 1:
sys.stderr.write(str(ref_genomes) + "\n")
sys.stderr.write("ERROR! Multiple reference genomes detected in project.\n AmpliconRepository only "
"supports single-reference projects currently. Exiting.\n")
self.cleanup()
sys.exit(1)

potential_str_lsts = [
Expand Down Expand Up @@ -445,6 +477,9 @@ def json_modifications(self):
feat_basename = os.path.basename(sample_dct[feature])
feat_file = f'{self.sample_to_ac_location_dct[sample]}/files/{feat_basename}'
if feature == "CNV BED file" and any([feat_file.endswith(x) for x in ["AA_CNV_SEEDS.bed", "CNV_CALLS_pre_filtered.bed", "Not provided", "Not Provided"]]):
if self.samp_cnv_calls_dct[sample_name]:
feat_file = self.samp_cnv_calls_dct[sample_name]

cnvkit_dir = self.samp_ckit_dct[sample_dct['Sample name']]
if cnvkit_dir:
for f in os.listdir(cnvkit_dir):
Expand Down