Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add number of snps to per reference table data #30

Merged
merged 1 commit into from
Aug 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ workflow {
postprocessing_wf.out.ngx_json,
postprocessing_wf.out.misassembly_reference_json,
postprocessing_wf.out.misassembly_plot_json,
assembly_wf.out.all_versions)
assembly_wf.out.all_versions,
postprocessing_wf.out.snp_report_json)
}

workflow.onComplete {
Expand Down
8 changes: 6 additions & 2 deletions modules/postprocessing/postprocessing.nf
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ process SNP_ASSESSMENT {
output:
path('*.tsv'), emit: tsv
path('*_snps.csv'), emit: csv
path('*_snps.json'), emit: json

script:
template "snp_assessment.py"
Expand All @@ -176,10 +177,12 @@ process PLOT_SNP_REFERENCE {

input:
path snp_coords_dataframes
path snps_jsons

output:
path('*.html') optional true
path('*.json'), emit: json
path('snps_in_reference.json'), emit: json
path('snps_report_per_ref.json'), emit: reference_snps_json

script:
template "plot_snp.py"
Expand Down Expand Up @@ -278,7 +281,7 @@ workflow postprocessing_wf {
PLOT_GAP_BOXPLOT(GAP_ASSESSMENT.out.json | collect)
PLOT_GAP_REFERENCE(GAP_ASSESSMENT.out.csv | collect)
SNP_ASSESSMENT(paf, triple_reference)
PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect)
PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect, SNP_ASSESSMENT.out.json | collect)
MISASSEMBLY(misassembly_paf)
PROCESS_MISASSEMBLY(MISASSEMBLY.out.trace_pkl | collect, MISASSEMBLY.out.contig_length_pkl | collect, MISASSEMBLY.out.misassembly_json | collect, MISASSEMBLY.out.misassembled_reference_json | collect)
PLOT_MISASSEMBLY(MISASSEMBLY.out.csv | collect)
Expand All @@ -290,6 +293,7 @@ workflow postprocessing_wf {
phred_json = PROCESS_SHRIMP_PLOT.out.json
gap_reference_json = PLOT_GAP_REFERENCE.out.json
snp_reference_json = PLOT_SNP_REFERENCE.out.json
snp_report_json = PLOT_SNP_REFERENCE.out.reference_snps_json
gap_boxplot_json = PLOT_GAP_BOXPLOT.out.json
misassembly_json = PROCESS_MISASSEMBLY.out.json
misassembly_report_json = PROCESS_MISASSEMBLY.out.report_json
Expand Down
4 changes: 3 additions & 1 deletion modules/report/report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ process COMPILE_REPORT {
file plot_misassembly_per_ref
file about_md
file containers_config
file snp_report_json

output:
path('pipeline_report*.json')
Expand Down Expand Up @@ -84,9 +85,10 @@ workflow report_wf {
misassembly_reference
plot_misassembly
version_file
snp_report_json

main:
COMPILE_VERSIONS(version_file)
COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config)
COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config, snp_report_json)

}
29 changes: 27 additions & 2 deletions templates/compile_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
ABOUT_MD = "$about_md"
CONTAINERS = "$containers_config"
PLOT_MISASSEMBLY_PER_REFERENCE = "$plot_misassembly_per_ref"
SNP_REPORT_REFERENCE = "$snp_report_json"

logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
Expand Down Expand Up @@ -69,6 +70,7 @@
logger.debug("ABOUT_MD: {}".format(ABOUT_MD))
logger.debug("CONTAINERS: {}".format(CONTAINERS))
logger.debug("CONTAINERS: {}".format(PLOT_MISASSEMBLY_PER_REFERENCE))
logger.debug("SNP_REPORT_REFERENCE: {}".format(SNP_REPORT_REFERENCE))


html_template = """
Expand Down Expand Up @@ -327,7 +329,7 @@ def process_sample_reads(reads_jsons):
def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapping_stats_report, completness_plot,
lmas_logo, reference_file, lx_json, shrimp_json, gap_reference_json, gap_histogram, plot_misassembly, misassembly_report,
min_contig_size, nax_json, ngx_json, reads_json, snp_reference_json, versions_json, misassembly_per_ref, about_md,
containers_config, plot_misassembly_per_reference_json):
containers_config, plot_misassembly_per_reference_json, snp_per_ref):

metadata = {
"nfMetadata": {
Expand Down Expand Up @@ -433,6 +435,29 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp
except KeyError:
item['misassembled_contigs'] = 0
item['misassembly_events'] = 0

# add snp stats
with open(snp_per_ref) as snp_fh:
snp_stats = json.load(snp_fh)
for sample_id in main_data_tables_js.keys():
for reference in main_data_tables_js[sample_id]["ReferenceTables"].keys():
for item_row in main_data_tables_js[sample_id]["ReferenceTables"][reference]:
for item in item_row:
assembler = item['assembler']
try:
references = list(
snp_stats[sample_id][assembler][0].keys())
if reference in references:
index = references.index(reference)
ref_name = references[index]
try:
item['snps'] = snp_stats[sample_id][assembler][0][ref_name]["snps"]
except KeyError:
item['snps'] = 0
else:
item['snps'] = 0
except KeyError:
item['snps'] = 0

for sample_id in main_data_tables_js.keys():
main_data_plots_js[sample_id] = {}
Expand Down Expand Up @@ -628,5 +653,5 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp
main(MAIN_JS, PIPELINE_STATS, ASSEMBLY_STATS_REPORT, CONTIG_SIZE_DISTRIBUTION, MAPPING_STATS_REPORT,
COMPLETNESS_JSON, LMAS_LOGO, REFERENCE_FILE, LX_JSON, SHRIMP_JSON, GAP_REFERENCE_JSON, GAP_HISTOGRAM,
MISASSEMBLY_PLOT, MISASSEMBLY_REPORT, MIN_CONTIG_SIZE, NAX_JSON, NGX_JSON, READS_NUMBER, SNP_REFERENCE_JSON,
VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE)
VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE, SNP_REPORT_REFERENCE)

36 changes: 32 additions & 4 deletions templates/plot_snp.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,37 @@

if __file__.endswith(".command.sh"):
DATAFRAME_LIST = '$snp_coords_dataframes'.split()
JSON_LIST = '$snps_jsons'.split()
logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
logger.debug("DATAFRAME_LIST: {}".format(DATAFRAME_LIST))


def main(dataframes):
logger.debug("JSON_LIST: {}".format(JSON_LIST))

def snps_per_ref(report_per_reference):
"""
"""
master_report_data_per_reference = {}
for report_reference in report_per_reference:
with open(report_reference) as json_fh:
data_json = json.load(json_fh)
print(data_json)
if data_json["sample"] not in master_report_data_per_reference.keys():
master_report_data_per_reference[data_json["sample"]] = {
data_json["assembler"]: [data_json["reference"]]}
elif data_json["assembler"] not in master_report_data_per_reference[data_json["sample"]].keys():
master_report_data_per_reference[data_json["sample"]][data_json["assembler"]] = [
data_json["reference"]]
else:
master_report_data_per_reference[data_json["sample"]][data_json["assembler"]].append(
data_json["reference"])

logger.debug("Global report data: {}". format(master_report_data_per_reference))

with open("snps_report_per_ref.json", "w") as json_report:
json_report.write(json.dumps(
master_report_data_per_reference, separators=(",", ":")))

def main(dataframes, jsons):

li = []
for filename in dataframes:
Expand Down Expand Up @@ -154,7 +179,10 @@ def main(dataframes):

with open('snps_in_reference.json', 'w') as json_report:
json_report.write(json.dumps(report_dict, separators=(",", ":")))

# SNPS STATS PER REF
snps_per_ref(jsons)


if __name__ == '__main__':
main(DATAFRAME_LIST)
main(DATAFRAME_LIST, JSON_LIST)
15 changes: 14 additions & 1 deletion templates/snp_assessment.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from itertools import groupby
from numpy.lib.function_base import append
import pandas as pd
import json
try:
import utils
except ImportError:
Expand Down Expand Up @@ -123,6 +124,9 @@ def main(sample_id, assembler, assembly, mapping, reference):

df = pd.DataFrame(columns=COLUMNS)

reference_report = {"sample": sample_id,
"assembler": assembler, "reference": {}}

# iterator for reference files (sequence length is needed)
references = (x[1] for x in groupby(open(reference, "r"), lambda line: line[0] == ">"))

Expand All @@ -139,7 +143,16 @@ def main(sample_id, assembler, assembly, mapping, reference):
#print(coord, substitution)
df = df.append({'Sample': sample_id, 'Assembler': assembler, 'Reference': reference_name,
'Reference Length': len(seq)/3, 'SNP Location': coord, 'Substitution Type': substitution}, ignore_index=True)


if reference_name not in reference_report['reference'].keys():
reference_report['reference'][reference_name] = {"snps": 1}
else:
reference_report['reference'][reference_name]["snps"] += 1

# Write files for report
with open("{}_{}_snps.json".format(sample_id, assembler), "w") as json_report:
json_report.write(json.dumps(reference_report, separators=(",", ":")))

df.to_csv(sample_id + '_' + assembler + '_snps.csv')


Expand Down