Add working run command

metagenlab · Aug 2, 2024 · a0dc44d · a0dc44d
2 parents 0ed9c4b + 1e7f6be
commit a0dc44d
Show file tree

Hide file tree

Showing 48 changed files with 2,249 additions and 932 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 zamp.egg-info/
-zamp/__pycache__
+zamp/__pycache__
+zamp/ressources/
diff --git a/setup.py b/setup.py
@@ -58,6 +58,7 @@ def get_data_files():
         "Click>=8.1.3",
         "attrmap>=0.0.7",
         "snaketool-utils>=0.0.5",
+        "metasnek>=0.0.8",
         "biopython>=1.83",
     ],
     entry_points={"console_scripts": ["zamp=zamp.__main__:main"]},

diff --git a/zamp/Insilico_quality_control.Snakefile b/zamp/Insilico_quality_control.Snakefile
@@ -25,5 +25,5 @@ rule check_amplicon_quality:
     input:
         expand(
             "QualityControl/{denoiser}/compare_quality_table.tsv",
-            denoiser=config["denoiser"],
+            denoiser=DENOISER,
         ),
diff --git a/zamp/Insilico_taxa_assign.Snakefile b/zamp/Insilico_taxa_assign.Snakefile
@@ -5,9 +5,7 @@ import os
 ## When using singularity
 if "--use-singularity" in sys.argv:
     ### Bind the directory of the database to the singularity containers.
-    workflow.deployment_settings.apptainer_args += (
-        f' -B {config["tax_DB_path"]}:{config["tax_DB_path"]}'
-    )
+    workflow.deployment_settings.apptainer_args += f" -B {os.path.abspath(config.args.database)}:{os.path.abspath(config.args.database)}"
     #### Load a dictionnary of singularity containers that will be called from each rule
 
 singularity_envs = yaml.safe_load(
@@ -35,12 +33,12 @@ rule insilico_validation:
     input:
         expand(
             "InSilico/3_classified/{classifier}_{tax_DB}/dna-sequences_tax_assignments.txt",
-            classifier=config["classifier"],
-            tax_DB=config["tax_DB_name"],
+            classifier=CLASSIFIER,
+            tax_DB=DBNAME,
         ),
         "InSilico/2_denoised/count_table.tsv",
         expand(
             "InSilico/3_classified/{classifier}_{tax_DB}/InSilico_compare_tax.tsv",
-            classifier=config["classifier"],
-            tax_DB=config["tax_DB_name"],
+            classifier=CLASSIFIER,
+            tax_DB=DBNAME,
         ),
diff --git a/zamp/Snakefile b/zamp/Snakefile
@@ -2,16 +2,126 @@ import re
 import yaml
 import os
 from snakemake.utils import min_version
+from attrmap import AttrMap
+import attrmap.utils as au
+import pandas as pd
+import glob
+from metasnek import fastq_finder
+import warnings
 
 min_version("8.10.6")
 
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+)
+
+
+# Concatenate Snakemake's own log file with the master log file
+def copy_log_file():
+    files = glob.glob(os.path.join(".snakemake", "log", "*.snakemake.log"))
+    if not files:
+        return None
+    current_log = max(files, key=os.path.getmtime)
+    shell("cat " + current_log + " >> " + LOG)
+
+
+onsuccess:
+    copy_log_file()
+
+
+onerror:
+    copy_log_file()
+
+
+# config file
+configfile: os.path.join(workflow.basedir, "config", "config.yaml")
+
+
+config = AttrMap(config)
+config = au.convert_state(config, read_only=True)
+
+
+include: os.path.join("rules", "0_preprocessing", "directories.smk")
+include: os.path.join("rules", "0_preprocessing", "functions.smk")
+
+
+OUTPUT = config.args.output
+LOG = os.path.join(OUTPUT, "zamp.log")
+
+
+# Read run args
+## Inputs args
+INPUT = config.args.input
+METADATA = config.args.metadata
+if os.path.isdir(INPUT) and METADATA:
+
+    reads_df = (
+        pd.DataFrame.from_dict(fastq_finder.parse_samples_to_dictionary(INPUT))
+        .T.reset_index()
+        .rename(columns={"index": "fastq"})
+    )
+    meta_df = pd.read_csv(METADATA, sep="\t")
+    SAMPLES = reads_df.merge(meta_df, on="fastq")
+    SAMPLES.set_index("sample", inplace=True)
+elif os.path.isfile(INPUT) and not METADATA:
+    SAMPLES = pd.read_csv(INPUT, sep="\t", index_col="sample")
+
+
+# Add read pairing
+SAMPLES["paired"] = [True if R2 else False for R2 in SAMPLES.R2]
+PAIRING = {
+    sample: ("paired" if SAMPLES.loc[sample, "paired"] else "single")
+    for sample in SAMPLES.index
+}
+## Denoiser
+DENOISER = config.args.denoiser
+## Primers args
+TRIM = config.args.trim
+AMPLICON = config.args.amplicon
+OVERLAP = config.args.min_overlap
+FW_PRIMER = config.args.fw_primer
+RV_PRIMER = config.args.rv_primer
+
+## Merged sequences args
+MINLEN = config.args.minlen
+MAXLEN = config.args.maxlen
+
+## Denoising args
+FW_TRIM = config.args.fw_trim
+RV_TRIM = config.args.rv_trim
+FW_ERRORS = config.args.fw_errors
+RV_ERRORS = config.args.rv_errors
+
+## Database args
+DBPATH = os.path.dirname(os.path.abspath(config.args.database))
+DBNAME = os.listdir(DBPATH)
+
+## Classifier
+CLASSIFIER = config.args.classifier
+
+
+## Post-processing args
+RAREFACTION = config.args.rarefaction.split(",")
+MIN_PREV = config.args.min_prev
+MIN_COUNT = config.args.min_count
+NORM = config.args.normalization.split(",")
+REPL_EMPTY = config.args.replace_empty
+KEEP_RANK = config.args.keep_rank
+KEEP_TAXA = config.args.keep_taxa.split(",")
+EXCL_RANK = config.args.exclude_rank.split(",")
+EXCL_TAXA = config.args.exclude_taxa.split(",")
+
+## Special out args
+MELTED = config.args.melted
+PHYSEQ_RANK = config.args.physeq_rank.split(",")
+TRANSPOSED = config.args.transposed
+QIIME_VIZ = config.args.qiime_viz
 
 ## When using singularity
 if "--use-singularity" in sys.argv:
-    ### Bind the directory of the database to the singularity containers.
-    workflow.deployment_settings.apptainer_args += (
-        f' -B {config["tax_DB_path"]}:{config["tax_DB_path"]}'
-    )
+    ### Mount Database path to singularity containers.
+    workflow.deployment_settings.apptainer_args += f" -B {os.path.abspath(config.args.database)}:{os.path.abspath(config.args.database)}"
     #### Load a dictionnary of singularity containers that will be called from each rule
 singularity_envs = yaml.safe_load(
     open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), "r")
@@ -20,22 +130,20 @@ singularity_envs = yaml.safe_load(
 
 ## Include the pipeline rules
 ### Sets of script to handle logs, input files and list of output:
-include: "rules/0_preprocessing/scripts/logging.py"
-include: "rules/0_preprocessing/scripts/make_input_lists.py"
-include: "rules/0_preprocessing/scripts/make_output_lists.py"
+include: os.path.join("rules", "0_preprocessing", "scripts", "make_output_lists.py")
 ### Snakemake rules to do the job:
-include: "rules/0_preprocessing/get_inputs.rules"
-include: "rules/0_preprocessing/QC_raw_reads.rules"
-include: "rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules"
-include: "rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules"
-include: "rules/1_2_vsearch_OTUs/1_PANDAseq_trim_filter_merge.rules"
-include: "rules/1_2_vsearch_OTUs/2_vsearch_denoising.rules"
-include: "rules/3_tax_assignment/tax_assign.rules"
-include: "rules/4_post_processing/physeq_processing.rules"
-include: "rules/4_post_processing/TreeShrink.rules"
-include: "rules/5_visualization/General_plotting.rules"
-include: "rules/5_visualization/QIIME2_import.rules"
-include: "rules/PICRUSt2/picrust.rules"
+include: os.path.join("rules", "0_preprocessing", "get_inputs.rules")
+include: os.path.join("rules", "0_preprocessing", "QC_raw_reads.rules")
+include: os.path.join("rules", "1_2_DADA2_ASVs", "1_cutadapt_trim.rules")
+include: os.path.join("rules", "1_2_DADA2_ASVs", "2_DADA2_denoising.rules")
+include: os.path.join("rules", "1_2_vsearch_OTUs", "1_PANDAseq_trim_filter_merge.rules")
+include: os.path.join("rules", "1_2_vsearch_OTUs", "2_vsearch_denoising.rules")
+include: os.path.join("rules", "3_tax_assignment", "tax_assign.rules")
+include: os.path.join("rules", "4_post_processing", "physeq_processing.rules")
+include: os.path.join("rules", "4_post_processing", "TreeShrink.rules")
+include: os.path.join("rules", "5_visualization", "General_plotting.rules")
+include: os.path.join("rules", "5_visualization", "QIIME2_import.rules")
+include: os.path.join("rules", "PICRUSt2", "picrust.rules")
 
 
 report: "workflow.rst"

diff --git a/zamp/__main__.py b/zamp/__main__.py
@@ -10,6 +10,7 @@
     print_citation,
     common_options,
     db_options,
+    run_options,
     run_snakemake,
 )
 
@@ -52,6 +53,7 @@ def db(**kwargs):
         help_option_names=["-h", "--help"], ignore_unknown_options=True
     ),
 )
+@run_options
 @common_options
 def run(**kwargs):
     """

diff --git a/zamp/rules/0_preprocessing/QC_raw_reads.rules b/zamp/rules/0_preprocessing/QC_raw_reads.rules
@@ -1,57 +1,82 @@
 ## Generate MultiQC reports
 
 
-rule assess_quality_raw_reads_with_fastqc:
+rule fastQC_raw_reads:
     conda:
-        "../../envs/FastQC.yml"
+        os.path.join(dir.envs, "FastQC.yml")
     container:
         singularity_envs["fastqc"]
     input:
-        "raw_reads/{sample}_{suffix}.fastq.gz",
+        os.path.join(dir.out.fastq, "{sample}_{suffix}.fastq.gz"),
     output:
-        temp("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip"),
+        temp(
+            os.path.join(
+                dir.out.qc,
+                "FastQC",
+                "raw_reads",
+                "{run}",
+                "{sample}_{suffix}_fastqc.zip",
+            )
+        ),
     log:
-        logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.txt",
+        os.path.join(
+            dir.logs,
+            "QC",
+            "FastQC",
+            "raw_reads",
+            "{run}",
+            "{sample}_{suffix}_fastqc.txt",
+        ),
     threads: 1
     shell:
         """
         fastqc -t {threads} {input} -o $(dirname {output[0]}) --nogroup 2> {log[0]}
         """
 
 
-def sample_list_run_QC(RUN):
+def sample_list_run_QC(run):
     file_list = []
-    samples = list(all_samples.index.values[all_samples[config["run_column"]] == RUN])
-    for s in samples:
-        suffix_s = reads_ext[s]
+    samples = list(SAMPLES[SAMPLES["run"] == run].index)
+    for sample in samples:
+        if SAMPLES.loc[sample, "paired"]:
+            suffix = ["R1", "R2"]
+        else:
+            suffix = ["single"]
         combined_values = expand(
-            "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip",
-            RUN=RUN,
-            sample=s,
-            suffix=suffix_s,
+            os.path.join(
+                "{qc}", "FastQC", "raw_reads", "{run}", "{sample}_{suffix}_fastqc.zip"
+            ),
+            qc=dir.out.qc,
+            run=run,
+            sample=sample,
+            suffix=suffix,
         )
         file_list = file_list + combined_values
 
     return file_list
 
 
-rule create_per_run_multiqc_report:
+rule multiQC_per_run:
     conda:
-        "../../envs/MultiQC.yml"
+        os.path.join(dir.envs, "MultiQC.yml")
     container:
         singularity_envs["multiqc"]
     input:
-        lambda wildcards: sample_list_run_QC(wildcards.RUN),
+        lambda wildcards: sample_list_run_QC(wildcards.run),
     output:
         report(
-            "QC/{RUN}_multiqc_raw_reads_report.html",
+            os.path.join(dir.out.qc, "{run}_multiqc_raw_reads_report.html"),
             caption="report/MultiQC.rst",
             category="MultiQC",
-            subcategory="{RUN}",
+            subcategory="{run}",
+        ),
+        os.path.join(
+            dir.out.qc,
+            "{run}_multiqc_raw_reads_report_data",
+            "multiqc_general_stats.txt",
         ),
-        "QC/{RUN}_multiqc_raw_reads_report_data/multiqc_general_stats.txt",
     log:
-        logging_folder + "QC/{RUN}_multiqc_raw_reads_report.txt",
+        os.path.join(dir.logs, "QC", "{run}_multiqc_raw_reads_report.txt"),
     priority: 100
     shell:
         """
@@ -62,38 +87,50 @@ rule create_per_run_multiqc_report:
 def sample_list_overall_QC():
     file_list = []
 
-    for i in set(all_samples[config["run_column"]]):
-        samples = list(all_samples.index.values[all_samples[config["run_column"]] == i])
-        for s in samples:
-            suffix_s = reads_ext[s]
+    for run in set(SAMPLES["run"]):
+        samples = list(SAMPLES[SAMPLES["run"] == run].index)
+        for sample in samples:
+            if SAMPLES.loc[sample, "paired"]:
+                suffix = ["R1", "R2"]
+            else:
+                suffix = ["single"]
             combined_values = expand(
-                "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip",
-                RUN=i,
-                sample=s,
-                suffix=suffix_s,
+                os.path.join(
+                    "{qc}",
+                    "FastQC",
+                    "raw_reads",
+                    "{run}",
+                    "{sample}_{suffix}_fastqc.zip",
+                ),
+                qc=dir.out.qc,
+                run=run,
+                sample=sample,
+                suffix=suffix,
             )
             file_list = file_list + combined_values
 
     return file_list
 
 
-rule create_overall_raw_reads_multiqc_report:
+rule multiQC_all:
     conda:
-        "../../envs/MultiQC.yml"
+        os.path.join(dir.envs, "MultiQC.yml")
     container:
         singularity_envs["multiqc"]
     input:
         sample_list_overall_QC(),
     output:
         report(
-            "QC/multiqc_raw_reads_report.html",
+            os.path.join(dir.out.qc, "multiqc_raw_reads_report.html"),
             caption="report/MultiQC.rst",
             category="MultiQC",
             subcategory="All run",
         ),
-        "QC/multiqc_raw_reads_report_data/multiqc_general_stats.txt",
+        os.path.join(
+            dir.out.qc, "multiqc_raw_reads_report_data/multiqc_general_stats.txt"
+        ),
     log:
-        logging_folder + "QC/multiqc_raw_reads_report.txt",
+        os.path.join(dir.logs, "QC", "multiqc_raw_reads_report.txt"),
     priority: 100
     shell:
         """