chore: Update design file reference in config files

Adding check of reference design file
kircherlab · Jul 19, 2024 · 60806e9 · 60806e9
1 parent 6dcad7d
commit 60806e9
Show file tree

Hide file tree

Showing 15 changed files with 229 additions and 59 deletions.
diff --git a/config/example_assignment_bwa.yaml b/config/example_assignment_bwa.yaml
@@ -8,11 +8,11 @@ assignments:
     alignment_tool:
       tool: bwa # bwa or exact
       configs:
-        min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design file
+        min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design_file
         sequence_length: # sequence length of design excluding adapters.
           min: 166
           max: 175
-        alignment_start: # start of an alignment in the reference. Here using 15 bp adapters. Can be different when using adapter free approaches
+        alignment_start: # start of an alignment in the reference/design_file. Here using 15 bp adapters. Can be different when using adapter free approaches
           min: 1 # integer
           max: 3 # integer
     FW:
@@ -21,6 +21,6 @@ assignments:
       - resources/Assignment_BasiC/R2.fastq.gz
     REV:
       - resources/Assignment_BasiC/R3.fastq.gz
-    reference: resources/design.fa
+    design_file: resources/design.fa
     configs:
       default: {} # name of an example filtering config
diff --git a/config/example_assignment_exact_lazy.yaml b/config/example_assignment_exact_lazy.yaml
@@ -9,14 +9,14 @@ assignments:
       tool: exact # bwa or exact
       configs:
         sequence_length: 170 # sequence length of design excluding adapters.
-        alignment_start: 1 # start of the alignment in the reference
+        alignment_start: 1 # start of the alignment in the reference/design_file
     FW:
       - resources/Assignment_BasiC/R1.fastq.gz
     BC:
       - resources/Assignment_BasiC/R2.fastq.gz
     REV:
       - resources/Assignment_BasiC/R3.fastq.gz
-    reference: resources/design.fa
+    design_file: resources/design.fa
     configs:
       lazy: # name of an example filtering config
         min_support: 2 # default 3

diff --git a/config/example_assignment_exact_linker.yaml b/config/example_assignment_exact_linker.yaml
@@ -11,11 +11,11 @@ assignments:
       tool: exact # bwa or exact
       configs:
         sequence_length: 170 # sequence length of design excluding adapters.
-        alignment_start: 1 # start of the alignment in the reference
+        alignment_start: 1 # start of the alignment in the reference/design_file
     FW:
       - resources/Assignment_BasiC/R1.fastq.gz
     REV:
       - resources/Assignment_BasiC/R3.fastq.gz
-    reference: resources/design.fa
+    design_file: resources/design.fa
     configs:
       default: {} # name of an example filtering config
diff --git a/config/example_config.yaml b/config/example_config.yaml
@@ -9,12 +9,12 @@ assignments:
       tool: exact # bwa or exact
       configs:
         sequence_length: 170 # sequence length of design excluding adapters.
-        alignment_start: 1 # start of the alignment in the reference
-        # min_mapping_quality: 1  # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design file
+        alignment_start: 1 # start of the alignment in the reference/design_file
+        # min_mapping_quality: 1  # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design_file
         # sequence_length:  # sequence length of design excluding adapters.
         #   min: 195
         #   max: 205
-        # alignment_start:  # start of an alignment in the reference. Here using 15 bp adapters. Can be different when using adapter free approaches
+        # alignment_start:  # start of an alignment in the reference/design_file. Here using 15 bp adapters. Can be different when using adapter free approaches
         #   min: 15  # integer
         #   max: 17  # integer
     FW:
@@ -23,7 +23,7 @@ assignments:
       - resources/Assignment_BasiC/R2.fastq.gz
     REV:
       - resources/Assignment_BasiC/R3.fastq.gz
-    reference: resources/design.fa
+    design_file: resources/design.fa
     configs:
       default: {} # name of an example filtering config
 experiments:

diff --git a/docs/config.rst b/docs/config.rst
@@ -52,13 +52,13 @@ Each assignment you want to process you have to giv him a name like :code:`examp
         Configurations of the alignment tool selected.
 
         :sequence_length (bwa):
-            Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify . :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the reference file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory this option enables designs with multiple sequence lengths.
+            Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify . :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the design file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory this option enables designs with multiple sequence lengths.
         :alignment_start (bwa):
             Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. We also recommend to vary this value a bit because the start might not be exact after the adapter. E.g. by +-1.
         :min_mapping_quality (bwa):
             (Optinal) Defines the minimum mapping quality (MAPQ) of the alinment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 0. Otherwise the default value of 1 is recommended. 
         :sequence_length (exact):
-            Defines the :code:`sequence_length` which is the length of a sequence alignment to an oligo in the reference file. Only one length design is supported.
+            Defines the :code:`sequence_length` which is the length of a sequence alignment to an oligo in the design file. Only one length design is supported.
         :alignment_start (exact):
             Defines the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases.
 
@@ -86,15 +86,15 @@ Each assignment you want to process you have to giv him a name like :code:`examp
     :min_dovetailed_overlap:
         (Optional) Minimum dovetailed overlap. Default is set to 10.
 
-:reference:
+:design_file:
     Design file (full or relative path) in fasta format. The design file should contain the oligos in fasta format. The header should contain the oligo name and should be unique. The sequence should be the sequence of the oligo and must also be unique. When having multiple oligo names with the same sequence please merge them into one fasta entry. The oligo name later used to link barcode to oligo. The sequence is used to map the reads to the oligos. Adapters can be in the seuqence and therefore :code:`alignment_start` has to be adjusted.
 :configs:
     After mapping the reads to the design file and extracting the barcodes per oligo the configuration (using different names) can be used to generate multiple filtering and configuration settings of the final maq oligo to barcode. Each configuration is a dictionary with the following keys:
 
     :min_support:
-        Minimum number of same BC that map to teh same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`3`.
+        Minimum number of same BC that map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`3`.
     :fraction:
-        Minumum fraction of same BC that map to teh same oligo. E.g. :code:`0.7` means that at least 70% of the BC map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`0.7`.
+        Minumum fraction of same BC that map to the same oligo. E.g. :code:`0.7` means that at least 70% of the BC map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`0.7`.
     :unknown_other:
             (Optional) Shows not mapped BCs in the final output map. Not recommended to use as mapping file fore the experiment workflow. But can be usefull for debugging. Default is :code:`false`. 
     :ambigous:
@@ -123,8 +123,6 @@ The experiment workflow is configured in the :code:`experiments` section. Each e
     Path to the experiment file. The full or relative path to the file should be used. The experiment file is a comma separated file and is decribed in the `Experiment file`_ section.
 :demultiplex:
     (Optional) If set to :code:`true` the reads are demultiplexed. This means that the reads are split into different files for each barcode. This is usefull for further analysis. Default is :code:`false`.
-:design_file:
-    Design file (full or relative path) in fasta format. The design file should contain the oligos in fasta format. The header should contain the oligo name and should be unique. The sequence should be the sequence of the oligo and must also be unique. When having multiple oligo names with the same sequence please merge them into one fasta entry. Should be the same as :code:`reference` in the `Assignment workflow`_.
 :label_file:
     (Optional) Path to the label file. The full or relative path to the file should be used. The label file is a tab separated file and contais the oligo name and the label of it. The oligo name should be the same as in the design file. The label is used to group the oligos in the final output, e.g. for plotting. 
 

diff --git a/profiles/default/config.yaml b/profiles/default/config.yaml
@@ -15,6 +15,9 @@ set-threads:
   assignment_collect: 30
   assignment_collectBCs: 20
 set-resources:
+  assignment_check_design:
+    runtime: 240
+    slurm_partition: medium
   assignment_hybridFWRead_get_reads_by_length:
     runtime: 1140
     mem: 2G

diff --git a/resources/assoc_basic/config.yml b/resources/assoc_basic/config.yml
@@ -20,6 +20,6 @@ assignments:
       - data/SRR10800986_2.fastq.gz
     REV:
       - data/SRR10800986_3.fastq.gz
-    reference: data/design.fa
+    design_file: data/design.fa
     configs:
       default: {}
diff --git a/resources/combined_basic/config.yml b/resources/combined_basic/config.yml
@@ -20,7 +20,7 @@ assignments:
       - data/SRR10800986_2.fastq.gz
     REV:
       - data/SRR10800986_3.fastq.gz
-    reference: data/design.fa
+    design_file: data/design.fa
     configs:
       default: {}
 experiments:

diff --git a/workflow/envs/python3.yaml b/workflow/envs/python3.yaml
@@ -13,3 +13,4 @@ dependencies:
   - polars
   - python
   - pysam
+  - pyfastx
diff --git a/workflow/rules/assignment.smk b/workflow/rules/assignment.smk
@@ -11,13 +11,62 @@ include: "assignment/hybridFWRead.smk"
 include: "assignment/statistic.smk"
 
 
+rule assignment_check_design:
+    """
+    Check if the design file is correct and no duplicated sequences are present (FW and reverse).
+    Also check if no duplicated headers and no illegal characters in header.
+    """
+    conda:
+        "../envs/python3.yaml"
+    input:
+        design=lambda wc: config["assignments"][wc.assignment]["design_file"],
+        script=getScript("assignment/check_design_file.py"),
+    output:
+        temp("results/assignment/{assignment}/reference/reference.fa.fxi"),
+        touch("results/assignment/{assignment}/design_check.done"),
+        ref="results/assignment/{assignment}/reference/reference.fa",
+    params:
+        start=lambda wc: (
+            config["assignments"][wc.assignment]["alignment_tool"]["configs"][
+                "alignment_start"
+            ]
+            if config["assignments"][wc.assignment]["alignment_tool"]["tool"]
+            == "exact"
+            else config["assignments"][wc.assignment]["alignment_tool"]["configs"][
+                "alignment_start"
+            ]["max"]
+        ),
+        length=lambda wc: (
+            config["assignments"][wc.assignment]["alignment_tool"]["configs"][
+                "sequence_length"
+            ]
+            if config["assignments"][wc.assignment]["alignment_tool"]["tool"]
+            == "exact"
+            else config["assignments"][wc.assignment]["alignment_tool"]["configs"][
+                "sequence_length"
+            ]["min"]
+        ),
+    log:
+        log=temp("results/logs/assignment/check_design.{assignment}.log"),
+        err="results/assignment/{assignment}/design_check.err",
+    shell:
+        """
+        trap "cat {log.err}" ERR
+        cp {input.design} {output.ref}
+        python {input.script} --input {input.design} --start {params.start} --length {params.length} > {log.log} 2> {log.err};
+        """
+
+
 rule assignment_fastq_split:
     """
     Split the fastq files into n files for parallelisation. 
     n is given by split_read in the configuration file.
+
+    Runs only if the design file is correct.
     """
     input:
-        lambda wc: getAssignmentRead(wc.assignment, wc.read),
+        fastq=lambda wc: getAssignmentRead(wc.assignment, wc.read),
+        check="results/assignment/{assignment}/design_check.done",
     output:
         temp(
             expand(
@@ -43,7 +92,7 @@ rule assignment_fastq_split:
         ),
     shell:
         """
-        fastqsplitter -i <(zcat {input}) -t 1 {params.files} &> {log}
+        fastqsplitter -i <(zcat {input.fastq}) -t 1 {params.files} &> {log}
         """
 
 
@@ -63,9 +112,11 @@ rule assignment_attach_idx:
             "results/assignment/{assignment}/fastq/splits/{read}.split{split}.BCattached.fastq.gz"
         ),
     params:
-        BC_rev_comp=lambda wc: "--reverse-complement"
-        if config["assignments"][wc.assignment]["BC_rev_comp"]
-        else "",
+        BC_rev_comp=lambda wc: (
+            "--reverse-complement"
+            if config["assignments"][wc.assignment]["BC_rev_comp"]
+            else ""
+        ),
     log:
         temp("results/logs/assignment/attach_idx.{assignment}.{split}.{read}.log"),
     shell:
@@ -125,14 +176,17 @@ rule assignment_collectBCs:
     Get the barcodes.
     """
     input:
-        lambda wc: expand(
-            "results/assignment/{{assignment}}/BCs/barcodes_exact.{split}.tsv",
+        lambda wc: (
+            expand(
+                "results/assignment/{{assignment}}/BCs/barcodes_exact.{split}.tsv",
+                    split=range(0, getSplitNumber()),
+                )
+                if config["assignments"][wc.assignment]["alignment_tool"]["tool"]
+            == "exact"
+            else expand(
+                "results/assignment/{{assignment}}/BCs/barcodes_incl_other.{split}.tsv",
                 split=range(0, getSplitNumber()),
             )
-            if config["assignments"][wc.assignment]["alignment_tool"]["tool"] == "exact"
-        else expand(
-            "results/assignment/{{assignment}}/BCs/barcodes_incl_other.{split}.tsv",
-            split=range(0, getSplitNumber()),
         ),
     output:
         "results/assignment/{assignment}/barcodes_incl_other.sorted.tsv.gz",
@@ -170,16 +224,20 @@ rule assignment_filter:
         fraction=lambda wc: config["assignments"][wc.assignment]["configs"][
             wc.assignment_config
         ]["fraction"],
-        unknown_other=lambda wc: "-o"
-        if config["assignments"][wc.assignment]["configs"][wc.assignment_config][
-            "unknown_other"
-        ]
-        else "",
-        ambiguous=lambda wc: "-a"
-        if config["assignments"][wc.assignment]["configs"][wc.assignment_config][
-            "ambiguous"
-        ]
-        else "",
+        unknown_other=lambda wc: (
+            "-o"
+            if config["assignments"][wc.assignment]["configs"][wc.assignment_config][
+                "unknown_other"
+            ]
+            else ""
+        ),
+        ambiguous=lambda wc: (
+            "-a"
+            if config["assignments"][wc.assignment]["configs"][wc.assignment_config][
+                "ambiguous"
+            ]
+            else ""
+        ),
         bc_length=lambda wc: config["assignments"][wc.assignment]["bc_length"],
     shell:
         """

diff --git a/workflow/rules/assignment/hybridFWRead.smk b/workflow/rules/assignment/hybridFWRead.smk
@@ -11,7 +11,8 @@ rule assignment_hybridFWRead_get_reads_by_length:
     conda:
         "../../envs/default.yaml"
     input:
-        lambda wc: config["assignments"][wc.assignment]["FW"],
+        fastq=lambda wc: config["assignments"][wc.assignment]["FW"],
+        check="results/assignment/{assignment}/design_check.done",
     output:
         FW_tmp=temp("results/assignment/{assignment}/fastq/FW.byLength.fastq"),
         BC_tmp=temp("results/assignment/{assignment}/fastq/BC.byLength.fastq"),
@@ -26,7 +27,7 @@ rule assignment_hybridFWRead_get_reads_by_length:
         + 1,
     shell:
         """
-        zcat {input} | \
+        zcat {input.fastq} | \
         awk '{{if (NR%4==2 || NR%4==0){{
                 print substr($0,1,20) > "{output.BC_tmp}"; print substr($0,{params.insert_start}) > "{output.FW_tmp}"
             }} else {{

diff --git a/workflow/rules/assignment/mapping_bwa.smk b/workflow/rules/assignment/mapping_bwa.smk
@@ -5,9 +5,9 @@ rule assignment_bwa_ref:
     conda:
         "../../envs/bwa_samtools_picard_htslib.yaml"
     input:
-        lambda wc: config["assignments"][wc.assignment]["reference"],
-    output:
         ref="results/assignment/{assignment}/reference/reference.fa",
+        check="results/assignment/{assignment}/design_check.done",
+    output:
         bwa=expand(
             "results/assignment/{{assignment}}/reference/reference.fa.{ext}",
             ext=["fai"] + assignment_bwa_dicts,
@@ -17,10 +17,9 @@ rule assignment_bwa_ref:
         temp("results/logs/assignment/bwa_ref.{assignment}.log"),
     shell:
         """
-        cat {input} | awk '{{gsub(/[\\]\\[]/,"_")}}$0' > {output.ref};
-        bwa index -a bwtsw {output.ref} &> {log};
-        samtools faidx {output.ref} &>> {log};
-        picard CreateSequenceDictionary REFERENCE={output.ref} OUTPUT={output.d} &>> {log}
+        bwa index -a bwtsw {input.ref} &> {log};
+        samtools faidx {input.ref} &>> {log};
+        picard CreateSequenceDictionary REFERENCE={input.ref} OUTPUT={output.d} &>> {log}
         """
 
 

diff --git a/workflow/rules/assignment/mapping_exact.smk b/workflow/rules/assignment/mapping_exact.smk
@@ -5,19 +5,20 @@ rule assignment_mapping_exact_reference:
     conda:
         "../../envs/default.yaml"
     input:
-        lambda wc: config["assignments"][wc.assignment]["reference"],
+        check="results/assignment/{assignment}/design_check.done",
+        ref="results/assignment/{assignment}/reference/reference.fa",
     output:
         "results/assignment/{assignment}/reference/reference_exact.fa",
     log:
         temp("results/logs/assignment/mapping_exact_reference.{assignment}.log"),
     shell:
         """
         paste <(
-            cat {input} | awk '{{if ($1 ~ /^>/) {{ gsub(/[\\]\\[]/,"_"); print substr($1,2)}}}}';
-            cat {input} | awk '{{if ($1 ~ /^>/) {{ gsub(/[\\]\\[]/,"_"); print substr($1,2)}}}}';
+            cat {input.ref} | awk '{{if ($1 ~ /^>/) {{ print substr($1,2)}}}}';
+            cat {input.ref} | awk '{{if ($1 ~ /^>/) {{ print substr($1,2)}}}}';
         ) <(
-            cat {input} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}';
-            cat {input} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}' | tr ACGTacgt TGCAtgca | rev;
+            cat {input.ref} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}';
+            cat {input.ref} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}' | tr ACGTacgt TGCAtgca | rev;
         ) > {output}
         """
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ dependencies: @@
       - polars
       - python
       - pysam
+      - pyfastx