Skip to content

Commit

Permalink
chore: Update design file reference in config files
Browse files Browse the repository at this point in the history
Adding check of reference design file
  • Loading branch information
visze committed Jul 19, 2024
1 parent 6dcad7d commit 60806e9
Show file tree
Hide file tree
Showing 15 changed files with 229 additions and 59 deletions.
6 changes: 3 additions & 3 deletions config/example_assignment_bwa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ assignments:
alignment_tool:
tool: bwa # bwa or exact
configs:
min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design file
min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design_file
sequence_length: # sequence length of design excluding adapters.
min: 166
max: 175
alignment_start: # start of an alignment in the reference. Here using 15 bp adapters. Can be different when using adapter free approaches
alignment_start: # start of an alignment in the reference/design_file. Here using 15 bp adapters. Can be different when using adapter free approaches
min: 1 # integer
max: 3 # integer
FW:
Expand All @@ -21,6 +21,6 @@ assignments:
- resources/Assignment_BasiC/R2.fastq.gz
REV:
- resources/Assignment_BasiC/R3.fastq.gz
reference: resources/design.fa
design_file: resources/design.fa
configs:
default: {} # name of an example filtering config
4 changes: 2 additions & 2 deletions config/example_assignment_exact_lazy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ assignments:
tool: exact # bwa or exact
configs:
sequence_length: 170 # sequence length of design excluding adapters.
alignment_start: 1 # start of the alignment in the reference
alignment_start: 1 # start of the alignment in the reference/design_file
FW:
- resources/Assignment_BasiC/R1.fastq.gz
BC:
- resources/Assignment_BasiC/R2.fastq.gz
REV:
- resources/Assignment_BasiC/R3.fastq.gz
reference: resources/design.fa
design_file: resources/design.fa
configs:
lazy: # name of an example filtering config
min_support: 2 # default 3
Expand Down
4 changes: 2 additions & 2 deletions config/example_assignment_exact_linker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ assignments:
tool: exact # bwa or exact
configs:
sequence_length: 170 # sequence length of design excluding adapters.
alignment_start: 1 # start of the alignment in the reference
alignment_start: 1 # start of the alignment in the reference/design_file
FW:
- resources/Assignment_BasiC/R1.fastq.gz
REV:
- resources/Assignment_BasiC/R3.fastq.gz
reference: resources/design.fa
design_file: resources/design.fa
configs:
default: {} # name of an example filtering config
8 changes: 4 additions & 4 deletions config/example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ assignments:
tool: exact # bwa or exact
configs:
sequence_length: 170 # sequence length of design excluding adapters.
alignment_start: 1 # start of the alignment in the reference
# min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design file
alignment_start: 1 # start of the alignment in the reference/design_file
# min_mapping_quality: 1 # integer >=0 Please use 1 when you have oligos that differ by 1 base in your reference/design_file
# sequence_length: # sequence length of design excluding adapters.
# min: 195
# max: 205
# alignment_start: # start of an alignment in the reference. Here using 15 bp adapters. Can be different when using adapter free approaches
# alignment_start: # start of an alignment in the reference/design_file. Here using 15 bp adapters. Can be different when using adapter free approaches
# min: 15 # integer
# max: 17 # integer
FW:
Expand All @@ -23,7 +23,7 @@ assignments:
- resources/Assignment_BasiC/R2.fastq.gz
REV:
- resources/Assignment_BasiC/R3.fastq.gz
reference: resources/design.fa
design_file: resources/design.fa
configs:
default: {} # name of an example filtering config
experiments:
Expand Down
12 changes: 5 additions & 7 deletions docs/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@ Each assignment you want to process you have to giv him a name like :code:`examp
Configurations of the alignment tool selected.

:sequence_length (bwa):
Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify . :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the reference file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory this option enables designs with multiple sequence lengths.
Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify . :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the design file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory this option enables designs with multiple sequence lengths.
:alignment_start (bwa):
Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. We also recommend to vary this value a bit because the start might not be exact after the adapter. E.g. by +-1.
:min_mapping_quality (bwa):
(Optinal) Defines the minimum mapping quality (MAPQ) of the alinment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 0. Otherwise the default value of 1 is recommended.
:sequence_length (exact):
Defines the :code:`sequence_length` which is the length of a sequence alignment to an oligo in the reference file. Only one length design is supported.
Defines the :code:`sequence_length` which is the length of a sequence alignment to an oligo in the design file. Only one length design is supported.
:alignment_start (exact):
Defines the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases.

Expand Down Expand Up @@ -86,15 +86,15 @@ Each assignment you want to process you have to giv him a name like :code:`examp
:min_dovetailed_overlap:
(Optional) Minimum dovetailed overlap. Default is set to 10.

:reference:
:design_file:
Design file (full or relative path) in fasta format. The design file should contain the oligos in fasta format. The header should contain the oligo name and should be unique. The sequence should be the sequence of the oligo and must also be unique. When having multiple oligo names with the same sequence please merge them into one fasta entry. The oligo name later used to link barcode to oligo. The sequence is used to map the reads to the oligos. Adapters can be in the seuqence and therefore :code:`alignment_start` has to be adjusted.
:configs:
After mapping the reads to the design file and extracting the barcodes per oligo the configuration (using different names) can be used to generate multiple filtering and configuration settings of the final maq oligo to barcode. Each configuration is a dictionary with the following keys:

:min_support:
Minimum number of same BC that map to teh same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`3`.
Minimum number of same BC that map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`3`.
:fraction:
Minumum fraction of same BC that map to teh same oligo. E.g. :code:`0.7` means that at least 70% of the BC map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`0.7`.
Minumum fraction of same BC that map to the same oligo. E.g. :code:`0.7` means that at least 70% of the BC map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`0.7`.
:unknown_other:
(Optional) Shows not mapped BCs in the final output map. Not recommended to use as mapping file fore the experiment workflow. But can be usefull for debugging. Default is :code:`false`.
:ambigous:
Expand Down Expand Up @@ -123,8 +123,6 @@ The experiment workflow is configured in the :code:`experiments` section. Each e
Path to the experiment file. The full or relative path to the file should be used. The experiment file is a comma separated file and is decribed in the `Experiment file`_ section.
:demultiplex:
(Optional) If set to :code:`true` the reads are demultiplexed. This means that the reads are split into different files for each barcode. This is usefull for further analysis. Default is :code:`false`.
:design_file:
Design file (full or relative path) in fasta format. The design file should contain the oligos in fasta format. The header should contain the oligo name and should be unique. The sequence should be the sequence of the oligo and must also be unique. When having multiple oligo names with the same sequence please merge them into one fasta entry. Should be the same as :code:`reference` in the `Assignment workflow`_.
:label_file:
(Optional) Path to the label file. The full or relative path to the file should be used. The label file is a tab separated file and contais the oligo name and the label of it. The oligo name should be the same as in the design file. The label is used to group the oligos in the final output, e.g. for plotting.

Expand Down
3 changes: 3 additions & 0 deletions profiles/default/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ set-threads:
assignment_collect: 30
assignment_collectBCs: 20
set-resources:
assignment_check_design:
runtime: 240
slurm_partition: medium
assignment_hybridFWRead_get_reads_by_length:
runtime: 1140
mem: 2G
Expand Down
2 changes: 1 addition & 1 deletion resources/assoc_basic/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ assignments:
- data/SRR10800986_2.fastq.gz
REV:
- data/SRR10800986_3.fastq.gz
reference: data/design.fa
design_file: data/design.fa
configs:
default: {}
2 changes: 1 addition & 1 deletion resources/combined_basic/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ assignments:
- data/SRR10800986_2.fastq.gz
REV:
- data/SRR10800986_3.fastq.gz
reference: data/design.fa
design_file: data/design.fa
configs:
default: {}
experiments:
Expand Down
1 change: 1 addition & 0 deletions workflow/envs/python3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ dependencies:
- polars
- python
- pysam
- pyfastx
100 changes: 79 additions & 21 deletions workflow/rules/assignment.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,62 @@ include: "assignment/hybridFWRead.smk"
include: "assignment/statistic.smk"


rule assignment_check_design:
"""
Check if the design file is correct and no duplicated sequences are present (FW and reverse).
Also check if no duplicated headers and no illegal characters in header.
"""
conda:
"../envs/python3.yaml"
input:
design=lambda wc: config["assignments"][wc.assignment]["design_file"],
script=getScript("assignment/check_design_file.py"),
output:
temp("results/assignment/{assignment}/reference/reference.fa.fxi"),
touch("results/assignment/{assignment}/design_check.done"),
ref="results/assignment/{assignment}/reference/reference.fa",
params:
start=lambda wc: (
config["assignments"][wc.assignment]["alignment_tool"]["configs"][
"alignment_start"
]
if config["assignments"][wc.assignment]["alignment_tool"]["tool"]
== "exact"
else config["assignments"][wc.assignment]["alignment_tool"]["configs"][
"alignment_start"
]["max"]
),
length=lambda wc: (
config["assignments"][wc.assignment]["alignment_tool"]["configs"][
"sequence_length"
]
if config["assignments"][wc.assignment]["alignment_tool"]["tool"]
== "exact"
else config["assignments"][wc.assignment]["alignment_tool"]["configs"][
"sequence_length"
]["min"]
),
log:
log=temp("results/logs/assignment/check_design.{assignment}.log"),
err="results/assignment/{assignment}/design_check.err",
shell:
"""
trap "cat {log.err}" ERR
cp {input.design} {output.ref}
python {input.script} --input {input.design} --start {params.start} --length {params.length} > {log.log} 2> {log.err};
"""


rule assignment_fastq_split:
"""
Split the fastq files into n files for parallelisation.
n is given by split_read in the configuration file.
Runs only if the design file is correct.
"""
input:
lambda wc: getAssignmentRead(wc.assignment, wc.read),
fastq=lambda wc: getAssignmentRead(wc.assignment, wc.read),
check="results/assignment/{assignment}/design_check.done",
output:
temp(
expand(
Expand All @@ -43,7 +92,7 @@ rule assignment_fastq_split:
),
shell:
"""
fastqsplitter -i <(zcat {input}) -t 1 {params.files} &> {log}
fastqsplitter -i <(zcat {input.fastq}) -t 1 {params.files} &> {log}
"""


Expand All @@ -63,9 +112,11 @@ rule assignment_attach_idx:
"results/assignment/{assignment}/fastq/splits/{read}.split{split}.BCattached.fastq.gz"
),
params:
BC_rev_comp=lambda wc: "--reverse-complement"
if config["assignments"][wc.assignment]["BC_rev_comp"]
else "",
BC_rev_comp=lambda wc: (
"--reverse-complement"
if config["assignments"][wc.assignment]["BC_rev_comp"]
else ""
),
log:
temp("results/logs/assignment/attach_idx.{assignment}.{split}.{read}.log"),
shell:
Expand Down Expand Up @@ -125,14 +176,17 @@ rule assignment_collectBCs:
Get the barcodes.
"""
input:
lambda wc: expand(
"results/assignment/{{assignment}}/BCs/barcodes_exact.{split}.tsv",
lambda wc: (
expand(
"results/assignment/{{assignment}}/BCs/barcodes_exact.{split}.tsv",
split=range(0, getSplitNumber()),
)
if config["assignments"][wc.assignment]["alignment_tool"]["tool"]
== "exact"
else expand(
"results/assignment/{{assignment}}/BCs/barcodes_incl_other.{split}.tsv",
split=range(0, getSplitNumber()),
)
if config["assignments"][wc.assignment]["alignment_tool"]["tool"] == "exact"
else expand(
"results/assignment/{{assignment}}/BCs/barcodes_incl_other.{split}.tsv",
split=range(0, getSplitNumber()),
),
output:
"results/assignment/{assignment}/barcodes_incl_other.sorted.tsv.gz",
Expand Down Expand Up @@ -170,16 +224,20 @@ rule assignment_filter:
fraction=lambda wc: config["assignments"][wc.assignment]["configs"][
wc.assignment_config
]["fraction"],
unknown_other=lambda wc: "-o"
if config["assignments"][wc.assignment]["configs"][wc.assignment_config][
"unknown_other"
]
else "",
ambiguous=lambda wc: "-a"
if config["assignments"][wc.assignment]["configs"][wc.assignment_config][
"ambiguous"
]
else "",
unknown_other=lambda wc: (
"-o"
if config["assignments"][wc.assignment]["configs"][wc.assignment_config][
"unknown_other"
]
else ""
),
ambiguous=lambda wc: (
"-a"
if config["assignments"][wc.assignment]["configs"][wc.assignment_config][
"ambiguous"
]
else ""
),
bc_length=lambda wc: config["assignments"][wc.assignment]["bc_length"],
shell:
"""
Expand Down
5 changes: 3 additions & 2 deletions workflow/rules/assignment/hybridFWRead.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ rule assignment_hybridFWRead_get_reads_by_length:
conda:
"../../envs/default.yaml"
input:
lambda wc: config["assignments"][wc.assignment]["FW"],
fastq=lambda wc: config["assignments"][wc.assignment]["FW"],
check="results/assignment/{assignment}/design_check.done",
output:
FW_tmp=temp("results/assignment/{assignment}/fastq/FW.byLength.fastq"),
BC_tmp=temp("results/assignment/{assignment}/fastq/BC.byLength.fastq"),
Expand All @@ -26,7 +27,7 @@ rule assignment_hybridFWRead_get_reads_by_length:
+ 1,
shell:
"""
zcat {input} | \
zcat {input.fastq} | \
awk '{{if (NR%4==2 || NR%4==0){{
print substr($0,1,20) > "{output.BC_tmp}"; print substr($0,{params.insert_start}) > "{output.FW_tmp}"
}} else {{
Expand Down
11 changes: 5 additions & 6 deletions workflow/rules/assignment/mapping_bwa.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ rule assignment_bwa_ref:
conda:
"../../envs/bwa_samtools_picard_htslib.yaml"
input:
lambda wc: config["assignments"][wc.assignment]["reference"],
output:
ref="results/assignment/{assignment}/reference/reference.fa",
check="results/assignment/{assignment}/design_check.done",
output:
bwa=expand(
"results/assignment/{{assignment}}/reference/reference.fa.{ext}",
ext=["fai"] + assignment_bwa_dicts,
Expand All @@ -17,10 +17,9 @@ rule assignment_bwa_ref:
temp("results/logs/assignment/bwa_ref.{assignment}.log"),
shell:
"""
cat {input} | awk '{{gsub(/[\\]\\[]/,"_")}}$0' > {output.ref};
bwa index -a bwtsw {output.ref} &> {log};
samtools faidx {output.ref} &>> {log};
picard CreateSequenceDictionary REFERENCE={output.ref} OUTPUT={output.d} &>> {log}
bwa index -a bwtsw {input.ref} &> {log};
samtools faidx {input.ref} &>> {log};
picard CreateSequenceDictionary REFERENCE={input.ref} OUTPUT={output.d} &>> {log}
"""


Expand Down
11 changes: 6 additions & 5 deletions workflow/rules/assignment/mapping_exact.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,20 @@ rule assignment_mapping_exact_reference:
conda:
"../../envs/default.yaml"
input:
lambda wc: config["assignments"][wc.assignment]["reference"],
check="results/assignment/{assignment}/design_check.done",
ref="results/assignment/{assignment}/reference/reference.fa",
output:
"results/assignment/{assignment}/reference/reference_exact.fa",
log:
temp("results/logs/assignment/mapping_exact_reference.{assignment}.log"),
shell:
"""
paste <(
cat {input} | awk '{{if ($1 ~ /^>/) {{ gsub(/[\\]\\[]/,"_"); print substr($1,2)}}}}';
cat {input} | awk '{{if ($1 ~ /^>/) {{ gsub(/[\\]\\[]/,"_"); print substr($1,2)}}}}';
cat {input.ref} | awk '{{if ($1 ~ /^>/) {{ print substr($1,2)}}}}';
cat {input.ref} | awk '{{if ($1 ~ /^>/) {{ print substr($1,2)}}}}';
) <(
cat {input} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}';
cat {input} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}' | tr ACGTacgt TGCAtgca | rev;
cat {input.ref} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}';
cat {input.ref} | awk '{{if ($1 ~ /^[^>]/) {{ seq=seq$1}}; if ($1 ~ /^>/ && NR!=1) {{print seq; seq=""}}}} END {{print seq}}' | tr ACGTacgt TGCAtgca | rev;
) > {output}
"""

Expand Down
Loading

0 comments on commit 60806e9

Please sign in to comment.