From 4fd8426814ef5462ad337a0f5f7f9a680776343a Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Thu, 22 Jul 2021 22:42:53 +0200 Subject: [PATCH 01/12] speed up adapter trimming These are just suggestions to speed up the trimming step as it seems to be very slow. --- genome_grist/conf/Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index f1e78808..8b9f84a5 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -331,9 +331,9 @@ rule trim_adapters_wc: shell: """ fastp --in1 {input.r1} --in2 {input.r2} \ --detect_adapter_for_pe --qualified_quality_phred 4 \ - --length_required 25 --correction --thread {threads} \ + --length_required 25 --thread {threads} \ --json {output.json} --html {output.html} \ - --low_complexity_filter --stdout | gzip -9 > {output.interleaved} + --stdout | gzip --fast > {output.interleaved} """ # adapter trimming for the singleton reads @@ -353,8 +353,8 @@ rule trim_unpaired_adapters_wc: shell: """ fastp --in1 {input.unp} --out1 {output.unp} \ --detect_adapter_for_se --qualified_quality_phred 4 \ - --low_complexity_filter --thread {threads} \ - --length_required 25 --correction \ + --thread {threads} \ + --length_required 25 \ --json {output.json} --html {output.html} """ @@ -472,7 +472,7 @@ rule mpileup_wc: gunzip -c {input.query} > $genomefile bcftools mpileup -Ou -f $genomefile {input.bam} | bcftools call -mv -Ob -o {output.bcf} rm $genomefile - bcftools view {output.bcf} | bgzip > {output.vcf} + bcftools view {output.bcf} | b -9 > {output.vcf} bcftools index {output.vcf} """ From 94e8631d83c98a05a3a674210745bf7509007b3f Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Fri, 23 Jul 2021 16:23:19 +0200 Subject: [PATCH 02/12] Fix a typo --- genome_grist/conf/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index 8b9f84a5..42c795b2 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -472,7 +472,7 @@ rule mpileup_wc: gunzip -c {input.query} > $genomefile bcftools mpileup -Ou -f $genomefile {input.bam} | bcftools call -mv -Ob -o {output.bcf} rm $genomefile - bcftools view {output.bcf} | b -9 > {output.vcf} + bcftools view {output.bcf} | bgzip -1 > {output.vcf} bcftools index {output.vcf} """ From d1fa68ac1113b07eb46be81b2b4751511f5be992 Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Fri, 23 Jul 2021 16:29:59 +0200 Subject: [PATCH 03/12] final fix - revert back --- genome_grist/conf/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index 42c795b2..6d32c724 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -472,7 +472,7 @@ rule mpileup_wc: gunzip -c {input.query} > $genomefile bcftools mpileup -Ou -f $genomefile {input.bam} | bcftools call -mv -Ob -o {output.bcf} rm $genomefile - bcftools view {output.bcf} | bgzip -1 > {output.vcf} + bcftools view {output.bcf} | bgzip > {output.vcf} bcftools index {output.vcf} """ From a2218c16df6d8d109ff008b06d9223026bdf6041 Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Wed, 19 Jan 2022 13:08:55 +0200 Subject: [PATCH 04/12] add fastp config --- genome_grist/conf/Snakefile | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index c268b817..224ff111 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -90,6 +90,18 @@ def make_param_str(ksizes, scaled): ### +# handle `fastp` params. +def handle_fastp(): + param_string = "" + config_correction = config.get("fastp_correction", '') + config_low_complexity = config.get("fastp_low_complexity", '') + correction_flag = "--correction" if correction in ["ON", '1'] else '' + low_complexity_flag = "--low_complexity_filter" if config_low_complexity in ["ON", '1'] else '' + params_string = f"{correction_flag} {low_complexity_flag}" + return param_string + +### + # utility function def load_csv(filename): with open(filename, "r", newline="") as fp: @@ -493,6 +505,8 @@ rule trim_adapters_wc: interleaved = protected(outdir + '/trim/{sample}.trim.fq.gz'), json=outdir + "/trim/{sample}.trim.json", html=outdir + "/trim/{sample}.trim.html", + params: + extra_params = handle_fastp() conda: 'env/trim.yml' threads: 4 resources: @@ -502,9 +516,15 @@ rule trim_adapters_wc: shell: """ fastp --in1 {input.r1} --in2 {input.r2} \ --detect_adapter_for_pe --qualified_quality_phred 4 \ +<<<<<<< Updated upstream --length_required 25 --thread {threads} \ --json {output.json} --html {output.html} \ --stdout | gzip --fast > {output.interleaved} +======= + --length_required 25 {params.extra_params} --thread {threads} \ + --json {output.json} --html {output.html} \ + --stdout | gzip -9 > {output.interleaved} +>>>>>>> Stashed changes """ # adapter trimming for the singleton reads From 31f2318bfad3590c8523b2d849324e9948c3dda7 Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Wed, 19 Jan 2022 13:51:44 +0200 Subject: [PATCH 05/12] handle_fastp() --- genome_grist/conf/Snakefile | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index 224ff111..4135d1b7 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -516,15 +516,9 @@ rule trim_adapters_wc: shell: """ fastp --in1 {input.r1} --in2 {input.r2} \ --detect_adapter_for_pe --qualified_quality_phred 4 \ -<<<<<<< Updated upstream - --length_required 25 --thread {threads} \ - --json {output.json} --html {output.html} \ - --stdout | gzip --fast > {output.interleaved} -======= --length_required 25 {params.extra_params} --thread {threads} \ --json {output.json} --html {output.html} \ --stdout | gzip -9 > {output.interleaved} ->>>>>>> Stashed changes """ # adapter trimming for the singleton reads @@ -535,6 +529,8 @@ rule trim_unpaired_adapters_wc: unp = protected(outdir + '/trim/{sample}_unpaired.trim.fq.gz'), json = protected(outdir + '/trim/{sample}_unpaired.trim.json'), html = protected(outdir + '/trim/{sample}_unpaired.trim.html'), + params: + extra_params = handle_fastp() threads: 4 resources: mem_mb=5000, @@ -543,7 +539,7 @@ rule trim_unpaired_adapters_wc: conda: 'env/trim.yml' shell: """ fastp --in1 {input.unp} --out1 {output.unp} \ - --detect_adapter_for_se --qualified_quality_phred 4 \ + --detect_adapter_for_se {params.extra_params} --qualified_quality_phred 4 \ --thread {threads} \ --length_required 25 \ --json {output.json} --html {output.html} From 1c915cb590e3c7961ca9433f96f50355eb2dd66a Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Wed, 19 Jan 2022 13:56:43 +0200 Subject: [PATCH 06/12] update docs --- doc/configuring.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/configuring.md b/doc/configuring.md index 41ee2d4e..03b9789f 100644 --- a/doc/configuring.md +++ b/doc/configuring.md @@ -363,6 +363,16 @@ genbank_cache: ./genbank_cache # the default is set for the all-Genbank database. # DEFAULT: 100e9 prefetch_memory: 100e9 + + +### OTHER PARAMETERS + +## Error trimming flags +# fastp_correction: set to ON or 1 for base correction for PE data +fastp_correction: OFF +# fastp_low_complexity: set to ON or 1 for applying low complexity filter +fastp_low_complexity: OFF + ``` ## More advanced genome-grist usage From 5ae93495a712dfe9626c719087e2b334b3e7d6b5 Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Wed, 19 Jan 2022 14:30:47 +0200 Subject: [PATCH 07/12] minor fix --- genome_grist/conf/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index 4135d1b7..9c2915b5 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -95,7 +95,7 @@ def handle_fastp(): param_string = "" config_correction = config.get("fastp_correction", '') config_low_complexity = config.get("fastp_low_complexity", '') - correction_flag = "--correction" if correction in ["ON", '1'] else '' + correction_flag = "--correction" if config_correction in ["ON", '1'] else '' low_complexity_flag = "--low_complexity_filter" if config_low_complexity in ["ON", '1'] else '' params_string = f"{correction_flag} {low_complexity_flag}" return param_string From bc29589bdd926df887d5c9172f9ac7e94e580f22 Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Wed, 19 Jan 2022 15:36:59 +0200 Subject: [PATCH 08/12] boolean yaml on/off --- genome_grist/conf/Snakefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index 9c2915b5..36051285 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -93,10 +93,11 @@ def make_param_str(ksizes, scaled): # handle `fastp` params. def handle_fastp(): param_string = "" - config_correction = config.get("fastp_correction", '') - config_low_complexity = config.get("fastp_low_complexity", '') - correction_flag = "--correction" if config_correction in ["ON", '1'] else '' - low_complexity_flag = "--low_complexity_filter" if config_low_complexity in ["ON", '1'] else '' + # OFF/ON is processed as boolean by default. + config_correction = config.get("fastp_correction", 'OFF') + config_low_complexity = config.get("fastp_low_complexity", 'OFF') + correction_flag = "--correction" if config_correction else '' + low_complexity_flag = "--low_complexity_filter" if config_low_complexity else '' params_string = f"{correction_flag} {low_complexity_flag}" return param_string From 6f94c9763719bb199cb0bd73b3214d92e7dda0c7 Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Wed, 19 Jan 2022 15:55:40 +0200 Subject: [PATCH 09/12] correct params handling --- genome_grist/conf/Snakefile | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index 36051285..48ee0fb3 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -92,14 +92,12 @@ def make_param_str(ksizes, scaled): # handle `fastp` params. def handle_fastp(): - param_string = "" - # OFF/ON is processed as boolean by default. config_correction = config.get("fastp_correction", 'OFF') config_low_complexity = config.get("fastp_low_complexity", 'OFF') - correction_flag = "--correction" if config_correction else '' - low_complexity_flag = "--low_complexity_filter" if config_low_complexity else '' + correction_flag = "--correction" if config_correction in [True, '1'] else '' + low_complexity_flag = "--low_complexity_filter" if config_low_complexity in [True, '1'] else '' params_string = f"{correction_flag} {low_complexity_flag}" - return param_string + return params_string ### From 94a0891ffbce6ec8ea9760ff80b98eca9330e5ef Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Wed, 19 Jan 2022 22:45:29 +0200 Subject: [PATCH 10/12] abundtrim_cutoff as config --- doc/configuring.md | 3 +++ genome_grist/conf/Snakefile | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/configuring.md b/doc/configuring.md index 03b9789f..7091325a 100644 --- a/doc/configuring.md +++ b/doc/configuring.md @@ -373,6 +373,9 @@ fastp_correction: OFF # fastp_low_complexity: set to ON or 1 for applying low complexity filter fastp_low_complexity: OFF +## k-mers abundance trimming +# remove k-mers below this abundance +abundtrim_cutoff: 3 ``` ## More advanced genome-grist usage diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index 48ee0fb3..d6c7358d 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -39,7 +39,7 @@ if not base_tempdir: sys.exit(-1) ABUNDTRIM_MEMORY = float(config.get('metagenome_trim_memory', '1e9')) - +ABUNDTRIM_CUTOFF = config.get("abundtrim_cutoff", '3') GENBANK_CACHE = config.get('genbank_cache', './genbank_cache/') GENBANK_CACHE = os.path.normpath(GENBANK_CACHE) @@ -555,8 +555,9 @@ rule kmer_trim_reads_wc: mem_mb = int(ABUNDTRIM_MEMORY / 1e6), params: mem = ABUNDTRIM_MEMORY, + cutoff = ABUNDTRIM_CUTOFF, shell: """ - trim-low-abund.py -C 3 -Z 18 -M {params.mem} -V \ + trim-low-abund.py -C {params.cutoff} -Z 18 -M {params.mem} -V \ {input.interleaved} -o {output} --gzip """ From 34b99c1c7c04a82398f03050c612e155bd39c59f Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Sat, 22 Jan 2022 10:43:33 +0200 Subject: [PATCH 11/12] Update doc/configuring.md Co-authored-by: C. Titus Brown --- doc/configuring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/configuring.md b/doc/configuring.md index 7091325a..12df7b2d 100644 --- a/doc/configuring.md +++ b/doc/configuring.md @@ -368,7 +368,7 @@ prefetch_memory: 100e9 ### OTHER PARAMETERS ## Error trimming flags -# fastp_correction: set to ON or 1 for base correction for PE data +# fastp_correction: set to ON or 1 for base correction when overlapping PE are present fastp_correction: OFF # fastp_low_complexity: set to ON or 1 for applying low complexity filter fastp_low_complexity: OFF From 8f7180f59e07128c8f9c947e9fca1ca9ba2a24d8 Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Sat, 22 Jan 2022 10:44:38 +0200 Subject: [PATCH 12/12] Update doc/configuring.md Co-authored-by: C. Titus Brown --- doc/configuring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/configuring.md b/doc/configuring.md index 12df7b2d..510138a6 100644 --- a/doc/configuring.md +++ b/doc/configuring.md @@ -374,7 +374,7 @@ fastp_correction: OFF fastp_low_complexity: OFF ## k-mers abundance trimming -# remove k-mers below this abundance +# remove k-mers below this abundance from high-abundance reads; does not affect low-abundance reads abundtrim_cutoff: 3 ```