Skip to content

Commit

Permalink
Add Prokka to identify ncRNAs for QC purposes; add prokka summary to …
Browse files Browse the repository at this point in the history
…bin summary process; refactor bin summary code
  • Loading branch information
Jim Downie committed Dec 19, 2024
1 parent 9950440 commit 62bf7a4
Show file tree
Hide file tree
Showing 16 changed files with 199 additions and 310 deletions.
68 changes: 45 additions & 23 deletions bin/bin_summary.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
#!/usr/bin/env Rscript

## Collates the various bin-level summary TSV files exported at
## various points in the longreadmag pipeline run
##
## Author: Jim Downie, 2024

library(optparse)
library(tidyverse)

Expand All @@ -10,7 +15,7 @@ parser <- add_option(
type = "character",
action = "store",
default = NULL,
help = "Comma-separated list of TSV files output by seqkit stats",
help = "Comma-separated list of TSV files output by seqkit stats.",
metavar="filename"
)

Expand All @@ -20,7 +25,7 @@ parser <- add_option(
type = "character",
action = "store",
default = NULL,
help = "Comma-separated list of TSV files output by checkm2 predict",
help = "Comma-separated list of TSV files output by checkm2 predict.",
metavar="filename"
)

Expand All @@ -30,7 +35,17 @@ parser <- add_option(
type = "character",
action = "store",
default = NULL,
help = "Comma-separated list of TSV files output by GTDB-Tk",
help = "Comma-separated list of TSV files output by GTDB-Tk.",
metavar="filename"
)

parser <- add_option(
object = parser,
opt_str = c("-r", "--rrnas"),
type = "character",
action = "store",
default = NULL,
help = "Comma-separated list of TSV files output by GAWK_PROKKA_SUMMARY.",
metavar="filename"
)

Expand Down Expand Up @@ -66,6 +81,10 @@ parser <- add_option(

input <- parse_args(parser)

## Functions to read in summary information about each bin
## Each should be named in the format "read_X", and X
## should be the full name of one of the arguments defined in
## the optparse section
read_stats <- function(file) {
df <- read_tsv(file) |>
mutate(
Expand All @@ -78,7 +97,7 @@ read_stats <- function(file) {
return(df)
}

read_checkm <- function(file) {
read_checkm2 <- function(file) {
df <- read_tsv(file) |>
select(bin = Name,
completeness = Completeness,
Expand All @@ -91,7 +110,7 @@ read_checkm <- function(file) {

read_taxonomy <- function(file) {
df <- read_tsv(file)
if(ncol(df) > 3) {
if(ncol(df) == 3) {
df <- select(df,
bin = `Genome ID`,
gtdb_classification = `GTDB classification`,
Expand All @@ -110,28 +129,31 @@ read_taxonomy <- function(file) {
return(df)
}

data <- list()
if(rlang::has_name(input, "stats")) {
stats_files <- unlist(str_split(input$stats, ","))
stats_df <- map(stats_files, read_stats) |> list_rbind()
data <- c(data, list(stats_df))
} else {
stop("Error: no stats file provided!")
}

if(rlang::has_name(input, "checkm2")) {
checkm_files <- unlist(str_split(input$checkm2, ","))
checkm_df <- map(checkm_files, read_checkm) |> list_rbind()
data <- c(data, list(checkm_df))
read_rrnas <- function(file) {
df <- read_tsv(file)
return(df)
}

if(rlang::has_name(input, "taxonomy")) {
tax_files <- unlist(str_split(input$taxonomy, ","))
tax_df <- map(tax_files, read_taxonomy) |> list_rbind()
data <- c(data, list(tax_df))
## Takes the arg input list and a defined input type
## Check if the arg has been passed, then split the string into
## filenames, read them, and call the relevant
## read_X function
split_and_read <- function(input, input_type) {
if(!is.null(pluck(input, input_type))){
function_name <- paste0("read_", input_type)
files <- unlist(str_split(pluck(input, input_type), ","))
df <- map(files, \(x) do.call(function_name, list(file = x))) |>
list_rbind()
return(df)
}
}

summary <- reduce(data, \(x, y) left_join(x, y, by = "bin"))
## Map across all input types, read them, discard any that weren't provided
## and then bind them all together by bin
input_types <- c("stats", "checkm2", "taxonomy", "rrnas")
summary <- map(input_types, \(x) split_and_read(input, x)) |>
discard(is.null) |>
reduce(\(x, y) left_join(x, y, by = "bin"))

write_tsv(summary, glue::glue("{input$prefix}.bin_summary.tsv"))

Expand Down
34 changes: 34 additions & 0 deletions bin/prokka_summary.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
## Summarise information about tRNA and rRNA contents
## given by Prokka for a genome bin
##
## Author: Jim Downie, 2024

BEGIN {
FS = "\t"
OFS = FS
print "bin", "total_trnas", "unique_trnas", "rrna_23s", "rrna_16s", "rrna_5s"
}
BEGINFILE {
bin = FILENAME
sub(".*/", "", bin)
sub(/\.[^\.]+$/, "", bin)
total_trnas = 0
unique_trnas = 0
pos_23s = 0
pos_16s = 0
pos_5s = 0
}
$2 == "tRNA" {
total_trnas++
trna_arr[$7] = 1
}
$2 == "rRNA" {
if($7 == "23S ribosomal RNA") { pos_23s = 1 }
if($7 == "16S ribosomal RNA") { pos_16s = 1 }
if($7 == "5S ribosomal RNA") { pos_5s = 1 }
}
ENDFILE {
for (i in trna_arr) { unique_trnas++ }
delete trna_arr
print bin, total_trnas, unique_trnas, pos_23s, pos_16s, pos_5s
}
38 changes: 33 additions & 5 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ process {
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
}

withName: 'GAWK_MAGSCOT_PROCESS_CONTIG2BIN' {
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}" }
ext.suffix = { "tsv" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
}

withName: 'GAWK_MAGSCOT_PROCESS_CONTIG2BIN' {
ext.args = { "-v FS='\\t'" }
ext.args2 = { "'{OFS = FS} {print \$2,\$1,\"${meta.binner}\"}'" }
Expand All @@ -126,6 +132,17 @@ process {
tag = { "${meta.id}_${meta.assembler}" }
}

withName: 'GAWK_PROKKA_SUMMARY' {
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}.prokka_summary" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
ext.suffix = { "tsv" }
publishDir = [
path: { "${params.outdir}/qc/prokka" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'GAWK_RENAME_BINS' {
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_contig2bin" }
ext.args = { "" }
Expand Down Expand Up @@ -205,6 +222,17 @@ process {
]
}

withName: 'PROKKA' {
ext.args = { "--force --metagenome --rfam --noanno --fast --rawproduct" }
ext.prefix = { "${meta.binid}" }
tag = { "${meta.binid}" }
publishDir = [
path: { "${params.outdir}/qc/prokka" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'PYRODIGAL' {
ext.args = { "-p meta" }
ext.prefix = { "${meta.id}_${meta.assembler}" }
Expand All @@ -218,7 +246,7 @@ process {

withName: 'SEQKIT_STATS' {
ext.args = { "-b -a" }
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}_stats" }
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}.stats" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
publishDir = [
path: { "${params.outdir}/qc/stats/" },
Expand All @@ -239,11 +267,11 @@ process {
}

withName: 'TAXONKIT_NAME2TAXID' {
ext.args = { "-i 4" }
ext.prefix = { "${meta.id}_${meta.assembler}" }
tag = { "${meta.id}_${meta.assembler}" }
ext.args = { "-i 4" }
ext.prefix = { "${meta.id}_${meta.assembler}_${meta.binner}.gtdb_to_ncbi" }
tag = { "${meta.id}_${meta.assembler}_${meta.binner}" }
publishDir = [
path: { "${params.outdir}/mapping/hic" },
path: { "${params.outdir}/taxonomy/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
Expand Down
5 changes: 0 additions & 5 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"barrnap": {
"branch": "master",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"bwamem2/index": {
"branch": "master",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
Expand Down
8 changes: 7 additions & 1 deletion modules/local/bin_summary/main.nf
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
process BIN_SUMMARY {
tag "${meta.id}"
label "process_low"

conda "${moduleDir}/environment.yml"
Expand All @@ -7,7 +8,10 @@ process BIN_SUMMARY {
'community.wave.seqera.io/library/r-base_r-tidyverse_r-optparse:fb0e94661e2bf4e0' }"

input:
tuple val(meta), path(stats), path(checkm2), path(taxonomy)
tuple val(meta), path(stats)
tuple val(meta), path(checkm2)
tuple val(meta), path(taxonomy)
tuple val(meta), path(prokka)

output:
tuple val(meta), path("*.bin_summary.tsv"), emit: summary
Expand All @@ -19,8 +23,10 @@ process BIN_SUMMARY {
def stats_input = stats ? "--stats ${stats.join(",")}" : ""
def checkm_input = checkm2 ? "--checkm ${checkm2.join(",")}" : ""
def tax_input = taxonomy ? "--taxonomy ${taxonomy.join(",")}" : ""
def prokka_input = prokka ? "--prokka ${prokka.join(",")}" : ""
"""
bin_summary.R \\
-o ${prefix} \\
${stats_input} \\
${checkm_input} \\
${tax_input} \\
Expand Down
5 changes: 0 additions & 5 deletions modules/nf-core/barrnap/environment.yml

This file was deleted.

56 changes: 0 additions & 56 deletions modules/nf-core/barrnap/main.nf

This file was deleted.

49 changes: 0 additions & 49 deletions modules/nf-core/barrnap/meta.yml

This file was deleted.

Loading

0 comments on commit 62bf7a4

Please sign in to comment.