Skip to content

Commit

Permalink
Merge pull request #395 from prototaxites/euk_classify
Browse files Browse the repository at this point in the history
Add domain-level classification for bins
  • Loading branch information
prototaxites authored Jun 9, 2023
2 parents 5852d01 + 7172909 commit 7c6a62a
Show file tree
Hide file tree
Showing 24 changed files with 992 additions and 403 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Added`

- [#395](https://github.com/nf-core/mag/pull/395) - Add support for fast domain-level classification of bins using Tiara, to allow bins to be separated into eukaryotic and prokaryotic-specific processes.
- [#422](https://github.com/nf-core/mag/pull/422) - Adds support for normalization of read depth with BBNorm (added by @erikrikarddaniel and @fabianegli)
- [#439](https://github.com/nf-core/mag/pull/439) - Adds ability to enter the pipeline at the binning stage by providing a CSV of pre-computed assemblies (by @prototaxites)

Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@

> Nurk, S., Meleshko, D., Korobeynikov, A., & Pevzner, P. A. (2017). metaSPAdes: a new versatile metagenomic assembler. Genome research, 27(5), 824-834. doi: 10.1101/gr.213959.116.
- [Tiara](https://doi.org/10.1093/bioinformatics/btab672)

> Karlicki, M., Antonowicz, S., Karnkowska, A., 2022. Tiara: deep learning-based classification system for eukaryotic sequences. Bioinformatics 38, 344–350. doi: 10.1093/bioinformatics/btab672
## Data

- [Full-size test data](https://doi.org/10.1038/s41587-019-0191-2)
Expand Down
144 changes: 144 additions & 0 deletions bin/domain_classification.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env Rscript

# Written by Jim Downie and released under the MIT license.
# See git repository (https://github.com/nf-core/mag) for full license text.

library(optparse)
library(tidyverse)

parser <- OptionParser()
parser <- add_option(parser, c("-t", "--classification_file"),
action = "store",
type = "character",
metavar = "character",
help = "The out.txt tsv file of per-contig classifications from Tiara.")
parser <- add_option(parser, c("-s", "--contig_to_bin"),
action = "store",
type = "character",
metavar = "character",
help = "A tsv file with two columns, bin and contig, listing the contig membership for each bin.")
parser <- add_option(parser, c("-j", "--join_prokaryotes"),
action = "store_true",
type = "logical",
default = TRUE,
metavar = "logical",
help = "Use an general prokaryote classification instead of separating Archaea and Bacteria.")
parser <- add_option(parser, c("-a", "--assembler"),
action = "store",
type = "character",
metavar = "character",
help = "Assembler used to assemble the contigs. 'MEGAHIT' or 'SPAdes' only.")
parser <- add_option(parser, c("-o", "--output_prefix"),
action = "store",
type = "character",
metavar = "character",
help = "Prefix for the output classification table name.")
args <- parse_args(parser)

## optparse doesn't have a required flag so exit if we don't get given a file
if(is.null(args$classification_file)) {
stop("Tiara classification file not provided.")
}
if(is.null(args$contig_to_bin)) {
stop("Contig to bin file not provided.")
}
if(is.null(args$assembler)) {
stop("Assembler not provided.")
}
if(!(args$assembler %in% c("MEGAHIT", "SPAdes"))) {
stop("Invalid assembler provided.")
}

find_classification <- function(probabilities, join_prokaryotes = TRUE) {
if(join_prokaryotes) {
classifications <- c("prokarya", "eukarya", "organelle", "unknown")
} else {
classifications <- c("archaea", "bacteria", "eukarya", "organelle", "unknown")
}
return(classifications[which.max(probabilities)])
}

classify_bins <- function(tiara, contig2bin, join_prokaryotes, assembler){
## DASTOOL_FASTATOCONTIG2BIN drops everything after a space in contig names, which MEGAHIT uses
## Modify Tiara classifications contig IDs accordingly so we can merge
if(assembler == "MEGAHIT"){
tiara$sequence_id <- str_split(tiara$sequence_id, " ", simplify = TRUE)[,1]
}
if(join_prokaryotes) {
n_classifications <- 4
} else {
n_classifications <- 5
}

## combination of left_join and filter collectively eliminate unclassified contigs
tiara <- tiara |>
left_join(contig2bin) |>
filter(!is.na(BinID)) |>
select(sequence_id,
BinID,
Archaea = arc,
Bacteria = bac,
Eukarya = euk,
Organelle = org,
Unknown = unk1)

if(join_prokaryotes) {
tiara <- tiara |>
mutate(Prokarya = Archaea + Bacteria) |>
select(sequence_id, BinID, Prokarya, Eukarya, Organelle, Unknown)
}

## Identify the columns to softmax
prob_columns <- 2:(2 + n_classifications - 1)

## Calculate softmax probabilites based on summed bin probabilities for each category
softmax_probabilities <- tiara |>
group_by(BinID) |>
summarise(across(all_of(prob_columns), sum), .groups = "drop") |>
rowwise() |>
mutate(denominator = sum(exp(c_across(all_of(prob_columns))))) |>
mutate(across(all_of(prob_columns), \(x) exp(x)/denominator),
classification = find_classification(c_across(all_of(prob_columns)),
join_prokaryotes = join_prokaryotes)) |>
select(-denominator)

## A bin may have no classified contigs if all contigs are below the minimum
## Tiara length threshold
unclassified_bins <- contig2bin$BinID[!(contig2bin$BinID %in% softmax_probabilities$BinID)]

## Assign these as unclassified
if(length(unclassified_bins) > 0) {
for(bin in unclassified_bins) {
if(join_prokaryotes == TRUE){
row <- tibble(
BinID = bin, Prokarya = NA, Eukarya = NA, Organelle = NA, Unknown = NA, classification = "unknown"
)
} else {
row <- tibble(
BinID = bin, Bacteria = NA, Archaea = NA, Eukarya = NA, Organelle = NA, Unknown = NA, classification = "unknown"
)
}
softmax_probabilities <- bind_rows(softmax_probabilities, row)
}
}

return(softmax_probabilities)
}

classifications <- read_tsv(args$classification_file, na = c("NA", "n/a"))
contig_to_bin <- read_tsv(args$contig_to_bin, col_names = c("sequence_id", "BinID"))

results <- classify_bins(tiara = classifications,
contig2bin = contig_to_bin,
join_prokaryotes = args$join_prokaryotes,
assembler = args$assembler)

## Keep just the classifications so we can loop over more easily
results_basic <- select(results, BinID, classification)

## write outputs
write_tsv(results, paste0(args$output_prefix, ".binclassification.tsv"))
write_tsv(results_basic, "bin2classification.tsv", col_names = FALSE)

## write out package versions
packageVersion("tidyverse") |> as.character() |> writeLines("tidyverse_version.txt")
12 changes: 6 additions & 6 deletions bin/split_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@
)
# contigs to retain and pool
elif length >= min_length_to_retain_contig:
pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name))
pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
# remaining sequences
else:
remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name))
remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
else:
with open(input_file) as f:
fasta_sequences = SeqIO.parse(f, "fasta")
Expand All @@ -64,10 +64,10 @@
)
# contigs to retain and pool
elif length >= min_length_to_retain_contig:
pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name))
pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
# remaining sequences
else:
remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name))
remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))

# Sort sequences above threshold by length
df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True)
Expand All @@ -77,10 +77,10 @@
for index, row in df_above_threshold.iterrows():
if index + 1 <= max_sequences:
print("write " + out_base + "." + str(index + 1) + ".fa")
out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"])
out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")
SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta")
else:
pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"]))
pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description=""))

print("write " + out_base + ".pooled.fa")
SeqIO.write(pooled, out_base + ".pooled.fa", "fasta")
Expand Down
46 changes: 36 additions & 10 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -295,23 +295,14 @@ process {
]
}

withName: 'MAG_DEPTHS_PLOT|MAG_DEPTHS_SUMMARY|MAG_DEPTHS_PLOT_REFINED' {
withName: 'MAG_DEPTHS_PLOT|MAG_DEPTHS_SUMMARY' {
publishDir = [
path: { "${params.outdir}/GenomeBinning/depths/bins" },
mode: params.publish_dir_mode,
pattern: "*.{png,tsv}"
]
}

withName: 'MAG_DEPTHS_SUMMARY_REFINED' {
ext.prefix = "bin_refined_depths_summary"
publishDir = [
path: { "${params.outdir}/GenomeBinning/depths/bins" },
mode: params.publish_dir_mode,
pattern: "*.{tsv}"
]
}

withName: 'BIN_SUMMARY' {
publishDir = [
path: { "${params.outdir}/GenomeBinning" },
Expand Down Expand Up @@ -656,6 +647,10 @@ process {
ext.prefix = { "${meta.assembler}-MaxBin2-${meta.id}" }
}

withName: DASTOOL_FASTATOCONTIG2BIN_TIARA {
ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}" }
}

withName: DASTOOL_DASTOOL {
publishDir = [
[
Expand Down Expand Up @@ -684,6 +679,37 @@ process {
]
}

withName: TIARA_TIARA {
publishDir = [
[
path: { "${params.outdir}/Taxonomy/Tiara" },
mode: params.publish_dir_mode,
pattern: { "${meta.assembler}-${meta.id}.tiara.{txt}" }
],
[
path: { "${params.outdir}/Taxonomy/Tiara/log" },
mode: params.publish_dir_mode,
pattern: { "log_${meta.assembler}-${meta.id}.tiara.{txt}" }
]
]
ext.args = { "--min_len ${params.tiara_min_length} --probabilities" }
ext.prefix = { "${meta.assembler}-${meta.id}.tiara" }
}

withName: TIARA_CLASSIFY {
ext.args = { "--join_prokaryotes --assembler ${meta.assembler}" }
ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.bin}-${meta.id}" }
}

withName: TIARA_SUMMARY {
publishDir = [
path: { "${params.outdir}/Taxonomy/" },
mode: params.publish_dir_mode,
pattern: "tiara_summary.tsv"
]
ext.prefix = "tiara_summary"
}

withName: CUSTOM_DUMPSOFTWAREVERSIONS {
publishDir = [
path: { "${params.outdir}/pipeline_info" },
Expand Down
7 changes: 4 additions & 3 deletions conf/test_adapterremoval.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@
*/

params {
config_profile_name = 'Test profile for running with AdapterRemoval'
config_profile_description = 'Minimal test dataset to check pipeline function with AdapterRemoval data'
config_profile_name = 'Test profile for running with AdapterRemoval and domain classification'
config_profile_description = 'Minimal test dataset to check pipeline function with AdapterRemoval data and domain classification.'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.euk.csv'
centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
skip_krona = true
Expand All @@ -30,4 +30,5 @@ params {
gtdb = false
clip_tool = 'adapterremoval'
skip_concoct = true
bin_domain_classification = true
}
34 changes: 0 additions & 34 deletions conf/test_no_clipping.config

This file was deleted.

Binary file modified docs/images/mag_workflow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 7c6a62a

Please sign in to comment.