Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/Create mock data subworkflow #206

Merged
merged 11 commits into from
Jan 5, 2022
2 changes: 1 addition & 1 deletion .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install nf-core
pip install nf-core==2.0.1

- name: Run nf-core lint
env:
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ test_environment: tests/requirements.txt
image: Dockerfile
docker build . -t jason-c-kwan/autometa:`git branch --show-current`

## Build necessary docker images for nextflow modules from Dockerfiles in docker/modules
modules-images: docker/modules/get_genomes_for_mock.Dockerfile docker/modules/mock_data_reporter.Dockerfile
docker build docker/modules/ -t jason-c-kwan/autometa-nf-modules-get_genomes_for_mock -f docker/modules/get_genomes_for_mock.Dockerfile
docker build docker/modules/ -t jason-c-kwan/autometa-nf-modules-mock_data_reporter -f docker/modules/mock_data_reporter.Dockerfile

## Build documentation for autometa.readthedocs.io
docs:
make clean html -C docs
Expand Down
18 changes: 18 additions & 0 deletions bin/clean_mock_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# Combine downloaded genomes into a single fasta file
zcat **/*.fna.gz > combined_nucleotide.fna

# Use EMBOSS's 'splitter' to create "contigs" of specified size
splitter -size 5000 combined_nucleotide.fna -outseq metagenome.fna

gzip metagenome.fna
gzip combined_nucleotide.fna

# get assembly accessions/locus
zgrep ">" **/*.fna.gz | sed 's/\s.*$//' | sed -E 's/[^\/]*.>//' > assembly_to_locus.txt #TODO: delimit better
# get assembly accessions
zgrep ">" **/*.fna.gz | cut -d_ -f1,2 | uniq > assemblies.txt

# Add fake spades coverage to deflines
zcat "metagenome.fna.gz" | awk '/^>/ {$0=$1} 1' | sed 's/>.*/&_length_1_cov_1/' | gzip > fake_spades.fna.gz
9 changes: 9 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ params {
publish_by_meta = ['id']
publish_dir = "diamond_blastp_results"
}
'get_genomes_for_mock' {
args = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt"
args2 = 'GCF_000734955.1|GCF_900448115.1|GCF_015751765.1'
publish_files = false
}
'hmmsearch_options' {
args = "-Z 150 --cpu 1 --seed 42"
args2 = ""
Expand All @@ -60,6 +65,10 @@ params {
publish_by_meta = ['id']
publish_dir = "kmers_normalized"
}
'mock_data_report'{
publish_by_meta = ['id']
publish_dir = "mock_data_reports"
}
'prodigal_options' {
publish_by_meta = ['id']
args = "-p meta -m"
Expand Down
23 changes: 23 additions & 0 deletions docker/modules/get_genomes_for_mock.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM continuumio/miniconda3
LABEL maintainer="jason.kwan@wisc.edu"

RUN apt-get update --allow-releaseinfo-change \
&& apt-get install -y procps curl rsync \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

RUN conda install -c bioconda emboss \
&& conda clean --all -y

# Check entrypoints are available
RUN echo "Checking get_genomes_for_mock.nf module entrypoints" \
&& splitter -help > /dev/null \
&& curl -h > /dev/null \
&& gzip --help > /dev/null \
&& rsync --help > /dev/null \
&& xargs --help > /dev/null \
&& cut --help > /dev/null \
&& sed --help > /dev/null \
&& uniq --help > /dev/null \
&& zgrep --help > /dev/null \
&& zcat --help > /dev/null
17 changes: 17 additions & 0 deletions docker/modules/mock_data_reporter.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM continuumio/miniconda3
LABEL maintainer="jason.kwan@wisc.edu"

RUN apt-get update --allow-releaseinfo-change \
&& apt-get install -y procps \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

RUN conda install -c r r-ggplot2 r-stringi \
&& conda install -c conda-forge r-rmarkdown r-data.table r-ggplot2 r-plotly r-crosstalk r-dt pandoc r-r.utils r-ggbeeswarm r-patchwork \
&& conda install -c bioconda r-magrittr \
&& conda clean --all -y

# Check entrypoints are available
RUN echo "Checking mock_data_reporter.nf module entrypoints" \
&& Rscript --help > /dev/null \
&& Rscript -e "packages <- c('markdown','data.table', 'ggplot2', 'plotly', 'crosstalk', 'magrittr', 'DT', 'stringi', 'ggbeeswarm', 'patchwork');for (i in packages) {library(i, character.only = T)}"
201 changes: 201 additions & 0 deletions lib/mock_data_report.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
---
title: "Autometa Mock Data Report"
output: html_document
params:
bins_path: "binning.main.tsv"
assembly_to_locus_path: "assembly_to_locus.txt"
assembly_report_path: "assembly_report.txt"
genus: TRUE
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning=FALSE, error=TRUE)
```


```{r message=FALSE, warning=FALSE}

packages <- c("data.table", "ggplot2", "plotly", "crosstalk", "magrittr", "DT", "ggbeeswarm", "patchwork")

for (i in packages) {
if (!requireNamespace(i)) {
install.packages(i)
}
library(i, character.only = T)
}
```


```{r read-and-clean-bins}
bins <- fread(params$bins_path)

bins$locus = vapply(bins$contig,
function(x){
paste(strsplit(x, "_")[[1]][1:2],collapse="_")
},
FUN.VALUE = "")
```

```{r read-and-clean-assembly-to-locus}
assembly_to_locus <- fread(
params$assembly_to_locus_path,
sep = "/",
header = F,
col.names = c("assembly", "locus"))

assembly_to_locus$assembly = vapply(assembly_to_locus$assembly,
function(x) {
paste(strsplit(x, "_")[[1]][1:2],
collapse = "_")
},
FUN.VALUE = "")
```


```{r read-and-clean-assembly-report, message=FALSE, warning=FALSE}
assembly_report <- fread(params$assembly_report_path)
assembly_report$genus <- gsub("\\s.*", "", assembly_report$organism_name)
```


```{r merge-all}
bins <- merge(bins, assembly_to_locus, by = "locus", all.x = T, all.y = F)
bins <- merge(bins, assembly_report, by.x = "assembly", by.y ="# assembly_accession", all.x = T, all.y = F)
```


## Summary

Summary information about the binning data

```{r summary}

summary_table <- bins[,
list(seqs_in_assembly = length(unique(locus)),
fake_contigs = length(unique(contig)),
genus = unique(genus),
taxid = unique(taxid),
name = unique(organism_name)),
assembly]
summary_table <- summary_table[order(genus), ]

DT::datatable(summary_table, options = list(
columnDefs = list(list(className = 'dt-center', targets = "_all"))))

```



```{r plotly-crosstalk-key}
bins_copy <- bins
bins <- highlight_key(bins, ~contig)
```


```{r plotly-xy}
if (params$genus) {
plotly_xy <- plot_ly(bins,
x = ~x_1,
y = ~x_2,
color = ~genus,
hoverinfo ="text",
hovertext = paste("Assembly:", bins_copy$assembly,
"<br> locus:", bins_copy$locus,
"<br> contig:", bins_copy$contig)) %>%
add_markers()

} else {
plotly_xy <- plot_ly(bins,
x = ~x_1,
y = ~x_2,
color = ~assembly,
hoverinfo ="text",
hovertext = paste("Assembly:", bins_copy$assembly,
"<br> locus:", bins_copy$locus,
"<br> contig:", bins_copy$contig)) %>%
add_markers()
}


```


```{r plotly-xyz}
if (params$genus) {
plotly_xyz <- plot_ly(bins,
x = ~x_1,
y = ~x_2,
z = ~gc_content,
color = ~genus,
hoverinfo ="text",
hovertext = paste("Assembly:", bins_copy$assembly,
"<br> locus:", bins_copy$locus,
"<br> contig:", bins_copy$contig))%>%
add_markers()
} else {

plotly_xyz <- plot_ly(bins,
x = ~x_1,
y = ~x_2,
z = ~gc_content,
color = ~assembly,
hoverinfo ="text",
hovertext = paste("Assembly:", bins_copy$assembly,
"<br> locus:", bins_copy$locus,
"<br> contig:", bins_copy$contig))%>%
add_markers()
}

```

## Plots

The plots below are linked ("Brushing"/Selecting points in the 2D plot will highlight points in the 3D plot).


```{r message=FALSE, warning=FALSE, fig.height=12, fig.width=12}

fig <- subplot(plotly_xy, plotly_xyz,nrows = 2)

fig %>% layout(
xaxis = list(domain=list(x=c(0,0.5),y=c(0,0.5))),

scene = list(domain=list(x=c(0,1),y=c(0,0.5))),

xaxis2 = list(domain=list(x=c(0,.5),y=c(0,0.5))),

showlegend=TRUE, showlegend2=FALSE) %>%
highlight(on = "plotly_selected", dynamic = TRUE)#, selectize = TRUE)

```

```{r}
if (params$genus) {
p1 <- ggplot(bins, aes(x = genus, y = x_1)) +
ggbeeswarm::geom_quasirandom(aes(color= genus), alpha=0.3) +
theme(legend.position="none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p2 <- ggplot(bins, aes(x = genus, y = x_2)) +
ggbeeswarm::geom_quasirandom(aes(color= genus), alpha=0.3) +
theme(legend.position="none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


p3 <- ggplot(bins, aes(x = genus, y = gc_content)) +
ggbeeswarm::geom_quasirandom(aes(color= genus), alpha=0.3) +
theme(legend.position="none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
} else {
p1 <- ggplot(bins, aes(x = assembly, y = x_1)) +
ggbeeswarm::geom_quasirandom(aes(color= assembly), alpha=0.3) +
theme(legend.position="none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p2 <- ggplot(bins, aes(x = assembly, y = x_2)) +
ggbeeswarm::geom_quasirandom(aes(color= assembly), alpha=0.3) +
theme(legend.position="none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


p3 <- ggplot(bins, aes(x = assembly, y = gc_content)) +
ggbeeswarm::geom_quasirandom(aes(color= assembly), alpha=0.3) +
theme(legend.position="none", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
}
p1+p2+p3+ plot_layout(guides = "collect")
```
38 changes: 38 additions & 0 deletions modules/local/get_genomes_for_mock.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'

params.options = [:]
options = initOptions(params.options)

process GET_GENOMES_FOR_MOCK {
def genome_count = options.args2.tokenize('|').size()
tag "fetching ${genome_count} genomes"
conda (params.enable_conda ? "bioconda::emboss=6.6.0" : null)
container "jason-c-kwan/autometa-nf-modules-get_genomes_for_mock"

output:
path "metagenome.fna.gz", emit: metagenome
path "combined_nucleotide.fna.gz", emit: combined_nucleotide
path "fake_spades.fna.gz", emit: fake_spades_coverage
path "assembly_to_locus.txt", emit: assembly_to_locus
path "assemblies.txt", emit: assemblies
path "assembly_report.txt", emit: assembly_report
"""
curl -s ${options.args} > assembly_report.txt

grep -E "${options.args2}" assembly_report.txt |\\
awk -F '\\t' '{print \$20}' |\\
sed 's,https://,rsync://,' |\\
xargs -n 1 -I {} \
rsync -am \
--exclude='*_rna_from_genomic.fna.gz' \
--exclude='*_cds_from_genomic.fna.gz' \
--include="*_genomic.fna.gz" \
--include="*_protein.faa.gz" \
--include='*/' \
--exclude='*' {} .

# "clean_mock_data.sh" is here: ~/Autometa/bin/clean_mock_data.sh
clean_mock_data.sh
"""
}
Loading