From 87b0ed33df81924116840c9f88413fd411ff6e88 Mon Sep 17 00:00:00 2001 From: Sam Minot Date: Mon, 31 Oct 2022 12:23:04 -0700 Subject: [PATCH] Update aggregate organisms (#52) * Update phippery * User k-mers to filter overlapping peptides * summarize_by_organism requires output_wide_csv * Summarizing by organism requires zscores * Keep the sequence --- ._wb/tool/phip-flow/config.json | 14 ++-------- README.md | 2 +- bin/merge-counts-stats.py | 10 +++++-- docker/Dockerfile | 46 ------------------------------- docker/README.md | 11 -------- nextflow.config | 18 +++++------- templates/aggregate_organisms.py | 47 ++++++++++++++------------------ workflows/output.nf | 2 +- workflows/statistics.nf | 2 +- 9 files changed, 39 insertions(+), 113 deletions(-) delete mode 100644 docker/Dockerfile delete mode 100644 docker/README.md diff --git a/._wb/tool/phip-flow/config.json b/._wb/tool/phip-flow/config.json index 2a1dfec..efc7877 100644 --- a/._wb/tool/phip-flow/config.json +++ b/._wb/tool/phip-flow/config.json @@ -90,22 +90,12 @@ "peptide_org_col": { "help": "Column in the peptide table indicating the organism for each peptide", "wb_type": "string", - "default": "Strain" - }, - "peptide_prot_col": { - "help": "Column in the peptide table indicating the protein for each peptide", - "wb_type": "string", - "default": "Protein" - }, - "peptide_pos_col": { - "help": "Column in the peptide table indicating the position within the protein for each peptide", - "wb_type": "string", - "default": "Prot_Start" + "default": "organism" }, "peptide_seq_col": { "help": "Column in the peptide table containing the peptide sequence (used to match against public epitopes)", "wb_type": "string", - "default": "Prot" + "default": "peptide" }, "max_overlap": { "help": "Maximum allowed overlap between detected peptides", diff --git a/README.md b/README.md index 509ff64..c4cfdb0 100644 --- a/README.md +++ b/README.md @@ -22,5 +22,5 @@ Launch the pipeline execution with the following command: nextflow run matsengrp/phip-flow -profile docker Note: the [Dockerfile](docker/Dockerfile) contains all the required dependencies. -Add the `-profile docker` to enable the containerised execution to the +Add the `-profile docker` to enable the containerized execution to the example command line shown below. diff --git a/bin/merge-counts-stats.py b/bin/merge-counts-stats.py index d96212e..d28a1f6 100755 --- a/bin/merge-counts-stats.py +++ b/bin/merge-counts-stats.py @@ -1,10 +1,7 @@ #!/usr/bin/env python -""" -""" import pandas as pd import numpy as np -import phippery from phippery.utils import * import argparse import glob @@ -92,6 +89,13 @@ def num(s): right_index=True ) + # Add more summary metrics per sample + sample_table = sample_table.assign( + percent_mapped=sample_table["reads_mapped"] / sample_table["raw_total_sequences"] * 100., + percent_peptides_detected=(merged_counts > 0).mean() * 100., + percent_peptides_between_10_and_100=merged_counts.applymap(lambda v: v >= 10 & v <= 100).mean() * 100., + ) + ds = stitch_dataset( counts=merged_counts, peptide_table=peptide_table, diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 650bb41..0000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -FROM quay.io/hdc-workflows/ubuntu:20.04 - -# bust cache -ADD http://date.jsontest.com /etc/builddate - -LABEL maintainer "Jared Galloway " \ - version "1.4" \ - description "Common PhIP-Seq Workflows" - -# install needed tools -RUN apt-get update --fix-missing -qq && \ - DEBIAN_FRONTEND=noninteractive \ - apt-get install -y -q \ - git \ - curl \ - locales \ - libncurses5-dev \ - libncursesw5-dev \ - build-essential \ - pkg-config \ - zlib1g-dev \ - python3 \ - python3-pip \ - python3-venv \ - zip \ - wget - -ENV VIRTUAL_ENV=/opt/venv -RUN python3 -m venv $VIRTUAL_ENV -ENV PATH="$VIRTUAL_ENV/bin:$PATH" - -# install phippery -RUN pip install phippery==1.1.0 - -# install pre-build binary Bowtie1.3 -RUN curl -fksSL https://sourceforge.net/projects/bowtie-bio/files/bowtie/1.3.1/bowtie-1.3.1-linux-x86_64.zip \ - --output bowtie-1.3.1-linux-x86_64.zip \ - && unzip bowtie-1.3.1-linux-x86_64.zip \ - && (cd /usr/bin/ && ln -s /bowtie-1.3.1-linux-x86_64/* ./) - - -# install SAMtools -RUN curl -fksSL https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 | tar xj && \ - cd samtools-1.3.1 && \ - make all all-htslib && make install install-htslib - diff --git a/docker/README.md b/docker/README.md deleted file mode 100644 index 64cfe57..0000000 --- a/docker/README.md +++ /dev/null @@ -1,11 +0,0 @@ -= Docker container - -The pipline needs the following tools: - -- http://www.htslib.org/[samtools] - -A Docker container with all tools except the Genome Analysis Toolkit can be built from the `Dockerfile` present in this folder. - -A prebuilt Docker container is available at the Docker hub as `cbcrg/ngs2017ws-nf`. - -For the Genome Analysis Toolkit we use `matsengrp/phippery`. diff --git a/nextflow.config b/nextflow.config index 713beaa..fec89b3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,17 +73,11 @@ params{ summarize_by_organism = false // Column in the peptide table indicating the organism for each peptide - peptide_org_col = "Strain" - - // Column in the peptide table indicating the protein for each peptide - peptide_prot_col = "Protein" - - // Column in the peptide table indicating the position within the protein for each peptide - peptide_pos_col = "Prot_Start" + peptide_org_col = "organism" // Column in the peptide table containing the peptide sequence - // (used to match against public epitopes) - peptide_seq_col = "Prot" + // (used to match against public epitopes, and to filter overlapping peptides) + peptide_seq_col = "seq" // Maximum allowed overlap between detected peptides max_overlap = 7 @@ -102,6 +96,10 @@ params{ } +// Set the container which can be used for all processes +process { + container = 'quay.io/hdc-workflows/phippery:1.1.4' +} profiles { @@ -112,12 +110,10 @@ profiles { // Run locally assuming docker is installed with the latest image docker { docker.enabled = true - process.container = 'quay.io/jgallowa/phip-flow:latest' } // Run batch submission assuming docker is installed with the latest image cluster { - process.container = 'quay.io/jgallowa/phip-flow:latest' singularity { enabled = true diff --git a/templates/aggregate_organisms.py b/templates/aggregate_organisms.py index 9d267c6..0dbe4fa 100755 --- a/templates/aggregate_organisms.py +++ b/templates/aggregate_organisms.py @@ -21,7 +21,10 @@ # Public: marked as TRUE if the epitope was included in the input list of public epitopes # 3. To combine the virus-level data for each sample, only keep the highest-scoring -# set of epitopes which do not overlap with any other epitope by more than 7aa +# set of epitopes which do not overlap with any other epitope by more than 7aa. +# To identify overlaps, use an exact alignment approach using k-mers. Note that this +# will count two peptides as overlapping if they share any 7aa sequence, without +# performing global alignment. # 4. Finally, save a table with the following information per virus, per sample: # Number of all epitope hits @@ -143,10 +146,6 @@ def read_peptide_mapping(self) -> pd.DataFrame: mapping = { # The user must specify the column used to group peptides by organism "!{params.peptide_org_col}": "organism", - # By protein name - "!{params.peptide_prot_col}": "protein", - # By the starting position of the peptide within the protein - "!{params.peptide_pos_col}": "position", # And by the protein sequence (which corresponds to the public epitope sequences) "!{params.peptide_seq_col}": "seq" } @@ -176,11 +175,9 @@ def read_peptide_mapping(self) -> pd.DataFrame: self.logger.info(f"Public Epitopes: {df['public'].sum():,} / {df.shape[0]:,}") - # Drop the sequence, but keep the length of the peptide + # Add the length of the peptide df = df.assign( peptide_length=lambda d: d["seq"].apply(len) - ).drop( - columns=["seq"] ) return df @@ -279,24 +276,20 @@ def group_organisms(self) -> pd.DataFrame: def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> pd.DataFrame: """Analyze the data for a single sample, single organism.""" - # Add the protein, position, and length information for each peptide + # Add the sequence information for each peptide df = df.assign( - protein=df["peptide"].apply( - self.peptide_mapping["protein"].get, - ), - position=df["peptide"].apply( - self.peptide_mapping["position"].get, - ), - peptide_length=df["peptide"].apply( - self.peptide_mapping["peptide_length"].get, + seq=df["peptide"].apply( + self.peptide_mapping["seq"].get + ).apply( + lambda s: s.rstrip("*") ) ) # Sort by EBS (descending) df = df.sort_values(by="EBS", ascending=False) - # Keep track of which positions have been covered - covered_positions = defaultdict(set) + # Keep track of the peptide kmers which have been observed so far + kmers_seen = set() # Make a list of the indices which will be dropped to_drop = list() @@ -304,14 +297,14 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p # Go down the list, starting with the tightest binders for i, r in df.iterrows(): - # Get the positions covered by this peptide - row_pos = set(range(r["position"], r["position"] + r["peptide_length"])) - - # Get the number of overlapping positions - n_overlap = len(covered_positions[r["protein"]] & row_pos) + # Get the kmers by this peptide + row_kmers = set([ + r["seq"][n:(n + self.max_overlap)] + for n in range(len(r["seq"]) - self.max_overlap) + ]) - # If the maximum overlap threshold is exceeded - if n_overlap >= self.max_overlap: + # If any of those kmers have been seen before + if len(row_kmers & kmers_seen) > 0: # Drop the row to_drop.append(i) @@ -320,7 +313,7 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p else: # Add the covered positions - covered_positions[r["protein"]] |= row_pos + kmers_seen |= row_kmers df = df.drop(index=to_drop) diff --git a/workflows/output.nf b/workflows/output.nf index 7a3385b..c58b65e 100644 --- a/workflows/output.nf +++ b/workflows/output.nf @@ -18,7 +18,7 @@ process dump_wide_csv { publishDir "$params.results/wide_data/", mode: 'copy', overwrite: true input: path phip_data output: path "*.csv" - when: params.output_wide_csv + when: params.output_wide_csv || params.summarize_by_organism shell: """ phippery to-wide-csv -o $params.dataset_prefix $phip_data diff --git a/workflows/statistics.nf b/workflows/statistics.nf index e455d70..48a7c86 100644 --- a/workflows/statistics.nf +++ b/workflows/statistics.nf @@ -85,7 +85,7 @@ process fit_predict_neg_binom { process fit_predict_zscore { input: path phip_data output: path "fit_predict_zscore.phip" - when: params.run_zscore_fit_predict + when: params.run_zscore_fit_predict || params.summarize_by_organism shell: """ fit-predict-zscore.py \