Skip to content

Commit

Permalink
Update aggregate organisms (#52)
Browse files Browse the repository at this point in the history
* Update phippery

* User k-mers to filter overlapping peptides

* summarize_by_organism requires output_wide_csv

* Summarizing by organism requires zscores

* Keep the sequence
  • Loading branch information
sminot authored Oct 31, 2022
1 parent 9a940da commit 87b0ed3
Show file tree
Hide file tree
Showing 9 changed files with 39 additions and 113 deletions.
14 changes: 2 additions & 12 deletions ._wb/tool/phip-flow/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -90,22 +90,12 @@
"peptide_org_col": {
"help": "Column in the peptide table indicating the organism for each peptide",
"wb_type": "string",
"default": "Strain"
},
"peptide_prot_col": {
"help": "Column in the peptide table indicating the protein for each peptide",
"wb_type": "string",
"default": "Protein"
},
"peptide_pos_col": {
"help": "Column in the peptide table indicating the position within the protein for each peptide",
"wb_type": "string",
"default": "Prot_Start"
"default": "organism"
},
"peptide_seq_col": {
"help": "Column in the peptide table containing the peptide sequence (used to match against public epitopes)",
"wb_type": "string",
"default": "Prot"
"default": "peptide"
},
"max_overlap": {
"help": "Maximum allowed overlap between detected peptides",
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ Launch the pipeline execution with the following command:
nextflow run matsengrp/phip-flow -profile docker

Note: the [Dockerfile](docker/Dockerfile) contains all the required dependencies.
Add the `-profile docker` to enable the containerised execution to the
Add the `-profile docker` to enable the containerized execution to the
example command line shown below.
10 changes: 7 additions & 3 deletions bin/merge-counts-stats.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
#!/usr/bin/env python
"""
"""

import pandas as pd
import numpy as np
import phippery
from phippery.utils import *
import argparse
import glob
Expand Down Expand Up @@ -92,6 +89,13 @@ def num(s):
right_index=True
)

# Add more summary metrics per sample
sample_table = sample_table.assign(
percent_mapped=sample_table["reads_mapped"] / sample_table["raw_total_sequences"] * 100.,
percent_peptides_detected=(merged_counts > 0).mean() * 100.,
percent_peptides_between_10_and_100=merged_counts.applymap(lambda v: v >= 10 & v <= 100).mean() * 100.,
)

ds = stitch_dataset(
counts=merged_counts,
peptide_table=peptide_table,
Expand Down
46 changes: 0 additions & 46 deletions docker/Dockerfile

This file was deleted.

11 changes: 0 additions & 11 deletions docker/README.md

This file was deleted.

18 changes: 7 additions & 11 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,11 @@ params{
summarize_by_organism = false

// Column in the peptide table indicating the organism for each peptide
peptide_org_col = "Strain"

// Column in the peptide table indicating the protein for each peptide
peptide_prot_col = "Protein"

// Column in the peptide table indicating the position within the protein for each peptide
peptide_pos_col = "Prot_Start"
peptide_org_col = "organism"

// Column in the peptide table containing the peptide sequence
// (used to match against public epitopes)
peptide_seq_col = "Prot"
// (used to match against public epitopes, and to filter overlapping peptides)
peptide_seq_col = "seq"

// Maximum allowed overlap between detected peptides
max_overlap = 7
Expand All @@ -102,6 +96,10 @@ params{

}

// Set the container which can be used for all processes
process {
container = 'quay.io/hdc-workflows/phippery:1.1.4'
}

profiles {

Expand All @@ -112,12 +110,10 @@ profiles {
// Run locally assuming docker is installed with the latest image
docker {
docker.enabled = true
process.container = 'quay.io/jgallowa/phip-flow:latest'
}

// Run batch submission assuming docker is installed with the latest image
cluster {
process.container = 'quay.io/jgallowa/phip-flow:latest'

singularity {
enabled = true
Expand Down
47 changes: 20 additions & 27 deletions templates/aggregate_organisms.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
# Public: marked as TRUE if the epitope was included in the input list of public epitopes

# 3. To combine the virus-level data for each sample, only keep the highest-scoring
# set of epitopes which do not overlap with any other epitope by more than 7aa
# set of epitopes which do not overlap with any other epitope by more than 7aa.
# To identify overlaps, use an exact alignment approach using k-mers. Note that this
# will count two peptides as overlapping if they share any 7aa sequence, without
# performing global alignment.

# 4. Finally, save a table with the following information per virus, per sample:
# Number of all epitope hits
Expand Down Expand Up @@ -143,10 +146,6 @@ def read_peptide_mapping(self) -> pd.DataFrame:
mapping = {
# The user must specify the column used to group peptides by organism
"!{params.peptide_org_col}": "organism",
# By protein name
"!{params.peptide_prot_col}": "protein",
# By the starting position of the peptide within the protein
"!{params.peptide_pos_col}": "position",
# And by the protein sequence (which corresponds to the public epitope sequences)
"!{params.peptide_seq_col}": "seq"
}
Expand Down Expand Up @@ -176,11 +175,9 @@ def read_peptide_mapping(self) -> pd.DataFrame:

self.logger.info(f"Public Epitopes: {df['public'].sum():,} / {df.shape[0]:,}")

# Drop the sequence, but keep the length of the peptide
# Add the length of the peptide
df = df.assign(
peptide_length=lambda d: d["seq"].apply(len)
).drop(
columns=["seq"]
)

return df
Expand Down Expand Up @@ -279,39 +276,35 @@ def group_organisms(self) -> pd.DataFrame:
def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> pd.DataFrame:
"""Analyze the data for a single sample, single organism."""

# Add the protein, position, and length information for each peptide
# Add the sequence information for each peptide
df = df.assign(
protein=df["peptide"].apply(
self.peptide_mapping["protein"].get,
),
position=df["peptide"].apply(
self.peptide_mapping["position"].get,
),
peptide_length=df["peptide"].apply(
self.peptide_mapping["peptide_length"].get,
seq=df["peptide"].apply(
self.peptide_mapping["seq"].get
).apply(
lambda s: s.rstrip("*")
)
)

# Sort by EBS (descending)
df = df.sort_values(by="EBS", ascending=False)

# Keep track of which positions have been covered
covered_positions = defaultdict(set)
# Keep track of the peptide kmers which have been observed so far
kmers_seen = set()

# Make a list of the indices which will be dropped
to_drop = list()

# Go down the list, starting with the tightest binders
for i, r in df.iterrows():

# Get the positions covered by this peptide
row_pos = set(range(r["position"], r["position"] + r["peptide_length"]))

# Get the number of overlapping positions
n_overlap = len(covered_positions[r["protein"]] & row_pos)
# Get the kmers by this peptide
row_kmers = set([
r["seq"][n:(n + self.max_overlap)]
for n in range(len(r["seq"]) - self.max_overlap)
])

# If the maximum overlap threshold is exceeded
if n_overlap >= self.max_overlap:
# If any of those kmers have been seen before
if len(row_kmers & kmers_seen) > 0:

# Drop the row
to_drop.append(i)
Expand All @@ -320,7 +313,7 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p
else:

# Add the covered positions
covered_positions[r["protein"]] |= row_pos
kmers_seen |= row_kmers

df = df.drop(index=to_drop)

Expand Down
2 changes: 1 addition & 1 deletion workflows/output.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ process dump_wide_csv {
publishDir "$params.results/wide_data/", mode: 'copy', overwrite: true
input: path phip_data
output: path "*.csv"
when: params.output_wide_csv
when: params.output_wide_csv || params.summarize_by_organism
shell:
"""
phippery to-wide-csv -o $params.dataset_prefix $phip_data
Expand Down
2 changes: 1 addition & 1 deletion workflows/statistics.nf
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ process fit_predict_neg_binom {
process fit_predict_zscore {
input: path phip_data
output: path "fit_predict_zscore.phip"
when: params.run_zscore_fit_predict
when: params.run_zscore_fit_predict || params.summarize_by_organism
shell:
"""
fit-predict-zscore.py \
Expand Down

0 comments on commit 87b0ed3

Please sign in to comment.