Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update aggregate organisms #52

Merged
merged 16 commits into from
Oct 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions ._wb/tool/phip-flow/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -90,22 +90,12 @@
"peptide_org_col": {
"help": "Column in the peptide table indicating the organism for each peptide",
"wb_type": "string",
"default": "Strain"
},
"peptide_prot_col": {
"help": "Column in the peptide table indicating the protein for each peptide",
"wb_type": "string",
"default": "Protein"
},
"peptide_pos_col": {
"help": "Column in the peptide table indicating the position within the protein for each peptide",
"wb_type": "string",
"default": "Prot_Start"
"default": "organism"
},
"peptide_seq_col": {
"help": "Column in the peptide table containing the peptide sequence (used to match against public epitopes)",
"wb_type": "string",
"default": "Prot"
"default": "peptide"
},
"max_overlap": {
"help": "Maximum allowed overlap between detected peptides",
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ Launch the pipeline execution with the following command:
nextflow run matsengrp/phip-flow -profile docker

Note: the [Dockerfile](docker/Dockerfile) contains all the required dependencies.
Add the `-profile docker` to enable the containerised execution to the
Add the `-profile docker` to enable the containerized execution to the
example command line shown below.
10 changes: 7 additions & 3 deletions bin/merge-counts-stats.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
#!/usr/bin/env python
"""
"""

import pandas as pd
import numpy as np
import phippery
from phippery.utils import *
import argparse
import glob
Expand Down Expand Up @@ -92,6 +89,13 @@ def num(s):
right_index=True
)

# Add more summary metrics per sample
sample_table = sample_table.assign(
percent_mapped=sample_table["reads_mapped"] / sample_table["raw_total_sequences"] * 100.,
percent_peptides_detected=(merged_counts > 0).mean() * 100.,
percent_peptides_between_10_and_100=merged_counts.applymap(lambda v: v >= 10 & v <= 100).mean() * 100.,
)

ds = stitch_dataset(
counts=merged_counts,
peptide_table=peptide_table,
Expand Down
46 changes: 0 additions & 46 deletions docker/Dockerfile

This file was deleted.

11 changes: 0 additions & 11 deletions docker/README.md

This file was deleted.

18 changes: 7 additions & 11 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,11 @@ params{
summarize_by_organism = false

// Column in the peptide table indicating the organism for each peptide
peptide_org_col = "Strain"

// Column in the peptide table indicating the protein for each peptide
peptide_prot_col = "Protein"

// Column in the peptide table indicating the position within the protein for each peptide
peptide_pos_col = "Prot_Start"
peptide_org_col = "organism"

// Column in the peptide table containing the peptide sequence
// (used to match against public epitopes)
peptide_seq_col = "Prot"
// (used to match against public epitopes, and to filter overlapping peptides)
peptide_seq_col = "seq"

// Maximum allowed overlap between detected peptides
max_overlap = 7
Expand All @@ -102,6 +96,10 @@ params{

}

// Set the container which can be used for all processes
process {
container = 'quay.io/hdc-workflows/phippery:1.1.4'
}

profiles {

Expand All @@ -112,12 +110,10 @@ profiles {
// Run locally assuming docker is installed with the latest image
docker {
docker.enabled = true
process.container = 'quay.io/jgallowa/phip-flow:latest'
}

// Run batch submission assuming docker is installed with the latest image
cluster {
process.container = 'quay.io/jgallowa/phip-flow:latest'

singularity {
enabled = true
Expand Down
47 changes: 20 additions & 27 deletions templates/aggregate_organisms.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
# Public: marked as TRUE if the epitope was included in the input list of public epitopes

# 3. To combine the virus-level data for each sample, only keep the highest-scoring
# set of epitopes which do not overlap with any other epitope by more than 7aa
# set of epitopes which do not overlap with any other epitope by more than 7aa.
# To identify overlaps, use an exact alignment approach using k-mers. Note that this
# will count two peptides as overlapping if they share any 7aa sequence, without
# performing global alignment.

# 4. Finally, save a table with the following information per virus, per sample:
# Number of all epitope hits
Expand Down Expand Up @@ -143,10 +146,6 @@ def read_peptide_mapping(self) -> pd.DataFrame:
mapping = {
# The user must specify the column used to group peptides by organism
"!{params.peptide_org_col}": "organism",
# By protein name
"!{params.peptide_prot_col}": "protein",
# By the starting position of the peptide within the protein
"!{params.peptide_pos_col}": "position",
# And by the protein sequence (which corresponds to the public epitope sequences)
"!{params.peptide_seq_col}": "seq"
}
Expand Down Expand Up @@ -176,11 +175,9 @@ def read_peptide_mapping(self) -> pd.DataFrame:

self.logger.info(f"Public Epitopes: {df['public'].sum():,} / {df.shape[0]:,}")

# Drop the sequence, but keep the length of the peptide
# Add the length of the peptide
df = df.assign(
peptide_length=lambda d: d["seq"].apply(len)
).drop(
columns=["seq"]
)

return df
Expand Down Expand Up @@ -279,39 +276,35 @@ def group_organisms(self) -> pd.DataFrame:
def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> pd.DataFrame:
"""Analyze the data for a single sample, single organism."""

# Add the protein, position, and length information for each peptide
# Add the sequence information for each peptide
df = df.assign(
protein=df["peptide"].apply(
self.peptide_mapping["protein"].get,
),
position=df["peptide"].apply(
self.peptide_mapping["position"].get,
),
peptide_length=df["peptide"].apply(
self.peptide_mapping["peptide_length"].get,
seq=df["peptide"].apply(
self.peptide_mapping["seq"].get
).apply(
lambda s: s.rstrip("*")
)
)

# Sort by EBS (descending)
df = df.sort_values(by="EBS", ascending=False)

# Keep track of which positions have been covered
covered_positions = defaultdict(set)
# Keep track of the peptide kmers which have been observed so far
kmers_seen = set()

# Make a list of the indices which will be dropped
to_drop = list()

# Go down the list, starting with the tightest binders
for i, r in df.iterrows():

# Get the positions covered by this peptide
row_pos = set(range(r["position"], r["position"] + r["peptide_length"]))

# Get the number of overlapping positions
n_overlap = len(covered_positions[r["protein"]] & row_pos)
# Get the kmers by this peptide
row_kmers = set([
r["seq"][n:(n + self.max_overlap)]
for n in range(len(r["seq"]) - self.max_overlap)
])

# If the maximum overlap threshold is exceeded
if n_overlap >= self.max_overlap:
# If any of those kmers have been seen before
if len(row_kmers & kmers_seen) > 0:

# Drop the row
to_drop.append(i)
Expand All @@ -320,7 +313,7 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p
else:

# Add the covered positions
covered_positions[r["protein"]] |= row_pos
kmers_seen |= row_kmers

df = df.drop(index=to_drop)

Expand Down
2 changes: 1 addition & 1 deletion workflows/output.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ process dump_wide_csv {
publishDir "$params.results/wide_data/", mode: 'copy', overwrite: true
input: path phip_data
output: path "*.csv"
when: params.output_wide_csv
when: params.output_wide_csv || params.summarize_by_organism
shell:
"""
phippery to-wide-csv -o $params.dataset_prefix $phip_data
Expand Down
2 changes: 1 addition & 1 deletion workflows/statistics.nf
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ process fit_predict_neg_binom {
process fit_predict_zscore {
input: path phip_data
output: path "fit_predict_zscore.phip"
when: params.run_zscore_fit_predict
when: params.run_zscore_fit_predict || params.summarize_by_organism
shell:
"""
fit-predict-zscore.py \
Expand Down