matsengrp · sminot · Oct 31, 2022 · Oct 20, 2022 · Oct 27, 2022 · Oct 28, 2022
diff --git a/._wb/tool/phip-flow/config.json b/._wb/tool/phip-flow/config.json
@@ -90,22 +90,12 @@
         "peptide_org_col": {
             "help": "Column in the peptide table indicating the organism for each peptide",
             "wb_type": "string",
-            "default": "Strain"
-        },
-        "peptide_prot_col": {
-            "help": "Column in the peptide table indicating the protein for each peptide",
-            "wb_type": "string",
-            "default": "Protein"
-        },
-        "peptide_pos_col": {
-            "help": "Column in the peptide table indicating the position within the protein for each peptide",
-            "wb_type": "string",
-            "default": "Prot_Start"
+            "default": "organism"
         },
         "peptide_seq_col": {
             "help": "Column in the peptide table containing the peptide sequence (used to match against public epitopes)",
             "wb_type": "string",
-            "default": "Prot"
+            "default": "peptide"
         },
         "max_overlap": {
             "help": "Maximum allowed overlap between detected peptides",

diff --git a/README.md b/README.md
@@ -22,5 +22,5 @@ Launch the pipeline execution with the following command:
     nextflow run matsengrp/phip-flow -profile docker
 
 Note: the [Dockerfile](docker/Dockerfile) contains all the required dependencies. 
-Add the `-profile docker` to enable the containerised execution to the 
+Add the `-profile docker` to enable the containerized execution to the 
 example command line shown below. 
diff --git a/bin/merge-counts-stats.py b/bin/merge-counts-stats.py
@@ -1,10 +1,7 @@
 #!/usr/bin/env python
-"""
-"""
 
 import pandas as pd
 import numpy as np
-import phippery
 from phippery.utils import *
 import argparse
 import glob
@@ -92,6 +89,13 @@ def num(s):
                 right_index=True
         )
 
+    # Add more summary metrics per sample
+    sample_table = sample_table.assign(
+        percent_mapped=sample_table["reads_mapped"] / sample_table["raw_total_sequences"] * 100.,
+        percent_peptides_detected=(merged_counts > 0).mean() * 100.,
+        percent_peptides_between_10_and_100=merged_counts.applymap(lambda v: v >= 10 & v <= 100).mean() * 100.,
+    )
+
     ds = stitch_dataset(
         counts=merged_counts, 
         peptide_table=peptide_table, 

diff --git a/docker/Dockerfile b/docker/Dockerfile
diff --git a/docker/README.md b/docker/README.md
diff --git a/nextflow.config b/nextflow.config
@@ -73,17 +73,11 @@ params{
     summarize_by_organism = false
 
     // Column in the peptide table indicating the organism for each peptide
-    peptide_org_col = "Strain"
-
-    // Column in the peptide table indicating the protein for each peptide
-    peptide_prot_col = "Protein"
-
-    // Column in the peptide table indicating the position within the protein for each peptide
-    peptide_pos_col = "Prot_Start"
+    peptide_org_col = "organism"
 
     // Column in the peptide table containing the peptide sequence
-    // (used to match against public epitopes)
-    peptide_seq_col = "Prot"
+    // (used to match against public epitopes, and to filter overlapping peptides)
+    peptide_seq_col = "seq"
 
     // Maximum allowed overlap between detected peptides
     max_overlap = 7
@@ -102,6 +96,10 @@ params{
 
 }
 
+// Set the container which can be used for all processes
+process {
+    container = 'quay.io/hdc-workflows/phippery:1.1.4'
+}
 
 profiles {
 
@@ -112,12 +110,10 @@ profiles {
     // Run locally assuming docker is installed with the latest image
     docker {
         docker.enabled = true
-        process.container = 'quay.io/jgallowa/phip-flow:latest'
     }
 
     // Run batch submission assuming docker is installed with the latest image
     cluster {
-        process.container = 'quay.io/jgallowa/phip-flow:latest'
 
         singularity {
             enabled = true

diff --git a/templates/aggregate_organisms.py b/templates/aggregate_organisms.py
@@ -21,7 +21,10 @@
 #   Public: marked as TRUE if the epitope was included in the input list of public epitopes
 
 # 3. To combine the virus-level data for each sample, only keep the highest-scoring
-# set of epitopes which do not overlap with any other epitope by more than 7aa
+# set of epitopes which do not overlap with any other epitope by more than 7aa.
+# To identify overlaps, use an exact alignment approach using k-mers. Note that this
+# will count two peptides as overlapping if they share any 7aa sequence, without
+# performing global alignment.
 
 # 4. Finally, save a table with the following information per virus, per sample:
 #   Number of all epitope hits
@@ -143,10 +146,6 @@ def read_peptide_mapping(self) -> pd.DataFrame:
         mapping = {
             # The user must specify the column used to group peptides by organism
             "!{params.peptide_org_col}": "organism",
-            # By protein name
-            "!{params.peptide_prot_col}": "protein",
-            # By the starting position of the peptide within the protein
-            "!{params.peptide_pos_col}": "position",
             # And by the protein sequence (which corresponds to the public epitope sequences)
             "!{params.peptide_seq_col}": "seq"
         }
@@ -176,11 +175,9 @@ def read_peptide_mapping(self) -> pd.DataFrame:
 
         self.logger.info(f"Public Epitopes: {df['public'].sum():,} / {df.shape[0]:,}")
 
-        # Drop the sequence, but keep the length of the peptide
+        # Add the length of the peptide
         df = df.assign(
             peptide_length=lambda d: d["seq"].apply(len)
-        ).drop(
-            columns=["seq"]
         )
 
         return df
@@ -279,39 +276,35 @@ def group_organisms(self) -> pd.DataFrame:
     def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> pd.DataFrame:
         """Analyze the data for a single sample, single organism."""
 
-        # Add the protein, position, and length information for each peptide
+        # Add the sequence information for each peptide
         df = df.assign(
-            protein=df["peptide"].apply(
-                self.peptide_mapping["protein"].get,
-            ),
-            position=df["peptide"].apply(
-                self.peptide_mapping["position"].get,
-            ),
-            peptide_length=df["peptide"].apply(
-                self.peptide_mapping["peptide_length"].get,
+            seq=df["peptide"].apply(
+                self.peptide_mapping["seq"].get
+            ).apply(
+                lambda s: s.rstrip("*")
             )
         )
 
         # Sort by EBS (descending)
         df = df.sort_values(by="EBS", ascending=False)
 
-        # Keep track of which positions have been covered
-        covered_positions = defaultdict(set)
+        # Keep track of the peptide kmers which have been observed so far
+        kmers_seen = set()
 
         # Make a list of the indices which will be dropped
         to_drop = list()
 
         # Go down the list, starting with the tightest binders
         for i, r in df.iterrows():
 
-            # Get the positions covered by this peptide
-            row_pos = set(range(r["position"], r["position"] + r["peptide_length"]))
-
-            # Get the number of overlapping positions
-            n_overlap = len(covered_positions[r["protein"]] & row_pos)
+            # Get the kmers by this peptide
+            row_kmers = set([
+                r["seq"][n:(n + self.max_overlap)]
+                for n in range(len(r["seq"]) - self.max_overlap)
+            ])
 
-            # If the maximum overlap threshold is exceeded
-            if n_overlap >= self.max_overlap:
+            # If any of those kmers have been seen before
+            if len(row_kmers & kmers_seen) > 0:
 
                 # Drop the row
                 to_drop.append(i)
@@ -320,7 +313,7 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p
             else:
 
                 # Add the covered positions
-                covered_positions[r["protein"]] |= row_pos
+                kmers_seen |= row_kmers
 
         df = df.drop(index=to_drop)
 

diff --git a/workflows/output.nf b/workflows/output.nf
@@ -18,7 +18,7 @@ process dump_wide_csv {
     publishDir "$params.results/wide_data/", mode: 'copy', overwrite: true
     input: path phip_data
     output: path "*.csv"
-    when: params.output_wide_csv
+    when: params.output_wide_csv || params.summarize_by_organism
     shell:
     """
     phippery to-wide-csv -o $params.dataset_prefix $phip_data

diff --git a/workflows/statistics.nf b/workflows/statistics.nf
@@ -85,7 +85,7 @@ process fit_predict_neg_binom {
 process fit_predict_zscore {
     input: path phip_data
     output: path "fit_predict_zscore.phip"
-    when: params.run_zscore_fit_predict
+    when: params.run_zscore_fit_predict || params.summarize_by_organism
     shell:
     """
     fit-predict-zscore.py \