Merge pull request #14 from pinellolab/dev

Add editing preference profiling
pinellolab · Nov 15, 2023 · 4696b9e · 4696b9e
2 parents e1c209d + 36c9e21
commit 4696b9e
Show file tree

Hide file tree

Showing 8 changed files with 204 additions and 87 deletions.
diff --git a/README.md b/README.md
@@ -13,12 +13,14 @@ This is an analysis toolkit for the pooled CRISPR reporter or sensor data. The r
 ## Overview
 `crispr-bean` supports end-to-end analysis of pooled sorting screens, with or without reporter.  
 
-<img src="imgs/dag_bean.png" alt="dag_bean.svg" width="700"/>  
+<img src="imgs/dag_bean_v2.png" alt="dag_bean_v2.svg" width="700"/>  
 
-1. [`bean-count-sample`](#bean-count-samples-count-reporter-screen-data): Base-editing-aware **mapping** of guide, optionally with reporter from `.fastq` files.  
-2. [`bean-qc`](#bean-qc-qc-of-reporter-screen-data): Quality control report and filtering out / masking of aberrant sample and guides  
-3. [`bean-filter`](#bean-filter-filtering-and-optionally-translating-alleles): Filter reporter alleles; essential for `tiling` mode that allows for all alleles generated from gRNA.
-4. [`bean-run`](#bean-run-quantify-variant-effects): Quantify targeted variants' effect sizes from screen data.  
+1. [`bean-count-sample`](#bean-count-samples-count-reporter-screen-data): Base-editing-aware **mapping** of guide, optionally with reporter from `.fastq` files.
+  *   [`bean-count-create`](#bean-create-screen-create-reporterscreen-object-from-flat-files) creates minimal ReporterScreen object from flat gRNA count file. Note that this way, allele counts are not included and many functionalities involving allele and edit counts are not supported.
+2. [`bean-profile`](#bean-profile-profile-editing-patterns): Profile editing preferences of your editor.  
+3. [`bean-qc`](#bean-qc-qc-of-reporter-screen-data): Quality control report and filtering out / masking of aberrant sample and guides  
+4. [`bean-filter`](#bean-filter-filtering-and-optionally-translating-alleles): Filter reporter alleles; essential for `tiling` mode that allows for all alleles generated from gRNA.
+5. [`bean-run`](#bean-run-quantify-variant-effects): Quantify targeted variants' effect sizes from screen data.  
 
 ### Data structure
 BEAN stores mapped gRNA and allele counts in `ReporterScreen` object which is compatible with [AnnData](https://anndata.readthedocs.io/en/latest/index.html). See [Data Structure](#data-structure) section for more information.
@@ -141,7 +143,35 @@ Optional columns are not required but can be provided for compatibility with `be
 * `--rerun` (default: `False`): Recount each sample. If `False`, existing count for each sample is taken.
 
 
+## `bean-create-screen`: Create ReporterScreen object from flat files
+```bash
+bean-create-screen gRNA_library.csv sample_list.csv gRNA_counts_table.csv
+```
+### Input
+  * [gRNA_library.csv](#1-gRNA_librarycsv)
+  * [sample_list.csv](#2-sample_listcsv)
+  * gRNA_counts_table.csv: Table with gRNA ID in the first column and sample IDs as the column names (first row)
+
+### Full Parameters
+  * `-e`, `--edits` (default: `None`): Path to edit counts .csv table, with index at first column and column names at the first row.
+  * `-o`, `--output-prefix` (default: `None`): Output file prefix (output will be saved as `output_prefix.h5ad`). If not provided, `gRNA_counts_table_csv` file prefix is used.
+
+<br/><br/>
 
+## `bean-profile`: Profile editing patterns
+```bash
+bean-profile my_sorting_screen.h5ad -o output_prefix `# Prefix for editing profile report` 
+```
+### Parameters
+  * `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.
+  * `--replicate-col` (default: `"rep"`): Column name in `bdata.samples` that describes replicate ID.
+  * `--condition-col` (default: `"bin"`): Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)
+  * `--pam-col` (default: `None`): Column name describing PAM of each gRNA in `bdata.guides`.
+  * `--control-condition` (default: `"bulk"`): Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`.
+  * `-w`, `--window-length` (default: `6`): Window length of editing window of maximal editing efficiency to be identified. This window is used to quantify context specificity within the window.
+
+### Output
+Above command produces `prefix_editing_preference.[html,ipynb]` as editing preferences ([see example](notebooks/profile_editing_preference.ipynb)).  
 
 <br/><br/>
 

diff --git a/bean/plotting/editing_patterns.py b/bean/plotting/editing_patterns.py
@@ -22,7 +22,7 @@ def _add_absent_edits(
 ):
     """If edit is not observed in editable position, add into the edit rate table."""
     editable_positions = np.where(
-        (np.array(list(bdata.guides.loc[guide, "Reporter"])) == edited_base)
+        (np.array(list(bdata.guides.loc[guide, "reporter"])) == edited_base)
     )[0]
     if guide not in edit_tbl.guide.tolist():
         observed_rel_pos = []
@@ -62,68 +62,59 @@ def _add_absent_edits(
 def get_edit_rates(
     bdata,
     edit_count_key="edit_counts",
-    add_absent=True,
     adjust_spacer_pos: bool = True,
     reporter_column: str = "reporter",
 ):
     """
     Obtain position- and context-wise editing rate (context: base preceding the target base position).
     Args
-    bdata (bean.ReporterScreen): input ReporterScreen object to be used for analyzing posiiton-wide editing rate.
+    bdata (bean.reporterScreen): input ReporterScreen object to be used for analyzing posiiton-wide editing rate.
     edit_count_key: key in bdata.uns with edit count DataFrame to be used for analyzing posiiton-wide editing rate.
     """
     edit_rates = bdata.get_normalized_allele_counts(bdata.uns[edit_count_key])
     edit_rates_agg = edit_rates[["guide", "edit"]]
     edit_rates_agg = edit_rates_agg.loc[
         edit_rates_agg.guide.map(lambda s: "CONTROL" not in s)
     ].reset_index(drop=True)
-    try:
-        edit_rates_agg["rep_median"] = edit_rates.iloc[:, 2:].median(axis=1)
-        edit_rates_agg["rep_mean"] = edit_rates.iloc[:, 2:].mean(axis=1)
-        edit_rates_agg["rel_pos"] = edit_rates_agg.edit.map(lambda e: e.rel_pos).astype(
-            int
-        )
-        if add_absent:
-            for guide in tqdm(
-                edit_rates_agg.guide.unique(),
-                desc="Calibrating edits in editable positions...",
-            ):
-                edit_rates_agg = _add_absent_edits(
-                    bdata,
-                    guide,
-                    edit_rates_agg,
-                    edited_base=bdata.base_edited_from,
-                    target_alt=bdata.base_edited_to,
-                )
-        if adjust_spacer_pos:
-            if "guide_len" not in bdata.guides.columns:
-                bdata.guides["guide_len"] = bdata.guides.sequence.map(len)
-            start_pad = (
-                32
-                - 6
-                - bdata.guides.guide_len[edit_rates_agg.guide].reset_index(drop=True)
-            )
-            edit_rates_agg["spacer_pos"] = (edit_rates_agg.rel_pos - start_pad).astype(
-                int
-            )
-            edit_rates_agg = edit_rates_agg[
-                (edit_rates_agg.spacer_pos >= 0) & (edit_rates_agg.spacer_pos < 20)
-            ].reset_index(drop=True)
-            edit_rates_agg.loc[:, "spacer_pos"] = edit_rates_agg["spacer_pos"] + 1
-        else:
-            edit_rates_agg["spacer_pos"] = edit_rates_agg.rel_pos + 1
-        edit_rates_agg["base_change"] = edit_rates_agg.edit.map(
-            lambda e: e.get_base_change()
+    edit_rates_agg["rep_median"] = edit_rates.iloc[:, 2:].median(axis=1)
+    edit_rates_agg["rep_mean"] = edit_rates.iloc[:, 2:].mean(axis=1)
+    edit_rates_agg["rel_pos"] = edit_rates_agg.edit.map(lambda e: e.rel_pos).astype(int)
+
+    for guide in tqdm(
+        edit_rates_agg.guide.unique(),
+        desc="Calibrating edits in editable positions...",
+    ):
+        edit_rates_agg = _add_absent_edits(
+            bdata,
+            guide,
+            edit_rates_agg,
+            edited_base=bdata.base_edited_from,
+            target_alt=bdata.base_edited_to,
         )
-        edit_rates_agg.rel_pos = edit_rates_agg.rel_pos.astype(int)
-        edit_rates_agg["context"] = edit_rates_agg.apply(
-            lambda row: bdata.guides.loc[row.guide, reporter_column][
-                row.rel_pos - 1 : row.rel_pos + 1
-            ],
-            axis=1,
+
+    if adjust_spacer_pos:
+        if "guide_len" not in bdata.guides.columns:
+            bdata.guides["guide_len"] = bdata.guides.sequence.map(len)
+        start_pad = (
+            32 - 6 - bdata.guides.guide_len[edit_rates_agg.guide].reset_index(drop=True)
         )
-    except Exception as exp:
-        print(exp)
+        edit_rates_agg["spacer_pos"] = (edit_rates_agg.rel_pos - start_pad).astype(int)
+        edit_rates_agg = edit_rates_agg[
+            (edit_rates_agg.spacer_pos >= 0) & (edit_rates_agg.spacer_pos < 20)
+        ].reset_index(drop=True)
+        edit_rates_agg.loc[:, "spacer_pos"] = edit_rates_agg["spacer_pos"] + 1
+    else:
+        edit_rates_agg["spacer_pos"] = edit_rates_agg.rel_pos + 1
+    edit_rates_agg["base_change"] = edit_rates_agg.edit.map(
+        lambda e: e.get_base_change()
+    )
+    edit_rates_agg.rel_pos = edit_rates_agg.rel_pos.astype(int)
+    edit_rates_agg["context"] = edit_rates_agg.apply(
+        lambda row: bdata.guides.loc[row.guide, reporter_column][
+            row.rel_pos - 1 : row.rel_pos + 1
+        ],
+        axis=1,
+    )
     return edit_rates_agg
 
 
@@ -190,10 +181,10 @@ def _get_norm_rates_df(
     edit_rates_df=None,
     edit_count_key="edit_counts",
     base_changes: Sequence[str] = None,
-    show_list=None,
 ):
     change_by_pos = pd.pivot(
-        edit_rates_df.groupby(["base_change", "spacer_pos"])
+        edit_rates_df[["base_change", "spacer_pos", "rep_mean"]]
+        .groupby(["base_change", "spacer_pos"])
         .sum()["rep_mean"]
         .reset_index(),
         index="spacer_pos",
@@ -203,7 +194,7 @@ def _get_norm_rates_df(
 
     norm_matrix = pd.DataFrame(index=change_by_pos.index, columns=BASES)
     for pos in norm_matrix.index:
-        pos_base = bdata.guides.Reporter.map(
+        pos_base = bdata.guides.reporter.map(
             lambda s: s[pos] if pos < len(s) else " "
         ).values
         for b in BASES:
@@ -216,10 +207,8 @@ def _get_norm_rates_df(
             :, change_by_pos.columns.map(lambda s: s.split(">")[0])
         ].values
     )
-    if show_list is None:
-        show_list = ["A>C", "A>T", "A>G", "C>T", "C>G"]
     # norm_rate_reduced = _combine_complementary_base_changes(norm_rate).astype(float)
-    return norm_rate.astype(float)[show_list]  # _reduced
+    return norm_rate.astype(float)[base_changes]  # _reduced
 
 
 def _get_possible_changes_from_target_base(target_basechange: str):
@@ -257,6 +246,8 @@ def plot_by_pos_behive(
     if nonref_base_changes is None:
         if target_basechange == "A>G":
             nonref_base_changes = ["C>T", "C>G"]
+        elif target_basechange == "C>T":
+            nonref_base_changes = ["G>A", "G>C"]
         else:
             print("No non-ref base changes specified. not drawing them")
             nonref_base_changes = []
@@ -266,10 +257,10 @@ def plot_by_pos_behive(
         bdata,
         norm_rates_df,
         edit_count_key,
-        base_changes=[
+        base_changes=ref_other_changes
+        + [
             target_basechange,
         ]
-        + ref_other_changes
         + nonref_base_changes,
     )
     fig, ax = plt.subplots(figsize=(3, 7))
@@ -278,6 +269,7 @@ def plot_by_pos_behive(
     if normalize:
         df_to_draw = df_to_draw / vmax * 100
         vmax = 100
+
     target_data = df_to_draw.copy()
     target_data.loc[:, target_data.columns != target_basechange] = np.nan
     sns.heatmap(
@@ -291,6 +283,7 @@ def plot_by_pos_behive(
         fmt=".0f" if normalize else ".2g",
         annot_kws={"fontsize": 8},
     )
+
     ref_data = df_to_draw.copy()
     ref_data.loc[
         :,
@@ -322,20 +315,19 @@ def plot_by_pos_behive(
         annot_kws={"fontsize": 8},
     )
     ax.set_ylabel("Protospacer position")
-    return ax
+    return ax, df_to_draw
 
 
-def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame):
-    edit_rates_df["pam"] = bdata.guides.loc[
-        edit_rates_df.guide, "5-nt PAM"
-    ].reset_index(drop=True)
+def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame, pam_col="5-nt PAM"):
+    edit_rates_df["pam"] = bdata.guides.loc[edit_rates_df.guide, pam_col].reset_index(
+        drop=True
+    )
     edit_rates_df["pam23"] = edit_rates_df.pam.map(lambda s: s[1:3])
-    edit_rates_df["pam2"] = edit_rates_df.pam.map(lambda s: s[1])
-    edit_rates_df["pam3"] = edit_rates_df.pam.map(lambda s: s[2])
-    edit_rates_df["pam12"] = edit_rates_df.pam.map(lambda s: s[:2])
-    edit_rates_df["pam34"] = edit_rates_df.pam.map(lambda s: s[2:4])
     return pd.pivot(
-        edit_rates_df.loc[(edit_rates_df.base_change == bdata.target_base_change)]
+        edit_rates_df.loc[
+            (edit_rates_df.base_change == bdata.target_base_change),
+            ["rep_mean", "pam23", "spacer_pos"],
+        ]
         .groupby(["pam23", "spacer_pos"])
         .mean()
         .reset_index(),
@@ -348,11 +340,12 @@ def get_position_by_pam_rates(bdata, edit_rates_df: pd.DataFrame):
 def plot_by_pos_pam(
     bdata,
     edit_rates_df,
+    pam_col="5-nt PAM",
     ax=None,
     figsize=(6, 4),
 ):
     """Plot position by PAM"""
-    pos_by_pam = get_position_by_pam_rates(bdata, edit_rates_df)
+    pos_by_pam = get_position_by_pam_rates(bdata, edit_rates_df, pam_col)
     if ax is None:
         fig, ax = plt.subplots(figsize=figsize)
     sns.heatmap(pos_by_pam, ax=ax, cmap="Blues")
@@ -432,6 +425,7 @@ def get_pam_preference(
 def plot_pam_preference(
     edit_rates_df,
     bdata=None,
+    pam_col="5-nt PAM",
     edit_start_pos: int = 3,
     edit_end_pos: int = 8,
     ax=None,
@@ -440,7 +434,7 @@ def plot_pam_preference(
     """ """
     if "pam2" not in edit_rates_df.columns or "pam3" not in edit_rates_df.columns:
         edit_rates_df["pam"] = bdata.guides.loc[
-            edit_rates_df.guide, "5-nt PAM"
+            edit_rates_df.guide, pam_col
         ].reset_index(drop=True)
         edit_rates_df["pam2"] = edit_rates_df.pam.map(lambda s: s[1])
         edit_rates_df["pam3"] = edit_rates_df.pam.map(lambda s: s[2])

diff --git a/bean/plotting/utils.py b/bean/plotting/utils.py
@@ -0,0 +1,69 @@
+import argparse
+
+
+def parse_args():
+    print("  \n~~~BEAN Profile~~~")
+    print("-Profile editing patterns of your editor-")
+    print(
+        r"""
+    _ _                     __ _ _     
+  /  \ '\     _ __ _ _ ___ / _(_) |___ 
+  |   \  \   | '_ \ '_/ _ \  _| | / -_)
+   \   \  |  | .__/_| \___/_| |_|_\___|
+    `.__|/   |_|                       
+    """
+    )
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "bdata_path", help="Path to the ReporterScreen object to run QC on", type=str
+    )
+    parser.add_argument(
+        "-o",
+        "--output-prefix",
+        help="Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.",
+        type=str,
+    )
+    parser.add_argument(
+        "--replicate-col",
+        help="Column name in `bdata.samples` that describes replicate ID.",
+        type=str,
+        default="rep",
+    )
+    parser.add_argument(
+        "--condition-col",
+        help="Column name in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)",
+        type=str,
+        default="bin",
+    )
+    parser.add_argument(
+        "--pam-col",
+        help="Column name describing PAM of each gRNA in `bdata.guides`.",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--control-condition",
+        help="Control condition where editing preference would be profiled at. Pre-filters data where `bdata.samples[condition_col] == control_condition`.",
+        type=str,
+        default="bulk",
+    )
+    parser.add_argument(
+        "-w",
+        "--window-length",
+        help="Window length of editing window of maximal editing efficiency to be identified. This window is used to quantify context specificity within the window.",
+        type=int,
+        default=6,
+    )
+
+    args = parser.parse_args()
+    if args.output_prefix is None:
+        args.output_prefix = f"{args.bdata_path.rsplit('.h5ad', 1)[0]}"
+    return args
+
+
+def check_args(args):
+    if args.window_length < 1:
+        raise ValueError(f"window_length {args.window_length} is too small.")
+    if args.window_length > 20:
+        raise ValueError(f"window_length {args.window_length} is too large.")
+    return args