Merge pull request #30 from pinellolab/dev

Update to v1.2.4
pinellolab · May 7, 2024 · 4b45711 · 4b45711
2 parents 3bbe276 + 204fb66
commit 4b45711
Show file tree

Hide file tree

Showing 13 changed files with 484 additions and 58 deletions.
diff --git a/bean/annotate/translate_allele.py b/bean/annotate/translate_allele.py
@@ -137,7 +137,7 @@ def _parse_range(chr_range: str) -> Tuple[str, int, int]:
     return (chrom, start, end)
 
 
-def _parse_description(desc_str: str):
+def _parse_description(desc_str: str) -> Tuple[Tuple[str, int, int], str]:
     """Parse description line of fasta file to get range."""
     sstr = desc_str.split(" ")
     strand = None
@@ -153,21 +153,38 @@ def _parse_description(desc_str: str):
     return genome_range, strand
 
 
-def get_cds_seq_pos_from_fasta(fasta_file_name: str) -> Tuple[List[str], List[int]]:
+def get_cds_seq_pos_from_fasta(
+    fasta_file_name: str,
+) -> Tuple[str, List[str], List[int], str]:
     """Obtain tuple of lists, first with nucleotide and second with genomic position of the nucleotide."""
     exons = list(SeqIO.parse(fasta_file_name, "fasta"))
     translated_seq = []
     genomic_pos = []
+    (exon_chrom, exon_start, exon_end), strand = _parse_description(
+        exons[0].description
+    )
+    if strand == "-":
+        exons = exons[::-1]
     for exon in exons:
         (exon_chrom, exon_start, exon_end), strand = _parse_description(
             exon.description
         )
+        exon_seq = []
+        exon_pos = []
         for i, nt in enumerate(str(exon.seq)):
             if nt.islower():
                 continue
-            translated_seq.append(nt)
-            genomic_pos.append(exon_start + i)
-    return (exon_chrom, translated_seq, genomic_pos, strand)
+            exon_seq.append(nt)
+            if strand == "+":
+                exon_pos.append(exon_start + i)
+            else:
+                exon_pos.append(exon_end - i)
+        if strand == "-":
+            exon_seq = revcomp(exon_seq)
+            exon_pos = exon_pos[::-1]
+        translated_seq.extend(exon_seq)
+        genomic_pos.extend(exon_pos)
+    return exon_chrom, translated_seq, genomic_pos, {"+": 1, "-": -1}[strand]
 
 
 def _translate_single_codon(
@@ -207,12 +224,10 @@ def __init__(self):
         self.genomic_pos: Sequence = []
 
     @classmethod
-    def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True):
+    def from_fasta(cls, fasta_file_name, gene_name=None, suppressMessage=True):
         cds = cls()
-        # if fasta_file_name is not None:
-        #     if not suppressMessage:
-        #         raise ValueError("No fasta file provided as reference: using LDLR")
-        #     fasta_file_name = os.path.dirname(be.__file__) + "/annotate/ldlr_exons.fa"
+        if gene_name is None:
+            gene_name = os.path.basename(fasta_file_name).upper().split(".FA")[0]
         if gene_name not in cls.gene_info_dict:
             chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_fasta(
                 fasta_file_name
@@ -252,18 +267,10 @@ def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"):
         cds.translated_seq = deepcopy(cls.gene_info_dict[gene_name]["translated_seq"])
         cds.genomic_pos = cls.gene_info_dict[gene_name]["genomic_pos"]
         cds.nt = cds.gene_info_dict[gene_name]["translated_seq"]  # in sense strand
-        # print(cds.gene_name + ":" + "".join(cds.nt))
-        # if cds.strand == -1:
-        #     cds.nt = revcomp(cds.nt)
         cds.pos = np.array(cds.genomic_pos)
         cds.edited_nt = deepcopy(cds.nt)
         return cds
 
-    # def translate(self):
-    #     if self.strand == -1:
-    #         self.edited_nt = revcomp(self.edited_nt)
-    #     self.aa = _translate(self.edited_nt, codon_map)
-
     def _get_relative_nt_pos(self, absolute_pos):
         """0-based relative position"""
         nt_relative_pos = np.where(self.pos == absolute_pos)[0]
@@ -665,16 +672,16 @@ def annotate_edit(
         )
     edit_info["coding"] = ""
     edit_info.loc[edit_info.pos.map(lambda s: s.startswith("A")), "coding"] = "coding"
-    edit_info.loc[
-        edit_info.pos.map(lambda s: not s.startswith("A")), "coding"
-    ] = "noncoding"
+    edit_info.loc[edit_info.pos.map(lambda s: not s.startswith("A")), "coding"] = (
+        "noncoding"
+    )
     if control_tag is not None:
-        edit_info.loc[
-            edit_info.pos.map(lambda s: control_tag in s), "group"
-        ] = "negctrl"
-        edit_info.loc[
-            edit_info.pos.map(lambda s: control_tag in s), "coding"
-        ] = "negctrl"
+        edit_info.loc[edit_info.pos.map(lambda s: control_tag in s), "group"] = (
+            "negctrl"
+        )
+        edit_info.loc[edit_info.pos.map(lambda s: control_tag in s), "coding"] = (
+            "negctrl"
+        )
     edit_info.loc[
         (edit_info.coding == "noncoding") & (edit_info.group != "negctrl"), "int_pos"
     ] = edit_info.loc[
@@ -691,9 +698,9 @@ def annotate_edit(
         (edit_info.alt == edit_info.ref) & (edit_info.coding == "coding"), "group"
     ] = "syn"
     if splice_sites is not None:
-        edit_info.loc[
-            edit_info.pos.isin(splice_sites.astype(str)), "group"
-        ] = "splicing"
+        edit_info.loc[edit_info.pos.isin(splice_sites.astype(str)), "group"] = (
+            "splicing"
+        )
     edit_info.loc[edit_info.int_pos < -100, "group"] = "negctrl"
     edit_info.loc[edit_info.int_pos < -100, "coding"] = "negctrl"
     return edit_info
diff --git a/bean/cli/qc.py b/bean/cli/qc.py
@@ -41,15 +41,17 @@ def main(args):
             posctrl_col=args.posctrl_col,
             posctrl_val=args.posctrl_val,
             lfc_thres=args.lfc_thres,
-            replicate_label=args.replicate_label,
-            condition_label=args.condition_label,
+            replicate_label=args.replicate_col,
+            condition_label=args.condition_col,
             comp_cond1=args.lfc_cond1,
             comp_cond2=args.lfc_cond2,
             ctrl_cond=args.control_condition,
             exp_id=args.out_report_prefix,
             recalculate_edits=(not args.dont_recalculate_edits),
             base_edit_data=args.base_edit_data,
             remove_bad_replicates=args.remove_bad_replicates,
+            reporter_length=args.reporter_length,
+            reporter_right_flank_length=args.reporter_right_flank_length,
         ),
         kernel_name="bean_python3",
     )

diff --git a/bean/framework/ReporterScreen.py b/bean/framework/ReporterScreen.py
@@ -356,6 +356,8 @@ def get_edit_mat_from_uns(
         rel_pos_is_reporter=False,
         target_pos_col="target_pos",
         edit_count_key="edit_counts",
+        reporter_length: int = 32,
+        reporter_right_flank_length: int = 6,
     ):
         """
         Get the edit matrix from `.uns[edit_count_key]` to store the result in `.layers["edits"]`
@@ -374,6 +376,10 @@ def get_edit_mat_from_uns(
             target_base_edit = self.target_base_changes
         if match_target_position is None:
             match_target_position = not self.tiling
+        if "reporter_length" in self.uns:
+            reporter_length = self.uns["reporter_length"]
+        if "reproter_right_flank_length" in self.uns:
+            reporter_right_flank_length = self.uns["reporter_right_flank_length"]
         if edit_count_key not in self.uns or len(self.uns[edit_count_key]) == 0:
             raise ValueError(
                 "Edit count isn't calculated. "
@@ -400,7 +406,9 @@ def get_edit_mat_from_uns(
             drop=True
         )
         edits["guide_start_pos"] = (
-            32 - 6 - guide_len[edits.guide_idx].reset_index(drop=True)
+            reporter_length
+            - reporter_right_flank_length
+            - guide_len[edits.guide_idx].reset_index(drop=True)
         )
         if not match_target_position:
             edits["rel_pos"] = edits.edit.map(lambda e: e.rel_pos)

diff --git a/bean/notebooks/sample_quality_report.ipynb b/bean/notebooks/sample_quality_report.ipynb
@@ -60,7 +60,9 @@
     "recalculate_edits = True\n",
     "tiling = None\n",
     "base_edit_data = True\n",
-    "remove_bad_replicates = False"
+    "remove_bad_replicates = False\n",
+    "reporter_length = 32\n",
+    "reporter_right_flank_length = 6"
    ]
   },
   {
@@ -84,6 +86,8 @@
     "    tiling = bdata.uns['tiling']\n",
     "else:\n",
     "    raise ValueError(\"Ambiguous assignment if the screen is a tiling screen. Provide `--tiling=True` or `tiling=False`.\")\n",
+    "bdata.uns[\"reporter_length\"] = reporter_length\n",
+    "bdata.uns[\"reporter_right_flank_length\"] = reporter_right_flank_length\n",
     "if posctrl_col:\n",
     "    if posctrl_col not in bdata.guides.columns:\n",
     "        raise ValueError(f\"--posctrl-col argument '{posctrl_col}' is not present in the input ReporterScreen.guides.columns {bdata.guides.columns}. If you do not want to use positive control gRNA annotation for LFC calculation, feed --posctrl-col='' instead.\")\n",
@@ -128,6 +132,18 @@
     "bdata.samples"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for qc_col in [\"gini_X\", \"median_corr_X\", f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\",\"mean_editing_rate\", \"mask\"]:\n",
+    "    if qc_col in bdata.samples:\n",
+    "        del bdata.samples[qc_col]\n",
+    "n_cols_samples = len(bdata.samples.columns)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -384,25 +400,28 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bdata.samples[\"mask\"] = 1\n",
+    "mdata = bdata.samples.copy()\n",
     "# Data has positive control\n",
-    "bdata.samples.loc[\n",
+    "for col in mdata.columns.tolist():\n",
+    "    mdata[col]=1.0\n",
+    "\n",
+    "mdata.loc[\n",
     "    bdata.samples.median_corr_X.isnull() | (bdata.samples.median_corr_X < count_correlation_thres),\n",
-    "    \"mask\",\n",
-    "] = 0\n",
+    "    \"median_corr_X\",\n",
+    "] = 0.0\n",
     "if \"mean_editing_rate\" in bdata.samples.columns.tolist():\n",
-    "    bdata.samples.loc[bdata.samples.mean_editing_rate < edit_rate_thres, \"mask\"] = 0\n",
+    "    mdata.loc[bdata.samples.mean_editing_rate < edit_rate_thres, \"mean_editing_rate\"] = 0\n",
     "\n",
-    "bdata.samples.loc[\n",
+    "mdata.loc[\n",
     "    bdata.samples[f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\"] < lfc_thres,\n",
-    "    \"mask\",\n",
-    "] = 0\n",
+    "    f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\",\n",
+    "] = 0.0\n",
     "if posctrl_col:\n",
     "    print(\"filter with posctrl LFC\")\n",
-    "    bdata.samples.loc[\n",
+    "    mdata.loc[\n",
     "        bdata.samples[f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\"].isnull(),\n",
-    "        \"mask\",\n",
-    "    ] = 0"
+    "        f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\",\n",
+    "    ] = 0.0\n"
    ]
   },
   {
@@ -411,7 +430,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bdata.samples.style.background_gradient(cmap=\"coolwarm_r\")"
+    "from matplotlib import colors\n",
+    "def b_g(s, cmap='coolwarm_r', low=0, high=1):\n",
+    "    a = mdata.loc[:,s.name].copy()\n",
+    "    if s.name not in mdata.columns.tolist()[n_cols_samples:]:\n",
+    "        a[:] = 1.0\n",
+    "    # rng = a.max() - a.min()\n",
+    "    # norm = colors.Normalize(a.min() - (rng * low),\n",
+    "    #                     a.max() + (rng * high))\n",
+    "    # normed = norm(a.values)\n",
+    "    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(a.values)]\n",
+    "    return ['background-color: %s' % color for color in c]\n",
+    "print(\"Failing QC is shown as red:\")\n",
+    "bdata.samples.style.apply(b_g)"
    ]
   },
   {
@@ -421,6 +452,10 @@
    "outputs": [],
    "source": [
     "# leave replicate with more than 1 sorting bin data\n",
+    "print(mdata)\n",
+    "print(n_cols_samples)\n",
+    "\n",
+    "bdata.samples[\"mask\"] = mdata.iloc[:,n_cols_samples:].astype(int).all(axis=1).astype(int).tolist()\n",
     "if remove_bad_replicates:\n",
     "    rep_n_samples = bdata.samples.groupby(replicate_label)[\"mask\"].sum()\n",
     "    print(rep_n_samples)\n",

diff --git a/bean/preprocessing/data_class.py b/bean/preprocessing/data_class.py
@@ -232,6 +232,8 @@ def get_size_factor(self, X: np.array):
         assert geom_mean_x.shape == (n_guides,)
         norm_count = X / geom_mean_x[:, None]
         size_factor = np.median(norm_count, axis=0)
+        if any(size_factor == 0):
+            size_factor = np.mean(norm_count, axis=0)
         assert size_factor.shape == (n_samples,)
         return size_factor
 

diff --git a/bean/preprocessing/get_alpha0.py b/bean/preprocessing/get_alpha0.py
@@ -100,11 +100,11 @@ def get_fitted_alpha0(
     a0 = ((n - 1) / (r - 1 + 1 / (1 - p)) - 1).mean(axis=0)
 
     x, y = get_valid_vals(n.log(), a0.log())
-    if len(y) < 10:
+    if len(y) < 5:
         if popt is None:
             popt = (-1.510, 0.7861)
         print(
-            f"Cannot fit log(a0) ~ log(q): data too sparse! Using pre-fitted values [b0, b1]={popt}"
+            f"Cannot fit log(a0) ~ log(q): data too sparse ({len(y)} valid values)! Using pre-fitted values [b0, b1]={popt}"
         )
     else:
         popt, pcov = curve_fit(linear, x, y)

diff --git a/bean/preprocessing/utils.py b/bean/preprocessing/utils.py
@@ -26,17 +26,31 @@ def prepare_bdata(bdata: be.ReporterScreen, args, warn, prefix: str):
     bdata = bdata.copy()
     bdata.samples["replicate"] = bdata.samples[args.replicate_col].astype("category")
     bdata.guides = bdata.guides.loc[:, ~bdata.guides.columns.duplicated()].copy()
+
+    # filter out 0-count gRNAs & samples
+    if args.selection == "sorting" or args.exclude_control_condition_for_inference:
+        bdata_test = bdata[
+            :, bdata.samples[args.condition_col] != args.control_condition
+        ]
+    else:
+        bdata_test = bdata
+    if any(bdata_test.X.sum(axis=1) == 0):
+        warn(
+            f"Filtering out {sum(bdata_test.X.sum(axis=1) == 0)} gRNAs without any counts over all samples."
+        )
+        bdata = bdata[bdata_test.X.sum(axis=1) > 0, :]
+    if any(bdata[:, bdata.samples.mask == 1].X.sum(axis=0) == 0):
+        raise ValueError(
+            f"Sample {bdata.samples.index[(bdata.samples.mask == 1) & (bdata[:,bdata.samples.mask == 1].X.sum(axis=0) == 0)]} has 0 counts. Make sure you mask that sample."
+        )
+
     if args.library_design == "variant":
         if bdata.guides[args.target_col].isnull().any():
             raise ValueError(
                 f"Some target column (bdata.guides[{args.target_col}]) value is null. Check your input file."
             )
         bdata = bdata[bdata.guides[args.target_col].argsort(), :]
-        if any(bdata.X.sum(axis=1) > 0):
-            warn(
-                f"Filtering out {sum(bdata.X.sum(axis=1) > 0)} gRNAs without any counts over all samples."
-            )
-            bdata = bdata[bdata.X.sum(axis=1) > 0, :]
+
         n_no_support_targets, bdata = filter_no_info_target(
             bdata,
             condit_col=args.condition_col,

diff --git a/bean/qc/parser.py b/bean/qc/parser.py
@@ -84,7 +84,7 @@ def parse_args(parser=None):
         help="Specify that the guide library is tiling library without 'n guides per target' design",
     )
     input_parser.add_argument(
-        "--replicate-label",
+        "--replicate-col",
         help="Label of column in `bdata.samples` that describes replicate ID.",
         type=str,
         default="replicate",
@@ -96,7 +96,7 @@ def parse_args(parser=None):
         default=None,
     )
     input_parser.add_argument(
-        "--condition-label",
+        "--condition-col",
         help="Label of column in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)",
         type=str,
         default="condition",
@@ -117,11 +117,13 @@ def parse_args(parser=None):
         "--edit-start-pos",
         help="Edit start position to quantify editing rate on, 0-based inclusive.",
         default=2,
+        type=int,
     )
     input_parser.add_argument(
         "--edit-end-pos",
         help="Edit end position to quantify editing rate on, 0-based exclusive.",
         default=7,
+        type=int,
     )
 
     input_parser.add_argument(
@@ -149,5 +151,16 @@ def parse_args(parser=None):
         type=str,
         default="bulk",
     )
-
+    parser.add_argument(
+        "--reporter-length",
+        type=int,
+        default=32,
+        help="Length of reporter sequence in the construct.",
+    )
+    parser.add_argument(
+        "--reporter-right-flank-length",
+        type=int,
+        default=6,
+        help="Length of the right-flanking nucleotides of protospacer in the reporter.",
+    )
     return parser