Skip to content

Commit

Permalink
Merge pull request #30 from pinellolab/dev
Browse files Browse the repository at this point in the history
Update to v1.2.4
  • Loading branch information
jykr authored May 7, 2024
2 parents 3bbe276 + 204fb66 commit 4b45711
Show file tree
Hide file tree
Showing 13 changed files with 484 additions and 58 deletions.
67 changes: 37 additions & 30 deletions bean/annotate/translate_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def _parse_range(chr_range: str) -> Tuple[str, int, int]:
return (chrom, start, end)


def _parse_description(desc_str: str):
def _parse_description(desc_str: str) -> Tuple[Tuple[str, int, int], str]:
"""Parse description line of fasta file to get range."""
sstr = desc_str.split(" ")
strand = None
Expand All @@ -153,21 +153,38 @@ def _parse_description(desc_str: str):
return genome_range, strand


def get_cds_seq_pos_from_fasta(fasta_file_name: str) -> Tuple[List[str], List[int]]:
def get_cds_seq_pos_from_fasta(
fasta_file_name: str,
) -> Tuple[str, List[str], List[int], str]:
"""Obtain tuple of lists, first with nucleotide and second with genomic position of the nucleotide."""
exons = list(SeqIO.parse(fasta_file_name, "fasta"))
translated_seq = []
genomic_pos = []
(exon_chrom, exon_start, exon_end), strand = _parse_description(
exons[0].description
)
if strand == "-":
exons = exons[::-1]
for exon in exons:
(exon_chrom, exon_start, exon_end), strand = _parse_description(
exon.description
)
exon_seq = []
exon_pos = []
for i, nt in enumerate(str(exon.seq)):
if nt.islower():
continue
translated_seq.append(nt)
genomic_pos.append(exon_start + i)
return (exon_chrom, translated_seq, genomic_pos, strand)
exon_seq.append(nt)
if strand == "+":
exon_pos.append(exon_start + i)
else:
exon_pos.append(exon_end - i)
if strand == "-":
exon_seq = revcomp(exon_seq)
exon_pos = exon_pos[::-1]
translated_seq.extend(exon_seq)
genomic_pos.extend(exon_pos)
return exon_chrom, translated_seq, genomic_pos, {"+": 1, "-": -1}[strand]


def _translate_single_codon(
Expand Down Expand Up @@ -207,12 +224,10 @@ def __init__(self):
self.genomic_pos: Sequence = []

@classmethod
def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True):
def from_fasta(cls, fasta_file_name, gene_name=None, suppressMessage=True):
cds = cls()
# if fasta_file_name is not None:
# if not suppressMessage:
# raise ValueError("No fasta file provided as reference: using LDLR")
# fasta_file_name = os.path.dirname(be.__file__) + "/annotate/ldlr_exons.fa"
if gene_name is None:
gene_name = os.path.basename(fasta_file_name).upper().split(".FA")[0]
if gene_name not in cls.gene_info_dict:
chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_fasta(
fasta_file_name
Expand Down Expand Up @@ -252,18 +267,10 @@ def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"):
cds.translated_seq = deepcopy(cls.gene_info_dict[gene_name]["translated_seq"])
cds.genomic_pos = cls.gene_info_dict[gene_name]["genomic_pos"]
cds.nt = cds.gene_info_dict[gene_name]["translated_seq"] # in sense strand
# print(cds.gene_name + ":" + "".join(cds.nt))
# if cds.strand == -1:
# cds.nt = revcomp(cds.nt)
cds.pos = np.array(cds.genomic_pos)
cds.edited_nt = deepcopy(cds.nt)
return cds

# def translate(self):
# if self.strand == -1:
# self.edited_nt = revcomp(self.edited_nt)
# self.aa = _translate(self.edited_nt, codon_map)

def _get_relative_nt_pos(self, absolute_pos):
"""0-based relative position"""
nt_relative_pos = np.where(self.pos == absolute_pos)[0]
Expand Down Expand Up @@ -665,16 +672,16 @@ def annotate_edit(
)
edit_info["coding"] = ""
edit_info.loc[edit_info.pos.map(lambda s: s.startswith("A")), "coding"] = "coding"
edit_info.loc[
edit_info.pos.map(lambda s: not s.startswith("A")), "coding"
] = "noncoding"
edit_info.loc[edit_info.pos.map(lambda s: not s.startswith("A")), "coding"] = (
"noncoding"
)
if control_tag is not None:
edit_info.loc[
edit_info.pos.map(lambda s: control_tag in s), "group"
] = "negctrl"
edit_info.loc[
edit_info.pos.map(lambda s: control_tag in s), "coding"
] = "negctrl"
edit_info.loc[edit_info.pos.map(lambda s: control_tag in s), "group"] = (
"negctrl"
)
edit_info.loc[edit_info.pos.map(lambda s: control_tag in s), "coding"] = (
"negctrl"
)
edit_info.loc[
(edit_info.coding == "noncoding") & (edit_info.group != "negctrl"), "int_pos"
] = edit_info.loc[
Expand All @@ -691,9 +698,9 @@ def annotate_edit(
(edit_info.alt == edit_info.ref) & (edit_info.coding == "coding"), "group"
] = "syn"
if splice_sites is not None:
edit_info.loc[
edit_info.pos.isin(splice_sites.astype(str)), "group"
] = "splicing"
edit_info.loc[edit_info.pos.isin(splice_sites.astype(str)), "group"] = (
"splicing"
)
edit_info.loc[edit_info.int_pos < -100, "group"] = "negctrl"
edit_info.loc[edit_info.int_pos < -100, "coding"] = "negctrl"
return edit_info
6 changes: 4 additions & 2 deletions bean/cli/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,17 @@ def main(args):
posctrl_col=args.posctrl_col,
posctrl_val=args.posctrl_val,
lfc_thres=args.lfc_thres,
replicate_label=args.replicate_label,
condition_label=args.condition_label,
replicate_label=args.replicate_col,
condition_label=args.condition_col,
comp_cond1=args.lfc_cond1,
comp_cond2=args.lfc_cond2,
ctrl_cond=args.control_condition,
exp_id=args.out_report_prefix,
recalculate_edits=(not args.dont_recalculate_edits),
base_edit_data=args.base_edit_data,
remove_bad_replicates=args.remove_bad_replicates,
reporter_length=args.reporter_length,
reporter_right_flank_length=args.reporter_right_flank_length,
),
kernel_name="bean_python3",
)
Expand Down
10 changes: 9 additions & 1 deletion bean/framework/ReporterScreen.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,8 @@ def get_edit_mat_from_uns(
rel_pos_is_reporter=False,
target_pos_col="target_pos",
edit_count_key="edit_counts",
reporter_length: int = 32,
reporter_right_flank_length: int = 6,
):
"""
Get the edit matrix from `.uns[edit_count_key]` to store the result in `.layers["edits"]`
Expand All @@ -374,6 +376,10 @@ def get_edit_mat_from_uns(
target_base_edit = self.target_base_changes
if match_target_position is None:
match_target_position = not self.tiling
if "reporter_length" in self.uns:
reporter_length = self.uns["reporter_length"]
if "reproter_right_flank_length" in self.uns:
reporter_right_flank_length = self.uns["reporter_right_flank_length"]
if edit_count_key not in self.uns or len(self.uns[edit_count_key]) == 0:
raise ValueError(
"Edit count isn't calculated. "
Expand All @@ -400,7 +406,9 @@ def get_edit_mat_from_uns(
drop=True
)
edits["guide_start_pos"] = (
32 - 6 - guide_len[edits.guide_idx].reset_index(drop=True)
reporter_length
- reporter_right_flank_length
- guide_len[edits.guide_idx].reset_index(drop=True)
)
if not match_target_position:
edits["rel_pos"] = edits.edit.map(lambda e: e.rel_pos)
Expand Down
61 changes: 48 additions & 13 deletions bean/notebooks/sample_quality_report.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@
"recalculate_edits = True\n",
"tiling = None\n",
"base_edit_data = True\n",
"remove_bad_replicates = False"
"remove_bad_replicates = False\n",
"reporter_length = 32\n",
"reporter_right_flank_length = 6"
]
},
{
Expand All @@ -84,6 +86,8 @@
" tiling = bdata.uns['tiling']\n",
"else:\n",
" raise ValueError(\"Ambiguous assignment if the screen is a tiling screen. Provide `--tiling=True` or `tiling=False`.\")\n",
"bdata.uns[\"reporter_length\"] = reporter_length\n",
"bdata.uns[\"reporter_right_flank_length\"] = reporter_right_flank_length\n",
"if posctrl_col:\n",
" if posctrl_col not in bdata.guides.columns:\n",
" raise ValueError(f\"--posctrl-col argument '{posctrl_col}' is not present in the input ReporterScreen.guides.columns {bdata.guides.columns}. If you do not want to use positive control gRNA annotation for LFC calculation, feed --posctrl-col='' instead.\")\n",
Expand Down Expand Up @@ -128,6 +132,18 @@
"bdata.samples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for qc_col in [\"gini_X\", \"median_corr_X\", f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\",\"mean_editing_rate\", \"mask\"]:\n",
" if qc_col in bdata.samples:\n",
" del bdata.samples[qc_col]\n",
"n_cols_samples = len(bdata.samples.columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -384,25 +400,28 @@
"metadata": {},
"outputs": [],
"source": [
"bdata.samples[\"mask\"] = 1\n",
"mdata = bdata.samples.copy()\n",
"# Data has positive control\n",
"bdata.samples.loc[\n",
"for col in mdata.columns.tolist():\n",
" mdata[col]=1.0\n",
"\n",
"mdata.loc[\n",
" bdata.samples.median_corr_X.isnull() | (bdata.samples.median_corr_X < count_correlation_thres),\n",
" \"mask\",\n",
"] = 0\n",
" \"median_corr_X\",\n",
"] = 0.0\n",
"if \"mean_editing_rate\" in bdata.samples.columns.tolist():\n",
" bdata.samples.loc[bdata.samples.mean_editing_rate < edit_rate_thres, \"mask\"] = 0\n",
" mdata.loc[bdata.samples.mean_editing_rate < edit_rate_thres, \"mean_editing_rate\"] = 0\n",
"\n",
"bdata.samples.loc[\n",
"mdata.loc[\n",
" bdata.samples[f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\"] < lfc_thres,\n",
" \"mask\",\n",
"] = 0\n",
" f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\",\n",
"] = 0.0\n",
"if posctrl_col:\n",
" print(\"filter with posctrl LFC\")\n",
" bdata.samples.loc[\n",
" mdata.loc[\n",
" bdata.samples[f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\"].isnull(),\n",
" \"mask\",\n",
" ] = 0"
" f\"median_lfc_corr.{comp_cond1}_{comp_cond2}\",\n",
" ] = 0.0\n"
]
},
{
Expand All @@ -411,7 +430,19 @@
"metadata": {},
"outputs": [],
"source": [
"bdata.samples.style.background_gradient(cmap=\"coolwarm_r\")"
"from matplotlib import colors\n",
"def b_g(s, cmap='coolwarm_r', low=0, high=1):\n",
" a = mdata.loc[:,s.name].copy()\n",
" if s.name not in mdata.columns.tolist()[n_cols_samples:]:\n",
" a[:] = 1.0\n",
" # rng = a.max() - a.min()\n",
" # norm = colors.Normalize(a.min() - (rng * low),\n",
" # a.max() + (rng * high))\n",
" # normed = norm(a.values)\n",
" c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(a.values)]\n",
" return ['background-color: %s' % color for color in c]\n",
"print(\"Failing QC is shown as red:\")\n",
"bdata.samples.style.apply(b_g)"
]
},
{
Expand All @@ -421,6 +452,10 @@
"outputs": [],
"source": [
"# leave replicate with more than 1 sorting bin data\n",
"print(mdata)\n",
"print(n_cols_samples)\n",
"\n",
"bdata.samples[\"mask\"] = mdata.iloc[:,n_cols_samples:].astype(int).all(axis=1).astype(int).tolist()\n",
"if remove_bad_replicates:\n",
" rep_n_samples = bdata.samples.groupby(replicate_label)[\"mask\"].sum()\n",
" print(rep_n_samples)\n",
Expand Down
2 changes: 2 additions & 0 deletions bean/preprocessing/data_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ def get_size_factor(self, X: np.array):
assert geom_mean_x.shape == (n_guides,)
norm_count = X / geom_mean_x[:, None]
size_factor = np.median(norm_count, axis=0)
if any(size_factor == 0):
size_factor = np.mean(norm_count, axis=0)
assert size_factor.shape == (n_samples,)
return size_factor

Expand Down
4 changes: 2 additions & 2 deletions bean/preprocessing/get_alpha0.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ def get_fitted_alpha0(
a0 = ((n - 1) / (r - 1 + 1 / (1 - p)) - 1).mean(axis=0)

x, y = get_valid_vals(n.log(), a0.log())
if len(y) < 10:
if len(y) < 5:
if popt is None:
popt = (-1.510, 0.7861)
print(
f"Cannot fit log(a0) ~ log(q): data too sparse! Using pre-fitted values [b0, b1]={popt}"
f"Cannot fit log(a0) ~ log(q): data too sparse ({len(y)} valid values)! Using pre-fitted values [b0, b1]={popt}"
)
else:
popt, pcov = curve_fit(linear, x, y)
Expand Down
24 changes: 19 additions & 5 deletions bean/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,31 @@ def prepare_bdata(bdata: be.ReporterScreen, args, warn, prefix: str):
bdata = bdata.copy()
bdata.samples["replicate"] = bdata.samples[args.replicate_col].astype("category")
bdata.guides = bdata.guides.loc[:, ~bdata.guides.columns.duplicated()].copy()

# filter out 0-count gRNAs & samples
if args.selection == "sorting" or args.exclude_control_condition_for_inference:
bdata_test = bdata[
:, bdata.samples[args.condition_col] != args.control_condition
]
else:
bdata_test = bdata
if any(bdata_test.X.sum(axis=1) == 0):
warn(
f"Filtering out {sum(bdata_test.X.sum(axis=1) == 0)} gRNAs without any counts over all samples."
)
bdata = bdata[bdata_test.X.sum(axis=1) > 0, :]
if any(bdata[:, bdata.samples.mask == 1].X.sum(axis=0) == 0):
raise ValueError(
f"Sample {bdata.samples.index[(bdata.samples.mask == 1) & (bdata[:,bdata.samples.mask == 1].X.sum(axis=0) == 0)]} has 0 counts. Make sure you mask that sample."
)

if args.library_design == "variant":
if bdata.guides[args.target_col].isnull().any():
raise ValueError(
f"Some target column (bdata.guides[{args.target_col}]) value is null. Check your input file."
)
bdata = bdata[bdata.guides[args.target_col].argsort(), :]
if any(bdata.X.sum(axis=1) > 0):
warn(
f"Filtering out {sum(bdata.X.sum(axis=1) > 0)} gRNAs without any counts over all samples."
)
bdata = bdata[bdata.X.sum(axis=1) > 0, :]

n_no_support_targets, bdata = filter_no_info_target(
bdata,
condit_col=args.condition_col,
Expand Down
19 changes: 16 additions & 3 deletions bean/qc/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def parse_args(parser=None):
help="Specify that the guide library is tiling library without 'n guides per target' design",
)
input_parser.add_argument(
"--replicate-label",
"--replicate-col",
help="Label of column in `bdata.samples` that describes replicate ID.",
type=str,
default="replicate",
Expand All @@ -96,7 +96,7 @@ def parse_args(parser=None):
default=None,
)
input_parser.add_argument(
"--condition-label",
"--condition-col",
help="Label of column in `bdata.samples` that describes experimental condition. (sorting bin, time, etc.)",
type=str,
default="condition",
Expand All @@ -117,11 +117,13 @@ def parse_args(parser=None):
"--edit-start-pos",
help="Edit start position to quantify editing rate on, 0-based inclusive.",
default=2,
type=int,
)
input_parser.add_argument(
"--edit-end-pos",
help="Edit end position to quantify editing rate on, 0-based exclusive.",
default=7,
type=int,
)

input_parser.add_argument(
Expand Down Expand Up @@ -149,5 +151,16 @@ def parse_args(parser=None):
type=str,
default="bulk",
)

parser.add_argument(
"--reporter-length",
type=int,
default=32,
help="Length of reporter sequence in the construct.",
)
parser.add_argument(
"--reporter-right-flank-length",
type=int,
default=6,
help="Length of the right-flanking nucleotides of protospacer in the reporter.",
)
return parser
Loading

0 comments on commit 4b45711

Please sign in to comment.