Skip to content

Commit

Permalink
Merge pull request #17 from pinellolab/dev
Browse files Browse the repository at this point in the history
Debug translation & run
  • Loading branch information
jykr authored Dec 5, 2023
2 parents 6c64277 + 3a1d2c1 commit 8bd29ca
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 18 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ bean-profile my_sorting_screen.h5ad -o output_prefix `# Prefix for editing profi
### Output
Above command produces `prefix_editing_preference.[html,ipynb]` as editing preferences ([see example](notebooks/profile_editing_preference.ipynb)).

<img src="imgs/profile_output.png" alt="Allele translation" width="700" style="background-color:white;"/>
<img src="imgs/profile_output.png" alt="Allele translation" width="700" />

### Parameters
* `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.
Expand All @@ -188,7 +188,7 @@ bean-qc \

`bean-qc` supports following quality control and masks samples with low quality. Specifically:

<img src="imgs/qc_output.png" alt="Allele translation" width="900" style="background-color:white;"/>
<img src="imgs/qc_output.png" alt="Allele translation" width="900"/>

* Plots guide coverage and the uniformity of coverage
* Guide count correlation between samples
Expand Down Expand Up @@ -402,5 +402,5 @@ Python package `bean` supports multiple data wrangling functionalities for `Repo
* Full pipeline takes 90.1s in GitHub Action for toy dataset of 2 replicates and 30 guides.
## Citation
If you have used BEAN, please cite:
If you have used BEAN for your analysis, please cite:
Ryu, J. et al. Joint genotypic and phenotypic outcome modeling improves base editing variant effect quantification. medRxiv (2023) doi:10.1101/2023.09.08.23295253
11 changes: 6 additions & 5 deletions bean/annotate/translate_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,11 @@ def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True):
return cds

@classmethod
def from_gene_name(cls, gene_name):
def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"):
cds = cls()
if gene_name not in cls.gene_info_dict:
chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_gene_name(
gene_name
gene_name, ref_version
)
cls.gene_info_dict[gene_name] = {
"chrom": chrom,
Expand Down Expand Up @@ -329,8 +329,8 @@ def get_mismatch_df():
).drop_duplicates()


def get_cds_dict(gene_names):
return {gname: CDS.from_gene_name(gname) for gname in gene_names}
def get_cds_dict(gene_names, ref_version: str = "GRCh38"):
return {gname: CDS.from_gene_name(gname, ref_version) for gname in gene_names}


def export_gene_info_to_json(gene_dict, write_path=".tmp_gene_info.csv"):
Expand Down Expand Up @@ -418,14 +418,15 @@ def get_allele_aa_change_single_gene(
fasta_file: str = None,
fasta_file_id: str = None,
include_synonymous: bool = True,
ref_version: str = "GRCh38",
):
"""
Obtain amino acid changes
"""
if gene_name is None and fasta_file is not None:
cds = CDS.from_fasta(fasta_file, fasta_file_id)
elif gene_name is not None and fasta_file is None:
cds = CDS.from_gene_name(gene_name)
cds = CDS.from_gene_name(gene_name, ref_version)
else:
raise ValueError("Only one of `gene_name` or `fasta_file` should be provided.")
return cds.get_aa_change(allele, include_synonymous)
Expand Down
21 changes: 15 additions & 6 deletions bean/annotate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def get_mane_transcript_id(gene_name: str):
return mane_transcript_id, id_version


def get_exons_from_transcript_id(transcript_id: str, id_version: int):
def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_version: str = "GRCh38"):
"""
Retrieves the exons and the start position of the coding sequence (CDS) for a given transcript ID and version.
Expand All @@ -112,9 +112,18 @@ def get_exons_from_transcript_id(transcript_id: str, id_version: int):
response = requests.get(api_url, headers={"Content-Type": "application/json"})
transcript_json = response.json()
if transcript_json["count"] != 1:
raise ValueError(
f"Non-unique entry for transcript ID and version:\n{transcript_json}"
)
if transcript_json["count"] > 1:
api_url = f"http://tark.ensembl.org/api/transcript/?stable_id={transcript_id}&stable_id_version={id_version}&assembly_name={ref_version}&expand=exons"
response = requests.get(api_url, headers={"Content-Type": "application/json"})
transcript_json = response.json()
if transcript_json["count"] != 1:
raise ValueError(
f"Non-unique entry for transcript ID , version {id_version} and assembly {ref_version}:\n{transcript_json}"
)
else:
raise ValueError(
f"No entry found for transcript ID {transcript_id}, version {id_version} and assembly {ref_version}"
)
transcript_record = transcript_json["results"][0]
exons_list = transcript_record["exons"]
strand = transcript_record["loc_strand"] # +1/-1
Expand Down Expand Up @@ -186,11 +195,11 @@ def get_exon_pos_seq(exon_id, id_version, cds_start, cds_end, ref_version="GRCh3
return chrom, list(sequence), list(range(start_pos, end_pos + 1))


def get_cds_seq_pos_from_gene_name(gene_name: str):
def get_cds_seq_pos_from_gene_name(gene_name: str, ref_version: str = "GRCh38"):
transcript_id, id_version = get_mane_transcript_id(gene_name)
print(f"MANE transcript ID {transcript_id} for {gene_name} will be used.")
exons_list, cds_start, cds_end, strand = get_exons_from_transcript_id(
transcript_id, id_version
transcript_id, id_version, ref_version
)
if strand == -1:
exons_list = exons_list[::-1]
Expand Down
19 changes: 19 additions & 0 deletions bean/model/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,25 @@ def check_args(args, bdata):
raise ValueError(
f"{args.bdata_path} does not have specified sample mask column {args.sample_mask_col} in .samples"
)
if args.condition_col not in bdata.samples.columns:
raise ValueError(
f"Condition column set by `--condition-col` {args.condition_col} not in ReporterScreen.samples.columns:{bdata.samples.columns}. Check your input."
)
if args.control_condition_label not in bdata.samples[args.condition_col].tolist():
raise ValueError(
f"No sample has control label (set by `--control-condition-label`) {args.control_condition_label} in ReporterScreen.samples[{args.condition_col}]: {bdata.samples[args.condition_col]}. Check your input."
)
if args.replicate_col not in bdata.samples.columns:
raise ValueError(
f"Condition column set by `--replicate-col` {args.replicate_col} not in ReporterScreen.samples.columns:{bdata.samples.columns}. Check your input."
)
if (
args.control_guide_tag is not None
and not bdata.guides.index.map(lambda s: args.control_guide_tag in s).any()
):
raise ValueError(
f"Negative control guide label {args.control_guide_tag} provided by `--control-guide-tag` doesn't appear in any of the guide names. Check your input."
)
if args.alpha_if_overdispersion_fitting_fails is not None:
try:
b0, b1 = args.alpha_if_overdispersion_fitting_fails.split(",")
Expand Down
12 changes: 8 additions & 4 deletions bean/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,12 +214,16 @@ def _assign_rep_ids_and_sort(
screen: be.ReporterScreen, rep_col: str, condition_column: str = None
) -> be.ReporterScreen:
"""Assign replicate IDs to samples and sort them accordingly."""
if rep_col not in screen.samples.columns:
raise ValueError(
f"{rep_col} not in columns of ReporterScreen.samples with following columns: {screen.samples.columns}. Check your input or adjust `--replicate-col` of `bean-run` command as the existing column name specifying experimental replicates."
)
for i, rep in enumerate(sorted(screen.samples[rep_col].unique())):
screen.samples.loc[screen.samples[rep_col] == rep, f"{rep_col}_id"] = i
if condition_column is None:
sort_key = f"{rep_col}_id"
else:
sort_key = [f"{rep_col}_id", f"{condition_column}_id"]
if condition_column is None:
sort_key = f"{rep_col}_id"
else:
sort_key = [f"{rep_col}_id", f"{condition_column}_id"]
screen = screen[
:,
screen.samples.sort_values(sort_key).index,
Expand Down

0 comments on commit 8bd29ca

Please sign in to comment.