Merge pull request #17 from pinellolab/dev

Debug translation & run
pinellolab · Dec 5, 2023 · 8bd29ca · 8bd29ca
2 parents 6c64277 + 3a1d2c1
commit 8bd29ca
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -165,7 +165,7 @@ bean-profile my_sorting_screen.h5ad -o output_prefix `# Prefix for editing profi
 ### Output
 Above command produces `prefix_editing_preference.[html,ipynb]` as editing preferences ([see example](notebooks/profile_editing_preference.ipynb)).  
 
-<img src="imgs/profile_output.png" alt="Allele translation" width="700" style="background-color:white;"/>  
+<img src="imgs/profile_output.png" alt="Allele translation" width="700" />  
 
 ### Parameters
   * `-o`, `--output-prefix` (default: `None`): Output prefix of editing pattern report (prefix.html, prefix.ipynb). If not provided, base name of `bdata_path` is used.
@@ -188,7 +188,7 @@ bean-qc \
 
 `bean-qc` supports following quality control and masks samples with low quality. Specifically:  
 
-<img src="imgs/qc_output.png" alt="Allele translation" width="900" style="background-color:white;"/>  
+<img src="imgs/qc_output.png" alt="Allele translation" width="900"/>  
 
 * Plots guide coverage and the uniformity of coverage
 * Guide count correlation between samples
@@ -402,5 +402,5 @@ Python package `bean` supports multiple data wrangling functionalities for `Repo
 * Full pipeline takes 90.1s in GitHub Action for toy dataset of 2 replicates and 30 guides.
 
 ## Citation
-If you have used BEAN, please cite:  
+If you have used BEAN for your analysis, please cite:  
 Ryu, J. et al. Joint genotypic and phenotypic outcome modeling improves base editing variant effect quantification. medRxiv (2023) doi:10.1101/2023.09.08.23295253
diff --git a/bean/annotate/translate_allele.py b/bean/annotate/translate_allele.py
@@ -225,11 +225,11 @@ def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True):
         return cds
 
     @classmethod
-    def from_gene_name(cls, gene_name):
+    def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"):
         cds = cls()
         if gene_name not in cls.gene_info_dict:
             chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_gene_name(
-                gene_name
+                gene_name, ref_version
             )
             cls.gene_info_dict[gene_name] = {
                 "chrom": chrom,
@@ -329,8 +329,8 @@ def get_mismatch_df():
     ).drop_duplicates()
 
 
-def get_cds_dict(gene_names):
-    return {gname: CDS.from_gene_name(gname) for gname in gene_names}
+def get_cds_dict(gene_names, ref_version: str = "GRCh38"):
+    return {gname: CDS.from_gene_name(gname, ref_version) for gname in gene_names}
 
 
 def export_gene_info_to_json(gene_dict, write_path=".tmp_gene_info.csv"):
@@ -418,14 +418,15 @@ def get_allele_aa_change_single_gene(
     fasta_file: str = None,
     fasta_file_id: str = None,
     include_synonymous: bool = True,
+    ref_version: str = "GRCh38",
 ):
     """
     Obtain amino acid changes
     """
     if gene_name is None and fasta_file is not None:
         cds = CDS.from_fasta(fasta_file, fasta_file_id)
     elif gene_name is not None and fasta_file is None:
-        cds = CDS.from_gene_name(gene_name)
+        cds = CDS.from_gene_name(gene_name, ref_version)
     else:
         raise ValueError("Only one of `gene_name` or `fasta_file` should be provided.")
     return cds.get_aa_change(allele, include_synonymous)

diff --git a/bean/annotate/utils.py b/bean/annotate/utils.py
@@ -97,7 +97,7 @@ def get_mane_transcript_id(gene_name: str):
     return mane_transcript_id, id_version
 
 
-def get_exons_from_transcript_id(transcript_id: str, id_version: int):
+def get_exons_from_transcript_id(transcript_id: str, id_version: int, ref_version: str = "GRCh38"):
     """
     Retrieves the exons and the start position of the coding sequence (CDS) for a given transcript ID and version.
 
@@ -112,9 +112,18 @@ def get_exons_from_transcript_id(transcript_id: str, id_version: int):
     response = requests.get(api_url, headers={"Content-Type": "application/json"})
     transcript_json = response.json()
     if transcript_json["count"] != 1:
-        raise ValueError(
-            f"Non-unique entry for transcript ID and version:\n{transcript_json}"
-        )
+        if transcript_json["count"] > 1:
+            api_url = f"http://tark.ensembl.org/api/transcript/?stable_id={transcript_id}&stable_id_version={id_version}&assembly_name={ref_version}&expand=exons"
+            response = requests.get(api_url, headers={"Content-Type": "application/json"})
+            transcript_json = response.json()
+            if transcript_json["count"] != 1:
+                raise ValueError(
+                    f"Non-unique entry for transcript ID , version {id_version} and assembly {ref_version}:\n{transcript_json}"
+                )
+        else:
+            raise ValueError(
+                f"No entry found for transcript ID {transcript_id}, version {id_version} and assembly {ref_version}"
+            )
     transcript_record = transcript_json["results"][0]
     exons_list = transcript_record["exons"]
     strand = transcript_record["loc_strand"]  # +1/-1
@@ -186,11 +195,11 @@ def get_exon_pos_seq(exon_id, id_version, cds_start, cds_end, ref_version="GRCh3
     return chrom, list(sequence), list(range(start_pos, end_pos + 1))
 
 
-def get_cds_seq_pos_from_gene_name(gene_name: str):
+def get_cds_seq_pos_from_gene_name(gene_name: str, ref_version: str = "GRCh38"):
     transcript_id, id_version = get_mane_transcript_id(gene_name)
     print(f"MANE transcript ID {transcript_id} for {gene_name} will be used.")
     exons_list, cds_start, cds_end, strand = get_exons_from_transcript_id(
-        transcript_id, id_version
+        transcript_id, id_version, ref_version
     )
     if strand == -1:
         exons_list = exons_list[::-1]

diff --git a/bean/model/run.py b/bean/model/run.py
@@ -309,6 +309,25 @@ def check_args(args, bdata):
             raise ValueError(
                 f"{args.bdata_path} does not have specified sample mask column {args.sample_mask_col} in .samples"
             )
+    if args.condition_col not in bdata.samples.columns:
+        raise ValueError(
+            f"Condition column set by `--condition-col` {args.condition_col} not in ReporterScreen.samples.columns:{bdata.samples.columns}. Check your input."
+        )
+    if args.control_condition_label not in bdata.samples[args.condition_col].tolist():
+        raise ValueError(
+            f"No sample has control label (set by `--control-condition-label`) {args.control_condition_label} in ReporterScreen.samples[{args.condition_col}]: {bdata.samples[args.condition_col]}. Check your input."
+        )
+    if args.replicate_col not in bdata.samples.columns:
+        raise ValueError(
+            f"Condition column set by `--replicate-col` {args.replicate_col} not in ReporterScreen.samples.columns:{bdata.samples.columns}. Check your input."
+        )
+    if (
+        args.control_guide_tag is not None
+        and not bdata.guides.index.map(lambda s: args.control_guide_tag in s).any()
+    ):
+        raise ValueError(
+            f"Negative control guide label {args.control_guide_tag} provided by `--control-guide-tag` doesn't appear in any of the guide names. Check your input."
+        )
     if args.alpha_if_overdispersion_fitting_fails is not None:
         try:
             b0, b1 = args.alpha_if_overdispersion_fitting_fails.split(",")

diff --git a/bean/preprocessing/utils.py b/bean/preprocessing/utils.py
@@ -214,12 +214,16 @@ def _assign_rep_ids_and_sort(
     screen: be.ReporterScreen, rep_col: str, condition_column: str = None
 ) -> be.ReporterScreen:
     """Assign replicate IDs to samples and sort them accordingly."""
+    if rep_col not in screen.samples.columns:
+        raise ValueError(
+            f"{rep_col} not in columns of ReporterScreen.samples with following columns: {screen.samples.columns}. Check your input or adjust `--replicate-col` of `bean-run` command as the existing column name specifying experimental replicates."
+        )
     for i, rep in enumerate(sorted(screen.samples[rep_col].unique())):
         screen.samples.loc[screen.samples[rep_col] == rep, f"{rep_col}_id"] = i
-        if condition_column is None:
-            sort_key = f"{rep_col}_id"
-        else:
-            sort_key = [f"{rep_col}_id", f"{condition_column}_id"]
+    if condition_column is None:
+        sort_key = f"{rep_col}_id"
+    else:
+        sort_key = [f"{rep_col}_id", f"{condition_column}_id"]
     screen = screen[
         :,
         screen.samples.sort_values(sort_key).index,