diff --git a/bean/annotate/translate_allele.py b/bean/annotate/translate_allele.py index 6310717..42953aa 100755 --- a/bean/annotate/translate_allele.py +++ b/bean/annotate/translate_allele.py @@ -137,7 +137,7 @@ def _parse_range(chr_range: str) -> Tuple[str, int, int]: return (chrom, start, end) -def _parse_description(desc_str: str): +def _parse_description(desc_str: str) -> Tuple[Tuple[str, int, int], str]: """Parse description line of fasta file to get range.""" sstr = desc_str.split(" ") strand = None @@ -153,21 +153,38 @@ def _parse_description(desc_str: str): return genome_range, strand -def get_cds_seq_pos_from_fasta(fasta_file_name: str) -> Tuple[List[str], List[int]]: +def get_cds_seq_pos_from_fasta( + fasta_file_name: str, +) -> Tuple[str, List[str], List[int], str]: """Obtain tuple of lists, first with nucleotide and second with genomic position of the nucleotide.""" exons = list(SeqIO.parse(fasta_file_name, "fasta")) translated_seq = [] genomic_pos = [] + (exon_chrom, exon_start, exon_end), strand = _parse_description( + exons[0].description + ) + if strand == "-": + exons = exons[::-1] for exon in exons: (exon_chrom, exon_start, exon_end), strand = _parse_description( exon.description ) + exon_seq = [] + exon_pos = [] for i, nt in enumerate(str(exon.seq)): if nt.islower(): continue - translated_seq.append(nt) - genomic_pos.append(exon_start + i) - return (exon_chrom, translated_seq, genomic_pos, strand) + exon_seq.append(nt) + if strand == "+": + exon_pos.append(exon_start + i) + else: + exon_pos.append(exon_end - i) + if strand == "-": + exon_seq = revcomp(exon_seq) + exon_pos = exon_pos[::-1] + translated_seq.extend(exon_seq) + genomic_pos.extend(exon_pos) + return exon_chrom, translated_seq, genomic_pos, {"+": 1, "-": -1}[strand] def _translate_single_codon( @@ -207,12 +224,10 @@ def __init__(self): self.genomic_pos: Sequence = [] @classmethod - def from_fasta(cls, fasta_file_name, gene_name, suppressMessage=True): + def from_fasta(cls, fasta_file_name, gene_name=None, suppressMessage=True): cds = cls() - # if fasta_file_name is not None: - # if not suppressMessage: - # raise ValueError("No fasta file provided as reference: using LDLR") - # fasta_file_name = os.path.dirname(be.__file__) + "/annotate/ldlr_exons.fa" + if gene_name is None: + gene_name = os.path.basename(fasta_file_name).upper().split(".FA")[0] if gene_name not in cls.gene_info_dict: chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_fasta( fasta_file_name @@ -252,18 +267,10 @@ def from_gene_name(cls, gene_name, ref_version: str = "GRCh38"): cds.translated_seq = deepcopy(cls.gene_info_dict[gene_name]["translated_seq"]) cds.genomic_pos = cls.gene_info_dict[gene_name]["genomic_pos"] cds.nt = cds.gene_info_dict[gene_name]["translated_seq"] # in sense strand - # print(cds.gene_name + ":" + "".join(cds.nt)) - # if cds.strand == -1: - # cds.nt = revcomp(cds.nt) cds.pos = np.array(cds.genomic_pos) cds.edited_nt = deepcopy(cds.nt) return cds - # def translate(self): - # if self.strand == -1: - # self.edited_nt = revcomp(self.edited_nt) - # self.aa = _translate(self.edited_nt, codon_map) - def _get_relative_nt_pos(self, absolute_pos): """0-based relative position""" nt_relative_pos = np.where(self.pos == absolute_pos)[0] @@ -665,16 +672,16 @@ def annotate_edit( ) edit_info["coding"] = "" edit_info.loc[edit_info.pos.map(lambda s: s.startswith("A")), "coding"] = "coding" - edit_info.loc[ - edit_info.pos.map(lambda s: not s.startswith("A")), "coding" - ] = "noncoding" + edit_info.loc[edit_info.pos.map(lambda s: not s.startswith("A")), "coding"] = ( + "noncoding" + ) if control_tag is not None: - edit_info.loc[ - edit_info.pos.map(lambda s: control_tag in s), "group" - ] = "negctrl" - edit_info.loc[ - edit_info.pos.map(lambda s: control_tag in s), "coding" - ] = "negctrl" + edit_info.loc[edit_info.pos.map(lambda s: control_tag in s), "group"] = ( + "negctrl" + ) + edit_info.loc[edit_info.pos.map(lambda s: control_tag in s), "coding"] = ( + "negctrl" + ) edit_info.loc[ (edit_info.coding == "noncoding") & (edit_info.group != "negctrl"), "int_pos" ] = edit_info.loc[ @@ -691,9 +698,9 @@ def annotate_edit( (edit_info.alt == edit_info.ref) & (edit_info.coding == "coding"), "group" ] = "syn" if splice_sites is not None: - edit_info.loc[ - edit_info.pos.isin(splice_sites.astype(str)), "group" - ] = "splicing" + edit_info.loc[edit_info.pos.isin(splice_sites.astype(str)), "group"] = ( + "splicing" + ) edit_info.loc[edit_info.int_pos < -100, "group"] = "negctrl" edit_info.loc[edit_info.int_pos < -100, "coding"] = "negctrl" return edit_info diff --git a/docs/exon_fa_format.md b/docs/exon_fa_format.md index 2498b39..4902a41 100755 --- a/docs/exon_fa_format.md +++ b/docs/exon_fa_format.md @@ -1,5 +1,5 @@ # Input .fa file format for `bean-filter` -You can provide custom FASTA file with exon sequence entries. Currently only supports positive strand genes. +You can provide custom FASTA file with exon sequence entries. * Exon FASTA files can be downloaded from UCSC Genomic sequences / Table Browser: [see the instruction video](https://www.youtube.com/watch?v=T4E0Ez5Vjz8) * You can manually format as: diff --git a/setup.py b/setup.py index a747c45..800208f 100755 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="crispr-bean", - version="1.2.2", + version="1.2.3", python_requires=">=3.8.0", author="Jayoung Ryu", author_email="jayoung_ryu@g.harvard.edu", diff --git a/tests/data/abca1.fa b/tests/data/abca1.fa new file mode 100644 index 0000000..dbf6829 --- /dev/null +++ b/tests/data/abca1.fa @@ -0,0 +1,280 @@ +>hg38_knownGene_ENST00000374736.8_0 range=chr9:104927935-104928155 5'pad=0 3'pad=0 strand=- repeatMasking=none +gtaattgcgagcgagagtgagtggggccgggacccgcagagccgagccga +cccttctctcccgggctgcggcagggcagggcggggagctccgcgcacca +acagagccggttctcagggcgctttgctccttgttttttccccggttctg +ttttctccccttctccggaaggcttgtcaaggggtaggagaaagagacgc +aaacacaaaagtggaaaacag +>hg38_knownGene_ENST00000374736.8_1 range=chr9:104903614-104903771 5'pad=0 3'pad=0 strand=- repeatMasking=none +ttaatgaccagccacggcgtccctgctgtgagctctggccgctgccttcc +agggctcccgagccacacgctgggggtgctggctgagggaacATGGCTTG +TTGGCCTCAGCTGAGGTTGCTGCTGTGGAAGAACCTCACTTTCAGAAGAA +GACAAACA +>hg38_knownGene_ENST00000374736.8_2 range=chr9:104889102-104889195 5'pad=0 3'pad=0 strand=- repeatMasking=none +TGTCAGCTGCTGCTGGAAGTGGCCTGGCCTCTATTTATCTTCCTGATCCT +GATCTCTGTTCGGCTGAGCTACCCACCCTATGAACAACATGAAT +>hg38_knownGene_ENST00000374736.8_3 range=chr9:104884427-104884568 5'pad=0 3'pad=0 strand=- repeatMasking=none +GCCATTTTCCAAATAAAGCCATGCCCTCTGCAGGAACACTTCCTTGGGTT +CAGGGGATTATCTGTAATGCCAACAACCCCTGTTTCCGTTACCCGACTCC +TGGGGAGGCTCCCGGAGTTGTTGGAAACTTTAACAAATCCAT +>hg38_knownGene_ENST00000374736.8_4 range=chr9:104883039-104883157 5'pad=0 3'pad=0 strand=- repeatMasking=none +TGTGGCTCGCCTGTTCTCAGATGCTCGGAGGCTTCTTTTATACAGCCAGA +AAGACACCAGCATGAAGGACATGCGCAAAGTTCTGAGAACATTACAGCAG +ATCAAGAAATCCAGCTCAA +>hg38_knownGene_ENST00000374736.8_5 range=chr9:104861679-104861800 5'pad=0 3'pad=0 strand=- repeatMasking=none +ACTTGAAGCTTCAAGATTTCCTGGTGGACAATGAAACCTTCTCTGGGTTC +CTGTATCACAACCTCTCTCTCCCAAAGTCTACTGTGGACAAGATGCTGAG +GGCTGATGTCATTCTCCACAAG +>hg38_knownGene_ENST00000374736.8_6 range=chr9:104858522-104858698 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTATTTTTGCAAGGCTACCAGTTACATTTGACAAGTCTGTGCAATGGATC +AAAATCAGAAGAGATGATTCAACTTGGTGACCAAGAAGTTTCTGAGCTTT +GTGGCCTACCAAGGGAGAAACTGGCTGCAGCAGAGCGAGTACTTCGTTCC +AACATGGACATCCTGAAGCCAATCCTG +>hg38_knownGene_ENST00000374736.8_7 range=chr9:104845477-104845569 5'pad=0 3'pad=0 strand=- repeatMasking=none +AGAACACTAAACTCTACATCTCCCTTCCCGAGCAAGGAGCTGGCTGAAGC +CACAAAAACATTGCTGCATAGTCTTGGGACTCTGGCCCAGGAG +>hg38_knownGene_ENST00000374736.8_8 range=chr9:104840279-104840519 5'pad=0 3'pad=0 strand=- repeatMasking=none +CTGTTCAGCATGAGAAGCTGGAGTGACATGCGACAGGAGGTGATGTTTCT +GACCAATGTGAACAGCTCCAGCTCCTCCACCCAAATCTACCAGGCTGTGT +CTCGTATTGTCTGCGGGCATCCCGAGGGAGGGGGGCTGAAGATCAAGTCT +CTCAACTGGTATGAGGACAACAACTACAAAGCCCTCTTTGGAGGCAATGG +CACTGAGGAAGATGCTGAAACCTTCTATGACAACTCTACAA +>hg38_knownGene_ENST00000374736.8_9 range=chr9:104837428-104837567 5'pad=0 3'pad=0 strand=- repeatMasking=none +CTCCTTACTGCAATGATTTGATGAAGAATTTGGAGTCTAGTCCTCTTTCC +CGCATTATCTGGAAAGCTCTGAAGCCGCTGCTCGTTGGGAAGATCCTGTA +TACACCTGACACTCCAGCCACAAGGCAGGTCATGGCTGAG +>hg38_knownGene_ENST00000374736.8_10 range=chr9:104836980-104837096 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTGAACAAGACCTTCCAGGAACTGGCTGTGTTCCATGATCTGGAAGGCAT +GTGGGAGGAACTCAGCCCCAAGATCTGGACCTTCATGGAGAACAGCCAAG +AAATGGACCTTGTCCGG +>hg38_knownGene_ENST00000374736.8_11 range=chr9:104832574-104832771 5'pad=0 3'pad=0 strand=- repeatMasking=none +ATGCTGTTGGACAGCAGGGACAATGACCACTTTTGGGAACAGCAGTTGGA +TGGCTTAGATTGGACAGCCCAAGACATCGTGGCGTTTTTGGCCAAGCACC +CAGAGGATGTCCAGTCCAGTAATGGTTCTGTGTACACCTGGAGAGAAGCT +TTCAACGAGACTAACCAGGCAATCCGGACCATATCTCGCTTCATGGAG +>hg38_knownGene_ENST00000374736.8_12 range=chr9:104831622-104831827 5'pad=0 3'pad=0 strand=- repeatMasking=none +TGTGTCAACCTGAACAAGCTAGAACCCATAGCAACAGAAGTCTGGCTCAT +CAACAAGTCCATGGAGCTGCTGGATGAGAGGAAGTTCTGGGCTGGTATTG +TGTTCACTGGAATTACTCCAGGCAGCATTGAGCTGCCCCATCATGTCAAG +TACAAGATCCGAATGGACATTGACAATGTGGAGAGGACAAATAAAATCAA +GGATGG +>hg38_knownGene_ENST00000374736.8_13 range=chr9:104830925-104831101 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTACTGGGACCCTGGTCCTCGAGCTGACCCCTTTGAGGACATGCGGTACG +TCTGGGGGGGCTTCGCCTACTTGCAGGATGTGGTGGAGCAGGCAATCATC +AGGGTGCTGACGGGCACCGAGAAGAAAACTGGTGTCTATATGCAACAGAT +GCCCTATCCCTGTTACGTTGATGACAT +>hg38_knownGene_ENST00000374736.8_14 range=chr9:104828916-104829138 5'pad=0 3'pad=0 strand=- repeatMasking=none +CTTTCTGCGGGTGATGAGCCGGTCAATGCCCCTCTTCATGACGCTGGCCT +GGATTTACTCAGTGGCTGTGATCATCAAGGGCATCGTGTATGAGAAGGAG +GCACGGCTGAAAGAGACCATGCGGATCATGGGCCTGGACAACAGCATCCT +CTGGTTTAGCTGGTTCATTAGTAGCCTCATTCCTCTTCTTGTGAGCGCTG +GCCTGCTAGTGGTCATCCTGAAG +>hg38_knownGene_ENST00000374736.8_15 range=chr9:104826948-104827169 5'pad=0 3'pad=0 strand=- repeatMasking=none +TTAGGAAACCTGCTGCCCTACAGTGATCCCAGCGTGGTGTTTGTCTTCCT +GTCCGTGTTTGCTGTGGTGACAATCCTGCAGTGCTTCCTGATTAGCACAC +TCTTCTCCAGAGCCAACCTGGCAGCAGCCTGTGGGGGCATCATCTACTTC +ACGCTGTACCTGCCCTACGTCCTGTGTGTGGCATGGCAGGACTACGTGGG +CTTCACACTCAAGATCTTCGCT +>hg38_knownGene_ENST00000374736.8_16 range=chr9:104825683-104825887 5'pad=0 3'pad=0 strand=- repeatMasking=none +AGCCTGCTGTCTCCTGTGGCTTTTGGGTTTGGCTGTGAGTACTTTGCCCT +TTTTGAGGAGCAGGGCATTGGAGTGCAGTGGGACAACCTGTTTGAGAGTC +CTGTGGAGGAAGATGGCTTCAATCTCACCACTTCGGTCTCCATGATGCTG +TTTGACACCTTCCTCTATGGGGTGATGACCTGGTACATTGAGGCTGTCTT +TCCAG +>hg38_knownGene_ENST00000374736.8_17 range=chr9:104824465-104824578 5'pad=0 3'pad=0 strand=- repeatMasking=none +GCCAGTACGGAATTCCCAGGCCCTGGTATTTTCCTTGCACCAAGTCCTAC +TGGTTTGGCGAGGAAAGTGATGAGAAGAGCCACCCTGGTTCCAACCAGAA +GAGAATATCAGAAA +>hg38_knownGene_ENST00000374736.8_18 range=chr9:104822496-104822667 5'pad=0 3'pad=0 strand=- repeatMasking=none +TCTGCATGGAGGAGGAACCCACCCACTTGAAGCTGGGCGTGTCCATTCAG +AACCTGGTAAAAGTCTACCGAGATGGGATGAAGGTGGCTGTCGATGGCCT +GGCACTGAATTTTTATGAGGGCCAGATCACCTCCTTCCTGGGCCACAATG +GAGCGGGGAAGACGACCACCAT +>hg38_knownGene_ENST00000374736.8_19 range=chr9:104821375-104821506 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTCAATCCTGACCGGGTTGTTCCCCCCGACCTCGGGCACCGCCTACATCC +TGGGAAAAGACATTCGCTCTGAGATGAGCACCATCCGGCAGAACCTGGGG +GTCTGTCCCCAGCATAACGTGCTGTTTGACAT +>hg38_knownGene_ENST00000374736.8_20 range=chr9:104819927-104820069 5'pad=0 3'pad=0 strand=- repeatMasking=none +GCTGACTGTCGAAGAACACATCTGGTTCTATGCCCGCTTGAAAGGGCTCT +CTGAGAAGCACGTGAAGGCGGAGATGGAGCAGATGGCCCTGGATGTTGGT +TTGCCATCAAGCAAGCTGAAAAGCAAAACAAGCCAGCTGTCAG +>hg38_knownGene_ENST00000374736.8_21 range=chr9:104819586-104819723 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTGGAATGCAGAGAAAGCTATCTGTGGCCTTGGCCTTTGTCGGGGGATCT +AAGGTTGTCATTCTGGATGAACCCACAGCTGGTGTGGACCCTTACTCCCG +CAGGGGAATATGGGAGCTGCTGCTGAAATACCGACAAG +>hg38_knownGene_ENST00000374736.8_22 range=chr9:104818663-104818883 5'pad=0 3'pad=0 strand=- repeatMasking=none +GCCGCACCATTATTCTCTCTACACACCACATGGATGAAGCGGACGTCCTG +GGGGACAGGATTGCCATCATCTCCCATGGGAAGCTGTGCTGTGTGGGCTC +CTCCCTGTTTCTGAAGAACCAGCTGGGAACAGGCTACTACCTGACCTTGG +TCAAGAAAGATGTGGAATCCTCCCTCAGTTCCTGCAGAAACAGTAGTAGC +ACTGTGTCATACCTGAAAAAG +>hg38_knownGene_ENST00000374736.8_23 range=chr9:104817332-104817404 5'pad=0 3'pad=0 strand=- repeatMasking=none +GAGGACAGTGTTTCTCAGAGCAGTTCTGATGCTGGCCTGGGCAGCGACCA +TGAGAGTGACACGCTGACCATCG +>hg38_knownGene_ENST00000374736.8_24 range=chr9:104816143-104816345 5'pad=0 3'pad=0 strand=- repeatMasking=none +ATGTCTCTGCTATCTCCAACCTCATCAGGAAGCATGTGTCTGAAGCCCGG +CTGGTGGAAGACATAGGGCATGAGCTGACCTATGTGCTGCCATATGAAGC +TGCTAAGGAGGGAGCCTTTGTGGAACTCTTTCATGAGATTGATGACCGGC +TCTCAGACCTGGGCATTTCTAGTTATGGCATCTCAGAGACGACCCTGGAA +GAA +>hg38_knownGene_ENST00000374736.8_25 range=chr9:104814427-104814475 5'pad=0 3'pad=0 strand=- repeatMasking=none +ATATTCCTCAAGGTGGCCGAAGAGAGTGGGGTGGATGCTGAGACCTCAG +>hg38_knownGene_ENST00000374736.8_26 range=chr9:104814118-104814231 5'pad=0 3'pad=0 strand=- repeatMasking=none +ATGGTACCTTGCCAGCAAGACGAAACAGGCGGGCCTTCGGGGACAAGCAG +AGCTGTCTTCGCCCGTTCACTGAAGATGATGCTGCTGATCCAAATGATTC +TGACATAGACCCAG +>hg38_knownGene_ENST00000374736.8_27 range=chr9:104812574-104812722 5'pad=0 3'pad=0 strand=- repeatMasking=none +AATCCAGAGAGACAGACTTGCTCAGTGGGATGGATGGCAAAGGGTCCTAC +CAGGTGAAAGGCTGGAAACTTACACAGCAACAGTTTGTGGCCCTTTTGTG +GAAGAGACTGCTAATTGCCAGACGGAGTCGGAAAGGATTTTTTGCTCAG +>hg38_knownGene_ENST00000374736.8_28 range=chr9:104810800-104810924 5'pad=0 3'pad=0 strand=- repeatMasking=none +ATTGTCTTGCCAGCTGTGTTTGTCTGCATTGCCCTTGTGTTCAGCCTGAT +CGTGCCACCCTTTGGCAAGTACCCCAGCCTGGAACTTCAGCCCTGGATGT +ACAACGAACAGTACACATTTGTCAG +>hg38_knownGene_ENST00000374736.8_29 range=chr9:104809466-104809564 5'pad=0 3'pad=0 strand=- repeatMasking=none +CAATGATGCTCCTGAGGACACGGGAACCCTGGAACTCTTAAACGCCCTCA +CCAAAGACCCTGGCTTCGGGACCCGCTGTATGGAAGGAAACCCAATCCC +>hg38_knownGene_ENST00000374736.8_30 range=chr9:104806241-104806430 5'pad=0 3'pad=0 strand=- repeatMasking=none +AGACACGCCCTGCCAGGCAGGGGAGGAAGAGTGGACCACTGCCCCAGTTC +CCCAGACCATCATGGACCTCTTCCAGAATGGGAACTGGACAATGCAGAAC +CCTTCACCTGCATGCCAGTGTAGCAGCGACAAAATCAAGAAGATGCTGCC +TGTGTGTCCCCCAGGGGCAGGGGGGCTGCCTCCTCCACAA +>hg38_knownGene_ENST00000374736.8_31 range=chr9:104804626-104804720 5'pad=0 3'pad=0 strand=- repeatMasking=none +AGAAAACAAAACACTGCAGATATCCTTCAGGACCTGACAGGAAGAAACAT +TTCGGATTATCTGGTGAAGACGTATGTGCAGATCATAGCCAAAAG +>hg38_knownGene_ENST00000374736.8_32 range=chr9:104803284-104803316 5'pad=0 3'pad=0 strand=- repeatMasking=none +CTTAAAGAACAAGATCTGGGTGAATGAGTTTAG +>hg38_knownGene_ENST00000374736.8_33 range=chr9:104802054-104802159 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTATGGCGGCTTTTCCCTGGGTGTCAGTAATACTCAAGCACTTCCTCCGA +GTCAAGAAGTTAATGATGCCATCAAACAAATGAAGAAACACCTAAAGCTG +GCCAAG +>hg38_knownGene_ENST00000374736.8_34 range=chr9:104800510-104800584 5'pad=0 3'pad=0 strand=- repeatMasking=none +GACAGTTCTGCAGATCGATTTCTCAACAGCTTGGGAAGATTTATGACAGG +ACTGGACACCAAAAATAATGTCAAG +>hg38_knownGene_ENST00000374736.8_35 range=chr9:104799819-104799988 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTGTGGTTCAATAACAAGGGCTGGCATGCAATCAGCTCTTTCCTGAATGT +CATCAACAATGCCATTCTCCGGGCCAACCTGCAAAAGGGAGAGAACCCTA +GCCATTATGGAATTACTGCTTTCAATCATCCCCTGAATCTCACCAAGCAG +CAGCTCTCAGAGGTGGCTCT +>hg38_knownGene_ENST00000374736.8_36 range=chr9:104798421-104798598 5'pad=0 3'pad=0 strand=- repeatMasking=none +GATGACCACATCAGTGGATGTCCTTGTGTCCATCTGTGTCATCTTTGCAA +TGTCCTTCGTCCCAGCCAGCTTTGTCGTATTCCTGATCCAGGAGCGGGTC +AGCAAAGCAAAACACCTGCAGTTCATCAGTGGAGTGAAGCCTGTCATCTA +CTGGCTCTCTAATTTTGTCTGGGATATG +>hg38_knownGene_ENST00000374736.8_37 range=chr9:104796309-104796424 5'pad=0 3'pad=0 strand=- repeatMasking=none +TGCAATTACGTTGTCCCTGCCACACTGGTCATTATCATCTTCATCTGCTT +CCAGCAGAAGTCCTATGTGTCCTCCACCAATCTGCCTGTGCTAGCCCTTC +TACTTTTGCTGTATGG +>hg38_knownGene_ENST00000374736.8_38 range=chr9:104796053-104796197 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTGGTCAATCACACCTCTCATGTACCCAGCCTCCTTTGTGTTCAAGATCC +CCAGCACAGCCTATGTGGTGCTCACCAGCGTGAACCTCTTCATTGGCATT +AATGGCAGCGTGGCCACCTTTGTGCTGGAGCTGTTCACCGACAAT +>hg38_knownGene_ENST00000374736.8_39 range=chr9:104794387-104794510 5'pad=0 3'pad=0 strand=- repeatMasking=none +AAGCTGAATAATATCAATGATATCCTGAAGTCCGTGTTCTTGATCTTCCC +ACATTTTTGCCTGGGACGAGGGCTCATCGACATGGTGAAAAACCAGGCAA +TGGCTGATGCCCTGGAAAGGTTTG +>hg38_knownGene_ENST00000374736.8_40 range=chr9:104793171-104793300 5'pad=0 3'pad=0 strand=- repeatMasking=none +GGGAGAATCGCTTTGTGTCACCATTATCTTGGGACTTGGTGGGACGAAAC +CTCTTCGCCATGGCCGTGGAAGGGGTGGTGTTCTTCCTCATTACTGTTCT +GATCCAGTACAGATTCTTCATCAGGCCCAG +>hg38_knownGene_ENST00000374736.8_41 range=chr9:104792786-104792906 5'pad=0 3'pad=0 strand=- repeatMasking=none +ACCTGTAAATGCAAAGCTATCTCCTCTGAATGATGAAGATGAAGATGTGA +GGCGGGAAAGACAGAGAATTCTTGATGGTGGAGGCCAGAATGACATCTTA +GAAATCAAGGAGTTGACGAAG +>hg38_knownGene_ENST00000374736.8_42 range=chr9:104791936-104791998 5'pad=0 3'pad=0 strand=- repeatMasking=none +ATATATAGAAGGAAGCGGAAGCCTGCTGTTGACAGGATTTGCGTGGGCAT +TCCTCCTGGTGAG +>hg38_knownGene_ENST00000374736.8_43 range=chr9:104790922-104791028 5'pad=0 3'pad=0 strand=- repeatMasking=none +TGCTTTGGGCTCCTGGGAGTTAATGGGGCTGGAAAATCATCAACTTTCAA +GATGTTAACAGGAGATACCACTGTTACCAGAGGAGATGCTTTCCTTAACA +AAAATAG +>hg38_knownGene_ENST00000374736.8_44 range=chr9:104788426-104788567 5'pad=0 3'pad=0 strand=- repeatMasking=none +TATCTTATCAAACATCCATGAAGTACATCAGAACATGGGCTACTGCCCTC +AGTTTGATGCCATCACAGAGCTGTTGACTGGGAGAGAACACGTGGAGTTC +TTTGCCCTTTTGAGAGGAGTCCCAGAGAAAGAAGTTGGCAAG +>hg38_knownGene_ENST00000374736.8_45 range=chr9:104787920-104788054 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTTGGTGAGTGGGCGATTCGGAAACTGGGCCTCGTGAAGTATGGAGAAAA +ATATGCTGGTAACTATAGTGGAGGCAACAAACGCAAGCTCTCTACAGCCA +TGGCTTTGATCGGCGGGCCTCCTGTGGTGTTTCTG +>hg38_knownGene_ENST00000374736.8_46 range=chr9:104786873-104786976 5'pad=0 3'pad=0 strand=- repeatMasking=none +GATGAACCCACCACAGGCATGGATCCCAAAGCCCGGCGGTTCTTGTGGAA +TTGTGCCCTAAGTGTTGTCAAGGAGGGGAGATCAGTAGTGCTTACATCTC +ATAG +>hg38_knownGene_ENST00000374736.8_47 range=chr9:104786298-104786390 5'pad=0 3'pad=0 strand=- repeatMasking=none +TATGGAAGAATGTGAAGCTCTTTGCACTAGGATGGCAATCATGGTCAATG +GAAGGTTCAGGTGCCTTGGCAGTGTCCAGCATCTAAAAAATAG +>hg38_knownGene_ENST00000374736.8_48 range=chr9:104785396-104785639 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTTTGGAGATGGTTATACAATAGTTGTACGAATAGCAGGGTCCAACCCGG +ACCTGAAGCCTGTCCAGGATTTCTTTGGACTTGCATTTCCTGGAAGTGTT +CTAAAAGAGAAACACCGGAACATGCTACAATACCAGCTTCCATCTTCATT +ATCTTCTCTGGCCAGGATATTCAGCATCCTCTCCCAGAGCAAAAAGCGAC +TCCACATAGAAGACTACTCTGTTTCTCAGACAACACTTGACCAA +>hg38_knownGene_ENST00000374736.8_49 range=chr9:104781006-104784455 5'pad=0 3'pad=0 strand=- repeatMasking=none +GTATTTGTGAACTTTGCCAAGGACCAAAGTGATGATGACCACTTAAAAGA +CCTCTCATTACACAAAAACCAGACAGTAGTGGACGTTGCAGTTCTCACAT +CTTTTCTACAGGATGAGAAAGTGAAAGAAAGCTATGTATGAagaatcctg +ttcatacggggtggctgaaagtaaagaggaactagactttcctttgcacc +atgtgaagtgttgtggagaaaagagccagaagttgatgtgggaagaagta +aactggatactgtactgatactattcaatgcaatgcaattcaatgcaatg +aaaacaaaattccattacaggggcagtgcctttgtagcctatgtcttgta +tggctctcaagtgaaagacttgaatttagttttttacctatacctatgtg +aaactctattatggaacccaatggacatatgggtttgaactcacactttt +ttttttttttttgttcctgtgtattctcattggggttgcaacaataattc +atcaagtaatcatggccagcgattattgatcaaaatcaaaaggtaatgca +catcctcattcactaagccatgccatgcccaggagactggtttcccggtg +acacatccattgctggcaatgagtgtgccagagttattagtgccaagttt +ttcagaaagtttgaagcaccatggtgtgtcatgctcacttttgtgaaagc +tgctctgctcagagtctatcaacattgaatatcagttgacagaatggtgc +catgcgtggctaacatcctgctttgattccctctgataagctgttctggt +ggcagtaacatgcaacaaaaatgtgggtgtctccaggcacgggaaacttg +gttccattgttatattgtcctatgcttcgagccatgggtctacagggtca +tccttatgagactcttaaatatacttagatcctggtaagaggcaaagaat +caacagccaaactgctggggctgcaagctgctgaagccagggcatgggat +taaagagattgtgcgttcaaacctagggaagcctgtgcccatttgtcctg +actgtctgctaacatggtacactgcatctcaagatgtttatctgacacaa +gtgtattatttctggctttttgaattaatctagaaaatgaaaagatggag +ttgtattttgacaaaaatgtttgtactttttaatgttatttggaatttta +agttctatcagtgacttctgaatccttagaatggcctctttgtagaaccc +tgtggtatagaggagtatggccactgccccactatttttattttcttatg +taagtttgcatatcagtcatgactagtgcctagaaagcaatgtgatggtc +aggatctcatgacattatatttgagtttctttcagatcatttaggatact +cttaatctcacttcatcaatcaaatattttttgagtgtatgctgtagctg +aaagagtatgtacgtacgtataagactagagagatattaagtctcagtac +acttcctgtgccatgttattcagctcactggtttacaaatataggttgtc +ttgtggttgtaggagcccactgtaacaatactgggcagcctttttttttt +tttttttaattgcaacaatgcaaaagccaagaaagtataagggtcacaag +tctaaacaatgaattcttcaacagggaaaacagctagcttgaaaacttgc +tgaaaaacacaacttgtgtttatggcatttagtaccttcaaataattggc +tttgcagatattggataccccattaaatctgacagtctcaaatttttcat +ctcttcaatcactagtcaagaaaaatataaaaacaacaaatacttccata +tggagcatttttcagagttttctaacccagtcttatttttctagtcagta +aacatttgtaaaaatactgtttcactaatacttactgttaactgtcttga +gagaaaagaaaaatatgagagaactattgtttggggaagttcaagtgatc +tttcaatatcattactaacttcttccactttttccagaatttgaatatta +acgctaaaggtgtaagacttcagatttcaaattaatctttctatattttt +taaatttacagaatattatataacccactgctgaaaaagaaaaaaatgat +tgttttagaagttaaagtcaatattgattttaaatataagtaatgaaggc +atatttccaataactagtgatatggcatcgttgcattttacagtatcttc +aaaaatacagaatttatagaataatttctcctcatttaatatttttcaaa +atcaaagttatggtttcctcattttactaaaatcgtattctaattcttca +ttatagtaaatctatgagcaactccttacttcggttcctctgatttcaag +gccatattttaaaaaatcaaaaggcactgtgaactattttgaagaaaaca +caacattttaatacagattgaaaggacctcttctgaagctagaaacaatc +tatagttatacatcttcattaatactgtgttaccttttaaaatagtaatt +ttttacattttcctgtgtaaacctaattgtggtagaaatttttaccaact +ctatactcaatcaagcaaaatttctgtatattccctgtggaatgtaccta +tgtgagtttcagaaattctcaaaatacgtgttcaaaaatttctgcttttg +catctttgggacacctcagaaaacttattaacaactgtgaatatgagaaa +tacagaagaaaataataagccctctatacataaatgcccagcacaattca +ttgttaaaaaacaaccaaacctcacactactgtatttcattatctgtact +gaaagcaaatgctttgtgactattaaatgttgcacatcattcattcactg +tatagtaatcattgactaaagccatttgtctgtgttttcttcttgtggtt +gtatatatcaggtaaaatattttccaaagagccatgtgtcatgtaatact +gaaccactttgatattgagacattaatttgtacccttgttattatctact +agtaataatgtaatactgtagaaatattgctctaattcttttcaaaattg +ttgcatcccccttagaatgtttctatttccataaggatttaggtatgcta +ttatcccttcttataccctaagatgaagctgtttttgtgctctttgttca +tcattggccctcattccaagcactttacgctgtctgtaatgggatctatt +tttgcactggaatatctgagaattgcaaaactagacaaaagtttcacaac +agatttctaagttaaatcattttcattaaaaggaaaaaagaaaaaaaatt +ttgtatgtcaataactttatatgaagtattaaaatgcatatttctatgtt +gtaatataatgagtcacaaaataaagctgtgacagttctgttggtctaca \ No newline at end of file diff --git a/tests/test_filter.py b/tests/test_filter.py index a57ce81..6d3d6d2 100755 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -1,6 +1,10 @@ import pytest import subprocess from bean.annotate.translate_allele import CDS +from bean.annotate.translate_allele import ( + get_cds_seq_pos_from_gene_name, + get_cds_seq_pos_from_fasta, +) @pytest.mark.order(311) @@ -76,3 +80,26 @@ def test_translate_aa(): abca1 = CDS.from_gene_name("ABCA1") allele_str = "chr9:104903664:0:-:C>T" assert str(abca1.get_aa_change(allele_str)) == "ABCA1:6:Q>*|", allele_str + + +def test_pos_fasta(): + chrom, translated_seq, genomic_pos, strand = get_cds_seq_pos_from_fasta( + "tests/data/abca1.fa" + ) + chrom2, translated_seq2, genomic_pos2, strand2 = get_cds_seq_pos_from_gene_name( + "ABCA1" + ) + assert chrom == chrom2 + assert translated_seq == translated_seq2 + assert genomic_pos == genomic_pos2, genomic_pos[:10] + assert strand == strand2 + + +def test_translate_fasta(): + abca1 = CDS.from_fasta("tests/data/abca1.fa", "ABCA1") + allele_str = "chr9:104903664:0:+:G>A" + assert str(abca1.get_aa_change(allele_str)) == "ABCA1:6:Q>*|", allele_str + + abca1 = CDS.from_fasta("tests/data/abca1.fa", "ABCA1") + allele_str = "chr9:104903664:0:-:C>T" + assert str(abca1.get_aa_change(allele_str)) == "ABCA1:6:Q>*|", allele_str