diff --git a/docs/ingest/gene-ensembl-release-110.ipynb b/docs/ingest/gene-ensembl-release-110.ipynb
new file mode 100644
index 0000000..ce16857
--- /dev/null
+++ b/docs/ingest/gene-ensembl-release-110.ipynb
@@ -0,0 +1,1648 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# `Gene`: ensembl, release-110"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- https://www.ensembl.org/info/data/mysql.html\n",
+ "- https://www.ensembl.org/info/docs/api/core/core_schema.html"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Install mysqlclient: https://pypi.org/project/mysqlclient/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✅ New records found in the public sources.yaml, updated /Users/sunnysun/.lamin/bionty/versions/sources_local.yaml!\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "from sqlalchemy import create_engine\n",
+ "import mysql.connector as sql # needed\n",
+ "import bionty as bt\n",
+ "\n",
+ "version = \"release-110\"\n",
+ "species = bt.Species(version=version).lookup()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_url(species: bt.Species):\n",
+ " return f\"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{species.core_db}\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "query_core = \"\"\"\n",
+ "SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym\n",
+ "FROM gene\n",
+ "LEFT JOIN xref ON gene.display_xref_id = xref.xref_id\n",
+ "LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id\n",
+ "\"\"\"\n",
+ "\n",
+ "query_external = \"\"\"\n",
+ "SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name\n",
+ "FROM gene\n",
+ "LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id\n",
+ "LEFT JOIN xref ON object_xref.xref_id = xref.xref_id\n",
+ "LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id\n",
+ "WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('EntrezGene')\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def generate_genes_df(species: bt.Species, version=\"release-110\"):\n",
+ " engine = create_engine(url=get_url(species=species))\n",
+ "\n",
+ " # Query for the basic gene annotations:\n",
+ " results_core = pd.read_sql(query_core, con=engine)\n",
+ " print(f\"result_core.shape: {results_core.shape}\")\n",
+ " print(f\"result_core.head():\\n\")\n",
+ " display(results_core.head())\n",
+ "\n",
+ " # aggregate metadata based on ensembl stable_id\n",
+ " results_core_group = results_core.groupby(\"stable_id\").agg(\n",
+ " {\n",
+ " \"display_label\": \"first\",\n",
+ " \"biotype\": \"first\",\n",
+ " \"description\": \"first\",\n",
+ " \"synonym\": lambda x: \"|\".join([i for i in set(x) if i is not None]),\n",
+ " }\n",
+ " )\n",
+ " print(f\"results_core_group.head():\\n\")\n",
+ " display(results_core_group.head())\n",
+ "\n",
+ " # Query for external ids:\n",
+ " results_external = pd.read_sql(query_external, con=engine)\n",
+ " results_external = results_external[\n",
+ " results_external.stable_id.str.startswith(\"ENS\")\n",
+ " ]\n",
+ " print(f\"results_external.shape: {results_external.shape}\")\n",
+ " print(f\"results_external.head():\\n\")\n",
+ " display(results_external.head())\n",
+ "\n",
+ " # ncbi_gene_id\n",
+ " entrez = (\n",
+ " results_external[results_external[\"db_name\"] == \"EntrezGene\"]\n",
+ " .drop_duplicates([\"stable_id\", \"dbprimary_acc\"])\n",
+ " .drop(columns=[\"xref_id\", \"db_name\"])\n",
+ " )\n",
+ " entrez.rename(columns={\"dbprimary_acc\": \"ncbi_gene_id\"}, inplace=True)\n",
+ " entrez = entrez.set_index(\"stable_id\")\n",
+ " dup = entrez[entrez.index.duplicated(keep=False)]\n",
+ " print(f\"duplicated ensembl_gene_ids with ncbi_gene_ids: {dup.shape[0]}\\n\")\n",
+ " display(dup.head())\n",
+ "\n",
+ " # merge with ncbi_gene_id\n",
+ " df = results_core_group.merge(\n",
+ " entrez, left_index=True, right_index=True, how=\"outer\"\n",
+ " )\n",
+ " df = df.reset_index()\n",
+ " df.rename(\n",
+ " columns={\n",
+ " \"stable_id\": \"ensembl_gene_id\",\n",
+ " \"display_label\": \"symbol\",\n",
+ " \"synonym\": \"synonyms\",\n",
+ " },\n",
+ " inplace=True,\n",
+ " )\n",
+ " df = df[\n",
+ " [\n",
+ " \"ensembl_gene_id\",\n",
+ " \"symbol\",\n",
+ " \"ncbi_gene_id\",\n",
+ " \"biotype\",\n",
+ " \"description\",\n",
+ " \"synonyms\",\n",
+ " ]\n",
+ " ]\n",
+ " df = df[~df[\"ensembl_gene_id\"].isnull()]\n",
+ " df = df.sort_values(\"ensembl_gene_id\").reset_index(drop=True)\n",
+ " print(f\"Final df.shape: {df.shape}\")\n",
+ " print(f\"df.head():\\n\")\n",
+ " display(df.head())\n",
+ "\n",
+ " # save to parquet\n",
+ " filename = f\"df_{species.name}__ensembl__{version}__Gene.parquet\"\n",
+ " df.to_parquet(filename)\n",
+ " print(f\"Saved as {filename}\")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Human"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "result_core.shape: (113336, 5)\n",
+ "result_core.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stable_id | \n",
+ " display_label | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonym | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSG00000210049 | \n",
+ " MT-TF | \n",
+ " Mt_tRNA | \n",
+ " mitochondrially encoded tRNA-Phe (UUU/C) [Sour... | \n",
+ " MTTF | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSG00000210049 | \n",
+ " MT-TF | \n",
+ " Mt_tRNA | \n",
+ " mitochondrially encoded tRNA-Phe (UUU/C) [Sour... | \n",
+ " TRNF | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSG00000211459 | \n",
+ " MT-RNR1 | \n",
+ " Mt_rRNA | \n",
+ " mitochondrially encoded 12S rRNA [Source:HGNC ... | \n",
+ " 12S | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSG00000211459 | \n",
+ " MT-RNR1 | \n",
+ " Mt_rRNA | \n",
+ " mitochondrially encoded 12S rRNA [Source:HGNC ... | \n",
+ " MOTS-C | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSG00000211459 | \n",
+ " MT-RNR1 | \n",
+ " Mt_rRNA | \n",
+ " mitochondrially encoded 12S rRNA [Source:HGNC ... | \n",
+ " MTRNR1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stable_id display_label biotype \\\n",
+ "0 ENSG00000210049 MT-TF Mt_tRNA \n",
+ "1 ENSG00000210049 MT-TF Mt_tRNA \n",
+ "2 ENSG00000211459 MT-RNR1 Mt_rRNA \n",
+ "3 ENSG00000211459 MT-RNR1 Mt_rRNA \n",
+ "4 ENSG00000211459 MT-RNR1 Mt_rRNA \n",
+ "\n",
+ " description synonym \n",
+ "0 mitochondrially encoded tRNA-Phe (UUU/C) [Sour... MTTF \n",
+ "1 mitochondrially encoded tRNA-Phe (UUU/C) [Sour... TRNF \n",
+ "2 mitochondrially encoded 12S rRNA [Source:HGNC ... 12S \n",
+ "3 mitochondrially encoded 12S rRNA [Source:HGNC ... MOTS-C \n",
+ "4 mitochondrially encoded 12S rRNA [Source:HGNC ... MTRNR1 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results_core_group.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " display_label | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonym | \n",
+ "
\n",
+ " \n",
+ " stable_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ENSG00000000003 | \n",
+ " TSPAN6 | \n",
+ " protein_coding | \n",
+ " tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] | \n",
+ " TM4SF6|T245|TSPAN-6 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000000005 | \n",
+ " TNMD | \n",
+ " protein_coding | \n",
+ " tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] | \n",
+ " TEM|MYODULIN|CHM1L|TENDIN|BRICD4 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000000419 | \n",
+ " DPM1 | \n",
+ " protein_coding | \n",
+ " dolichyl-phosphate mannosyltransferase subunit... | \n",
+ " CDGIE|MPDS | \n",
+ "
\n",
+ " \n",
+ " ENSG00000000457 | \n",
+ " SCYL3 | \n",
+ " protein_coding | \n",
+ " SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... | \n",
+ " PACE-1|PACE1 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000000460 | \n",
+ " C1orf112 | \n",
+ " protein_coding | \n",
+ " chromosome 1 open reading frame 112 [Source:HG... | \n",
+ " FLJ10706|APOLO1|FLIP | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " display_label biotype \\\n",
+ "stable_id \n",
+ "ENSG00000000003 TSPAN6 protein_coding \n",
+ "ENSG00000000005 TNMD protein_coding \n",
+ "ENSG00000000419 DPM1 protein_coding \n",
+ "ENSG00000000457 SCYL3 protein_coding \n",
+ "ENSG00000000460 C1orf112 protein_coding \n",
+ "\n",
+ " description \\\n",
+ "stable_id \n",
+ "ENSG00000000003 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] \n",
+ "ENSG00000000005 tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] \n",
+ "ENSG00000000419 dolichyl-phosphate mannosyltransferase subunit... \n",
+ "ENSG00000000457 SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... \n",
+ "ENSG00000000460 chromosome 1 open reading frame 112 [Source:HG... \n",
+ "\n",
+ " synonym \n",
+ "stable_id \n",
+ "ENSG00000000003 TM4SF6|T245|TSPAN-6 \n",
+ "ENSG00000000005 TEM|MYODULIN|CHM1L|TENDIN|BRICD4 \n",
+ "ENSG00000000419 CDGIE|MPDS \n",
+ "ENSG00000000457 PACE-1|PACE1 \n",
+ "ENSG00000000460 FLJ10706|APOLO1|FLIP "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results_external.shape: (36004, 4)\n",
+ "results_external.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stable_id | \n",
+ " xref_id | \n",
+ " dbprimary_acc | \n",
+ " db_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSG00000198888 | \n",
+ " 554032 | \n",
+ " 4535 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSG00000198763 | \n",
+ " 554045 | \n",
+ " 4536 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSG00000198804 | \n",
+ " 553814 | \n",
+ " 4512 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSG00000210151 | \n",
+ " 1138145 | \n",
+ " 113219467 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSG00000198712 | \n",
+ " 553829 | \n",
+ " 4513 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stable_id xref_id dbprimary_acc db_name\n",
+ "0 ENSG00000198888 554032 4535 EntrezGene\n",
+ "1 ENSG00000198763 554045 4536 EntrezGene\n",
+ "2 ENSG00000198804 553814 4512 EntrezGene\n",
+ "3 ENSG00000210151 1138145 113219467 EntrezGene\n",
+ "4 ENSG00000198712 553829 4513 EntrezGene"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "duplicated ensembl_gene_ids with ncbi_gene_ids: 6158\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ncbi_gene_id | \n",
+ "
\n",
+ " \n",
+ " stable_id | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ENSG00000278294 | \n",
+ " 124907156 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000278294 | \n",
+ " 124907485 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000278294 | \n",
+ " 124908250 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000274917 | \n",
+ " 100008587 | \n",
+ "
\n",
+ " \n",
+ " ENSG00000274917 | \n",
+ " 124907114 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ncbi_gene_id\n",
+ "stable_id \n",
+ "ENSG00000278294 124907156\n",
+ "ENSG00000278294 124907485\n",
+ "ENSG00000278294 124908250\n",
+ "ENSG00000274917 100008587\n",
+ "ENSG00000274917 124907114"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Final df.shape: (77043, 6)\n",
+ "df.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ensembl_gene_id | \n",
+ " symbol | \n",
+ " ncbi_gene_id | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonyms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSG00000000003 | \n",
+ " TSPAN6 | \n",
+ " 7105 | \n",
+ " protein_coding | \n",
+ " tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] | \n",
+ " TM4SF6|T245|TSPAN-6 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSG00000000005 | \n",
+ " TNMD | \n",
+ " 64102 | \n",
+ " protein_coding | \n",
+ " tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] | \n",
+ " TEM|MYODULIN|CHM1L|TENDIN|BRICD4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSG00000000419 | \n",
+ " DPM1 | \n",
+ " 8813 | \n",
+ " protein_coding | \n",
+ " dolichyl-phosphate mannosyltransferase subunit... | \n",
+ " CDGIE|MPDS | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSG00000000457 | \n",
+ " SCYL3 | \n",
+ " 57147 | \n",
+ " protein_coding | \n",
+ " SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... | \n",
+ " PACE-1|PACE1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSG00000000460 | \n",
+ " C1orf112 | \n",
+ " 55732 | \n",
+ " protein_coding | \n",
+ " chromosome 1 open reading frame 112 [Source:HG... | \n",
+ " FLJ10706|APOLO1|FLIP | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ensembl_gene_id symbol ncbi_gene_id biotype \\\n",
+ "0 ENSG00000000003 TSPAN6 7105 protein_coding \n",
+ "1 ENSG00000000005 TNMD 64102 protein_coding \n",
+ "2 ENSG00000000419 DPM1 8813 protein_coding \n",
+ "3 ENSG00000000457 SCYL3 57147 protein_coding \n",
+ "4 ENSG00000000460 C1orf112 55732 protein_coding \n",
+ "\n",
+ " description \\\n",
+ "0 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] \n",
+ "1 tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] \n",
+ "2 dolichyl-phosphate mannosyltransferase subunit... \n",
+ "3 SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... \n",
+ "4 chromosome 1 open reading frame 112 [Source:HG... \n",
+ "\n",
+ " synonyms \n",
+ "0 TM4SF6|T245|TSPAN-6 \n",
+ "1 TEM|MYODULIN|CHM1L|TENDIN|BRICD4 \n",
+ "2 CDGIE|MPDS \n",
+ "3 PACE-1|PACE1 \n",
+ "4 FLJ10706|APOLO1|FLIP "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Saved as df_human__ensembl__release-110__Gene.parquet\n"
+ ]
+ }
+ ],
+ "source": [
+ "generate_genes_df(species=species.human)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Mouse"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "result_core.shape: (84751, 5)\n",
+ "result_core.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stable_id | \n",
+ " display_label | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonym | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSMUSG00000064336 | \n",
+ " mt-Tf | \n",
+ " Mt_tRNA | \n",
+ " mitochondrially encoded tRNA phenylalanine [So... | \n",
+ " tRNA | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSMUSG00000064336 | \n",
+ " mt-Tf | \n",
+ " Mt_tRNA | \n",
+ " mitochondrially encoded tRNA phenylalanine [So... | \n",
+ " tRNA-Phe | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSMUSG00000064336 | \n",
+ " mt-Tf | \n",
+ " Mt_tRNA | \n",
+ " mitochondrially encoded tRNA phenylalanine [So... | \n",
+ " TrnF tRNA | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSMUSG00000064337 | \n",
+ " mt-Rnr1 | \n",
+ " Mt_rRNA | \n",
+ " mitochondrially encoded 12S rRNA [Source:MGI S... | \n",
+ " 12S ribosomal RNA | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSMUSG00000064337 | \n",
+ " mt-Rnr1 | \n",
+ " Mt_rRNA | \n",
+ " mitochondrially encoded 12S rRNA [Source:MGI S... | \n",
+ " 12S rRNA | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stable_id display_label biotype \\\n",
+ "0 ENSMUSG00000064336 mt-Tf Mt_tRNA \n",
+ "1 ENSMUSG00000064336 mt-Tf Mt_tRNA \n",
+ "2 ENSMUSG00000064336 mt-Tf Mt_tRNA \n",
+ "3 ENSMUSG00000064337 mt-Rnr1 Mt_rRNA \n",
+ "4 ENSMUSG00000064337 mt-Rnr1 Mt_rRNA \n",
+ "\n",
+ " description synonym \n",
+ "0 mitochondrially encoded tRNA phenylalanine [So... tRNA \n",
+ "1 mitochondrially encoded tRNA phenylalanine [So... tRNA-Phe \n",
+ "2 mitochondrially encoded tRNA phenylalanine [So... TrnF tRNA \n",
+ "3 mitochondrially encoded 12S rRNA [Source:MGI S... 12S ribosomal RNA \n",
+ "4 mitochondrially encoded 12S rRNA [Source:MGI S... 12S rRNA "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results_core_group.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " display_label | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonym | \n",
+ "
\n",
+ " \n",
+ " stable_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ENSMUSG00000000001 | \n",
+ " Gnai3 | \n",
+ " protein_coding | \n",
+ " guanine nucleotide binding protein (G protein)... | \n",
+ " Galphai3 | \n",
+ "
\n",
+ " \n",
+ " ENSMUSG00000000003 | \n",
+ " Pbsn | \n",
+ " protein_coding | \n",
+ " probasin [Source:MGI Symbol;Acc:MGI:1860484] | \n",
+ " PB | \n",
+ "
\n",
+ " \n",
+ " ENSMUSG00000000028 | \n",
+ " Cdc45 | \n",
+ " protein_coding | \n",
+ " cell division cycle 45 [Source:MGI Symbol;Acc:... | \n",
+ " Cdc45l | \n",
+ "
\n",
+ " \n",
+ " ENSMUSG00000000031 | \n",
+ " H19 | \n",
+ " lncRNA | \n",
+ " H19, imprinted maternally expressed transcript... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " ENSMUSG00000000037 | \n",
+ " Scml2 | \n",
+ " protein_coding | \n",
+ " Scm polycomb group protein like 2 [Source:MGI ... | \n",
+ " 4932420G07Rik | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " display_label biotype \\\n",
+ "stable_id \n",
+ "ENSMUSG00000000001 Gnai3 protein_coding \n",
+ "ENSMUSG00000000003 Pbsn protein_coding \n",
+ "ENSMUSG00000000028 Cdc45 protein_coding \n",
+ "ENSMUSG00000000031 H19 lncRNA \n",
+ "ENSMUSG00000000037 Scml2 protein_coding \n",
+ "\n",
+ " description \\\n",
+ "stable_id \n",
+ "ENSMUSG00000000001 guanine nucleotide binding protein (G protein)... \n",
+ "ENSMUSG00000000003 probasin [Source:MGI Symbol;Acc:MGI:1860484] \n",
+ "ENSMUSG00000000028 cell division cycle 45 [Source:MGI Symbol;Acc:... \n",
+ "ENSMUSG00000000031 H19, imprinted maternally expressed transcript... \n",
+ "ENSMUSG00000000037 Scm polycomb group protein like 2 [Source:MGI ... \n",
+ "\n",
+ " synonym \n",
+ "stable_id \n",
+ "ENSMUSG00000000001 Galphai3 \n",
+ "ENSMUSG00000000003 PB \n",
+ "ENSMUSG00000000028 Cdc45l \n",
+ "ENSMUSG00000000031 \n",
+ "ENSMUSG00000000037 4932420G07Rik "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results_external.shape: (27747, 4)\n",
+ "results_external.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stable_id | \n",
+ " xref_id | \n",
+ " dbprimary_acc | \n",
+ " db_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSMUSG00000064341 | \n",
+ " 344016 | \n",
+ " 17716 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSMUSG00000064345 | \n",
+ " 344027 | \n",
+ " 17717 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSMUSG00000064351 | \n",
+ " 343950 | \n",
+ " 17708 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSMUSG00000064354 | \n",
+ " 343957 | \n",
+ " 17709 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSMUSG00000064356 | \n",
+ " 343940 | \n",
+ " 17706 | \n",
+ " EntrezGene | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stable_id xref_id dbprimary_acc db_name\n",
+ "0 ENSMUSG00000064341 344016 17716 EntrezGene\n",
+ "1 ENSMUSG00000064345 344027 17717 EntrezGene\n",
+ "2 ENSMUSG00000064351 343950 17708 EntrezGene\n",
+ "3 ENSMUSG00000064354 343957 17709 EntrezGene\n",
+ "4 ENSMUSG00000064356 343940 17706 EntrezGene"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "duplicated ensembl_gene_ids with ncbi_gene_ids: 554\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ncbi_gene_id | \n",
+ "
\n",
+ " \n",
+ " stable_id | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ENSMUSG00000094383 | \n",
+ " 108168683 | \n",
+ "
\n",
+ " \n",
+ " ENSMUSG00000094383 | \n",
+ " 108168684 | \n",
+ "
\n",
+ " \n",
+ " ENSMUSG00000094383 | \n",
+ " 108169098 | \n",
+ "
\n",
+ " \n",
+ " ENSMUSG00000094383 | \n",
+ " 108169101 | \n",
+ "
\n",
+ " \n",
+ " ENSMUSG00000095634 | \n",
+ " 100039810 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ncbi_gene_id\n",
+ "stable_id \n",
+ "ENSMUSG00000094383 108168683\n",
+ "ENSMUSG00000094383 108168684\n",
+ "ENSMUSG00000094383 108169098\n",
+ "ENSMUSG00000094383 108169101\n",
+ "ENSMUSG00000095634 100039810"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Final df.shape: (57283, 6)\n",
+ "df.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ensembl_gene_id | \n",
+ " symbol | \n",
+ " ncbi_gene_id | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonyms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSMUSG00000000001 | \n",
+ " Gnai3 | \n",
+ " 14679 | \n",
+ " protein_coding | \n",
+ " guanine nucleotide binding protein (G protein)... | \n",
+ " Galphai3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSMUSG00000000003 | \n",
+ " Pbsn | \n",
+ " 54192 | \n",
+ " protein_coding | \n",
+ " probasin [Source:MGI Symbol;Acc:MGI:1860484] | \n",
+ " PB | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSMUSG00000000028 | \n",
+ " Cdc45 | \n",
+ " 12544 | \n",
+ " protein_coding | \n",
+ " cell division cycle 45 [Source:MGI Symbol;Acc:... | \n",
+ " Cdc45l | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSMUSG00000000031 | \n",
+ " H19 | \n",
+ " 14955 | \n",
+ " lncRNA | \n",
+ " H19, imprinted maternally expressed transcript... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSMUSG00000000037 | \n",
+ " Scml2 | \n",
+ " 107815 | \n",
+ " protein_coding | \n",
+ " Scm polycomb group protein like 2 [Source:MGI ... | \n",
+ " 4932420G07Rik | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ensembl_gene_id symbol ncbi_gene_id biotype \\\n",
+ "0 ENSMUSG00000000001 Gnai3 14679 protein_coding \n",
+ "1 ENSMUSG00000000003 Pbsn 54192 protein_coding \n",
+ "2 ENSMUSG00000000028 Cdc45 12544 protein_coding \n",
+ "3 ENSMUSG00000000031 H19 14955 lncRNA \n",
+ "4 ENSMUSG00000000037 Scml2 107815 protein_coding \n",
+ "\n",
+ " description synonyms \n",
+ "0 guanine nucleotide binding protein (G protein)... Galphai3 \n",
+ "1 probasin [Source:MGI Symbol;Acc:MGI:1860484] PB \n",
+ "2 cell division cycle 45 [Source:MGI Symbol;Acc:... Cdc45l \n",
+ "3 H19, imprinted maternally expressed transcript... \n",
+ "4 Scm polycomb group protein like 2 [Source:MGI ... 4932420G07Rik "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Saved as df_mouse__ensembl__release-110__Gene.parquet\n"
+ ]
+ }
+ ],
+ "source": [
+ "generate_genes_df(species=species.mouse)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## saccharomyces_cerevisiae"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "result_core.shape: (8552, 5)\n",
+ "result_core.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stable_id | \n",
+ " display_label | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonym | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " YBR024W | \n",
+ " SCO2 | \n",
+ " protein_coding | \n",
+ " Protein anchored to mitochondrial inner membra... | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " YDL245C | \n",
+ " HXT15 | \n",
+ " protein_coding | \n",
+ " Putative transmembrane polyol transporter; sup... | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " YBR232C | \n",
+ " None | \n",
+ " protein_coding | \n",
+ " Dubious open reading frame; unlikely to encode... | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " YDR320W-B | \n",
+ " None | \n",
+ " protein_coding | \n",
+ " Dubious open reading frame; unlikely to encode... | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " YBR021W | \n",
+ " FUR4 | \n",
+ " protein_coding | \n",
+ " Plasma membrane localized uracil permease; exp... | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stable_id display_label biotype \\\n",
+ "0 YBR024W SCO2 protein_coding \n",
+ "1 YDL245C HXT15 protein_coding \n",
+ "2 YBR232C None protein_coding \n",
+ "3 YDR320W-B None protein_coding \n",
+ "4 YBR021W FUR4 protein_coding \n",
+ "\n",
+ " description synonym \n",
+ "0 Protein anchored to mitochondrial inner membra... None \n",
+ "1 Putative transmembrane polyol transporter; sup... None \n",
+ "2 Dubious open reading frame; unlikely to encode... None \n",
+ "3 Dubious open reading frame; unlikely to encode... None \n",
+ "4 Plasma membrane localized uracil permease; exp... None "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results_core_group.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " display_label | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonym | \n",
+ "
\n",
+ " \n",
+ " stable_id | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " ETS1-1 | \n",
+ " None | \n",
+ " rRNA | \n",
+ " Non-coding region located immediately upstream... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " ETS1-2 | \n",
+ " None | \n",
+ " rRNA | \n",
+ " Non-coding region located immediately upstream... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " ETS2-1 | \n",
+ " None | \n",
+ " rRNA | \n",
+ " Non-coding region located adjacent to and down... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " ETS2-2 | \n",
+ " None | \n",
+ " rRNA | \n",
+ " Non-coding region located adjacent to RDN25; t... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " HRA1 | \n",
+ " None | \n",
+ " ncRNA | \n",
+ " Non-protein-coding RNA; substrate of RNase P, ... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " display_label biotype \\\n",
+ "stable_id \n",
+ "ETS1-1 None rRNA \n",
+ "ETS1-2 None rRNA \n",
+ "ETS2-1 None rRNA \n",
+ "ETS2-2 None rRNA \n",
+ "HRA1 None ncRNA \n",
+ "\n",
+ " description synonym \n",
+ "stable_id \n",
+ "ETS1-1 Non-coding region located immediately upstream... \n",
+ "ETS1-2 Non-coding region located immediately upstream... \n",
+ "ETS2-1 Non-coding region located adjacent to and down... \n",
+ "ETS2-2 Non-coding region located adjacent to RDN25; t... \n",
+ "HRA1 Non-protein-coding RNA; substrate of RNase P, ... "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results_external.shape: (0, 4)\n",
+ "results_external.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stable_id | \n",
+ " xref_id | \n",
+ " dbprimary_acc | \n",
+ " db_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [stable_id, xref_id, dbprimary_acc, db_name]\n",
+ "Index: []"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "duplicated ensembl_gene_ids with ncbi_gene_ids: 0\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ncbi_gene_id | \n",
+ "
\n",
+ " \n",
+ " stable_id | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [ncbi_gene_id]\n",
+ "Index: []"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Final df.shape: (7127, 6)\n",
+ "df.head():\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ensembl_gene_id | \n",
+ " symbol | \n",
+ " ncbi_gene_id | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonyms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ETS1-1 | \n",
+ " None | \n",
+ " NaN | \n",
+ " rRNA | \n",
+ " Non-coding region located immediately upstream... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ETS1-2 | \n",
+ " None | \n",
+ " NaN | \n",
+ " rRNA | \n",
+ " Non-coding region located immediately upstream... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ETS2-1 | \n",
+ " None | \n",
+ " NaN | \n",
+ " rRNA | \n",
+ " Non-coding region located adjacent to and down... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ETS2-2 | \n",
+ " None | \n",
+ " NaN | \n",
+ " rRNA | \n",
+ " Non-coding region located adjacent to RDN25; t... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " HRA1 | \n",
+ " None | \n",
+ " NaN | \n",
+ " ncRNA | \n",
+ " Non-protein-coding RNA; substrate of RNase P, ... | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ensembl_gene_id symbol ncbi_gene_id biotype \\\n",
+ "0 ETS1-1 None NaN rRNA \n",
+ "1 ETS1-2 None NaN rRNA \n",
+ "2 ETS2-1 None NaN rRNA \n",
+ "3 ETS2-2 None NaN rRNA \n",
+ "4 HRA1 None NaN ncRNA \n",
+ "\n",
+ " description synonyms \n",
+ "0 Non-coding region located immediately upstream... \n",
+ "1 Non-coding region located immediately upstream... \n",
+ "2 Non-coding region located adjacent to and down... \n",
+ "3 Non-coding region located adjacent to RDN25; t... \n",
+ "4 Non-protein-coding RNA; substrate of RNase P, ... "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Saved as df_saccharomyces cerevisiae__ensembl__release-110__Gene.parquet\n"
+ ]
+ }
+ ],
+ "source": [
+ "generate_genes_df(species=species.saccharomyces_cerevisiae)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.9.13 ('py39')",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ },
+ "orig_nbformat": 4,
+ "vscode": {
+ "interpreter": {
+ "hash": "ae1fefc8646a06dd2e75004cd934adda7c5727b046986a772e3b44b0ffba9754"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/ingest/index.md b/docs/ingest/index.md
index a8a6648..7f037b2 100644
--- a/docs/ingest/index.md
+++ b/docs/ingest/index.md
@@ -4,6 +4,7 @@
:maxdepth: 1
cell_line-clo-2022-03
+gene-ensembl-release-110
gene-ensembl-release-109
cell-marker-2.0
protein-uniprot-2023-03