diff --git a/docs/ingest/gene-ensembl-release-110.ipynb b/docs/ingest/gene-ensembl-release-110.ipynb new file mode 100644 index 0000000..ce16857 --- /dev/null +++ b/docs/ingest/gene-ensembl-release-110.ipynb @@ -0,0 +1,1648 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `Gene`: ensembl, release-110" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- https://www.ensembl.org/info/data/mysql.html\n", + "- https://www.ensembl.org/info/docs/api/core/core_schema.html" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install mysqlclient: https://pypi.org/project/mysqlclient/" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ New records found in the public sources.yaml, updated /Users/sunnysun/.lamin/bionty/versions/sources_local.yaml!\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import mysql.connector as sql # needed\n", + "import bionty as bt\n", + "\n", + "version = \"release-110\"\n", + "species = bt.Species(version=version).lookup()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def get_url(species: bt.Species):\n", + " return f\"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{species.core_db}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "query_core = \"\"\"\n", + "SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym\n", + "FROM gene\n", + "LEFT JOIN xref ON gene.display_xref_id = xref.xref_id\n", + "LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id\n", + "\"\"\"\n", + "\n", + "query_external = \"\"\"\n", + "SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name\n", + "FROM gene\n", + "LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id\n", + "LEFT JOIN xref ON object_xref.xref_id = xref.xref_id\n", + "LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id\n", + "WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('EntrezGene')\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_genes_df(species: bt.Species, version=\"release-110\"):\n", + " engine = create_engine(url=get_url(species=species))\n", + "\n", + " # Query for the basic gene annotations:\n", + " results_core = pd.read_sql(query_core, con=engine)\n", + " print(f\"result_core.shape: {results_core.shape}\")\n", + " print(f\"result_core.head():\\n\")\n", + " display(results_core.head())\n", + "\n", + " # aggregate metadata based on ensembl stable_id\n", + " results_core_group = results_core.groupby(\"stable_id\").agg(\n", + " {\n", + " \"display_label\": \"first\",\n", + " \"biotype\": \"first\",\n", + " \"description\": \"first\",\n", + " \"synonym\": lambda x: \"|\".join([i for i in set(x) if i is not None]),\n", + " }\n", + " )\n", + " print(f\"results_core_group.head():\\n\")\n", + " display(results_core_group.head())\n", + "\n", + " # Query for external ids:\n", + " results_external = pd.read_sql(query_external, con=engine)\n", + " results_external = results_external[\n", + " results_external.stable_id.str.startswith(\"ENS\")\n", + " ]\n", + " print(f\"results_external.shape: {results_external.shape}\")\n", + " print(f\"results_external.head():\\n\")\n", + " display(results_external.head())\n", + "\n", + " # ncbi_gene_id\n", + " entrez = (\n", + " results_external[results_external[\"db_name\"] == \"EntrezGene\"]\n", + " .drop_duplicates([\"stable_id\", \"dbprimary_acc\"])\n", + " .drop(columns=[\"xref_id\", \"db_name\"])\n", + " )\n", + " entrez.rename(columns={\"dbprimary_acc\": \"ncbi_gene_id\"}, inplace=True)\n", + " entrez = entrez.set_index(\"stable_id\")\n", + " dup = entrez[entrez.index.duplicated(keep=False)]\n", + " print(f\"duplicated ensembl_gene_ids with ncbi_gene_ids: {dup.shape[0]}\\n\")\n", + " display(dup.head())\n", + "\n", + " # merge with ncbi_gene_id\n", + " df = results_core_group.merge(\n", + " entrez, left_index=True, right_index=True, how=\"outer\"\n", + " )\n", + " df = df.reset_index()\n", + " df.rename(\n", + " columns={\n", + " \"stable_id\": \"ensembl_gene_id\",\n", + " \"display_label\": \"symbol\",\n", + " \"synonym\": \"synonyms\",\n", + " },\n", + " inplace=True,\n", + " )\n", + " df = df[\n", + " [\n", + " \"ensembl_gene_id\",\n", + " \"symbol\",\n", + " \"ncbi_gene_id\",\n", + " \"biotype\",\n", + " \"description\",\n", + " \"synonyms\",\n", + " ]\n", + " ]\n", + " df = df[~df[\"ensembl_gene_id\"].isnull()]\n", + " df = df.sort_values(\"ensembl_gene_id\").reset_index(drop=True)\n", + " print(f\"Final df.shape: {df.shape}\")\n", + " print(f\"df.head():\\n\")\n", + " display(df.head())\n", + "\n", + " # save to parquet\n", + " filename = f\"df_{species.name}__ensembl__{version}__Gene.parquet\"\n", + " df.to_parquet(filename)\n", + " print(f\"Saved as {filename}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Human" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "result_core.shape: (113336, 5)\n", + "result_core.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stable_iddisplay_labelbiotypedescriptionsynonym
0ENSG00000210049MT-TFMt_tRNAmitochondrially encoded tRNA-Phe (UUU/C) [Sour...MTTF
1ENSG00000210049MT-TFMt_tRNAmitochondrially encoded tRNA-Phe (UUU/C) [Sour...TRNF
2ENSG00000211459MT-RNR1Mt_rRNAmitochondrially encoded 12S rRNA [Source:HGNC ...12S
3ENSG00000211459MT-RNR1Mt_rRNAmitochondrially encoded 12S rRNA [Source:HGNC ...MOTS-C
4ENSG00000211459MT-RNR1Mt_rRNAmitochondrially encoded 12S rRNA [Source:HGNC ...MTRNR1
\n", + "
" + ], + "text/plain": [ + " stable_id display_label biotype \\\n", + "0 ENSG00000210049 MT-TF Mt_tRNA \n", + "1 ENSG00000210049 MT-TF Mt_tRNA \n", + "2 ENSG00000211459 MT-RNR1 Mt_rRNA \n", + "3 ENSG00000211459 MT-RNR1 Mt_rRNA \n", + "4 ENSG00000211459 MT-RNR1 Mt_rRNA \n", + "\n", + " description synonym \n", + "0 mitochondrially encoded tRNA-Phe (UUU/C) [Sour... MTTF \n", + "1 mitochondrially encoded tRNA-Phe (UUU/C) [Sour... TRNF \n", + "2 mitochondrially encoded 12S rRNA [Source:HGNC ... 12S \n", + "3 mitochondrially encoded 12S rRNA [Source:HGNC ... MOTS-C \n", + "4 mitochondrially encoded 12S rRNA [Source:HGNC ... MTRNR1 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results_core_group.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
display_labelbiotypedescriptionsynonym
stable_id
ENSG00000000003TSPAN6protein_codingtetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]TM4SF6|T245|TSPAN-6
ENSG00000000005TNMDprotein_codingtenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]TEM|MYODULIN|CHM1L|TENDIN|BRICD4
ENSG00000000419DPM1protein_codingdolichyl-phosphate mannosyltransferase subunit...CDGIE|MPDS
ENSG00000000457SCYL3protein_codingSCY1 like pseudokinase 3 [Source:HGNC Symbol;A...PACE-1|PACE1
ENSG00000000460C1orf112protein_codingchromosome 1 open reading frame 112 [Source:HG...FLJ10706|APOLO1|FLIP
\n", + "
" + ], + "text/plain": [ + " display_label biotype \\\n", + "stable_id \n", + "ENSG00000000003 TSPAN6 protein_coding \n", + "ENSG00000000005 TNMD protein_coding \n", + "ENSG00000000419 DPM1 protein_coding \n", + "ENSG00000000457 SCYL3 protein_coding \n", + "ENSG00000000460 C1orf112 protein_coding \n", + "\n", + " description \\\n", + "stable_id \n", + "ENSG00000000003 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] \n", + "ENSG00000000005 tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] \n", + "ENSG00000000419 dolichyl-phosphate mannosyltransferase subunit... \n", + "ENSG00000000457 SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... \n", + "ENSG00000000460 chromosome 1 open reading frame 112 [Source:HG... \n", + "\n", + " synonym \n", + "stable_id \n", + "ENSG00000000003 TM4SF6|T245|TSPAN-6 \n", + "ENSG00000000005 TEM|MYODULIN|CHM1L|TENDIN|BRICD4 \n", + "ENSG00000000419 CDGIE|MPDS \n", + "ENSG00000000457 PACE-1|PACE1 \n", + "ENSG00000000460 FLJ10706|APOLO1|FLIP " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results_external.shape: (36004, 4)\n", + "results_external.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stable_idxref_iddbprimary_accdb_name
0ENSG000001988885540324535EntrezGene
1ENSG000001987635540454536EntrezGene
2ENSG000001988045538144512EntrezGene
3ENSG000002101511138145113219467EntrezGene
4ENSG000001987125538294513EntrezGene
\n", + "
" + ], + "text/plain": [ + " stable_id xref_id dbprimary_acc db_name\n", + "0 ENSG00000198888 554032 4535 EntrezGene\n", + "1 ENSG00000198763 554045 4536 EntrezGene\n", + "2 ENSG00000198804 553814 4512 EntrezGene\n", + "3 ENSG00000210151 1138145 113219467 EntrezGene\n", + "4 ENSG00000198712 553829 4513 EntrezGene" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "duplicated ensembl_gene_ids with ncbi_gene_ids: 6158\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ncbi_gene_id
stable_id
ENSG00000278294124907156
ENSG00000278294124907485
ENSG00000278294124908250
ENSG00000274917100008587
ENSG00000274917124907114
\n", + "
" + ], + "text/plain": [ + " ncbi_gene_id\n", + "stable_id \n", + "ENSG00000278294 124907156\n", + "ENSG00000278294 124907485\n", + "ENSG00000278294 124908250\n", + "ENSG00000274917 100008587\n", + "ENSG00000274917 124907114" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Final df.shape: (77043, 6)\n", + "df.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ensembl_gene_idsymbolncbi_gene_idbiotypedescriptionsynonyms
0ENSG00000000003TSPAN67105protein_codingtetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]TM4SF6|T245|TSPAN-6
1ENSG00000000005TNMD64102protein_codingtenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]TEM|MYODULIN|CHM1L|TENDIN|BRICD4
2ENSG00000000419DPM18813protein_codingdolichyl-phosphate mannosyltransferase subunit...CDGIE|MPDS
3ENSG00000000457SCYL357147protein_codingSCY1 like pseudokinase 3 [Source:HGNC Symbol;A...PACE-1|PACE1
4ENSG00000000460C1orf11255732protein_codingchromosome 1 open reading frame 112 [Source:HG...FLJ10706|APOLO1|FLIP
\n", + "
" + ], + "text/plain": [ + " ensembl_gene_id symbol ncbi_gene_id biotype \\\n", + "0 ENSG00000000003 TSPAN6 7105 protein_coding \n", + "1 ENSG00000000005 TNMD 64102 protein_coding \n", + "2 ENSG00000000419 DPM1 8813 protein_coding \n", + "3 ENSG00000000457 SCYL3 57147 protein_coding \n", + "4 ENSG00000000460 C1orf112 55732 protein_coding \n", + "\n", + " description \\\n", + "0 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] \n", + "1 tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] \n", + "2 dolichyl-phosphate mannosyltransferase subunit... \n", + "3 SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... \n", + "4 chromosome 1 open reading frame 112 [Source:HG... \n", + "\n", + " synonyms \n", + "0 TM4SF6|T245|TSPAN-6 \n", + "1 TEM|MYODULIN|CHM1L|TENDIN|BRICD4 \n", + "2 CDGIE|MPDS \n", + "3 PACE-1|PACE1 \n", + "4 FLJ10706|APOLO1|FLIP " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved as df_human__ensembl__release-110__Gene.parquet\n" + ] + } + ], + "source": [ + "generate_genes_df(species=species.human)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mouse" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "result_core.shape: (84751, 5)\n", + "result_core.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stable_iddisplay_labelbiotypedescriptionsynonym
0ENSMUSG00000064336mt-TfMt_tRNAmitochondrially encoded tRNA phenylalanine [So...tRNA
1ENSMUSG00000064336mt-TfMt_tRNAmitochondrially encoded tRNA phenylalanine [So...tRNA-Phe
2ENSMUSG00000064336mt-TfMt_tRNAmitochondrially encoded tRNA phenylalanine [So...TrnF tRNA
3ENSMUSG00000064337mt-Rnr1Mt_rRNAmitochondrially encoded 12S rRNA [Source:MGI S...12S ribosomal RNA
4ENSMUSG00000064337mt-Rnr1Mt_rRNAmitochondrially encoded 12S rRNA [Source:MGI S...12S rRNA
\n", + "
" + ], + "text/plain": [ + " stable_id display_label biotype \\\n", + "0 ENSMUSG00000064336 mt-Tf Mt_tRNA \n", + "1 ENSMUSG00000064336 mt-Tf Mt_tRNA \n", + "2 ENSMUSG00000064336 mt-Tf Mt_tRNA \n", + "3 ENSMUSG00000064337 mt-Rnr1 Mt_rRNA \n", + "4 ENSMUSG00000064337 mt-Rnr1 Mt_rRNA \n", + "\n", + " description synonym \n", + "0 mitochondrially encoded tRNA phenylalanine [So... tRNA \n", + "1 mitochondrially encoded tRNA phenylalanine [So... tRNA-Phe \n", + "2 mitochondrially encoded tRNA phenylalanine [So... TrnF tRNA \n", + "3 mitochondrially encoded 12S rRNA [Source:MGI S... 12S ribosomal RNA \n", + "4 mitochondrially encoded 12S rRNA [Source:MGI S... 12S rRNA " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results_core_group.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
display_labelbiotypedescriptionsynonym
stable_id
ENSMUSG00000000001Gnai3protein_codingguanine nucleotide binding protein (G protein)...Galphai3
ENSMUSG00000000003Pbsnprotein_codingprobasin [Source:MGI Symbol;Acc:MGI:1860484]PB
ENSMUSG00000000028Cdc45protein_codingcell division cycle 45 [Source:MGI Symbol;Acc:...Cdc45l
ENSMUSG00000000031H19lncRNAH19, imprinted maternally expressed transcript...
ENSMUSG00000000037Scml2protein_codingScm polycomb group protein like 2 [Source:MGI ...4932420G07Rik
\n", + "
" + ], + "text/plain": [ + " display_label biotype \\\n", + "stable_id \n", + "ENSMUSG00000000001 Gnai3 protein_coding \n", + "ENSMUSG00000000003 Pbsn protein_coding \n", + "ENSMUSG00000000028 Cdc45 protein_coding \n", + "ENSMUSG00000000031 H19 lncRNA \n", + "ENSMUSG00000000037 Scml2 protein_coding \n", + "\n", + " description \\\n", + "stable_id \n", + "ENSMUSG00000000001 guanine nucleotide binding protein (G protein)... \n", + "ENSMUSG00000000003 probasin [Source:MGI Symbol;Acc:MGI:1860484] \n", + "ENSMUSG00000000028 cell division cycle 45 [Source:MGI Symbol;Acc:... \n", + "ENSMUSG00000000031 H19, imprinted maternally expressed transcript... \n", + "ENSMUSG00000000037 Scm polycomb group protein like 2 [Source:MGI ... \n", + "\n", + " synonym \n", + "stable_id \n", + "ENSMUSG00000000001 Galphai3 \n", + "ENSMUSG00000000003 PB \n", + "ENSMUSG00000000028 Cdc45l \n", + "ENSMUSG00000000031 \n", + "ENSMUSG00000000037 4932420G07Rik " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results_external.shape: (27747, 4)\n", + "results_external.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stable_idxref_iddbprimary_accdb_name
0ENSMUSG0000006434134401617716EntrezGene
1ENSMUSG0000006434534402717717EntrezGene
2ENSMUSG0000006435134395017708EntrezGene
3ENSMUSG0000006435434395717709EntrezGene
4ENSMUSG0000006435634394017706EntrezGene
\n", + "
" + ], + "text/plain": [ + " stable_id xref_id dbprimary_acc db_name\n", + "0 ENSMUSG00000064341 344016 17716 EntrezGene\n", + "1 ENSMUSG00000064345 344027 17717 EntrezGene\n", + "2 ENSMUSG00000064351 343950 17708 EntrezGene\n", + "3 ENSMUSG00000064354 343957 17709 EntrezGene\n", + "4 ENSMUSG00000064356 343940 17706 EntrezGene" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "duplicated ensembl_gene_ids with ncbi_gene_ids: 554\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ncbi_gene_id
stable_id
ENSMUSG00000094383108168683
ENSMUSG00000094383108168684
ENSMUSG00000094383108169098
ENSMUSG00000094383108169101
ENSMUSG00000095634100039810
\n", + "
" + ], + "text/plain": [ + " ncbi_gene_id\n", + "stable_id \n", + "ENSMUSG00000094383 108168683\n", + "ENSMUSG00000094383 108168684\n", + "ENSMUSG00000094383 108169098\n", + "ENSMUSG00000094383 108169101\n", + "ENSMUSG00000095634 100039810" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Final df.shape: (57283, 6)\n", + "df.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ensembl_gene_idsymbolncbi_gene_idbiotypedescriptionsynonyms
0ENSMUSG00000000001Gnai314679protein_codingguanine nucleotide binding protein (G protein)...Galphai3
1ENSMUSG00000000003Pbsn54192protein_codingprobasin [Source:MGI Symbol;Acc:MGI:1860484]PB
2ENSMUSG00000000028Cdc4512544protein_codingcell division cycle 45 [Source:MGI Symbol;Acc:...Cdc45l
3ENSMUSG00000000031H1914955lncRNAH19, imprinted maternally expressed transcript...
4ENSMUSG00000000037Scml2107815protein_codingScm polycomb group protein like 2 [Source:MGI ...4932420G07Rik
\n", + "
" + ], + "text/plain": [ + " ensembl_gene_id symbol ncbi_gene_id biotype \\\n", + "0 ENSMUSG00000000001 Gnai3 14679 protein_coding \n", + "1 ENSMUSG00000000003 Pbsn 54192 protein_coding \n", + "2 ENSMUSG00000000028 Cdc45 12544 protein_coding \n", + "3 ENSMUSG00000000031 H19 14955 lncRNA \n", + "4 ENSMUSG00000000037 Scml2 107815 protein_coding \n", + "\n", + " description synonyms \n", + "0 guanine nucleotide binding protein (G protein)... Galphai3 \n", + "1 probasin [Source:MGI Symbol;Acc:MGI:1860484] PB \n", + "2 cell division cycle 45 [Source:MGI Symbol;Acc:... Cdc45l \n", + "3 H19, imprinted maternally expressed transcript... \n", + "4 Scm polycomb group protein like 2 [Source:MGI ... 4932420G07Rik " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved as df_mouse__ensembl__release-110__Gene.parquet\n" + ] + } + ], + "source": [ + "generate_genes_df(species=species.mouse)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## saccharomyces_cerevisiae" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "result_core.shape: (8552, 5)\n", + "result_core.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stable_iddisplay_labelbiotypedescriptionsynonym
0YBR024WSCO2protein_codingProtein anchored to mitochondrial inner membra...None
1YDL245CHXT15protein_codingPutative transmembrane polyol transporter; sup...None
2YBR232CNoneprotein_codingDubious open reading frame; unlikely to encode...None
3YDR320W-BNoneprotein_codingDubious open reading frame; unlikely to encode...None
4YBR021WFUR4protein_codingPlasma membrane localized uracil permease; exp...None
\n", + "
" + ], + "text/plain": [ + " stable_id display_label biotype \\\n", + "0 YBR024W SCO2 protein_coding \n", + "1 YDL245C HXT15 protein_coding \n", + "2 YBR232C None protein_coding \n", + "3 YDR320W-B None protein_coding \n", + "4 YBR021W FUR4 protein_coding \n", + "\n", + " description synonym \n", + "0 Protein anchored to mitochondrial inner membra... None \n", + "1 Putative transmembrane polyol transporter; sup... None \n", + "2 Dubious open reading frame; unlikely to encode... None \n", + "3 Dubious open reading frame; unlikely to encode... None \n", + "4 Plasma membrane localized uracil permease; exp... None " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results_core_group.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
display_labelbiotypedescriptionsynonym
stable_id
ETS1-1NonerRNANon-coding region located immediately upstream...
ETS1-2NonerRNANon-coding region located immediately upstream...
ETS2-1NonerRNANon-coding region located adjacent to and down...
ETS2-2NonerRNANon-coding region located adjacent to RDN25; t...
HRA1NonencRNANon-protein-coding RNA; substrate of RNase P, ...
\n", + "
" + ], + "text/plain": [ + " display_label biotype \\\n", + "stable_id \n", + "ETS1-1 None rRNA \n", + "ETS1-2 None rRNA \n", + "ETS2-1 None rRNA \n", + "ETS2-2 None rRNA \n", + "HRA1 None ncRNA \n", + "\n", + " description synonym \n", + "stable_id \n", + "ETS1-1 Non-coding region located immediately upstream... \n", + "ETS1-2 Non-coding region located immediately upstream... \n", + "ETS2-1 Non-coding region located adjacent to and down... \n", + "ETS2-2 Non-coding region located adjacent to RDN25; t... \n", + "HRA1 Non-protein-coding RNA; substrate of RNase P, ... " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results_external.shape: (0, 4)\n", + "results_external.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stable_idxref_iddbprimary_accdb_name
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [stable_id, xref_id, dbprimary_acc, db_name]\n", + "Index: []" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "duplicated ensembl_gene_ids with ncbi_gene_ids: 0\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ncbi_gene_id
stable_id
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [ncbi_gene_id]\n", + "Index: []" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Final df.shape: (7127, 6)\n", + "df.head():\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ensembl_gene_idsymbolncbi_gene_idbiotypedescriptionsynonyms
0ETS1-1NoneNaNrRNANon-coding region located immediately upstream...
1ETS1-2NoneNaNrRNANon-coding region located immediately upstream...
2ETS2-1NoneNaNrRNANon-coding region located adjacent to and down...
3ETS2-2NoneNaNrRNANon-coding region located adjacent to RDN25; t...
4HRA1NoneNaNncRNANon-protein-coding RNA; substrate of RNase P, ...
\n", + "
" + ], + "text/plain": [ + " ensembl_gene_id symbol ncbi_gene_id biotype \\\n", + "0 ETS1-1 None NaN rRNA \n", + "1 ETS1-2 None NaN rRNA \n", + "2 ETS2-1 None NaN rRNA \n", + "3 ETS2-2 None NaN rRNA \n", + "4 HRA1 None NaN ncRNA \n", + "\n", + " description synonyms \n", + "0 Non-coding region located immediately upstream... \n", + "1 Non-coding region located immediately upstream... \n", + "2 Non-coding region located adjacent to and down... \n", + "3 Non-coding region located adjacent to RDN25; t... \n", + "4 Non-protein-coding RNA; substrate of RNase P, ... " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved as df_saccharomyces cerevisiae__ensembl__release-110__Gene.parquet\n" + ] + } + ], + "source": [ + "generate_genes_df(species=species.saccharomyces_cerevisiae)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.13 ('py39')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "ae1fefc8646a06dd2e75004cd934adda7c5727b046986a772e3b44b0ffba9754" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/ingest/index.md b/docs/ingest/index.md index a8a6648..7f037b2 100644 --- a/docs/ingest/index.md +++ b/docs/ingest/index.md @@ -4,6 +4,7 @@ :maxdepth: 1 cell_line-clo-2022-03 +gene-ensembl-release-110 gene-ensembl-release-109 cell-marker-2.0 protein-uniprot-2023-03