From 07e3b81ad48cd4734a368ee58b8396f29211836c Mon Sep 17 00:00:00 2001 From: Shawn Whitfield <80531166+stwhitfield@users.noreply.github.com> Date: Mon, 16 Oct 2023 14:07:40 -0400 Subject: [PATCH 01/15] add notebook folder and Get_ChEMBL_Approved_Drugs --- notebooks/Get_ChEMBL_Approved_Drugs.ipynb | 435 ++++++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 notebooks/Get_ChEMBL_Approved_Drugs.ipynb diff --git a/notebooks/Get_ChEMBL_Approved_Drugs.ipynb b/notebooks/Get_ChEMBL_Approved_Drugs.ipynb new file mode 100644 index 00000000..bd7851f8 --- /dev/null +++ b/notebooks/Get_ChEMBL_Approved_Drugs.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "28c39f67", + "metadata": {}, + "source": [ + "Retrieve all the approved drugs from ChEMBL as well as the date of approval and the SMILES.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2dd3e4ab-5921-4422-b6c6-adf2b6801254", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import datamol as dm\n", + "\n", + "from chembl_webresource_client.new_client import new_client as client\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cc2c16ee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4192" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First, we retrieve the ChEMBL IDs for all the approved drugs (max_phase=4)\n", + "mol_ids = client.molecule.filter(max_phase=4).only([\"molecule_chembl_id\"])\n", + "mol_ids = pd.DataFrame(mol_ids)\n", + "\n", + "len(mol_ids)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2744b624", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f7b1a87a59644f278e7c90344e56adc8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/4192 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Database is locked in thread 140349682980544; retrying (1/3)\n", + "Database is locked in thread 140342728836800; retrying (1/3)\n", + "Database is locked in thread 140352400905920; retrying (1/3)\n", + "Database is locked in thread 140357819938496; retrying (1/3)\n", + "Database is locked in thread 140356704265920; retrying (1/3)\n", + "Database is locked in thread 140341143402176; retrying (1/3)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + " | first_approval | \n", + "molecule_chembl_id | \n", + "molecule_type | \n", + "pref_name | \n", + "smiles | \n", + "
---|---|---|---|---|---|
0 | \n", + "1976.0 | \n", + "CHEMBL2 | \n", + "Small molecule | \n", + "PRAZOSIN | \n", + "COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC | \n", + "
1 | \n", + "1984.0 | \n", + "CHEMBL3 | \n", + "Small molecule | \n", + "NICOTINE | \n", + "CN1CCC[C@H]1c1cccnc1 | \n", + "
2 | \n", + "1990.0 | \n", + "CHEMBL4 | \n", + "Small molecule | \n", + "OFLOXACIN | \n", + "CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 | \n", + "
3 | \n", + "1964.0 | \n", + "CHEMBL5 | \n", + "Small molecule | \n", + "NALIDIXIC ACID | \n", + "CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21 | \n", + "
4 | \n", + "1965.0 | \n", + "CHEMBL6 | \n", + "Small molecule | \n", + "INDOMETHACIN | \n", + "COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1 | \n", + "
\n", + " | first_approval | \n", + "molecule_chembl_id | \n", + "molecule_type | \n", + "pref_name | \n", + "smiles | \n", + "
---|---|---|---|---|---|
0 | \n", + "1976.0 | \n", + "CHEMBL2 | \n", + "Small molecule | \n", + "PRAZOSIN | \n", + "COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC | \n", + "
1 | \n", + "1984.0 | \n", + "CHEMBL3 | \n", + "Small molecule | \n", + "NICOTINE | \n", + "CN1CCC[C@H]1c1cccnc1 | \n", + "
2 | \n", + "1990.0 | \n", + "CHEMBL4 | \n", + "Small molecule | \n", + "OFLOXACIN | \n", + "CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 | \n", + "
3 | \n", + "1964.0 | \n", + "CHEMBL5 | \n", + "Small molecule | \n", + "NALIDIXIC ACID | \n", + "CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21 | \n", + "
4 | \n", + "1965.0 | \n", + "CHEMBL6 | \n", + "Small molecule | \n", + "INDOMETHACIN | \n", + "COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
2623 | \n", + "2015.0 | \n", + "CHEMBL5095048 | \n", + "Small molecule | \n", + "AMPHETAMINE ASPARTATE/DEXTROAMPHETAMINE SULFATE | \n", + "CC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1... | \n", + "
2624 | \n", + "2022.0 | \n", + "CHEMBL5095049 | \n", + "Small molecule | \n", + "PACRITINIB CITRATE | \n", + "C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2cccc... | \n", + "
2625 | \n", + "2021.0 | \n", + "CHEMBL5095050 | \n", + "Small molecule | \n", + "FINGOLIMOD LAURYL SULFATE | \n", + "CCCCCCCCCCCCOS(=O)(=O)O.CCCCCCCCc1ccc(CCC(N)(C... | \n", + "
2626 | \n", + "2022.0 | \n", + "CHEMBL5095051 | \n", + "Small molecule | \n", + "VENLAFAXINE BESYLATE | \n", + "COc1ccc(C(CN(C)C)C2(O)CCCCC2)cc1.O=S(=O)(O)c1c... | \n", + "
2627 | \n", + "2015.0 | \n", + "CHEMBL5095505 | \n", + "Small molecule | \n", + "AMPHETAMINE/DEXTROAMPHETAMINE | \n", + "CC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1... | \n", + "
2628 rows × 5 columns
\n", + "\n", - " | first_approval | \n", - "molecule_chembl_id | \n", - "molecule_type | \n", - "pref_name | \n", - "smiles | \n", - "
---|---|---|---|---|---|
0 | \n", - "1976.0 | \n", - "CHEMBL2 | \n", - "Small molecule | \n", - "PRAZOSIN | \n", - "COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC | \n", - "
1 | \n", - "1984.0 | \n", - "CHEMBL3 | \n", - "Small molecule | \n", - "NICOTINE | \n", - "CN1CCC[C@H]1c1cccnc1 | \n", - "
2 | \n", - "1990.0 | \n", - "CHEMBL4 | \n", - "Small molecule | \n", - "OFLOXACIN | \n", - "CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 | \n", - "
3 | \n", - "1964.0 | \n", - "CHEMBL5 | \n", - "Small molecule | \n", - "NALIDIXIC ACID | \n", - "CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21 | \n", - "
4 | \n", - "1965.0 | \n", - "CHEMBL6 | \n", - "Small molecule | \n", - "INDOMETHACIN | \n", - "COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1 | \n", - "
\n", - " | first_approval | \n", - "molecule_chembl_id | \n", - "molecule_type | \n", - "pref_name | \n", - "smiles | \n", - "
---|---|---|---|---|---|
0 | \n", - "1976.0 | \n", - "CHEMBL2 | \n", - "Small molecule | \n", - "PRAZOSIN | \n", - "COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC | \n", - "
1 | \n", - "1984.0 | \n", - "CHEMBL3 | \n", - "Small molecule | \n", - "NICOTINE | \n", - "CN1CCC[C@H]1c1cccnc1 | \n", - "
2 | \n", - "1990.0 | \n", - "CHEMBL4 | \n", - "Small molecule | \n", - "OFLOXACIN | \n", - "CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 | \n", - "
3 | \n", - "1964.0 | \n", - "CHEMBL5 | \n", - "Small molecule | \n", - "NALIDIXIC ACID | \n", - "CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21 | \n", - "
4 | \n", - "1965.0 | \n", - "CHEMBL6 | \n", - "Small molecule | \n", - "INDOMETHACIN | \n", - "COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1 | \n", - "
... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
2623 | \n", - "2015.0 | \n", - "CHEMBL5095048 | \n", - "Small molecule | \n", - "AMPHETAMINE ASPARTATE/DEXTROAMPHETAMINE SULFATE | \n", - "CC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1... | \n", - "
2624 | \n", - "2022.0 | \n", - "CHEMBL5095049 | \n", - "Small molecule | \n", - "PACRITINIB CITRATE | \n", - "C1=C/COCc2cc(ccc2OCCN2CCCC2)Nc2nccc(n2)-c2cccc... | \n", - "
2625 | \n", - "2021.0 | \n", - "CHEMBL5095050 | \n", - "Small molecule | \n", - "FINGOLIMOD LAURYL SULFATE | \n", - "CCCCCCCCCCCCOS(=O)(=O)O.CCCCCCCCc1ccc(CCC(N)(C... | \n", - "
2626 | \n", - "2022.0 | \n", - "CHEMBL5095051 | \n", - "Small molecule | \n", - "VENLAFAXINE BESYLATE | \n", - "COc1ccc(C(CN(C)C)C2(O)CCCCC2)cc1.O=S(=O)(O)c1c... | \n", - "
2627 | \n", - "2015.0 | \n", - "CHEMBL5095505 | \n", - "Small molecule | \n", - "AMPHETAMINE/DEXTROAMPHETAMINE | \n", - "CC(N)Cc1ccccc1.C[C@H](N)Cc1ccccc1.C[C@H](N)Cc1... | \n", - "
2628 rows × 5 columns
\n", - "6w01^Go$j;62#A5C!Eq!Em!r3EM2q``7SodVTv6gFiV3VArB;#tonVcM
z;#HRX%M}(_p%0QZL|Dfn#4JtoLcP;)s?!iJn44O`gSKmx8_hM4PpAMKNXTp{8Pp2{
zQYs;%X!sG$*#ufr=thS2l531G2>VLqL<8O59mTa6#Y{H|EVwQT3Cz9UUC4)0Ai`g0
z%N_#=-*|!?R`(Yi(tkgN;Q`&zF8p0@@~uRWd1J4|R=XO_)?gWpR)|&!r-95L$>A#?
z-$}$RtZ;}!RoLvjT*W4aOmf2}C$vF{VZH8CprQs3nlcmyg+}}s PEsdu*7LsrdQop)g$oh$6bjbS
zV(5H5+WH!hSKeWSPUYA9hPuOf)FC)#w4P4u*Et_k4p;)O* $O>lX{Ze_VLOZ^)0jAvbe|+^QTh wOZSP@}kZQtwB>iDi3
zHMqJBT{xKOS`0Z}1no!AEljAFCXEeA(cWdk3GoIQaDA
z!Dsvm&*=&; W`oT{v53HE6cJ(CA~cl+e2a|qd7M`
znTN$*d?^pE
QIT@W}`=^?GWF1B?iR7btT|L>x6gtBR*Yk8;j_s9tWuAvTY%i2?(N`GA
z)vhPas_cBQ5|;5}?fGOAR-ivYK2@r|Pc}JE7m>SM-%kDW!}J^UabDc9+;vkwIst=6
zy8r5|Z2E^M3qSGMg>G0eMEbBft36in4SA!JGf;){!W==lB7kX*emy?PNmmux?~b{u
zIM!dTDT7@GbeO-oilM_JhWUTSwUW6U-8Fcyg}7x3yA^K{CU&ALrrm4(28P?k)3RXz
z3~tSq4VSob4rp6tHLO