diff --git a/.github/workflows/deploy-mkdocs-on-latest.yml b/.github/workflows/deploy-mkdocs-on-latest.yml new file mode 100644 index 0000000..32cc3cf --- /dev/null +++ b/.github/workflows/deploy-mkdocs-on-latest.yml @@ -0,0 +1,14 @@ +name: Deploy MkDocs on latest commit + +on: + push: + branches: + - main + - master + +jobs: + deploy-mkdocs: + uses: deargen/workflows/.github/workflows/deploy-mkdocs.yml@master + with: + deploy-type: latest + requirements-file: deps/lock/x86_64-manylinux_2_28/requirements_docs.txt diff --git a/README.md b/README.md index 130b440..9a340cb 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # bio-data-to-db: make Uniprot PostgreSQL database + [![image](https://img.shields.io/pypi/v/bio-data-to-db.svg)](https://pypi.python.org/pypi/bio-data-to-db) [![PyPI - Downloads](https://img.shields.io/pypi/dm/bio-data-to-db)](https://pypi.python.org/pypi/bio-data-to-db) [![image](https://img.shields.io/pypi/l/bio-data-to-db.svg)](https://pypi.python.org/pypi/bio-data-to-db) @@ -19,6 +20,8 @@ Written in Rust, thus equipped with extremely fast parsers. Packaged for python, So far, there is only one function implemented: **convert uniprot data to postgresql**. This package focuses more on parsing the data and inserting it into the database, rather than curating the data. +[📚 Documentation](https://deargen.github.io/bio-data-to-db/) + ## 🛠️ Installation ```bash @@ -29,6 +32,8 @@ pip install bio-data-to-db You can use the command line interface or the python API. +### Uniprot + ```bash # It will create a db 'uniprot' and a table named 'public.uniprot_info' in the database. # If you want another name, you can optionally pass it as the last argument. @@ -61,6 +66,49 @@ create_accession_to_pk_id_table("postgresql://user:password@localhost:5432/unipr keywords_tsv_to_postgresql("~/Downloads/keywords_all_2024_06_26.tsv", "postgresql://user:password@localhost:5432/uniprot") ``` +### BindingDB + +```bash +# Decode HTML entities and strip the strings in the `assay` table (column: description and assay_name). +# Currently, only assay table is supported. +bio-data-to-db bindingdb fix-table assay 'mysql://username:password@localhost/bind' +``` + +```python +from bio_data_to_db.bindingdb.fix_tables import fix_assay_table + +fix_assay_table("mysql://username:password@localhost/bind") +``` + +### PostgreSQL Helpers, SMILES, Polars utils and more + +```python +Some useful functions to work with PostgreSQL. + +```python +from bio_data_to_db.utils.postgresql import ( + create_db_if_not_exists, + create_schema_if_not_exists, + set_column_as_primary_key, + make_columns_unique, + make_large_columns_unique, + split_column_str_to_list, + polars_write_database, +) + +from bio_data_to_db.utils.smiles import ( + canonical_smiles_wo_salt, + polars_canonical_smiles_wo_salt, +) + +from bio_data_to_db.utils.polars import ( + w_pbar, +) +``` + +You can find the usage in the [📚 documentation](https://deargen.github.io/bio-data-to-db/). + + ## 👨‍💻️ Maintenance Notes ### Install from source @@ -72,10 +120,14 @@ bash scripts/install.sh uv pip install -r deps/requirements_dev.in ``` -### Compile requirements (generate lockfiles) +### Generate lockfiles Use GitHub Actions: `apply-pip-compile.yml`. Manually launch the workflow and it will make a commit with the updated lockfiles. +### Publish a new version to PyPI + +Use GitHub Actions: `deploy.yml`. Manually launch the workflow and it will compile on all architectures and publish the new version to PyPI. + ### About sqlx Sqlx offline mode should be configured so you can compile the code without a database present. diff --git a/deps/lock/aarch64-apple-darwin/.requirements.in.sha256 b/deps/lock/aarch64-apple-darwin/.requirements.in.sha256 index f9355d5..2be7d1e 100644 --- a/deps/lock/aarch64-apple-darwin/.requirements.in.sha256 +++ b/deps/lock/aarch64-apple-darwin/.requirements.in.sha256 @@ -1 +1 @@ -816025c3ff73af3261b082ee7e0c71954aa6b20922e17344cfb2f29636733488 requirements.in +2f65dd8deb2842edfead23a6aafb4f4f0b9e9e98982e39216069787d16327901 requirements.in diff --git a/deps/lock/aarch64-apple-darwin/.requirements_docs.in.sha256 b/deps/lock/aarch64-apple-darwin/.requirements_docs.in.sha256 new file mode 100644 index 0000000..6fd7f42 --- /dev/null +++ b/deps/lock/aarch64-apple-darwin/.requirements_docs.in.sha256 @@ -0,0 +1 @@ +f0f530946f38443ec95d76ac402dc3e3045fe8f7c26220e46b575aa56649503d requirements_docs.in diff --git a/deps/lock/aarch64-apple-darwin/requirements.txt b/deps/lock/aarch64-apple-darwin/requirements.txt index def352e..bacfba6 100644 --- a/deps/lock/aarch64-apple-darwin/requirements.txt +++ b/deps/lock/aarch64-apple-darwin/requirements.txt @@ -2,16 +2,23 @@ # uv pip compile requirements.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/aarch64-apple-darwin/requirements.txt --python-platform aarch64-apple-darwin --python-version 3.10 click==8.1.7 # via typer +connectorx==0.3.3 + # via -r requirements.in markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py -numpy==2.0.0 +mysqlclient==2.2.4 + # via -r requirements.in +numpy==1.26.4 # via # pandas # pyarrow + # rdkit pandas==2.2.2 # via -r requirements.in +pillow==10.4.0 + # via rdkit polars==1.2.0 # via -r requirements.in psycopg==3.2.1 @@ -26,6 +33,8 @@ python-dateutil==2.9.0.post0 # via pandas pytz==2024.1 # via pandas +rdkit==2024.3.3 + # via -r requirements.in rich==13.7.1 # via typer shellingham==1.5.4 @@ -34,6 +43,8 @@ six==1.16.0 # via python-dateutil sqlalchemy==2.0.31 # via -r requirements.in +tqdm==4.66.4 + # via -r requirements.in typer==0.12.3 # via -r requirements.in typing-extensions==4.12.2 diff --git a/deps/lock/aarch64-apple-darwin/requirements_dev.txt b/deps/lock/aarch64-apple-darwin/requirements_dev.txt index b904385..58547d9 100644 --- a/deps/lock/aarch64-apple-darwin/requirements_dev.txt +++ b/deps/lock/aarch64-apple-darwin/requirements_dev.txt @@ -6,6 +6,8 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via typer +connectorx==0.3.3 + # via -r requirements.in exceptiongroup==1.2.2 # via pytest filelock==3.15.4 @@ -24,13 +26,16 @@ maturin==1.7.0 # via -r requirements_dev.in mdurl==0.1.2 # via markdown-it-py +mysqlclient==2.2.4 + # via -r requirements.in networkx==3.3 # via -r requirements_dev.in -numpy==2.0.0 +numpy==1.26.4 # via # -r requirements_dev.in # pandas # pyarrow + # rdkit # scipy # trimesh packaging==24.1 @@ -39,6 +44,8 @@ packaging==24.1 # pytest pandas==2.2.2 # via -r requirements.in +pillow==10.4.0 + # via rdkit pluggy==1.5.0 # via pytest polars==1.2.0 @@ -59,6 +66,8 @@ pytz==2024.1 # via pandas pyyaml==6.0.1 # via huggingface-hub +rdkit==2024.3.3 + # via -r requirements.in requests==2.32.3 # via huggingface-hub rich==13.7.1 @@ -82,7 +91,9 @@ tomli==2.0.1 # maturin # pytest tqdm==4.66.4 - # via huggingface-hub + # via + # -r requirements.in + # huggingface-hub trimesh==4.4.3 # via -r requirements_dev.in typer==0.12.3 diff --git a/deps/lock/aarch64-apple-darwin/requirements_docs.txt b/deps/lock/aarch64-apple-darwin/requirements_docs.txt new file mode 100644 index 0000000..4b8a53c --- /dev/null +++ b/deps/lock/aarch64-apple-darwin/requirements_docs.txt @@ -0,0 +1,156 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements_docs.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/aarch64-apple-darwin/requirements_docs.txt --python-platform aarch64-apple-darwin --python-version 3.10 +babel==2.15.0 + # via mkdocs-material +backports-strenum==1.3.1 + # via griffe +cairocffi==1.7.1 + # via cairosvg +cairosvg==2.7.1 + # via mkdocs-material +certifi==2024.7.4 + # via requests +cffi==1.16.0 + # via cairocffi +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via + # mkdocs + # mkdocstrings +colorama==0.4.6 + # via + # griffe + # mkdocs-material +cssselect2==0.7.0 + # via cairosvg +defusedxml==0.7.1 + # via cairosvg +ghp-import==2.1.0 + # via mkdocs +griffe==0.48.0 + # via mkdocstrings-python +idna==3.7 + # via requests +importlib-metadata==8.2.0 + # via mike +importlib-resources==6.4.0 + # via mike +jinja2==3.1.4 + # via + # mike + # mkdocs + # mkdocs-material + # mkdocstrings +markdown==3.6 + # via + # mkdocs + # mkdocs-autorefs + # mkdocs-material + # mkdocstrings + # pymdown-extensions +markupsafe==2.1.5 + # via + # jinja2 + # mkdocs + # mkdocs-autorefs + # mkdocstrings +mergedeep==1.3.4 + # via + # mkdocs + # mkdocs-get-deps +mike==2.1.2 + # via -r requirements_docs.in +mkdocs==1.6.0 + # via + # -r requirements_docs.in + # mike + # mkdocs-autorefs + # mkdocs-coverage + # mkdocs-gen-files + # mkdocs-literate-nav + # mkdocs-material + # mkdocstrings +mkdocs-autorefs==1.0.1 + # via + # -r requirements_docs.in + # mkdocstrings +mkdocs-coverage==1.1.0 + # via -r requirements_docs.in +mkdocs-gen-files==0.5.0 + # via -r requirements_docs.in +mkdocs-get-deps==0.2.0 + # via mkdocs +mkdocs-literate-nav==0.6.1 + # via -r requirements_docs.in +mkdocs-material==9.5.30 + # via -r requirements_docs.in +mkdocs-material-extensions==1.3.1 + # via + # -r requirements_docs.in + # mkdocs-material +mkdocstrings==0.25.2 + # via + # -r requirements_docs.in + # mkdocstrings-python +mkdocstrings-python==1.10.7 + # via -r requirements_docs.in +packaging==24.1 + # via mkdocs +paginate==0.5.6 + # via mkdocs-material +pathspec==0.12.1 + # via mkdocs +pillow==10.4.0 + # via + # cairosvg + # mkdocs-material +platformdirs==4.2.2 + # via + # mkdocs-get-deps + # mkdocstrings +pycparser==2.22 + # via cffi +pygments==2.18.0 + # via mkdocs-material +pymdown-extensions==10.9 + # via + # mkdocs-material + # mkdocstrings +pyparsing==3.1.2 + # via mike +python-dateutil==2.9.0.post0 + # via ghp-import +pyyaml==6.0.1 + # via + # mike + # mkdocs + # mkdocs-get-deps + # pymdown-extensions + # pyyaml-env-tag +pyyaml-env-tag==0.1 + # via + # mike + # mkdocs +regex==2024.7.24 + # via mkdocs-material +requests==2.32.3 + # via mkdocs-material +six==1.16.0 + # via python-dateutil +tinycss2==1.3.0 + # via + # cairosvg + # cssselect2 +urllib3==2.2.2 + # via requests +verspec==0.1.0 + # via mike +watchdog==4.0.1 + # via mkdocs +webencodings==0.5.1 + # via + # cssselect2 + # tinycss2 +zipp==3.19.2 + # via importlib-metadata diff --git a/deps/lock/x86_64-apple-darwin/.requirements.in.sha256 b/deps/lock/x86_64-apple-darwin/.requirements.in.sha256 index f9355d5..2be7d1e 100644 --- a/deps/lock/x86_64-apple-darwin/.requirements.in.sha256 +++ b/deps/lock/x86_64-apple-darwin/.requirements.in.sha256 @@ -1 +1 @@ -816025c3ff73af3261b082ee7e0c71954aa6b20922e17344cfb2f29636733488 requirements.in +2f65dd8deb2842edfead23a6aafb4f4f0b9e9e98982e39216069787d16327901 requirements.in diff --git a/deps/lock/x86_64-apple-darwin/.requirements_docs.in.sha256 b/deps/lock/x86_64-apple-darwin/.requirements_docs.in.sha256 new file mode 100644 index 0000000..6fd7f42 --- /dev/null +++ b/deps/lock/x86_64-apple-darwin/.requirements_docs.in.sha256 @@ -0,0 +1 @@ +f0f530946f38443ec95d76ac402dc3e3045fe8f7c26220e46b575aa56649503d requirements_docs.in diff --git a/deps/lock/x86_64-apple-darwin/requirements.txt b/deps/lock/x86_64-apple-darwin/requirements.txt index 9ec9072..fcd3c42 100644 --- a/deps/lock/x86_64-apple-darwin/requirements.txt +++ b/deps/lock/x86_64-apple-darwin/requirements.txt @@ -2,18 +2,25 @@ # uv pip compile requirements.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-apple-darwin/requirements.txt --python-platform x86_64-apple-darwin --python-version 3.10 click==8.1.7 # via typer +connectorx==0.3.3 + # via -r requirements.in greenlet==3.0.3 # via sqlalchemy markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py -numpy==2.0.0 +mysqlclient==2.2.4 + # via -r requirements.in +numpy==1.26.4 # via # pandas # pyarrow + # rdkit pandas==2.2.2 # via -r requirements.in +pillow==10.4.0 + # via rdkit polars==1.2.0 # via -r requirements.in psycopg==3.2.1 @@ -28,6 +35,8 @@ python-dateutil==2.9.0.post0 # via pandas pytz==2024.1 # via pandas +rdkit==2024.3.3 + # via -r requirements.in rich==13.7.1 # via typer shellingham==1.5.4 @@ -36,6 +45,8 @@ six==1.16.0 # via python-dateutil sqlalchemy==2.0.31 # via -r requirements.in +tqdm==4.66.4 + # via -r requirements.in typer==0.12.3 # via -r requirements.in typing-extensions==4.12.2 diff --git a/deps/lock/x86_64-apple-darwin/requirements_dev.txt b/deps/lock/x86_64-apple-darwin/requirements_dev.txt index 6979b43..afe2537 100644 --- a/deps/lock/x86_64-apple-darwin/requirements_dev.txt +++ b/deps/lock/x86_64-apple-darwin/requirements_dev.txt @@ -6,6 +6,8 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via typer +connectorx==0.3.3 + # via -r requirements.in exceptiongroup==1.2.2 # via pytest filelock==3.15.4 @@ -26,13 +28,16 @@ maturin==1.7.0 # via -r requirements_dev.in mdurl==0.1.2 # via markdown-it-py +mysqlclient==2.2.4 + # via -r requirements.in networkx==3.3 # via -r requirements_dev.in -numpy==2.0.0 +numpy==1.26.4 # via # -r requirements_dev.in # pandas # pyarrow + # rdkit # scipy # trimesh packaging==24.1 @@ -41,6 +46,8 @@ packaging==24.1 # pytest pandas==2.2.2 # via -r requirements.in +pillow==10.4.0 + # via rdkit pluggy==1.5.0 # via pytest polars==1.2.0 @@ -61,6 +68,8 @@ pytz==2024.1 # via pandas pyyaml==6.0.1 # via huggingface-hub +rdkit==2024.3.3 + # via -r requirements.in requests==2.32.3 # via huggingface-hub rich==13.7.1 @@ -84,7 +93,9 @@ tomli==2.0.1 # maturin # pytest tqdm==4.66.4 - # via huggingface-hub + # via + # -r requirements.in + # huggingface-hub trimesh==4.4.3 # via -r requirements_dev.in typer==0.12.3 diff --git a/deps/lock/x86_64-apple-darwin/requirements_docs.txt b/deps/lock/x86_64-apple-darwin/requirements_docs.txt new file mode 100644 index 0000000..0fd83dd --- /dev/null +++ b/deps/lock/x86_64-apple-darwin/requirements_docs.txt @@ -0,0 +1,156 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements_docs.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-apple-darwin/requirements_docs.txt --python-platform x86_64-apple-darwin --python-version 3.10 +babel==2.15.0 + # via mkdocs-material +backports-strenum==1.3.1 + # via griffe +cairocffi==1.7.1 + # via cairosvg +cairosvg==2.7.1 + # via mkdocs-material +certifi==2024.7.4 + # via requests +cffi==1.16.0 + # via cairocffi +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via + # mkdocs + # mkdocstrings +colorama==0.4.6 + # via + # griffe + # mkdocs-material +cssselect2==0.7.0 + # via cairosvg +defusedxml==0.7.1 + # via cairosvg +ghp-import==2.1.0 + # via mkdocs +griffe==0.48.0 + # via mkdocstrings-python +idna==3.7 + # via requests +importlib-metadata==8.2.0 + # via mike +importlib-resources==6.4.0 + # via mike +jinja2==3.1.4 + # via + # mike + # mkdocs + # mkdocs-material + # mkdocstrings +markdown==3.6 + # via + # mkdocs + # mkdocs-autorefs + # mkdocs-material + # mkdocstrings + # pymdown-extensions +markupsafe==2.1.5 + # via + # jinja2 + # mkdocs + # mkdocs-autorefs + # mkdocstrings +mergedeep==1.3.4 + # via + # mkdocs + # mkdocs-get-deps +mike==2.1.2 + # via -r requirements_docs.in +mkdocs==1.6.0 + # via + # -r requirements_docs.in + # mike + # mkdocs-autorefs + # mkdocs-coverage + # mkdocs-gen-files + # mkdocs-literate-nav + # mkdocs-material + # mkdocstrings +mkdocs-autorefs==1.0.1 + # via + # -r requirements_docs.in + # mkdocstrings +mkdocs-coverage==1.1.0 + # via -r requirements_docs.in +mkdocs-gen-files==0.5.0 + # via -r requirements_docs.in +mkdocs-get-deps==0.2.0 + # via mkdocs +mkdocs-literate-nav==0.6.1 + # via -r requirements_docs.in +mkdocs-material==9.5.30 + # via -r requirements_docs.in +mkdocs-material-extensions==1.3.1 + # via + # -r requirements_docs.in + # mkdocs-material +mkdocstrings==0.25.2 + # via + # -r requirements_docs.in + # mkdocstrings-python +mkdocstrings-python==1.10.7 + # via -r requirements_docs.in +packaging==24.1 + # via mkdocs +paginate==0.5.6 + # via mkdocs-material +pathspec==0.12.1 + # via mkdocs +pillow==10.4.0 + # via + # cairosvg + # mkdocs-material +platformdirs==4.2.2 + # via + # mkdocs-get-deps + # mkdocstrings +pycparser==2.22 + # via cffi +pygments==2.18.0 + # via mkdocs-material +pymdown-extensions==10.9 + # via + # mkdocs-material + # mkdocstrings +pyparsing==3.1.2 + # via mike +python-dateutil==2.9.0.post0 + # via ghp-import +pyyaml==6.0.1 + # via + # mike + # mkdocs + # mkdocs-get-deps + # pymdown-extensions + # pyyaml-env-tag +pyyaml-env-tag==0.1 + # via + # mike + # mkdocs +regex==2024.7.24 + # via mkdocs-material +requests==2.32.3 + # via mkdocs-material +six==1.16.0 + # via python-dateutil +tinycss2==1.3.0 + # via + # cairosvg + # cssselect2 +urllib3==2.2.2 + # via requests +verspec==0.1.0 + # via mike +watchdog==4.0.1 + # via mkdocs +webencodings==0.5.1 + # via + # cssselect2 + # tinycss2 +zipp==3.19.2 + # via importlib-metadata diff --git a/deps/lock/x86_64-manylinux_2_28/.requirements.in.sha256 b/deps/lock/x86_64-manylinux_2_28/.requirements.in.sha256 index f9355d5..2be7d1e 100644 --- a/deps/lock/x86_64-manylinux_2_28/.requirements.in.sha256 +++ b/deps/lock/x86_64-manylinux_2_28/.requirements.in.sha256 @@ -1 +1 @@ -816025c3ff73af3261b082ee7e0c71954aa6b20922e17344cfb2f29636733488 requirements.in +2f65dd8deb2842edfead23a6aafb4f4f0b9e9e98982e39216069787d16327901 requirements.in diff --git a/deps/lock/x86_64-manylinux_2_28/.requirements_docs.in.sha256 b/deps/lock/x86_64-manylinux_2_28/.requirements_docs.in.sha256 new file mode 100644 index 0000000..6fd7f42 --- /dev/null +++ b/deps/lock/x86_64-manylinux_2_28/.requirements_docs.in.sha256 @@ -0,0 +1 @@ +f0f530946f38443ec95d76ac402dc3e3045fe8f7c26220e46b575aa56649503d requirements_docs.in diff --git a/deps/lock/x86_64-manylinux_2_28/requirements.txt b/deps/lock/x86_64-manylinux_2_28/requirements.txt index 1eb8c87..65ba813 100644 --- a/deps/lock/x86_64-manylinux_2_28/requirements.txt +++ b/deps/lock/x86_64-manylinux_2_28/requirements.txt @@ -2,18 +2,25 @@ # uv pip compile requirements.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-manylinux_2_28/requirements.txt --python-platform x86_64-manylinux_2_28 --python-version 3.10 click==8.1.7 # via typer +connectorx==0.3.3 + # via -r requirements.in greenlet==3.0.3 # via sqlalchemy markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py -numpy==2.0.0 +mysqlclient==2.2.4 + # via -r requirements.in +numpy==1.26.4 # via # pandas # pyarrow + # rdkit pandas==2.2.2 # via -r requirements.in +pillow==10.4.0 + # via rdkit polars==1.2.0 # via -r requirements.in psycopg==3.2.1 @@ -28,6 +35,8 @@ python-dateutil==2.9.0.post0 # via pandas pytz==2024.1 # via pandas +rdkit==2024.3.3 + # via -r requirements.in rich==13.7.1 # via typer shellingham==1.5.4 @@ -36,6 +45,8 @@ six==1.16.0 # via python-dateutil sqlalchemy==2.0.31 # via -r requirements.in +tqdm==4.66.4 + # via -r requirements.in typer==0.12.3 # via -r requirements.in typing-extensions==4.12.2 diff --git a/deps/lock/x86_64-manylinux_2_28/requirements_dev.txt b/deps/lock/x86_64-manylinux_2_28/requirements_dev.txt index f4473b2..aafabce 100644 --- a/deps/lock/x86_64-manylinux_2_28/requirements_dev.txt +++ b/deps/lock/x86_64-manylinux_2_28/requirements_dev.txt @@ -6,6 +6,8 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via typer +connectorx==0.3.3 + # via -r requirements.in exceptiongroup==1.2.2 # via pytest filelock==3.15.4 @@ -26,13 +28,16 @@ maturin==1.7.0 # via -r requirements_dev.in mdurl==0.1.2 # via markdown-it-py +mysqlclient==2.2.4 + # via -r requirements.in networkx==3.3 # via -r requirements_dev.in -numpy==2.0.0 +numpy==1.26.4 # via # -r requirements_dev.in # pandas # pyarrow + # rdkit # scipy # trimesh packaging==24.1 @@ -41,6 +46,8 @@ packaging==24.1 # pytest pandas==2.2.2 # via -r requirements.in +pillow==10.4.0 + # via rdkit pluggy==1.5.0 # via pytest polars==1.2.0 @@ -61,6 +68,8 @@ pytz==2024.1 # via pandas pyyaml==6.0.1 # via huggingface-hub +rdkit==2024.3.3 + # via -r requirements.in requests==2.32.3 # via huggingface-hub rich==13.7.1 @@ -84,7 +93,9 @@ tomli==2.0.1 # maturin # pytest tqdm==4.66.4 - # via huggingface-hub + # via + # -r requirements.in + # huggingface-hub trimesh==4.4.3 # via -r requirements_dev.in typer==0.12.3 diff --git a/deps/lock/x86_64-manylinux_2_28/requirements_docs.txt b/deps/lock/x86_64-manylinux_2_28/requirements_docs.txt new file mode 100644 index 0000000..4bc7758 --- /dev/null +++ b/deps/lock/x86_64-manylinux_2_28/requirements_docs.txt @@ -0,0 +1,156 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements_docs.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-manylinux_2_28/requirements_docs.txt --python-platform x86_64-manylinux_2_28 --python-version 3.10 +babel==2.15.0 + # via mkdocs-material +backports-strenum==1.3.1 + # via griffe +cairocffi==1.7.1 + # via cairosvg +cairosvg==2.7.1 + # via mkdocs-material +certifi==2024.7.4 + # via requests +cffi==1.16.0 + # via cairocffi +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via + # mkdocs + # mkdocstrings +colorama==0.4.6 + # via + # griffe + # mkdocs-material +cssselect2==0.7.0 + # via cairosvg +defusedxml==0.7.1 + # via cairosvg +ghp-import==2.1.0 + # via mkdocs +griffe==0.48.0 + # via mkdocstrings-python +idna==3.7 + # via requests +importlib-metadata==8.2.0 + # via mike +importlib-resources==6.4.0 + # via mike +jinja2==3.1.4 + # via + # mike + # mkdocs + # mkdocs-material + # mkdocstrings +markdown==3.6 + # via + # mkdocs + # mkdocs-autorefs + # mkdocs-material + # mkdocstrings + # pymdown-extensions +markupsafe==2.1.5 + # via + # jinja2 + # mkdocs + # mkdocs-autorefs + # mkdocstrings +mergedeep==1.3.4 + # via + # mkdocs + # mkdocs-get-deps +mike==2.1.2 + # via -r requirements_docs.in +mkdocs==1.6.0 + # via + # -r requirements_docs.in + # mike + # mkdocs-autorefs + # mkdocs-coverage + # mkdocs-gen-files + # mkdocs-literate-nav + # mkdocs-material + # mkdocstrings +mkdocs-autorefs==1.0.1 + # via + # -r requirements_docs.in + # mkdocstrings +mkdocs-coverage==1.1.0 + # via -r requirements_docs.in +mkdocs-gen-files==0.5.0 + # via -r requirements_docs.in +mkdocs-get-deps==0.2.0 + # via mkdocs +mkdocs-literate-nav==0.6.1 + # via -r requirements_docs.in +mkdocs-material==9.5.30 + # via -r requirements_docs.in +mkdocs-material-extensions==1.3.1 + # via + # -r requirements_docs.in + # mkdocs-material +mkdocstrings==0.25.2 + # via + # -r requirements_docs.in + # mkdocstrings-python +mkdocstrings-python==1.10.7 + # via -r requirements_docs.in +packaging==24.1 + # via mkdocs +paginate==0.5.6 + # via mkdocs-material +pathspec==0.12.1 + # via mkdocs +pillow==10.4.0 + # via + # cairosvg + # mkdocs-material +platformdirs==4.2.2 + # via + # mkdocs-get-deps + # mkdocstrings +pycparser==2.22 + # via cffi +pygments==2.18.0 + # via mkdocs-material +pymdown-extensions==10.9 + # via + # mkdocs-material + # mkdocstrings +pyparsing==3.1.2 + # via mike +python-dateutil==2.9.0.post0 + # via ghp-import +pyyaml==6.0.1 + # via + # mike + # mkdocs + # mkdocs-get-deps + # pymdown-extensions + # pyyaml-env-tag +pyyaml-env-tag==0.1 + # via + # mike + # mkdocs +regex==2024.7.24 + # via mkdocs-material +requests==2.32.3 + # via mkdocs-material +six==1.16.0 + # via python-dateutil +tinycss2==1.3.0 + # via + # cairosvg + # cssselect2 +urllib3==2.2.2 + # via requests +verspec==0.1.0 + # via mike +watchdog==4.0.1 + # via mkdocs +webencodings==0.5.1 + # via + # cssselect2 + # tinycss2 +zipp==3.19.2 + # via importlib-metadata diff --git a/deps/lock/x86_64-pc-windows-msvc/.requirements.in.sha256 b/deps/lock/x86_64-pc-windows-msvc/.requirements.in.sha256 index f9355d5..2be7d1e 100644 --- a/deps/lock/x86_64-pc-windows-msvc/.requirements.in.sha256 +++ b/deps/lock/x86_64-pc-windows-msvc/.requirements.in.sha256 @@ -1 +1 @@ -816025c3ff73af3261b082ee7e0c71954aa6b20922e17344cfb2f29636733488 requirements.in +2f65dd8deb2842edfead23a6aafb4f4f0b9e9e98982e39216069787d16327901 requirements.in diff --git a/deps/lock/x86_64-pc-windows-msvc/.requirements_docs.in.sha256 b/deps/lock/x86_64-pc-windows-msvc/.requirements_docs.in.sha256 new file mode 100644 index 0000000..6fd7f42 --- /dev/null +++ b/deps/lock/x86_64-pc-windows-msvc/.requirements_docs.in.sha256 @@ -0,0 +1 @@ +f0f530946f38443ec95d76ac402dc3e3045fe8f7c26220e46b575aa56649503d requirements_docs.in diff --git a/deps/lock/x86_64-pc-windows-msvc/requirements.txt b/deps/lock/x86_64-pc-windows-msvc/requirements.txt index ac6b99f..8789f66 100644 --- a/deps/lock/x86_64-pc-windows-msvc/requirements.txt +++ b/deps/lock/x86_64-pc-windows-msvc/requirements.txt @@ -3,19 +3,28 @@ click==8.1.7 # via typer colorama==0.4.6 - # via click + # via + # click + # tqdm +connectorx==0.3.3 + # via -r requirements.in greenlet==3.0.3 # via sqlalchemy markdown-it-py==3.0.0 # via rich mdurl==0.1.2 # via markdown-it-py -numpy==2.0.0 +mysqlclient==2.2.4 + # via -r requirements.in +numpy==1.26.4 # via # pandas # pyarrow + # rdkit pandas==2.2.2 # via -r requirements.in +pillow==10.4.0 + # via rdkit polars==1.2.0 # via -r requirements.in psycopg==3.2.1 @@ -30,6 +39,8 @@ python-dateutil==2.9.0.post0 # via pandas pytz==2024.1 # via pandas +rdkit==2024.3.3 + # via -r requirements.in rich==13.7.1 # via typer shellingham==1.5.4 @@ -38,6 +49,8 @@ six==1.16.0 # via python-dateutil sqlalchemy==2.0.31 # via -r requirements.in +tqdm==4.66.4 + # via -r requirements.in typer==0.12.3 # via -r requirements.in typing-extensions==4.12.2 diff --git a/deps/lock/x86_64-pc-windows-msvc/requirements_dev.txt b/deps/lock/x86_64-pc-windows-msvc/requirements_dev.txt index e4a4b05..3afa870 100644 --- a/deps/lock/x86_64-pc-windows-msvc/requirements_dev.txt +++ b/deps/lock/x86_64-pc-windows-msvc/requirements_dev.txt @@ -11,6 +11,8 @@ colorama==0.4.6 # click # pytest # tqdm +connectorx==0.3.3 + # via -r requirements.in exceptiongroup==1.2.2 # via pytest filelock==3.15.4 @@ -31,13 +33,16 @@ maturin==1.7.0 # via -r requirements_dev.in mdurl==0.1.2 # via markdown-it-py +mysqlclient==2.2.4 + # via -r requirements.in networkx==3.3 # via -r requirements_dev.in -numpy==2.0.0 +numpy==1.26.4 # via # -r requirements_dev.in # pandas # pyarrow + # rdkit # scipy # trimesh packaging==24.1 @@ -46,6 +51,8 @@ packaging==24.1 # pytest pandas==2.2.2 # via -r requirements.in +pillow==10.4.0 + # via rdkit pluggy==1.5.0 # via pytest polars==1.2.0 @@ -66,6 +73,8 @@ pytz==2024.1 # via pandas pyyaml==6.0.1 # via huggingface-hub +rdkit==2024.3.3 + # via -r requirements.in requests==2.32.3 # via huggingface-hub rich==13.7.1 @@ -89,7 +98,9 @@ tomli==2.0.1 # maturin # pytest tqdm==4.66.4 - # via huggingface-hub + # via + # -r requirements.in + # huggingface-hub trimesh==4.4.3 # via -r requirements_dev.in typer==0.12.3 diff --git a/deps/lock/x86_64-pc-windows-msvc/requirements_docs.txt b/deps/lock/x86_64-pc-windows-msvc/requirements_docs.txt new file mode 100644 index 0000000..646989c --- /dev/null +++ b/deps/lock/x86_64-pc-windows-msvc/requirements_docs.txt @@ -0,0 +1,158 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements_docs.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-pc-windows-msvc/requirements_docs.txt --python-platform x86_64-pc-windows-msvc --python-version 3.10 +babel==2.15.0 + # via mkdocs-material +backports-strenum==1.3.1 + # via griffe +cairocffi==1.7.1 + # via cairosvg +cairosvg==2.7.1 + # via mkdocs-material +certifi==2024.7.4 + # via requests +cffi==1.16.0 + # via cairocffi +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via + # mkdocs + # mkdocstrings +colorama==0.4.6 + # via + # click + # griffe + # mkdocs + # mkdocs-material +cssselect2==0.7.0 + # via cairosvg +defusedxml==0.7.1 + # via cairosvg +ghp-import==2.1.0 + # via mkdocs +griffe==0.48.0 + # via mkdocstrings-python +idna==3.7 + # via requests +importlib-metadata==8.2.0 + # via mike +importlib-resources==6.4.0 + # via mike +jinja2==3.1.4 + # via + # mike + # mkdocs + # mkdocs-material + # mkdocstrings +markdown==3.6 + # via + # mkdocs + # mkdocs-autorefs + # mkdocs-material + # mkdocstrings + # pymdown-extensions +markupsafe==2.1.5 + # via + # jinja2 + # mkdocs + # mkdocs-autorefs + # mkdocstrings +mergedeep==1.3.4 + # via + # mkdocs + # mkdocs-get-deps +mike==2.1.2 + # via -r requirements_docs.in +mkdocs==1.6.0 + # via + # -r requirements_docs.in + # mike + # mkdocs-autorefs + # mkdocs-coverage + # mkdocs-gen-files + # mkdocs-literate-nav + # mkdocs-material + # mkdocstrings +mkdocs-autorefs==1.0.1 + # via + # -r requirements_docs.in + # mkdocstrings +mkdocs-coverage==1.1.0 + # via -r requirements_docs.in +mkdocs-gen-files==0.5.0 + # via -r requirements_docs.in +mkdocs-get-deps==0.2.0 + # via mkdocs +mkdocs-literate-nav==0.6.1 + # via -r requirements_docs.in +mkdocs-material==9.5.30 + # via -r requirements_docs.in +mkdocs-material-extensions==1.3.1 + # via + # -r requirements_docs.in + # mkdocs-material +mkdocstrings==0.25.2 + # via + # -r requirements_docs.in + # mkdocstrings-python +mkdocstrings-python==1.10.7 + # via -r requirements_docs.in +packaging==24.1 + # via mkdocs +paginate==0.5.6 + # via mkdocs-material +pathspec==0.12.1 + # via mkdocs +pillow==10.4.0 + # via + # cairosvg + # mkdocs-material +platformdirs==4.2.2 + # via + # mkdocs-get-deps + # mkdocstrings +pycparser==2.22 + # via cffi +pygments==2.18.0 + # via mkdocs-material +pymdown-extensions==10.9 + # via + # mkdocs-material + # mkdocstrings +pyparsing==3.1.2 + # via mike +python-dateutil==2.9.0.post0 + # via ghp-import +pyyaml==6.0.1 + # via + # mike + # mkdocs + # mkdocs-get-deps + # pymdown-extensions + # pyyaml-env-tag +pyyaml-env-tag==0.1 + # via + # mike + # mkdocs +regex==2024.7.24 + # via mkdocs-material +requests==2.32.3 + # via mkdocs-material +six==1.16.0 + # via python-dateutil +tinycss2==1.3.0 + # via + # cairosvg + # cssselect2 +urllib3==2.2.2 + # via requests +verspec==0.1.0 + # via mike +watchdog==4.0.1 + # via mkdocs +webencodings==0.5.1 + # via + # cssselect2 + # tinycss2 +zipp==3.19.2 + # via importlib-metadata diff --git a/deps/requirements.in b/deps/requirements.in index 682be56..b81c766 100644 --- a/deps/requirements.in +++ b/deps/requirements.in @@ -5,4 +5,7 @@ sqlalchemy>=2.0.0 psycopg>=3.2.0 psycopg2>=2.9.0 pyarrow>=17.0.0 - +mysqlclient>=2.2.0 +connectorx>=0.3.0 +rdkit>=2024.3.1 +tqdm>=4.62.0 diff --git a/deps/requirements_docs.in b/deps/requirements_docs.in new file mode 100644 index 0000000..435b779 --- /dev/null +++ b/deps/requirements_docs.in @@ -0,0 +1,10 @@ +mkdocs +mkdocs-autorefs +mkdocs-coverage +mkdocs-gen-files +mkdocs-literate-nav +mkdocs-material[imaging] +mkdocs-material-extensions +mkdocstrings +mkdocstrings-python +mike diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..612c7a5 --- /dev/null +++ b/docs/index.md @@ -0,0 +1 @@ +--8<-- "README.md" diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..b95a704 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,103 @@ +site_name: bio-data-to-db +site_url: 'https://deargen.github.io/bio-data-to-db' +repo_url: 'https://github.com/deargen/bio-data-to-db' +copyright: | + © 2024 Deargen Inc. +watch: [mkdocs.yml, README.md, src/] +validation: + omitted_files: warn + absolute_links: warn + unrecognized_links: warn + +nav: + - Home: + - Overview: index.md + - Changelog: CHANGELOG.md + # defer to gen-files + literate-nav + - API reference: + - mkdocstrings-python: reference/ + +theme: + name: material + font: + text: Noto Sans Korean + code: Jetbrains Mono + features: + - toc.follow + - navigation.top + - navigation.footer + - navigation.sections + - navigation.tabs + - navigation.tabs.sticky + - navigation.indexes + - navigation.path + - search.suggest + - search.highlight + - content.tabs.link + - content.code.annotation + - content.code.copy + language: ko + palette: + - media: '(prefers-color-scheme: light)' + scheme: default + primary: teal + accent: purple + toggle: + icon: material/weather-sunny + name: Switch to dark mode + - media: '(prefers-color-scheme: dark)' + scheme: slate + primary: black + accent: lime + toggle: + icon: material/weather-night + name: Switch to system preference + +plugins: + - search + - gen-files: + scripts: + - scripts/gen_ref_nav.py + - literate-nav: + nav_file: SUMMARY.md + - mkdocstrings: + handlers: + python: + options: + show_symbol_type_heading: true + show_symbol_type_toc: true + members_order: source + allow_inspection: false # for .pyi stubs to work + paths: [src] # search packages in the src folder + +extra: + social: + - icon: fontawesome/brands/github-alt + link: https://github.com/deargen/python-project-template-2024 + version: + provider: mike + +markdown_extensions: + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - admonition + - pymdownx.arithmatex: + generic: true + - footnotes + - pymdownx.details + - pymdownx.superfences + - pymdownx.mark + - attr_list + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - pymdownx.tilde # strikethrough with ~~ ~~ + - toc: + permalink: true diff --git a/pyproject.toml b/pyproject.toml index ffe94ff..431c7da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,6 +105,7 @@ ignore = [ "NPY002", # legacy numpy random "UP017", # datetime.timezone.utc -> datetime.UTC "SIM108", # use ternary operator instead of if-else + "PYI021", # Docstrings should not be included in stubs ] [tool.ruff.lint.pydocstyle] diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 0cd881a..2133a7c 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -80,7 +80,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "bio-data-to-db" -version = "0.0.1+0.g4467862.dirty" +version = "0.1.1+7.g2bfd838.dirty" dependencies = [ "icecream", "pyo3", diff --git a/scripts/gen_ref_nav.py b/scripts/gen_ref_nav.py new file mode 100644 index 0000000..7abcf6b --- /dev/null +++ b/scripts/gen_ref_nav.py @@ -0,0 +1,50 @@ +"""Generate the code reference pages and navigation.""" + +from pathlib import Path + +import mkdocs_gen_files + +IGNORE_MODULES_EXACT = { + # "bio_data_to_db.__init__", +} + +IGNORE_MODULES_STARTSWITH = { + "bio_data_to_db.cli.", +} + +nav = mkdocs_gen_files.Nav() +mod_symbol = '' + +src = Path(__file__).parent.parent / "src" + +for path in sorted(src.rglob("*.py")): + module_path = path.relative_to(src).with_suffix("") + doc_path = path.relative_to(src).with_suffix(".md") + full_doc_path = Path("reference", doc_path) + + parts = tuple(module_path.parts) + module_str = ".".join(parts) + + if module_str in IGNORE_MODULES_EXACT or any( + module_str.startswith(prefix) for prefix in IGNORE_MODULES_STARTSWITH + ): + print(f"Skipping module: {module_str}") # noqa: T201 + continue + if parts[-1] == "__init__": + parts = parts[:-1] + doc_path = doc_path.with_name("index.md") + full_doc_path = full_doc_path.with_name("index.md") + elif parts[-1].startswith("_"): + continue + + nav_parts = [f"{mod_symbol} {part}" for part in parts] + nav[tuple(nav_parts)] = doc_path.as_posix() + + with mkdocs_gen_files.open(full_doc_path, "w") as fd: + ident = ".".join(parts) + fd.write(f"::: {ident}") + + mkdocs_gen_files.set_edit_path(full_doc_path, ".." / path) + +with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file: + nav_file.writelines(nav.build_literate_nav()) diff --git a/src/bio_data_to_db/__init__.py b/src/bio_data_to_db/__init__.py index f84101f..6139137 100644 --- a/src/bio_data_to_db/__init__.py +++ b/src/bio_data_to_db/__init__.py @@ -1,8 +1,6 @@ # Allow star imports # ruff: noqa: F403 F405 -from __future__ import annotations - from .bio_data_to_db import * __doc__ = bio_data_to_db.__doc__ diff --git a/src/bio_data_to_db/bindingdb/__init__.py b/src/bio_data_to_db/bindingdb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bio_data_to_db/bindingdb/fix_tables.py b/src/bio_data_to_db/bindingdb/fix_tables.py new file mode 100644 index 0000000..ce56454 --- /dev/null +++ b/src/bio_data_to_db/bindingdb/fix_tables.py @@ -0,0 +1,54 @@ +import html + +import polars as pl +import sqlalchemy + + +def fix_assay_table(uri: str): + """ + Fix the assay table in MySQL by decoding HTML entities like `'` and strip empty spaces. + + Notes: + - the table is replaced. + - types are reserved by manually changing them to the original types. For example, varchar -> text -> varchar + - primary key and foreign key constraints are reserved by manually adding them back just like the original table + """ + query = """ + SELECT + * + FROM + assay + """ + assay_df = pl.read_database_uri(query=query, uri=uri) + + # the column might be "binary" type if the type is "TEXT" in MySQL + assay_df = assay_df.with_columns( + pl.col("description") + .cast(pl.Utf8) + .map_elements(lambda s: html.unescape(s.strip()), return_dtype=pl.Utf8), + pl.col("assay_name") + .cast(pl.Utf8) + .map_elements(lambda s: html.unescape(s.strip()), return_dtype=pl.Utf8), + ) + + assay_df.write_database( + table_name="assay", + connection=uri, + if_table_exists="replace", + ) + + with sqlalchemy.create_engine(uri).connect() as conn: + conn.execute( + sqlalchemy.text(""" + ALTER TABLE assay MODIFY COLUMN `entryid` INT(11); + ALTER TABLE assay MODIFY COLUMN `assayid` INT(11); + ALTER TABLE assay MODIFY COLUMN `description` VARCHAR(4000); + ALTER TABLE assay MODIFY COLUMN `assay_name` VARCHAR(200); + ALTER TABLE assay ADD PRIMARY KEY (`entryid`,`assayid`); + ALTER TABLE assay ADD CONSTRAINT `assay_ibfk_1` FOREIGN KEY (`entryid`) REFERENCES `entry` (`entryid`); + """) + ) + + +# if __name__ == "__main__": +# fix_assay_table("mysql://username:@localhost:3306/bind") diff --git a/src/bio_data_to_db/bio_data_to_db.pyi b/src/bio_data_to_db/bio_data_to_db.pyi index 79d2d14..3c53be3 100644 --- a/src/bio_data_to_db/bio_data_to_db.pyi +++ b/src/bio_data_to_db/bio_data_to_db.pyi @@ -2,4 +2,9 @@ def uniprot_xml_to_postgresql( *, uniprot_xml_path: str, uri: str, -) -> None: ... +) -> None: + """ + (🦀 Rust) Load UniProt XML file into PostgreSQL database. + + This creates a `uniprot` database and a `uniprot_info` table. + """ diff --git a/src/bio_data_to_db/cli/bindingdb.py b/src/bio_data_to_db/cli/bindingdb.py new file mode 100644 index 0000000..a399407 --- /dev/null +++ b/src/bio_data_to_db/cli/bindingdb.py @@ -0,0 +1,45 @@ +import enum +import logging +from typing import Annotated + +import typer + +logger = logging.getLogger(__name__) + +app = typer.Typer(no_args_is_help=True) + + +class FixTableOption(str, enum.Enum): + assay = "assay" + + +@app.command(no_args_is_help=True) +def fix_table( + table_name: Annotated[ + FixTableOption, + typer.Argument(help="Table name to fix"), + ], + uri: Annotated[ + str, + typer.Argument(help="URI to the MySQL database"), + ], +): + """ + Fix the assay table in MySQL by decoding HTML entities like ''' and strip empty spaces. + """ + from bio_data_to_db.bindingdb.fix_tables import fix_assay_table + from bio_data_to_db.utils.log import setup_logging + + setup_logging() + fix_assay_table(uri) + logger.info( + "In `assay` table, HTML entities are decoded and empty spaces are stripped." + ) + + +def main(): + app() + + +if __name__ == "__main__": + main() diff --git a/src/bio_data_to_db/cli/main.py b/src/bio_data_to_db/cli/main.py index 42b537d..0376418 100644 --- a/src/bio_data_to_db/cli/main.py +++ b/src/bio_data_to_db/cli/main.py @@ -1,10 +1,11 @@ import typer -from . import uniprot +from . import bindingdb, uniprot app = typer.Typer(no_args_is_help=True) app.add_typer(uniprot.app, name="uniprot") +app.add_typer(bindingdb.app, name="bindingdb") def main(): diff --git a/src/bio_data_to_db/uniprot/utils.py b/src/bio_data_to_db/uniprot/utils.py index 4bc751d..c03216f 100644 --- a/src/bio_data_to_db/uniprot/utils.py +++ b/src/bio_data_to_db/uniprot/utils.py @@ -16,6 +16,32 @@ def create_empty_table( uri: str, ): + """ + Create an empty table in the database. Necessary to create the table structure before inserting data. + + Note: + It runs the following SQL query: + ```sql + CREATE TABLE public.uniprot_info ( + uniprot_pk_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY, + accessions TEXT[], + names TEXT[], + protein_names TEXT[], + gene_names TEXT[], + organism_scientific TEXT, + organism_commons TEXT[], + organism_synonyms TEXT[], + ncbi_taxonomy_id INT, + deargen_ncbi_taxonomy_id INT, + lineage TEXT[], + keywords TEXT[], + geneontology_ids TEXT[], + geneontology_names TEXT[], + sequence TEXT, + deargen_molecular_functions TEXT[] + ) + ``` + """ uri_wo_dbname, dbname = uri.rsplit("/", 1) create_db_if_not_exists(uri_wo_dbname, dbname) create_schema_if_not_exists(uri, "public") @@ -55,6 +81,17 @@ def create_empty_table( def create_accession_to_pk_id(uri: str): + """ + Create a table to map accession to uniprot_pk_id, from the uniprot_info table. + + It creates the following tables: + + - accession_to_pk_id + - accession_to_pk_id_list + + Note: + The mapping is not unique. It is possible to have multiple uniprot_pk_id for a single accession and vice versa. + """ with psycopg.connect( conninfo=uri, ) as conn: @@ -118,6 +155,9 @@ def keywords_tsv_to_postgresql( schema_name="public", table_name="keywords", ): + """ + Load the keywords_all_2024_06_26.tsv (or similar version) file into the database. + """ tsv_columns = [ "Keyword ID", "Name", diff --git a/src/bio_data_to_db/utils/__init__.py b/src/bio_data_to_db/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bio_data_to_db/utils/polars.py b/src/bio_data_to_db/utils/polars.py new file mode 100644 index 0000000..7a6a00f --- /dev/null +++ b/src/bio_data_to_db/utils/polars.py @@ -0,0 +1,26 @@ +from collections.abc import Callable +from typing import Any + +import tqdm + + +def w_pbar(pbar: tqdm.std.tqdm, func: Callable[..., Any]) -> Callable[..., Any]: + """ + Apply progress bar when using `map_elements` in `polars`. + + Examples: + >>> with tqdm(total=len(df)) as pbar: # doctest: +SKIP + ... df = df.with_columns( + ... pl.col("in_col") + ... .map_elements(w_pbar(pbar, lambda x: x + 1), return_dtype=pl.Int64) + ... ) + + Reference: + - https://stackoverflow.com/questions/75550124/python-polars-how-to-add-a-progress-bars-to-apply-loops + """ + + def foo(*args, **kwargs): + pbar.update(1) + return func(*args, **kwargs) + + return foo diff --git a/src/bio_data_to_db/utils/postgresql.py b/src/bio_data_to_db/utils/postgresql.py index b9e0658..93da972 100644 --- a/src/bio_data_to_db/utils/postgresql.py +++ b/src/bio_data_to_db/utils/postgresql.py @@ -70,6 +70,9 @@ def polars_datatype_to_sqlalchemy_type( def create_db_if_not_exists(uri_wo_db: str, db_name: str, comment: str | None = None): + """ + Create a database if it doesn't exist. + """ with psycopg.connect( conninfo=f"{uri_wo_db}", ) as conn: @@ -110,6 +113,9 @@ def create_db_if_not_exists(uri_wo_db: str, db_name: str, comment: str | None = def create_schema_if_not_exists(uri: str, schema_name: str, comment: str | None = None): + """ + Create a schema if it doesn't exist. The DB should already exist. + """ db_name = uri.split("/")[-1] with psycopg.connect( conninfo=uri, @@ -318,6 +324,9 @@ def split_column_str_to_list( separator: str, pg_element_type: str = "text", ): + """ + Split a string column into a list column. + """ if pg_element_type.lower() not in { "text", }: @@ -458,13 +467,13 @@ def polars_write_database( """ pl.DataFrame.write_database() but address the issue of writing unsigned and list columns to database. - https://stackoverflow.com/questions/77098480/polars-psycopg2-write-column-of-lists-to-postgresql + Reference: + - https://stackoverflow.com/questions/77098480/polars-psycopg2-write-column-of-lists-to-postgresql """ if isinstance(connection, str): connection = create_engine(connection) columns_dtype = {col: df[col].dtype for col in df.columns} - column_name_to_sqlalchemy_type = { col: polars_datatype_to_sqlalchemy_type(dtype) for col, dtype in columns_dtype.items() diff --git a/src/bio_data_to_db/utils/smiles.py b/src/bio_data_to_db/utils/smiles.py new file mode 100644 index 0000000..d2d05d4 --- /dev/null +++ b/src/bio_data_to_db/utils/smiles.py @@ -0,0 +1,51 @@ +from functools import cache + +import polars as pl +from rdkit import Chem +from tqdm import tqdm + +from .polars import w_pbar + + +@cache +def canonical_smiles_wo_salt(smiles: str) -> str | None: + """ + Get the canonical SMILES without salt from the input SMILES. + + Salt is a short part separated by "." in the SMILES. + Shared function with dti-pytorch + """ + m = Chem.MolFromSmiles(smiles) + if m is not None: + canonical_smiles = Chem.MolToSmiles(m, isomericSmiles=True, canonical=True) + split_smi = canonical_smiles.split(".") + if len(split_smi) > 1: + smiles_wo_salt = max(split_smi, key=len) + if Chem.MolFromSmiles(smiles_wo_salt) is None: + smiles_wo_salt = None + else: + smiles_wo_salt = split_smi[0] + else: + smiles_wo_salt = None + return smiles_wo_salt + + +def polars_canonical_smiles_wo_salt( + df: pl.DataFrame, + *, + smiles_col: str = "smiles", + out_col: str = "canonical_smiles_wo_salt", +) -> pl.DataFrame: + """ + Apply canonical_smiles_wo_salt on the DataFrame with tqdm. + """ + with tqdm( + total=df.shape[0], desc="Converting smiles to canonical smiles without salt" + ) as pbar: + df = df.with_columns( + pl.col(smiles_col) + .map_elements(w_pbar(pbar, canonical_smiles_wo_salt), return_dtype=pl.Utf8) + .alias(out_col), + ) + + return df