diff --git a/.github/workflows/deploy-mkdocs-on-latest.yml b/.github/workflows/deploy-mkdocs-on-latest.yml
new file mode 100644
index 0000000..32cc3cf
--- /dev/null
+++ b/.github/workflows/deploy-mkdocs-on-latest.yml
@@ -0,0 +1,14 @@
+name: Deploy MkDocs on latest commit
+
+on:
+ push:
+ branches:
+ - main
+ - master
+
+jobs:
+ deploy-mkdocs:
+ uses: deargen/workflows/.github/workflows/deploy-mkdocs.yml@master
+ with:
+ deploy-type: latest
+ requirements-file: deps/lock/x86_64-manylinux_2_28/requirements_docs.txt
diff --git a/README.md b/README.md
index 130b440..9a340cb 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
# bio-data-to-db: make Uniprot PostgreSQL database
+
[![image](https://img.shields.io/pypi/v/bio-data-to-db.svg)](https://pypi.python.org/pypi/bio-data-to-db)
[![PyPI - Downloads](https://img.shields.io/pypi/dm/bio-data-to-db)](https://pypi.python.org/pypi/bio-data-to-db)
[![image](https://img.shields.io/pypi/l/bio-data-to-db.svg)](https://pypi.python.org/pypi/bio-data-to-db)
@@ -19,6 +20,8 @@ Written in Rust, thus equipped with extremely fast parsers. Packaged for python,
So far, there is only one function implemented: **convert uniprot data to postgresql**. This package focuses more on parsing the data and inserting it into the database, rather than curating the data.
+[📚 Documentation](https://deargen.github.io/bio-data-to-db/)
+
## 🛠️ Installation
```bash
@@ -29,6 +32,8 @@ pip install bio-data-to-db
You can use the command line interface or the python API.
+### Uniprot
+
```bash
# It will create a db 'uniprot' and a table named 'public.uniprot_info' in the database.
# If you want another name, you can optionally pass it as the last argument.
@@ -61,6 +66,49 @@ create_accession_to_pk_id_table("postgresql://user:password@localhost:5432/unipr
keywords_tsv_to_postgresql("~/Downloads/keywords_all_2024_06_26.tsv", "postgresql://user:password@localhost:5432/uniprot")
```
+### BindingDB
+
+```bash
+# Decode HTML entities and strip the strings in the `assay` table (column: description and assay_name).
+# Currently, only assay table is supported.
+bio-data-to-db bindingdb fix-table assay 'mysql://username:password@localhost/bind'
+```
+
+```python
+from bio_data_to_db.bindingdb.fix_tables import fix_assay_table
+
+fix_assay_table("mysql://username:password@localhost/bind")
+```
+
+### PostgreSQL Helpers, SMILES, Polars utils and more
+
+```python
+Some useful functions to work with PostgreSQL.
+
+```python
+from bio_data_to_db.utils.postgresql import (
+ create_db_if_not_exists,
+ create_schema_if_not_exists,
+ set_column_as_primary_key,
+ make_columns_unique,
+ make_large_columns_unique,
+ split_column_str_to_list,
+ polars_write_database,
+)
+
+from bio_data_to_db.utils.smiles import (
+ canonical_smiles_wo_salt,
+ polars_canonical_smiles_wo_salt,
+)
+
+from bio_data_to_db.utils.polars import (
+ w_pbar,
+)
+```
+
+You can find the usage in the [📚 documentation](https://deargen.github.io/bio-data-to-db/).
+
+
## 👨💻️ Maintenance Notes
### Install from source
@@ -72,10 +120,14 @@ bash scripts/install.sh
uv pip install -r deps/requirements_dev.in
```
-### Compile requirements (generate lockfiles)
+### Generate lockfiles
Use GitHub Actions: `apply-pip-compile.yml`. Manually launch the workflow and it will make a commit with the updated lockfiles.
+### Publish a new version to PyPI
+
+Use GitHub Actions: `deploy.yml`. Manually launch the workflow and it will compile on all architectures and publish the new version to PyPI.
+
### About sqlx
Sqlx offline mode should be configured so you can compile the code without a database present.
diff --git a/deps/lock/aarch64-apple-darwin/.requirements.in.sha256 b/deps/lock/aarch64-apple-darwin/.requirements.in.sha256
index f9355d5..2be7d1e 100644
--- a/deps/lock/aarch64-apple-darwin/.requirements.in.sha256
+++ b/deps/lock/aarch64-apple-darwin/.requirements.in.sha256
@@ -1 +1 @@
-816025c3ff73af3261b082ee7e0c71954aa6b20922e17344cfb2f29636733488 requirements.in
+2f65dd8deb2842edfead23a6aafb4f4f0b9e9e98982e39216069787d16327901 requirements.in
diff --git a/deps/lock/aarch64-apple-darwin/.requirements_docs.in.sha256 b/deps/lock/aarch64-apple-darwin/.requirements_docs.in.sha256
new file mode 100644
index 0000000..6fd7f42
--- /dev/null
+++ b/deps/lock/aarch64-apple-darwin/.requirements_docs.in.sha256
@@ -0,0 +1 @@
+f0f530946f38443ec95d76ac402dc3e3045fe8f7c26220e46b575aa56649503d requirements_docs.in
diff --git a/deps/lock/aarch64-apple-darwin/requirements.txt b/deps/lock/aarch64-apple-darwin/requirements.txt
index def352e..bacfba6 100644
--- a/deps/lock/aarch64-apple-darwin/requirements.txt
+++ b/deps/lock/aarch64-apple-darwin/requirements.txt
@@ -2,16 +2,23 @@
# uv pip compile requirements.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/aarch64-apple-darwin/requirements.txt --python-platform aarch64-apple-darwin --python-version 3.10
click==8.1.7
# via typer
+connectorx==0.3.3
+ # via -r requirements.in
markdown-it-py==3.0.0
# via rich
mdurl==0.1.2
# via markdown-it-py
-numpy==2.0.0
+mysqlclient==2.2.4
+ # via -r requirements.in
+numpy==1.26.4
# via
# pandas
# pyarrow
+ # rdkit
pandas==2.2.2
# via -r requirements.in
+pillow==10.4.0
+ # via rdkit
polars==1.2.0
# via -r requirements.in
psycopg==3.2.1
@@ -26,6 +33,8 @@ python-dateutil==2.9.0.post0
# via pandas
pytz==2024.1
# via pandas
+rdkit==2024.3.3
+ # via -r requirements.in
rich==13.7.1
# via typer
shellingham==1.5.4
@@ -34,6 +43,8 @@ six==1.16.0
# via python-dateutil
sqlalchemy==2.0.31
# via -r requirements.in
+tqdm==4.66.4
+ # via -r requirements.in
typer==0.12.3
# via -r requirements.in
typing-extensions==4.12.2
diff --git a/deps/lock/aarch64-apple-darwin/requirements_dev.txt b/deps/lock/aarch64-apple-darwin/requirements_dev.txt
index b904385..58547d9 100644
--- a/deps/lock/aarch64-apple-darwin/requirements_dev.txt
+++ b/deps/lock/aarch64-apple-darwin/requirements_dev.txt
@@ -6,6 +6,8 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via typer
+connectorx==0.3.3
+ # via -r requirements.in
exceptiongroup==1.2.2
# via pytest
filelock==3.15.4
@@ -24,13 +26,16 @@ maturin==1.7.0
# via -r requirements_dev.in
mdurl==0.1.2
# via markdown-it-py
+mysqlclient==2.2.4
+ # via -r requirements.in
networkx==3.3
# via -r requirements_dev.in
-numpy==2.0.0
+numpy==1.26.4
# via
# -r requirements_dev.in
# pandas
# pyarrow
+ # rdkit
# scipy
# trimesh
packaging==24.1
@@ -39,6 +44,8 @@ packaging==24.1
# pytest
pandas==2.2.2
# via -r requirements.in
+pillow==10.4.0
+ # via rdkit
pluggy==1.5.0
# via pytest
polars==1.2.0
@@ -59,6 +66,8 @@ pytz==2024.1
# via pandas
pyyaml==6.0.1
# via huggingface-hub
+rdkit==2024.3.3
+ # via -r requirements.in
requests==2.32.3
# via huggingface-hub
rich==13.7.1
@@ -82,7 +91,9 @@ tomli==2.0.1
# maturin
# pytest
tqdm==4.66.4
- # via huggingface-hub
+ # via
+ # -r requirements.in
+ # huggingface-hub
trimesh==4.4.3
# via -r requirements_dev.in
typer==0.12.3
diff --git a/deps/lock/aarch64-apple-darwin/requirements_docs.txt b/deps/lock/aarch64-apple-darwin/requirements_docs.txt
new file mode 100644
index 0000000..4b8a53c
--- /dev/null
+++ b/deps/lock/aarch64-apple-darwin/requirements_docs.txt
@@ -0,0 +1,156 @@
+# This file was autogenerated by uv via the following command:
+# uv pip compile requirements_docs.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/aarch64-apple-darwin/requirements_docs.txt --python-platform aarch64-apple-darwin --python-version 3.10
+babel==2.15.0
+ # via mkdocs-material
+backports-strenum==1.3.1
+ # via griffe
+cairocffi==1.7.1
+ # via cairosvg
+cairosvg==2.7.1
+ # via mkdocs-material
+certifi==2024.7.4
+ # via requests
+cffi==1.16.0
+ # via cairocffi
+charset-normalizer==3.3.2
+ # via requests
+click==8.1.7
+ # via
+ # mkdocs
+ # mkdocstrings
+colorama==0.4.6
+ # via
+ # griffe
+ # mkdocs-material
+cssselect2==0.7.0
+ # via cairosvg
+defusedxml==0.7.1
+ # via cairosvg
+ghp-import==2.1.0
+ # via mkdocs
+griffe==0.48.0
+ # via mkdocstrings-python
+idna==3.7
+ # via requests
+importlib-metadata==8.2.0
+ # via mike
+importlib-resources==6.4.0
+ # via mike
+jinja2==3.1.4
+ # via
+ # mike
+ # mkdocs
+ # mkdocs-material
+ # mkdocstrings
+markdown==3.6
+ # via
+ # mkdocs
+ # mkdocs-autorefs
+ # mkdocs-material
+ # mkdocstrings
+ # pymdown-extensions
+markupsafe==2.1.5
+ # via
+ # jinja2
+ # mkdocs
+ # mkdocs-autorefs
+ # mkdocstrings
+mergedeep==1.3.4
+ # via
+ # mkdocs
+ # mkdocs-get-deps
+mike==2.1.2
+ # via -r requirements_docs.in
+mkdocs==1.6.0
+ # via
+ # -r requirements_docs.in
+ # mike
+ # mkdocs-autorefs
+ # mkdocs-coverage
+ # mkdocs-gen-files
+ # mkdocs-literate-nav
+ # mkdocs-material
+ # mkdocstrings
+mkdocs-autorefs==1.0.1
+ # via
+ # -r requirements_docs.in
+ # mkdocstrings
+mkdocs-coverage==1.1.0
+ # via -r requirements_docs.in
+mkdocs-gen-files==0.5.0
+ # via -r requirements_docs.in
+mkdocs-get-deps==0.2.0
+ # via mkdocs
+mkdocs-literate-nav==0.6.1
+ # via -r requirements_docs.in
+mkdocs-material==9.5.30
+ # via -r requirements_docs.in
+mkdocs-material-extensions==1.3.1
+ # via
+ # -r requirements_docs.in
+ # mkdocs-material
+mkdocstrings==0.25.2
+ # via
+ # -r requirements_docs.in
+ # mkdocstrings-python
+mkdocstrings-python==1.10.7
+ # via -r requirements_docs.in
+packaging==24.1
+ # via mkdocs
+paginate==0.5.6
+ # via mkdocs-material
+pathspec==0.12.1
+ # via mkdocs
+pillow==10.4.0
+ # via
+ # cairosvg
+ # mkdocs-material
+platformdirs==4.2.2
+ # via
+ # mkdocs-get-deps
+ # mkdocstrings
+pycparser==2.22
+ # via cffi
+pygments==2.18.0
+ # via mkdocs-material
+pymdown-extensions==10.9
+ # via
+ # mkdocs-material
+ # mkdocstrings
+pyparsing==3.1.2
+ # via mike
+python-dateutil==2.9.0.post0
+ # via ghp-import
+pyyaml==6.0.1
+ # via
+ # mike
+ # mkdocs
+ # mkdocs-get-deps
+ # pymdown-extensions
+ # pyyaml-env-tag
+pyyaml-env-tag==0.1
+ # via
+ # mike
+ # mkdocs
+regex==2024.7.24
+ # via mkdocs-material
+requests==2.32.3
+ # via mkdocs-material
+six==1.16.0
+ # via python-dateutil
+tinycss2==1.3.0
+ # via
+ # cairosvg
+ # cssselect2
+urllib3==2.2.2
+ # via requests
+verspec==0.1.0
+ # via mike
+watchdog==4.0.1
+ # via mkdocs
+webencodings==0.5.1
+ # via
+ # cssselect2
+ # tinycss2
+zipp==3.19.2
+ # via importlib-metadata
diff --git a/deps/lock/x86_64-apple-darwin/.requirements.in.sha256 b/deps/lock/x86_64-apple-darwin/.requirements.in.sha256
index f9355d5..2be7d1e 100644
--- a/deps/lock/x86_64-apple-darwin/.requirements.in.sha256
+++ b/deps/lock/x86_64-apple-darwin/.requirements.in.sha256
@@ -1 +1 @@
-816025c3ff73af3261b082ee7e0c71954aa6b20922e17344cfb2f29636733488 requirements.in
+2f65dd8deb2842edfead23a6aafb4f4f0b9e9e98982e39216069787d16327901 requirements.in
diff --git a/deps/lock/x86_64-apple-darwin/.requirements_docs.in.sha256 b/deps/lock/x86_64-apple-darwin/.requirements_docs.in.sha256
new file mode 100644
index 0000000..6fd7f42
--- /dev/null
+++ b/deps/lock/x86_64-apple-darwin/.requirements_docs.in.sha256
@@ -0,0 +1 @@
+f0f530946f38443ec95d76ac402dc3e3045fe8f7c26220e46b575aa56649503d requirements_docs.in
diff --git a/deps/lock/x86_64-apple-darwin/requirements.txt b/deps/lock/x86_64-apple-darwin/requirements.txt
index 9ec9072..fcd3c42 100644
--- a/deps/lock/x86_64-apple-darwin/requirements.txt
+++ b/deps/lock/x86_64-apple-darwin/requirements.txt
@@ -2,18 +2,25 @@
# uv pip compile requirements.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-apple-darwin/requirements.txt --python-platform x86_64-apple-darwin --python-version 3.10
click==8.1.7
# via typer
+connectorx==0.3.3
+ # via -r requirements.in
greenlet==3.0.3
# via sqlalchemy
markdown-it-py==3.0.0
# via rich
mdurl==0.1.2
# via markdown-it-py
-numpy==2.0.0
+mysqlclient==2.2.4
+ # via -r requirements.in
+numpy==1.26.4
# via
# pandas
# pyarrow
+ # rdkit
pandas==2.2.2
# via -r requirements.in
+pillow==10.4.0
+ # via rdkit
polars==1.2.0
# via -r requirements.in
psycopg==3.2.1
@@ -28,6 +35,8 @@ python-dateutil==2.9.0.post0
# via pandas
pytz==2024.1
# via pandas
+rdkit==2024.3.3
+ # via -r requirements.in
rich==13.7.1
# via typer
shellingham==1.5.4
@@ -36,6 +45,8 @@ six==1.16.0
# via python-dateutil
sqlalchemy==2.0.31
# via -r requirements.in
+tqdm==4.66.4
+ # via -r requirements.in
typer==0.12.3
# via -r requirements.in
typing-extensions==4.12.2
diff --git a/deps/lock/x86_64-apple-darwin/requirements_dev.txt b/deps/lock/x86_64-apple-darwin/requirements_dev.txt
index 6979b43..afe2537 100644
--- a/deps/lock/x86_64-apple-darwin/requirements_dev.txt
+++ b/deps/lock/x86_64-apple-darwin/requirements_dev.txt
@@ -6,6 +6,8 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via typer
+connectorx==0.3.3
+ # via -r requirements.in
exceptiongroup==1.2.2
# via pytest
filelock==3.15.4
@@ -26,13 +28,16 @@ maturin==1.7.0
# via -r requirements_dev.in
mdurl==0.1.2
# via markdown-it-py
+mysqlclient==2.2.4
+ # via -r requirements.in
networkx==3.3
# via -r requirements_dev.in
-numpy==2.0.0
+numpy==1.26.4
# via
# -r requirements_dev.in
# pandas
# pyarrow
+ # rdkit
# scipy
# trimesh
packaging==24.1
@@ -41,6 +46,8 @@ packaging==24.1
# pytest
pandas==2.2.2
# via -r requirements.in
+pillow==10.4.0
+ # via rdkit
pluggy==1.5.0
# via pytest
polars==1.2.0
@@ -61,6 +68,8 @@ pytz==2024.1
# via pandas
pyyaml==6.0.1
# via huggingface-hub
+rdkit==2024.3.3
+ # via -r requirements.in
requests==2.32.3
# via huggingface-hub
rich==13.7.1
@@ -84,7 +93,9 @@ tomli==2.0.1
# maturin
# pytest
tqdm==4.66.4
- # via huggingface-hub
+ # via
+ # -r requirements.in
+ # huggingface-hub
trimesh==4.4.3
# via -r requirements_dev.in
typer==0.12.3
diff --git a/deps/lock/x86_64-apple-darwin/requirements_docs.txt b/deps/lock/x86_64-apple-darwin/requirements_docs.txt
new file mode 100644
index 0000000..0fd83dd
--- /dev/null
+++ b/deps/lock/x86_64-apple-darwin/requirements_docs.txt
@@ -0,0 +1,156 @@
+# This file was autogenerated by uv via the following command:
+# uv pip compile requirements_docs.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-apple-darwin/requirements_docs.txt --python-platform x86_64-apple-darwin --python-version 3.10
+babel==2.15.0
+ # via mkdocs-material
+backports-strenum==1.3.1
+ # via griffe
+cairocffi==1.7.1
+ # via cairosvg
+cairosvg==2.7.1
+ # via mkdocs-material
+certifi==2024.7.4
+ # via requests
+cffi==1.16.0
+ # via cairocffi
+charset-normalizer==3.3.2
+ # via requests
+click==8.1.7
+ # via
+ # mkdocs
+ # mkdocstrings
+colorama==0.4.6
+ # via
+ # griffe
+ # mkdocs-material
+cssselect2==0.7.0
+ # via cairosvg
+defusedxml==0.7.1
+ # via cairosvg
+ghp-import==2.1.0
+ # via mkdocs
+griffe==0.48.0
+ # via mkdocstrings-python
+idna==3.7
+ # via requests
+importlib-metadata==8.2.0
+ # via mike
+importlib-resources==6.4.0
+ # via mike
+jinja2==3.1.4
+ # via
+ # mike
+ # mkdocs
+ # mkdocs-material
+ # mkdocstrings
+markdown==3.6
+ # via
+ # mkdocs
+ # mkdocs-autorefs
+ # mkdocs-material
+ # mkdocstrings
+ # pymdown-extensions
+markupsafe==2.1.5
+ # via
+ # jinja2
+ # mkdocs
+ # mkdocs-autorefs
+ # mkdocstrings
+mergedeep==1.3.4
+ # via
+ # mkdocs
+ # mkdocs-get-deps
+mike==2.1.2
+ # via -r requirements_docs.in
+mkdocs==1.6.0
+ # via
+ # -r requirements_docs.in
+ # mike
+ # mkdocs-autorefs
+ # mkdocs-coverage
+ # mkdocs-gen-files
+ # mkdocs-literate-nav
+ # mkdocs-material
+ # mkdocstrings
+mkdocs-autorefs==1.0.1
+ # via
+ # -r requirements_docs.in
+ # mkdocstrings
+mkdocs-coverage==1.1.0
+ # via -r requirements_docs.in
+mkdocs-gen-files==0.5.0
+ # via -r requirements_docs.in
+mkdocs-get-deps==0.2.0
+ # via mkdocs
+mkdocs-literate-nav==0.6.1
+ # via -r requirements_docs.in
+mkdocs-material==9.5.30
+ # via -r requirements_docs.in
+mkdocs-material-extensions==1.3.1
+ # via
+ # -r requirements_docs.in
+ # mkdocs-material
+mkdocstrings==0.25.2
+ # via
+ # -r requirements_docs.in
+ # mkdocstrings-python
+mkdocstrings-python==1.10.7
+ # via -r requirements_docs.in
+packaging==24.1
+ # via mkdocs
+paginate==0.5.6
+ # via mkdocs-material
+pathspec==0.12.1
+ # via mkdocs
+pillow==10.4.0
+ # via
+ # cairosvg
+ # mkdocs-material
+platformdirs==4.2.2
+ # via
+ # mkdocs-get-deps
+ # mkdocstrings
+pycparser==2.22
+ # via cffi
+pygments==2.18.0
+ # via mkdocs-material
+pymdown-extensions==10.9
+ # via
+ # mkdocs-material
+ # mkdocstrings
+pyparsing==3.1.2
+ # via mike
+python-dateutil==2.9.0.post0
+ # via ghp-import
+pyyaml==6.0.1
+ # via
+ # mike
+ # mkdocs
+ # mkdocs-get-deps
+ # pymdown-extensions
+ # pyyaml-env-tag
+pyyaml-env-tag==0.1
+ # via
+ # mike
+ # mkdocs
+regex==2024.7.24
+ # via mkdocs-material
+requests==2.32.3
+ # via mkdocs-material
+six==1.16.0
+ # via python-dateutil
+tinycss2==1.3.0
+ # via
+ # cairosvg
+ # cssselect2
+urllib3==2.2.2
+ # via requests
+verspec==0.1.0
+ # via mike
+watchdog==4.0.1
+ # via mkdocs
+webencodings==0.5.1
+ # via
+ # cssselect2
+ # tinycss2
+zipp==3.19.2
+ # via importlib-metadata
diff --git a/deps/lock/x86_64-manylinux_2_28/.requirements.in.sha256 b/deps/lock/x86_64-manylinux_2_28/.requirements.in.sha256
index f9355d5..2be7d1e 100644
--- a/deps/lock/x86_64-manylinux_2_28/.requirements.in.sha256
+++ b/deps/lock/x86_64-manylinux_2_28/.requirements.in.sha256
@@ -1 +1 @@
-816025c3ff73af3261b082ee7e0c71954aa6b20922e17344cfb2f29636733488 requirements.in
+2f65dd8deb2842edfead23a6aafb4f4f0b9e9e98982e39216069787d16327901 requirements.in
diff --git a/deps/lock/x86_64-manylinux_2_28/.requirements_docs.in.sha256 b/deps/lock/x86_64-manylinux_2_28/.requirements_docs.in.sha256
new file mode 100644
index 0000000..6fd7f42
--- /dev/null
+++ b/deps/lock/x86_64-manylinux_2_28/.requirements_docs.in.sha256
@@ -0,0 +1 @@
+f0f530946f38443ec95d76ac402dc3e3045fe8f7c26220e46b575aa56649503d requirements_docs.in
diff --git a/deps/lock/x86_64-manylinux_2_28/requirements.txt b/deps/lock/x86_64-manylinux_2_28/requirements.txt
index 1eb8c87..65ba813 100644
--- a/deps/lock/x86_64-manylinux_2_28/requirements.txt
+++ b/deps/lock/x86_64-manylinux_2_28/requirements.txt
@@ -2,18 +2,25 @@
# uv pip compile requirements.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-manylinux_2_28/requirements.txt --python-platform x86_64-manylinux_2_28 --python-version 3.10
click==8.1.7
# via typer
+connectorx==0.3.3
+ # via -r requirements.in
greenlet==3.0.3
# via sqlalchemy
markdown-it-py==3.0.0
# via rich
mdurl==0.1.2
# via markdown-it-py
-numpy==2.0.0
+mysqlclient==2.2.4
+ # via -r requirements.in
+numpy==1.26.4
# via
# pandas
# pyarrow
+ # rdkit
pandas==2.2.2
# via -r requirements.in
+pillow==10.4.0
+ # via rdkit
polars==1.2.0
# via -r requirements.in
psycopg==3.2.1
@@ -28,6 +35,8 @@ python-dateutil==2.9.0.post0
# via pandas
pytz==2024.1
# via pandas
+rdkit==2024.3.3
+ # via -r requirements.in
rich==13.7.1
# via typer
shellingham==1.5.4
@@ -36,6 +45,8 @@ six==1.16.0
# via python-dateutil
sqlalchemy==2.0.31
# via -r requirements.in
+tqdm==4.66.4
+ # via -r requirements.in
typer==0.12.3
# via -r requirements.in
typing-extensions==4.12.2
diff --git a/deps/lock/x86_64-manylinux_2_28/requirements_dev.txt b/deps/lock/x86_64-manylinux_2_28/requirements_dev.txt
index f4473b2..aafabce 100644
--- a/deps/lock/x86_64-manylinux_2_28/requirements_dev.txt
+++ b/deps/lock/x86_64-manylinux_2_28/requirements_dev.txt
@@ -6,6 +6,8 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via typer
+connectorx==0.3.3
+ # via -r requirements.in
exceptiongroup==1.2.2
# via pytest
filelock==3.15.4
@@ -26,13 +28,16 @@ maturin==1.7.0
# via -r requirements_dev.in
mdurl==0.1.2
# via markdown-it-py
+mysqlclient==2.2.4
+ # via -r requirements.in
networkx==3.3
# via -r requirements_dev.in
-numpy==2.0.0
+numpy==1.26.4
# via
# -r requirements_dev.in
# pandas
# pyarrow
+ # rdkit
# scipy
# trimesh
packaging==24.1
@@ -41,6 +46,8 @@ packaging==24.1
# pytest
pandas==2.2.2
# via -r requirements.in
+pillow==10.4.0
+ # via rdkit
pluggy==1.5.0
# via pytest
polars==1.2.0
@@ -61,6 +68,8 @@ pytz==2024.1
# via pandas
pyyaml==6.0.1
# via huggingface-hub
+rdkit==2024.3.3
+ # via -r requirements.in
requests==2.32.3
# via huggingface-hub
rich==13.7.1
@@ -84,7 +93,9 @@ tomli==2.0.1
# maturin
# pytest
tqdm==4.66.4
- # via huggingface-hub
+ # via
+ # -r requirements.in
+ # huggingface-hub
trimesh==4.4.3
# via -r requirements_dev.in
typer==0.12.3
diff --git a/deps/lock/x86_64-manylinux_2_28/requirements_docs.txt b/deps/lock/x86_64-manylinux_2_28/requirements_docs.txt
new file mode 100644
index 0000000..4bc7758
--- /dev/null
+++ b/deps/lock/x86_64-manylinux_2_28/requirements_docs.txt
@@ -0,0 +1,156 @@
+# This file was autogenerated by uv via the following command:
+# uv pip compile requirements_docs.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-manylinux_2_28/requirements_docs.txt --python-platform x86_64-manylinux_2_28 --python-version 3.10
+babel==2.15.0
+ # via mkdocs-material
+backports-strenum==1.3.1
+ # via griffe
+cairocffi==1.7.1
+ # via cairosvg
+cairosvg==2.7.1
+ # via mkdocs-material
+certifi==2024.7.4
+ # via requests
+cffi==1.16.0
+ # via cairocffi
+charset-normalizer==3.3.2
+ # via requests
+click==8.1.7
+ # via
+ # mkdocs
+ # mkdocstrings
+colorama==0.4.6
+ # via
+ # griffe
+ # mkdocs-material
+cssselect2==0.7.0
+ # via cairosvg
+defusedxml==0.7.1
+ # via cairosvg
+ghp-import==2.1.0
+ # via mkdocs
+griffe==0.48.0
+ # via mkdocstrings-python
+idna==3.7
+ # via requests
+importlib-metadata==8.2.0
+ # via mike
+importlib-resources==6.4.0
+ # via mike
+jinja2==3.1.4
+ # via
+ # mike
+ # mkdocs
+ # mkdocs-material
+ # mkdocstrings
+markdown==3.6
+ # via
+ # mkdocs
+ # mkdocs-autorefs
+ # mkdocs-material
+ # mkdocstrings
+ # pymdown-extensions
+markupsafe==2.1.5
+ # via
+ # jinja2
+ # mkdocs
+ # mkdocs-autorefs
+ # mkdocstrings
+mergedeep==1.3.4
+ # via
+ # mkdocs
+ # mkdocs-get-deps
+mike==2.1.2
+ # via -r requirements_docs.in
+mkdocs==1.6.0
+ # via
+ # -r requirements_docs.in
+ # mike
+ # mkdocs-autorefs
+ # mkdocs-coverage
+ # mkdocs-gen-files
+ # mkdocs-literate-nav
+ # mkdocs-material
+ # mkdocstrings
+mkdocs-autorefs==1.0.1
+ # via
+ # -r requirements_docs.in
+ # mkdocstrings
+mkdocs-coverage==1.1.0
+ # via -r requirements_docs.in
+mkdocs-gen-files==0.5.0
+ # via -r requirements_docs.in
+mkdocs-get-deps==0.2.0
+ # via mkdocs
+mkdocs-literate-nav==0.6.1
+ # via -r requirements_docs.in
+mkdocs-material==9.5.30
+ # via -r requirements_docs.in
+mkdocs-material-extensions==1.3.1
+ # via
+ # -r requirements_docs.in
+ # mkdocs-material
+mkdocstrings==0.25.2
+ # via
+ # -r requirements_docs.in
+ # mkdocstrings-python
+mkdocstrings-python==1.10.7
+ # via -r requirements_docs.in
+packaging==24.1
+ # via mkdocs
+paginate==0.5.6
+ # via mkdocs-material
+pathspec==0.12.1
+ # via mkdocs
+pillow==10.4.0
+ # via
+ # cairosvg
+ # mkdocs-material
+platformdirs==4.2.2
+ # via
+ # mkdocs-get-deps
+ # mkdocstrings
+pycparser==2.22
+ # via cffi
+pygments==2.18.0
+ # via mkdocs-material
+pymdown-extensions==10.9
+ # via
+ # mkdocs-material
+ # mkdocstrings
+pyparsing==3.1.2
+ # via mike
+python-dateutil==2.9.0.post0
+ # via ghp-import
+pyyaml==6.0.1
+ # via
+ # mike
+ # mkdocs
+ # mkdocs-get-deps
+ # pymdown-extensions
+ # pyyaml-env-tag
+pyyaml-env-tag==0.1
+ # via
+ # mike
+ # mkdocs
+regex==2024.7.24
+ # via mkdocs-material
+requests==2.32.3
+ # via mkdocs-material
+six==1.16.0
+ # via python-dateutil
+tinycss2==1.3.0
+ # via
+ # cairosvg
+ # cssselect2
+urllib3==2.2.2
+ # via requests
+verspec==0.1.0
+ # via mike
+watchdog==4.0.1
+ # via mkdocs
+webencodings==0.5.1
+ # via
+ # cssselect2
+ # tinycss2
+zipp==3.19.2
+ # via importlib-metadata
diff --git a/deps/lock/x86_64-pc-windows-msvc/.requirements.in.sha256 b/deps/lock/x86_64-pc-windows-msvc/.requirements.in.sha256
index f9355d5..2be7d1e 100644
--- a/deps/lock/x86_64-pc-windows-msvc/.requirements.in.sha256
+++ b/deps/lock/x86_64-pc-windows-msvc/.requirements.in.sha256
@@ -1 +1 @@
-816025c3ff73af3261b082ee7e0c71954aa6b20922e17344cfb2f29636733488 requirements.in
+2f65dd8deb2842edfead23a6aafb4f4f0b9e9e98982e39216069787d16327901 requirements.in
diff --git a/deps/lock/x86_64-pc-windows-msvc/.requirements_docs.in.sha256 b/deps/lock/x86_64-pc-windows-msvc/.requirements_docs.in.sha256
new file mode 100644
index 0000000..6fd7f42
--- /dev/null
+++ b/deps/lock/x86_64-pc-windows-msvc/.requirements_docs.in.sha256
@@ -0,0 +1 @@
+f0f530946f38443ec95d76ac402dc3e3045fe8f7c26220e46b575aa56649503d requirements_docs.in
diff --git a/deps/lock/x86_64-pc-windows-msvc/requirements.txt b/deps/lock/x86_64-pc-windows-msvc/requirements.txt
index ac6b99f..8789f66 100644
--- a/deps/lock/x86_64-pc-windows-msvc/requirements.txt
+++ b/deps/lock/x86_64-pc-windows-msvc/requirements.txt
@@ -3,19 +3,28 @@
click==8.1.7
# via typer
colorama==0.4.6
- # via click
+ # via
+ # click
+ # tqdm
+connectorx==0.3.3
+ # via -r requirements.in
greenlet==3.0.3
# via sqlalchemy
markdown-it-py==3.0.0
# via rich
mdurl==0.1.2
# via markdown-it-py
-numpy==2.0.0
+mysqlclient==2.2.4
+ # via -r requirements.in
+numpy==1.26.4
# via
# pandas
# pyarrow
+ # rdkit
pandas==2.2.2
# via -r requirements.in
+pillow==10.4.0
+ # via rdkit
polars==1.2.0
# via -r requirements.in
psycopg==3.2.1
@@ -30,6 +39,8 @@ python-dateutil==2.9.0.post0
# via pandas
pytz==2024.1
# via pandas
+rdkit==2024.3.3
+ # via -r requirements.in
rich==13.7.1
# via typer
shellingham==1.5.4
@@ -38,6 +49,8 @@ six==1.16.0
# via python-dateutil
sqlalchemy==2.0.31
# via -r requirements.in
+tqdm==4.66.4
+ # via -r requirements.in
typer==0.12.3
# via -r requirements.in
typing-extensions==4.12.2
diff --git a/deps/lock/x86_64-pc-windows-msvc/requirements_dev.txt b/deps/lock/x86_64-pc-windows-msvc/requirements_dev.txt
index e4a4b05..3afa870 100644
--- a/deps/lock/x86_64-pc-windows-msvc/requirements_dev.txt
+++ b/deps/lock/x86_64-pc-windows-msvc/requirements_dev.txt
@@ -11,6 +11,8 @@ colorama==0.4.6
# click
# pytest
# tqdm
+connectorx==0.3.3
+ # via -r requirements.in
exceptiongroup==1.2.2
# via pytest
filelock==3.15.4
@@ -31,13 +33,16 @@ maturin==1.7.0
# via -r requirements_dev.in
mdurl==0.1.2
# via markdown-it-py
+mysqlclient==2.2.4
+ # via -r requirements.in
networkx==3.3
# via -r requirements_dev.in
-numpy==2.0.0
+numpy==1.26.4
# via
# -r requirements_dev.in
# pandas
# pyarrow
+ # rdkit
# scipy
# trimesh
packaging==24.1
@@ -46,6 +51,8 @@ packaging==24.1
# pytest
pandas==2.2.2
# via -r requirements.in
+pillow==10.4.0
+ # via rdkit
pluggy==1.5.0
# via pytest
polars==1.2.0
@@ -66,6 +73,8 @@ pytz==2024.1
# via pandas
pyyaml==6.0.1
# via huggingface-hub
+rdkit==2024.3.3
+ # via -r requirements.in
requests==2.32.3
# via huggingface-hub
rich==13.7.1
@@ -89,7 +98,9 @@ tomli==2.0.1
# maturin
# pytest
tqdm==4.66.4
- # via huggingface-hub
+ # via
+ # -r requirements.in
+ # huggingface-hub
trimesh==4.4.3
# via -r requirements_dev.in
typer==0.12.3
diff --git a/deps/lock/x86_64-pc-windows-msvc/requirements_docs.txt b/deps/lock/x86_64-pc-windows-msvc/requirements_docs.txt
new file mode 100644
index 0000000..646989c
--- /dev/null
+++ b/deps/lock/x86_64-pc-windows-msvc/requirements_docs.txt
@@ -0,0 +1,158 @@
+# This file was autogenerated by uv via the following command:
+# uv pip compile requirements_docs.in -o /home/runner/work/bio-data-to-db/bio-data-to-db/deps/lock/x86_64-pc-windows-msvc/requirements_docs.txt --python-platform x86_64-pc-windows-msvc --python-version 3.10
+babel==2.15.0
+ # via mkdocs-material
+backports-strenum==1.3.1
+ # via griffe
+cairocffi==1.7.1
+ # via cairosvg
+cairosvg==2.7.1
+ # via mkdocs-material
+certifi==2024.7.4
+ # via requests
+cffi==1.16.0
+ # via cairocffi
+charset-normalizer==3.3.2
+ # via requests
+click==8.1.7
+ # via
+ # mkdocs
+ # mkdocstrings
+colorama==0.4.6
+ # via
+ # click
+ # griffe
+ # mkdocs
+ # mkdocs-material
+cssselect2==0.7.0
+ # via cairosvg
+defusedxml==0.7.1
+ # via cairosvg
+ghp-import==2.1.0
+ # via mkdocs
+griffe==0.48.0
+ # via mkdocstrings-python
+idna==3.7
+ # via requests
+importlib-metadata==8.2.0
+ # via mike
+importlib-resources==6.4.0
+ # via mike
+jinja2==3.1.4
+ # via
+ # mike
+ # mkdocs
+ # mkdocs-material
+ # mkdocstrings
+markdown==3.6
+ # via
+ # mkdocs
+ # mkdocs-autorefs
+ # mkdocs-material
+ # mkdocstrings
+ # pymdown-extensions
+markupsafe==2.1.5
+ # via
+ # jinja2
+ # mkdocs
+ # mkdocs-autorefs
+ # mkdocstrings
+mergedeep==1.3.4
+ # via
+ # mkdocs
+ # mkdocs-get-deps
+mike==2.1.2
+ # via -r requirements_docs.in
+mkdocs==1.6.0
+ # via
+ # -r requirements_docs.in
+ # mike
+ # mkdocs-autorefs
+ # mkdocs-coverage
+ # mkdocs-gen-files
+ # mkdocs-literate-nav
+ # mkdocs-material
+ # mkdocstrings
+mkdocs-autorefs==1.0.1
+ # via
+ # -r requirements_docs.in
+ # mkdocstrings
+mkdocs-coverage==1.1.0
+ # via -r requirements_docs.in
+mkdocs-gen-files==0.5.0
+ # via -r requirements_docs.in
+mkdocs-get-deps==0.2.0
+ # via mkdocs
+mkdocs-literate-nav==0.6.1
+ # via -r requirements_docs.in
+mkdocs-material==9.5.30
+ # via -r requirements_docs.in
+mkdocs-material-extensions==1.3.1
+ # via
+ # -r requirements_docs.in
+ # mkdocs-material
+mkdocstrings==0.25.2
+ # via
+ # -r requirements_docs.in
+ # mkdocstrings-python
+mkdocstrings-python==1.10.7
+ # via -r requirements_docs.in
+packaging==24.1
+ # via mkdocs
+paginate==0.5.6
+ # via mkdocs-material
+pathspec==0.12.1
+ # via mkdocs
+pillow==10.4.0
+ # via
+ # cairosvg
+ # mkdocs-material
+platformdirs==4.2.2
+ # via
+ # mkdocs-get-deps
+ # mkdocstrings
+pycparser==2.22
+ # via cffi
+pygments==2.18.0
+ # via mkdocs-material
+pymdown-extensions==10.9
+ # via
+ # mkdocs-material
+ # mkdocstrings
+pyparsing==3.1.2
+ # via mike
+python-dateutil==2.9.0.post0
+ # via ghp-import
+pyyaml==6.0.1
+ # via
+ # mike
+ # mkdocs
+ # mkdocs-get-deps
+ # pymdown-extensions
+ # pyyaml-env-tag
+pyyaml-env-tag==0.1
+ # via
+ # mike
+ # mkdocs
+regex==2024.7.24
+ # via mkdocs-material
+requests==2.32.3
+ # via mkdocs-material
+six==1.16.0
+ # via python-dateutil
+tinycss2==1.3.0
+ # via
+ # cairosvg
+ # cssselect2
+urllib3==2.2.2
+ # via requests
+verspec==0.1.0
+ # via mike
+watchdog==4.0.1
+ # via mkdocs
+webencodings==0.5.1
+ # via
+ # cssselect2
+ # tinycss2
+zipp==3.19.2
+ # via importlib-metadata
diff --git a/deps/requirements.in b/deps/requirements.in
index 682be56..b81c766 100644
--- a/deps/requirements.in
+++ b/deps/requirements.in
@@ -5,4 +5,7 @@ sqlalchemy>=2.0.0
psycopg>=3.2.0
psycopg2>=2.9.0
pyarrow>=17.0.0
-
+mysqlclient>=2.2.0
+connectorx>=0.3.0
+rdkit>=2024.3.1
+tqdm>=4.62.0
diff --git a/deps/requirements_docs.in b/deps/requirements_docs.in
new file mode 100644
index 0000000..435b779
--- /dev/null
+++ b/deps/requirements_docs.in
@@ -0,0 +1,10 @@
+mkdocs
+mkdocs-autorefs
+mkdocs-coverage
+mkdocs-gen-files
+mkdocs-literate-nav
+mkdocs-material[imaging]
+mkdocs-material-extensions
+mkdocstrings
+mkdocstrings-python
+mike
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..612c7a5
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1 @@
+--8<-- "README.md"
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..b95a704
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,103 @@
+site_name: bio-data-to-db
+site_url: 'https://deargen.github.io/bio-data-to-db'
+repo_url: 'https://github.com/deargen/bio-data-to-db'
+copyright: |
+ © 2024 Deargen Inc.
+watch: [mkdocs.yml, README.md, src/]
+validation:
+ omitted_files: warn
+ absolute_links: warn
+ unrecognized_links: warn
+
+nav:
+ - Home:
+ - Overview: index.md
+ - Changelog: CHANGELOG.md
+ # defer to gen-files + literate-nav
+ - API reference:
+ - mkdocstrings-python: reference/
+
+theme:
+ name: material
+ font:
+ text: Noto Sans Korean
+ code: Jetbrains Mono
+ features:
+ - toc.follow
+ - navigation.top
+ - navigation.footer
+ - navigation.sections
+ - navigation.tabs
+ - navigation.tabs.sticky
+ - navigation.indexes
+ - navigation.path
+ - search.suggest
+ - search.highlight
+ - content.tabs.link
+ - content.code.annotation
+ - content.code.copy
+ language: ko
+ palette:
+ - media: '(prefers-color-scheme: light)'
+ scheme: default
+ primary: teal
+ accent: purple
+ toggle:
+ icon: material/weather-sunny
+ name: Switch to dark mode
+ - media: '(prefers-color-scheme: dark)'
+ scheme: slate
+ primary: black
+ accent: lime
+ toggle:
+ icon: material/weather-night
+ name: Switch to system preference
+
+plugins:
+ - search
+ - gen-files:
+ scripts:
+ - scripts/gen_ref_nav.py
+ - literate-nav:
+ nav_file: SUMMARY.md
+ - mkdocstrings:
+ handlers:
+ python:
+ options:
+ show_symbol_type_heading: true
+ show_symbol_type_toc: true
+ members_order: source
+ allow_inspection: false # for .pyi stubs to work
+ paths: [src] # search packages in the src folder
+
+extra:
+ social:
+ - icon: fontawesome/brands/github-alt
+ link: https://github.com/deargen/python-project-template-2024
+ version:
+ provider: mike
+
+markdown_extensions:
+ - pymdownx.superfences:
+ custom_fences:
+ - name: mermaid
+ class: mermaid
+ format: !!python/name:pymdownx.superfences.fence_code_format
+ - pymdownx.highlight:
+ anchor_linenums: true
+ - pymdownx.inlinehilite
+ - pymdownx.snippets
+ - admonition
+ - pymdownx.arithmatex:
+ generic: true
+ - footnotes
+ - pymdownx.details
+ - pymdownx.superfences
+ - pymdownx.mark
+ - attr_list
+ - pymdownx.emoji:
+ emoji_index: !!python/name:material.extensions.emoji.twemoji
+ emoji_generator: !!python/name:material.extensions.emoji.to_svg
+ - pymdownx.tilde # strikethrough with ~~ ~~
+ - toc:
+ permalink: true
diff --git a/pyproject.toml b/pyproject.toml
index ffe94ff..431c7da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -105,6 +105,7 @@ ignore = [
"NPY002", # legacy numpy random
"UP017", # datetime.timezone.utc -> datetime.UTC
"SIM108", # use ternary operator instead of if-else
+ "PYI021", # Docstrings should not be included in stubs
]
[tool.ruff.lint.pydocstyle]
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index 0cd881a..2133a7c 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -80,7 +80,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "bio-data-to-db"
-version = "0.0.1+0.g4467862.dirty"
+version = "0.1.1+7.g2bfd838.dirty"
dependencies = [
"icecream",
"pyo3",
diff --git a/scripts/gen_ref_nav.py b/scripts/gen_ref_nav.py
new file mode 100644
index 0000000..7abcf6b
--- /dev/null
+++ b/scripts/gen_ref_nav.py
@@ -0,0 +1,50 @@
+"""Generate the code reference pages and navigation."""
+
+from pathlib import Path
+
+import mkdocs_gen_files
+
+IGNORE_MODULES_EXACT = {
+ # "bio_data_to_db.__init__",
+}
+
+IGNORE_MODULES_STARTSWITH = {
+ "bio_data_to_db.cli.",
+}
+
+nav = mkdocs_gen_files.Nav()
+mod_symbol = '
'
+
+src = Path(__file__).parent.parent / "src"
+
+for path in sorted(src.rglob("*.py")):
+ module_path = path.relative_to(src).with_suffix("")
+ doc_path = path.relative_to(src).with_suffix(".md")
+ full_doc_path = Path("reference", doc_path)
+
+ parts = tuple(module_path.parts)
+ module_str = ".".join(parts)
+
+ if module_str in IGNORE_MODULES_EXACT or any(
+ module_str.startswith(prefix) for prefix in IGNORE_MODULES_STARTSWITH
+ ):
+ print(f"Skipping module: {module_str}") # noqa: T201
+ continue
+ if parts[-1] == "__init__":
+ parts = parts[:-1]
+ doc_path = doc_path.with_name("index.md")
+ full_doc_path = full_doc_path.with_name("index.md")
+ elif parts[-1].startswith("_"):
+ continue
+
+ nav_parts = [f"{mod_symbol} {part}" for part in parts]
+ nav[tuple(nav_parts)] = doc_path.as_posix()
+
+ with mkdocs_gen_files.open(full_doc_path, "w") as fd:
+ ident = ".".join(parts)
+ fd.write(f"::: {ident}")
+
+ mkdocs_gen_files.set_edit_path(full_doc_path, ".." / path)
+
+with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file:
+ nav_file.writelines(nav.build_literate_nav())
diff --git a/src/bio_data_to_db/__init__.py b/src/bio_data_to_db/__init__.py
index f84101f..6139137 100644
--- a/src/bio_data_to_db/__init__.py
+++ b/src/bio_data_to_db/__init__.py
@@ -1,8 +1,6 @@
# Allow star imports
# ruff: noqa: F403 F405
-from __future__ import annotations
-
from .bio_data_to_db import *
__doc__ = bio_data_to_db.__doc__
diff --git a/src/bio_data_to_db/bindingdb/__init__.py b/src/bio_data_to_db/bindingdb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/bio_data_to_db/bindingdb/fix_tables.py b/src/bio_data_to_db/bindingdb/fix_tables.py
new file mode 100644
index 0000000..ce56454
--- /dev/null
+++ b/src/bio_data_to_db/bindingdb/fix_tables.py
@@ -0,0 +1,54 @@
+import html
+
+import polars as pl
+import sqlalchemy
+
+
+def fix_assay_table(uri: str):
+ """
+ Fix the assay table in MySQL by decoding HTML entities like `'` and strip empty spaces.
+
+ Notes:
+ - the table is replaced.
+ - types are reserved by manually changing them to the original types. For example, varchar -> text -> varchar
+ - primary key and foreign key constraints are reserved by manually adding them back just like the original table
+ """
+ query = """
+ SELECT
+ *
+ FROM
+ assay
+ """
+ assay_df = pl.read_database_uri(query=query, uri=uri)
+
+ # the column might be "binary" type if the type is "TEXT" in MySQL
+ assay_df = assay_df.with_columns(
+ pl.col("description")
+ .cast(pl.Utf8)
+ .map_elements(lambda s: html.unescape(s.strip()), return_dtype=pl.Utf8),
+ pl.col("assay_name")
+ .cast(pl.Utf8)
+ .map_elements(lambda s: html.unescape(s.strip()), return_dtype=pl.Utf8),
+ )
+
+ assay_df.write_database(
+ table_name="assay",
+ connection=uri,
+ if_table_exists="replace",
+ )
+
+ with sqlalchemy.create_engine(uri).connect() as conn:
+ conn.execute(
+ sqlalchemy.text("""
+ ALTER TABLE assay MODIFY COLUMN `entryid` INT(11);
+ ALTER TABLE assay MODIFY COLUMN `assayid` INT(11);
+ ALTER TABLE assay MODIFY COLUMN `description` VARCHAR(4000);
+ ALTER TABLE assay MODIFY COLUMN `assay_name` VARCHAR(200);
+ ALTER TABLE assay ADD PRIMARY KEY (`entryid`,`assayid`);
+ ALTER TABLE assay ADD CONSTRAINT `assay_ibfk_1` FOREIGN KEY (`entryid`) REFERENCES `entry` (`entryid`);
+ """)
+ )
+
+
+# if __name__ == "__main__":
+# fix_assay_table("mysql://username:@localhost:3306/bind")
diff --git a/src/bio_data_to_db/bio_data_to_db.pyi b/src/bio_data_to_db/bio_data_to_db.pyi
index 79d2d14..3c53be3 100644
--- a/src/bio_data_to_db/bio_data_to_db.pyi
+++ b/src/bio_data_to_db/bio_data_to_db.pyi
@@ -2,4 +2,9 @@ def uniprot_xml_to_postgresql(
*,
uniprot_xml_path: str,
uri: str,
-) -> None: ...
+) -> None:
+ """
+ (🦀 Rust) Load UniProt XML file into PostgreSQL database.
+
+ This creates a `uniprot` database and a `uniprot_info` table.
+ """
diff --git a/src/bio_data_to_db/cli/bindingdb.py b/src/bio_data_to_db/cli/bindingdb.py
new file mode 100644
index 0000000..a399407
--- /dev/null
+++ b/src/bio_data_to_db/cli/bindingdb.py
@@ -0,0 +1,45 @@
+import enum
+import logging
+from typing import Annotated
+
+import typer
+
+logger = logging.getLogger(__name__)
+
+app = typer.Typer(no_args_is_help=True)
+
+
+class FixTableOption(str, enum.Enum):
+ assay = "assay"
+
+
+@app.command(no_args_is_help=True)
+def fix_table(
+ table_name: Annotated[
+ FixTableOption,
+ typer.Argument(help="Table name to fix"),
+ ],
+ uri: Annotated[
+ str,
+ typer.Argument(help="URI to the MySQL database"),
+ ],
+):
+ """
+ Fix the assay table in MySQL by decoding HTML entities like ''' and strip empty spaces.
+ """
+ from bio_data_to_db.bindingdb.fix_tables import fix_assay_table
+ from bio_data_to_db.utils.log import setup_logging
+
+ setup_logging()
+ fix_assay_table(uri)
+ logger.info(
+ "In `assay` table, HTML entities are decoded and empty spaces are stripped."
+ )
+
+
+def main():
+ app()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/bio_data_to_db/cli/main.py b/src/bio_data_to_db/cli/main.py
index 42b537d..0376418 100644
--- a/src/bio_data_to_db/cli/main.py
+++ b/src/bio_data_to_db/cli/main.py
@@ -1,10 +1,11 @@
import typer
-from . import uniprot
+from . import bindingdb, uniprot
app = typer.Typer(no_args_is_help=True)
app.add_typer(uniprot.app, name="uniprot")
+app.add_typer(bindingdb.app, name="bindingdb")
def main():
diff --git a/src/bio_data_to_db/uniprot/utils.py b/src/bio_data_to_db/uniprot/utils.py
index 4bc751d..c03216f 100644
--- a/src/bio_data_to_db/uniprot/utils.py
+++ b/src/bio_data_to_db/uniprot/utils.py
@@ -16,6 +16,32 @@
def create_empty_table(
uri: str,
):
+ """
+ Create an empty table in the database. Necessary to create the table structure before inserting data.
+
+ Note:
+ It runs the following SQL query:
+ ```sql
+ CREATE TABLE public.uniprot_info (
+ uniprot_pk_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
+ accessions TEXT[],
+ names TEXT[],
+ protein_names TEXT[],
+ gene_names TEXT[],
+ organism_scientific TEXT,
+ organism_commons TEXT[],
+ organism_synonyms TEXT[],
+ ncbi_taxonomy_id INT,
+ deargen_ncbi_taxonomy_id INT,
+ lineage TEXT[],
+ keywords TEXT[],
+ geneontology_ids TEXT[],
+ geneontology_names TEXT[],
+ sequence TEXT,
+ deargen_molecular_functions TEXT[]
+ )
+ ```
+ """
uri_wo_dbname, dbname = uri.rsplit("/", 1)
create_db_if_not_exists(uri_wo_dbname, dbname)
create_schema_if_not_exists(uri, "public")
@@ -55,6 +81,17 @@ def create_empty_table(
def create_accession_to_pk_id(uri: str):
+ """
+ Create a table to map accession to uniprot_pk_id, from the uniprot_info table.
+
+ It creates the following tables:
+
+ - accession_to_pk_id
+ - accession_to_pk_id_list
+
+ Note:
+ The mapping is not unique. It is possible to have multiple uniprot_pk_id for a single accession and vice versa.
+ """
with psycopg.connect(
conninfo=uri,
) as conn:
@@ -118,6 +155,9 @@ def keywords_tsv_to_postgresql(
schema_name="public",
table_name="keywords",
):
+ """
+ Load the keywords_all_2024_06_26.tsv (or similar version) file into the database.
+ """
tsv_columns = [
"Keyword ID",
"Name",
diff --git a/src/bio_data_to_db/utils/__init__.py b/src/bio_data_to_db/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/bio_data_to_db/utils/polars.py b/src/bio_data_to_db/utils/polars.py
new file mode 100644
index 0000000..7a6a00f
--- /dev/null
+++ b/src/bio_data_to_db/utils/polars.py
@@ -0,0 +1,26 @@
+from collections.abc import Callable
+from typing import Any
+
+import tqdm
+
+
+def w_pbar(pbar: tqdm.std.tqdm, func: Callable[..., Any]) -> Callable[..., Any]:
+ """
+ Apply progress bar when using `map_elements` in `polars`.
+
+ Examples:
+ >>> with tqdm(total=len(df)) as pbar: # doctest: +SKIP
+ ... df = df.with_columns(
+ ... pl.col("in_col")
+ ... .map_elements(w_pbar(pbar, lambda x: x + 1), return_dtype=pl.Int64)
+ ... )
+
+ Reference:
+ - https://stackoverflow.com/questions/75550124/python-polars-how-to-add-a-progress-bars-to-apply-loops
+ """
+
+ def foo(*args, **kwargs):
+ pbar.update(1)
+ return func(*args, **kwargs)
+
+ return foo
diff --git a/src/bio_data_to_db/utils/postgresql.py b/src/bio_data_to_db/utils/postgresql.py
index b9e0658..93da972 100644
--- a/src/bio_data_to_db/utils/postgresql.py
+++ b/src/bio_data_to_db/utils/postgresql.py
@@ -70,6 +70,9 @@ def polars_datatype_to_sqlalchemy_type(
def create_db_if_not_exists(uri_wo_db: str, db_name: str, comment: str | None = None):
+ """
+ Create a database if it doesn't exist.
+ """
with psycopg.connect(
conninfo=f"{uri_wo_db}",
) as conn:
@@ -110,6 +113,9 @@ def create_db_if_not_exists(uri_wo_db: str, db_name: str, comment: str | None =
def create_schema_if_not_exists(uri: str, schema_name: str, comment: str | None = None):
+ """
+ Create a schema if it doesn't exist. The DB should already exist.
+ """
db_name = uri.split("/")[-1]
with psycopg.connect(
conninfo=uri,
@@ -318,6 +324,9 @@ def split_column_str_to_list(
separator: str,
pg_element_type: str = "text",
):
+ """
+ Split a string column into a list column.
+ """
if pg_element_type.lower() not in {
"text",
}:
@@ -458,13 +467,13 @@ def polars_write_database(
"""
pl.DataFrame.write_database() but address the issue of writing unsigned and list columns to database.
- https://stackoverflow.com/questions/77098480/polars-psycopg2-write-column-of-lists-to-postgresql
+ Reference:
+ - https://stackoverflow.com/questions/77098480/polars-psycopg2-write-column-of-lists-to-postgresql
"""
if isinstance(connection, str):
connection = create_engine(connection)
columns_dtype = {col: df[col].dtype for col in df.columns}
-
column_name_to_sqlalchemy_type = {
col: polars_datatype_to_sqlalchemy_type(dtype)
for col, dtype in columns_dtype.items()
diff --git a/src/bio_data_to_db/utils/smiles.py b/src/bio_data_to_db/utils/smiles.py
new file mode 100644
index 0000000..d2d05d4
--- /dev/null
+++ b/src/bio_data_to_db/utils/smiles.py
@@ -0,0 +1,51 @@
+from functools import cache
+
+import polars as pl
+from rdkit import Chem
+from tqdm import tqdm
+
+from .polars import w_pbar
+
+
+@cache
+def canonical_smiles_wo_salt(smiles: str) -> str | None:
+ """
+ Get the canonical SMILES without salt from the input SMILES.
+
+ Salt is a short part separated by "." in the SMILES.
+ Shared function with dti-pytorch
+ """
+ m = Chem.MolFromSmiles(smiles)
+ if m is not None:
+ canonical_smiles = Chem.MolToSmiles(m, isomericSmiles=True, canonical=True)
+ split_smi = canonical_smiles.split(".")
+ if len(split_smi) > 1:
+ smiles_wo_salt = max(split_smi, key=len)
+ if Chem.MolFromSmiles(smiles_wo_salt) is None:
+ smiles_wo_salt = None
+ else:
+ smiles_wo_salt = split_smi[0]
+ else:
+ smiles_wo_salt = None
+ return smiles_wo_salt
+
+
+def polars_canonical_smiles_wo_salt(
+ df: pl.DataFrame,
+ *,
+ smiles_col: str = "smiles",
+ out_col: str = "canonical_smiles_wo_salt",
+) -> pl.DataFrame:
+ """
+ Apply canonical_smiles_wo_salt on the DataFrame with tqdm.
+ """
+ with tqdm(
+ total=df.shape[0], desc="Converting smiles to canonical smiles without salt"
+ ) as pbar:
+ df = df.with_columns(
+ pl.col(smiles_col)
+ .map_elements(w_pbar(pbar, canonical_smiles_wo_salt), return_dtype=pl.Utf8)
+ .alias(out_col),
+ )
+
+ return df