From 2a249f53359cf75438ceef02186d4181c2fd8c97 Mon Sep 17 00:00:00 2001 From: Kiyoon Kim Date: Thu, 1 Aug 2024 14:08:26 +0900 Subject: [PATCH] feat: polars_canonical_smiles_wo_salt --- src/bio_data_to_db/utils/polars.py | 26 +++++++++++++ src/bio_data_to_db/utils/postgresql.py | 1 - src/bio_data_to_db/utils/smiles.py | 51 ++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 src/bio_data_to_db/utils/polars.py create mode 100644 src/bio_data_to_db/utils/smiles.py diff --git a/src/bio_data_to_db/utils/polars.py b/src/bio_data_to_db/utils/polars.py new file mode 100644 index 0000000..10b14b6 --- /dev/null +++ b/src/bio_data_to_db/utils/polars.py @@ -0,0 +1,26 @@ +from collections.abc import Callable +from typing import Any + +import tqdm + + +def w_pbar(pbar: tqdm.std.tqdm, func: Callable[..., Any]) -> Callable[..., Any]: + """ + Apply progress bar when using `map_elements` in `polars`. + + Example: + >>> with tqdm(total=len(df)) as pbar: # doctest: +SKIP + ... df = df.with_columns( + ... pl.col("in_col") + ... .map_elements(w_pbar(pbar, lambda x: x + 1), return_dtype=pl.Int64) + ... ) + + Reference: + - https://stackoverflow.com/questions/75550124/python-polars-how-to-add-a-progress-bars-to-apply-loops + """ + + def foo(*args, **kwargs): + pbar.update(1) + return func(*args, **kwargs) + + return foo diff --git a/src/bio_data_to_db/utils/postgresql.py b/src/bio_data_to_db/utils/postgresql.py index b9e0658..9c83377 100644 --- a/src/bio_data_to_db/utils/postgresql.py +++ b/src/bio_data_to_db/utils/postgresql.py @@ -464,7 +464,6 @@ def polars_write_database( connection = create_engine(connection) columns_dtype = {col: df[col].dtype for col in df.columns} - column_name_to_sqlalchemy_type = { col: polars_datatype_to_sqlalchemy_type(dtype) for col, dtype in columns_dtype.items() diff --git a/src/bio_data_to_db/utils/smiles.py b/src/bio_data_to_db/utils/smiles.py new file mode 100644 index 0000000..1195a23 --- /dev/null +++ b/src/bio_data_to_db/utils/smiles.py @@ -0,0 +1,51 @@ +from functools import cache + +import polars as pl +from rdkit import Chem +from tqdm import tqdm + +from .polars import w_pbar + + +@cache +def canonical_smiles_wo_salt(smiles): + """ + Get the canonical SMILES without salt from the input SMILES. + + Salt is a short part separated by "." in the SMILES. + Shared function with dti-pytorch + """ + m = Chem.MolFromSmiles(smiles) + if m is not None: + canonical_smiles = Chem.MolToSmiles(m, isomericSmiles=True, canonical=True) + split_smi = canonical_smiles.split(".") + if len(split_smi) > 1: + smiles_wo_salt = max(split_smi, key=len) + if Chem.MolFromSmiles(smiles_wo_salt) is None: + smiles_wo_salt = None + else: + smiles_wo_salt = split_smi[0] + else: + smiles_wo_salt = None + return smiles_wo_salt + + +def polars_canonical_smiles_wo_salt( + df: pl.DataFrame, + *, + smiles_col: str = "smiles", + out_col: str = "canonical_smiles_wo_salt", +): + """ + Apply canonical_smiles_wo_salt on the DataFrame with tqdm. + """ + with tqdm( + total=df.shape[0], desc="Converting smiles to canonical smiles without salt" + ) as pbar: + df = df.with_columns( + pl.col(smiles_col) + .map_elements(w_pbar(pbar, canonical_smiles_wo_salt), return_dtype=pl.Utf8) + .alias(out_col), + ) + + return df