Skip to content

Commit

Permalink
feat: polars_canonical_smiles_wo_salt
Browse files Browse the repository at this point in the history
  • Loading branch information
kiyoon committed Aug 1, 2024
1 parent 0e576ba commit 2a249f5
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 1 deletion.
26 changes: 26 additions & 0 deletions src/bio_data_to_db/utils/polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from collections.abc import Callable
from typing import Any

import tqdm


def w_pbar(pbar: tqdm.std.tqdm, func: Callable[..., Any]) -> Callable[..., Any]:
"""
Apply progress bar when using `map_elements` in `polars`.
Example:
>>> with tqdm(total=len(df)) as pbar: # doctest: +SKIP
... df = df.with_columns(
... pl.col("in_col")
... .map_elements(w_pbar(pbar, lambda x: x + 1), return_dtype=pl.Int64)
... )
Reference:
- https://stackoverflow.com/questions/75550124/python-polars-how-to-add-a-progress-bars-to-apply-loops
"""

def foo(*args, **kwargs):
pbar.update(1)
return func(*args, **kwargs)

return foo
1 change: 0 additions & 1 deletion src/bio_data_to_db/utils/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,6 @@ def polars_write_database(
connection = create_engine(connection)

columns_dtype = {col: df[col].dtype for col in df.columns}

column_name_to_sqlalchemy_type = {
col: polars_datatype_to_sqlalchemy_type(dtype)
for col, dtype in columns_dtype.items()
Expand Down
51 changes: 51 additions & 0 deletions src/bio_data_to_db/utils/smiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from functools import cache

import polars as pl
from rdkit import Chem
from tqdm import tqdm

from .polars import w_pbar


@cache
def canonical_smiles_wo_salt(smiles):
"""
Get the canonical SMILES without salt from the input SMILES.
Salt is a short part separated by "." in the SMILES.
Shared function with dti-pytorch
"""
m = Chem.MolFromSmiles(smiles)
if m is not None:
canonical_smiles = Chem.MolToSmiles(m, isomericSmiles=True, canonical=True)
split_smi = canonical_smiles.split(".")
if len(split_smi) > 1:
smiles_wo_salt = max(split_smi, key=len)
if Chem.MolFromSmiles(smiles_wo_salt) is None:
smiles_wo_salt = None
else:
smiles_wo_salt = split_smi[0]
else:
smiles_wo_salt = None
return smiles_wo_salt


def polars_canonical_smiles_wo_salt(
df: pl.DataFrame,
*,
smiles_col: str = "smiles",
out_col: str = "canonical_smiles_wo_salt",
):
"""
Apply canonical_smiles_wo_salt on the DataFrame with tqdm.
"""
with tqdm(
total=df.shape[0], desc="Converting smiles to canonical smiles without salt"
) as pbar:
df = df.with_columns(
pl.col(smiles_col)
.map_elements(w_pbar(pbar, canonical_smiles_wo_salt), return_dtype=pl.Utf8)
.alias(out_col),
)

return df

0 comments on commit 2a249f5

Please sign in to comment.