Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve how large molecules are automatically cut during mechanism generation #2660

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
152 changes: 94 additions & 58 deletions rmgpy/molecule/fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from rmgpy.molecule.molecule import Atom, Bond, Molecule
from rmgpy.molecule.atomtype import get_atomtype, AtomTypeError, ATOMTYPES, AtomType
from rdkit import Chem

from numpy.random import randint
# this variable is used to name atom IDs so that there are as few conflicts by
# using the entire space of integer objects
ATOM_ID_COUNTER = -(2**15)
Expand Down Expand Up @@ -836,7 +836,7 @@ def from_rdkit_mol(self, rdkitmol, atom_replace_dict=None):

return self

def cut_molecule(self, output_smiles=False, cut_through=True, size_threshold=None):
def cut_molecule(self, output_smiles=False, cut_through=True, size_threshold=5):
"""
For given input, output a list of cut fragments (either string or Fragment).
if output_smiles = True, the output list of fragments will be smiles.
Expand Down Expand Up @@ -888,15 +888,10 @@ def cut_molecule(self, output_smiles=False, cut_through=True, size_threshold=Non
frag_list.append(res_frag)
return frag_list

def sliceitup_arom(self, molecule, size_threshold=None):
def sliceitup_arom(self, molecule, size_threshold=5):
"""
Several specified aromatic patterns
"""
# set min size for each aliphatic fragment size
if size_threshold:
size_threshold = size_threshold
else:
size_threshold = 5
# if input is smiles string, output smiles
if isinstance(molecule, str):
molecule_smiles = molecule
Expand Down Expand Up @@ -950,29 +945,52 @@ def sliceitup_arom(self, molecule, size_threshold=None):
# mol_set contains new set of fragments
mol_set = Chem.GetMolFrags(new_mol, asMols=True)
# check all fragments' size
if all(
sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6)
>= size_threshold
for mol in mol_set
):
# replace * at cutting position with cutting label
for ind, rdmol in enumerate(mol_set):
frag = Chem.MolToSmiles(rdmol)
if len(mol_set) > 2: # means it cut into 3 fragments
if frag.count("*") > 1:
# replace both with R
frag_smi = frag.replace("*", "R")
else:
frag_smi = frag.replace("*", "L")
else: # means it only cut once, generate 2 fragments
if ind == 0:
frag_smi = frag.replace("*", "R")
try:
if all(sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6) >= size_threshold for mol in mol_set):
if len(mol_set) == 2:
frag1 = Chem.MolToSmiles(mol_set[0])
frag2 = Chem.MolToSmiles(mol_set[1])

frag1_R = frag1.count("Na")
frag1_L = frag1.count("K")
frag2_R = frag2.count("Na")
frag2_L = frag2.count("K")

if frag1_R > frag2_R and frag1_L <= frag2_L:
frag1_smi = frag1.replace("*", "L")
frag2_smi = frag2.replace("*", "R")
elif frag1_L > frag2_L and frag1_R <= frag2_R:
frag1_smi = frag1.replace("*", "R")
frag2_smi = frag2.replace("*", "L")
elif frag2_L > frag1_L and frag2_R <= frag1_R:
frag1_smi = frag1.replace("*", "L")
frag2_smi = frag2.replace("*", "R")
elif frag2_R > frag1_R and frag2_L <= frag1_L:
frag1_smi = frag1.replace("*", "R")
frag2_smi = frag2.replace("*", "L")
donerancl marked this conversation as resolved.
Show resolved Hide resolved
elif randint(0,1)==1:
donerancl marked this conversation as resolved.
Show resolved Hide resolved
frag1_smi = frag1.replace("*", "L")
frag2_smi = frag2.replace("*", "R")
else:
frag_smi = frag.replace("*", "L")
frag_list.append(frag_smi)
break
else:
# turn to next matched_atom_map
frag1_smi = frag1.replace("*", "R")
frag2_smi = frag2.replace("*", "L")

frag_list = [frag1_smi, frag2_smi]

elif len(mol_set) > 2: # means it cut into 3 fragments
frag_list = []
for ind, rdmol in enumerate(mol_set):
frag = Chem.MolToSmiles(rdmol)
if frag.count("*") > 1:
frag_smi = frag.replace("*", "R")
else:
frag_smi = frag.replace("*", "L")
frag_list.append(frag_smi)
break
else:
# turn to next matched_atom_map
continue
except:
donerancl marked this conversation as resolved.
Show resolved Hide resolved
continue
else:
# no match for this pattern
Expand Down Expand Up @@ -1014,15 +1032,10 @@ def sliceitup_arom(self, molecule, size_threshold=None):
frag_list_new.append(res_frag)
return frag_list_new

def sliceitup_aliph(self, molecule, size_threshold=None):
def sliceitup_aliph(self, molecule, size_threshold=5):
"""
Several specified aliphatic patterns
"""
# set min size for each aliphatic fragment size
if size_threshold:
size_threshold = size_threshold
else:
size_threshold = 5
# if input is smiles string, output smiles
if isinstance(molecule, str):
molecule_smiles = molecule
Expand Down Expand Up @@ -1079,29 +1092,52 @@ def sliceitup_aliph(self, molecule, size_threshold=None):
# mol_set contains new set of fragments
mol_set = Chem.GetMolFrags(new_mol, asMols=True)
# check all fragments' size
if all(
sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6)
>= size_threshold
for mol in mol_set
):
# replace * at cutting position with cutting label
for ind, rdmol in enumerate(mol_set):
frag = Chem.MolToSmiles(rdmol)
if len(mol_set) > 2: # means it cut into 3 fragments
if frag.count("*") > 1:
# replace both with R
frag_smi = frag.replace("*", "R")
try:
if all(sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6) >= size_threshold for mol in mol_set):
if len(mol_set) == 2:
frag1 = Chem.MolToSmiles(mol_set[0])
frag2 = Chem.MolToSmiles(mol_set[1])

frag1_R = frag1.count("Na")
frag1_L = frag1.count("K")
frag2_R = frag2.count("Na")
frag2_L = frag2.count("K")

if frag1_R > frag2_R and frag1_L <= frag2_L:
frag1_smi = frag1.replace("*", "L")
frag2_smi = frag2.replace("*", "R")
elif frag1_L > frag2_L and frag1_R <= frag2_R:
frag1_smi = frag1.replace("*", "R")
frag2_smi = frag2.replace("*", "L")
elif frag2_L > frag1_L and frag2_R <= frag1_R:
frag1_smi = frag1.replace("*", "L")
frag2_smi = frag2.replace("*", "R")
elif frag2_R > frag1_R and frag2_L <= frag1_L:
frag1_smi = frag1.replace("*", "R")
frag2_smi = frag2.replace("*", "L")
elif randint(0,1)==1:
frag1_smi = frag1.replace("*", "L")
frag2_smi = frag2.replace("*", "R")
else:
frag_smi = frag.replace("*", "L")
else: # means it only cut once, generate 2 fragments
if ind == 0:
frag_smi = frag.replace("*", "R")
else:
frag_smi = frag.replace("*", "L")
frag_list.append(frag_smi)
break
else:
# turn to next matched_atom_map
frag1_smi = frag1.replace("*", "R")
frag2_smi = frag2.replace("*", "L")

frag_list = [frag1_smi, frag2_smi]

elif len(mol_set) > 2: # means it cut into 3 fragments
frag_list = []
for ind, rdmol in enumerate(mol_set):
frag = Chem.MolToSmiles(rdmol)
if frag.count("*") > 1:
frag_smi = frag.replace("*", "R")
else:
frag_smi = frag.replace("*", "L")
frag_list.append(frag_smi)
break
else:
# turn to next matched_atom_map
continue
except:
continue
else:
# no match for this pattern
Expand Down
Loading
Loading