Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support of R-groups to the CDX loader. #36 #946

Merged
merged 2 commits into from
Dec 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions api/tests/integration/ref/formats/cdx_to_ket.py.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*** CDXML to mol ***
simple_rgrp64.cdx
{"root":{"nodes":[{"$ref":"mol0"},{"$ref":"rg1"}]},"mol0":{"type":"molecule","atoms":[{"label":"C","location":[0.4969991147518158,-0.639666736125946,0.0]},{"type":"rg-label","$refs":["rg-1"],"location":[0.49533283710479739,-0.1623331755399704,0.0]},{"label":"C","location":[0.011334228329360485,-0.6336659789085388,0.0]},{"label":"C","location":[0.009999592788517475,-0.1623331755399704,0.0]}],"bonds":[{"type":1,"atoms":[0,1]},{"type":1,"atoms":[0,2]},{"type":1,"atoms":[1,3]},{"type":1,"atoms":[2,3]}]},"rg1":{"rlogic":{"number":1},"type":"rgroup","atoms":[{"label":"C","location":[27.818666458129884,-13.968999862670899,0.0]},{"label":"C","location":[28.23433494567871,-14.208999633789063,0.0]},{"label":"C","location":[28.649999618530275,-13.968999862670899,0.0]},{"label":"C","location":[29.06566619873047,-14.208999633789063,0.0]},{"label":"Cl","location":[29.481332778930665,-13.968999862670899,0.0]}],"bonds":[{"type":1,"atoms":[0,1]},{"type":1,"atoms":[1,2]},{"type":1,"atoms":[2,3]},{"type":1,"atoms":[3,4]}]}}
28 changes: 28 additions & 0 deletions api/tests/integration/tests/formats/cdx_to_ket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
import sys

sys.path.append(
os.path.normpath(
os.path.join(os.path.abspath(__file__), "..", "..", "..", "common")
)
)
from env_indigo import * # noqa

indigo = Indigo()
indigo.setOption("molfile-saving-skip-date", True)

print("*** CDXML to mol ***")

root = joinPathPy("molecules/cdx", __file__)
files = os.listdir(root)
files.sort()
for filename in files:
print(filename)
try:
mol = indigo.loadMoleculeFromFile(os.path.join(root, filename))
print(mol.json())
except IndigoException as e:
print(getIndigoExceptionText(e))
print("*** Try as Query ***")
mol = indigo.loadQueryMoleculeFromFile(os.path.join(root, filename))
print(mol.json())
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
VmpDRDAxMDAEAwIBAAAAAAAAAAAAAACAAAAAAAMAFQAAAENoZW1EcmF3IEpTIDIu
MC4wLjkEAhAAvVl5ASOP3QJw/bgBHkWEAwEJCAAAAAAAAAAAAAIJCAAAANIEAADb
CQ0IAQABCAcBAAE6BAEAATsEAQAARQQBAAE8BAEAAEoEAQAADAYBAAEPBgEAAQ0G
AQAAQgQBAABDBAEAAEQEAQAADggCAJoCCggIABgAYADIAAMACwgIABgAAADIAAMA
CQgEAACAAgAICAQAmZkBAAcIBACZmQAABggEAAAAAgAFCAQAZmYOAAQIAgC0AAMI
BAAAAHgAIwgBAAUMCAEAACgIAQABKQgBAAEqCAEAATIIAQAAKwgBACgsCAEACi0I
AQABLggBAAACCBAAAAAkAAAAJAAAACQAAAAkAAEDAgAAAAIDAgABAAADMgAIAP//
/////wAAAAAAAP//AAAAAP////8AAAAA//8AAAAA/////wAAAAD/////AAD//wAB
DwAAAAEAGADp/QUAQXJpYWwACHgAAAMAAAEgASAAAAAAC2YIoP+E/4gL4wkYA2cF
JwP8AAIAAAEgASAAAAAAC2YIoAABAAAAZAAAAAEAAQEBAAAAAScPAAEAAQAAAAAA
AAAAAAAAAAACABkBkAAAAAAAQAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAtAsC
AAAAtQsUAAAAQ2hlbWljYWwgRm9ybXVsYTogtgsOAAAARXhhY3QgTWFzczogtwsU
AAAATW9sZWN1bGFyIFdlaWdodDoguAsHAAAAbS96OiC5CxYAAABFbGVtZW50YWwg
QW5hbHlzaXM6ILoLEQAAAEJvaWxpbmcgUG9pbnQ6ILsLEQAAAE1lbHRpbmcgUG9p
bnQ6ILwLEQAAAENyaXRpY2FsIFRlbXA6IL0LEQAAAENyaXRpY2FsIFByZXM6IL4L
EAAAAENyaXRpY2FsIFZvbDogvwsQAAAAR2liYnMgRW5lcmd5OiDACwkAAABMb2cg
UDogwQsGAAAATVI6IMILDwAAAEhlbnJ5J3MgTGF3OiDDCxAAAABIZWF0IG9mIEZv
cm06IMQLCAAAAHRQU0E6IMkLAgAAAMoLAgAAAAsMAgABAAoMAQAACQwBAAAMDAUA
AAAoIykBgBwAAAAEAhAAAAAAAAAAAACFqzYDAACSBhIIBAAAAJIGEwgEAIWrNgMW
CAQAAAAkABgIBAAAACQAFAgEAAAAAAAZCAAAEAgCAAEADwgCAAEAEQgBAAEDgAEA
AAAEAhAAvVl5ASOP3QL+14wBtTP0AgoAAgABAASAEAAAAAACCAA9iowBUXjsAgoA
AgACADcEAQABAAAEgBEAAAAAAggAUTh+AYVr7AIKAAIAAwAABAIACQBIBAAANwQB
AAE0BAQAAgAAADEECAAVAAAAFwAAAAaAAAAAAAACCAA0sYEBJc/oAgQCEAC9WXkB
Jc/oAjRxgwG1M/QCIwgBAAACBwIAAAAFBwEAAQAHDgABAAAAGABgAMgAAABSMQkH
DgABAAAAGABgAMgAAABSMQAAAAAEgBMAAAAAAggAKFyMAWbm3QIKAAIABAA3BAEA
AQAABIAUAAAAAAIIAFE4fgEo3N0CCgACAAUANwQBAAEAAAWAFQAAAAoAAgAGAAQG
BAAQAAAABQYEABEAAAAKBgEAAQAABYAWAAAACgACAAcABAYEABAAAAAFBgQAEwAA
AAoGAQABAAAFgBcAAAAKAAIACAAEBgQAEQAAAAUGBAAUAAAACgYBAAEAAAWAGAAA
AAoAAgAJAAQGBAATAAAABQYEABQAAAAKBgEAAQAAAAAKgAIAAAAEAhAAhat9Abhe
JwNw/bgBHkWEAwALEAAeRX4BUfgnAz2KlAFm5joDAQsQAD2KlAFR+CcD12O4AYWr
gwMCCwIAAQADgAQAAAAEAhAAVzOeAR/FMQOZma4Bw0V6AwoAAgALAASABQAAAAAC
CADrEaMBXI9CAwoAAgAMADcEAQABAAAEgAYAAAAAAggAHkWqAa4HTwMKAAIADQA3
BAEAAQAABIAHAAAAAAIIAOsRowEAgFsDCgACAA4ANwQBAAEAAASACAAAAAACCAAe
RaoBUfhnAwoAAgAPADcEAQABAAAEgAkAAAAAAggA6xGjAaNwdAMKAAIAEAACBAIA
EQArBAIAAABIBAAANwQBAAEGgAAAAAAAAggAzoqmAUPUcAMEAhAAVzOeAUPUcAMB
vqYBw0V6AyMIAQAAAgcCAAAABQcBAAEABw4AAQAAABgAYADIAAAAQ2wJBw4AAQAA
ABgAYADIAAAAQ2wAAAAABIAOAAAAAAIIAABAqgG4HjYDCgACABEAAAQCAAwASwQB
AAE3BAEAAQAABYAKAAAACgACABIABAYEAAUAAAAFBgQABgAAAAoGAQABAAAFgAsA
AAAKAAIAEwAEBgQABgAAAAUGBAAHAAAACgYBAAEAAAWADAAAAAoAAgAUAAQGBAAH
AAAABQYEAAgAAAAKBgEAAQAABYANAAAACgACABUABAYEAAgAAAAFBgQACQAAAAoG
AQABAAAFgA8AAAAKAAIAFgAEBgQADgAAAAUGBAAFAAAACgYBAAEAAAAABoAdAAAA
AAIIAJnZigEeRS0DBAIQAIWrfQG4XicDcP24AR5FhAMKAAIACgAHBwIAAAACBwIA
AAAABw4AAQAAABgAYADIAAAAUjEJBw4AAQAAABgAYADIAAAAUjEAAAAAAAAAAAAA
4 changes: 4 additions & 0 deletions core/indigo-core/molecule/molecule_cdxml_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ namespace indigo
AutoInt enhanced_stereo_group;
AutoInt index;
AutoInt geometry;
AutoInt alt_group_id;
AutoInt rg_index;

bool is_not_list;
std::vector<AutoInt> element_list;
std::unordered_map<int, int> bond_id_to_connection_idx;
Expand Down Expand Up @@ -893,6 +896,7 @@ namespace indigo

void _parseGraphic(CDXElement elem);
void _parseArrow(CDXElement elem);
void _parseAltGroup(CDXElement elem);

void _addAtomsAndBonds(BaseMolecule& mol, const std::vector<int>& atoms, const std::vector<CdxmlBond>& bonds);
void _addBracket(BaseMolecule& mol, const CdxmlBracket& bracket);
Expand Down
108 changes: 84 additions & 24 deletions core/indigo-core/molecule/src/molecule_cdxml_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,22 @@ void MoleculeCdxmlLoader::_parseCollections(BaseMolecule& mol)
int node_idx = _id_to_node_index.at(node.id);
switch (node.type)
{
case kCDXNodeType_NamedAlternativeGroup:
case kCDXNodeType_Element:
case kCDXNodeType_ElementList:
atoms.push_back(node_idx);
break;
case kCDXNodeType_ExternalConnectionPoint: {
auto& fn = nodes[_fragment_nodes.back()];
if (fn.connections.size() == 0)
fn.ext_connections.push_back(node.id);
if (_fragment_nodes.size())
{
auto& fn = nodes[_fragment_nodes.back()];
if (fn.connections.size() == 0)
fn.ext_connections.push_back(node.id);
}
else
{
// handle free external connection. attachment point?
}
}
break;
case kCDXNodeType_Nickname:
Expand Down Expand Up @@ -366,9 +374,11 @@ void MoleculeCdxmlLoader::_parseCDXMLElements(CDXElement elem, bool no_siblings,

auto arrow_lambda = [this](CDXElement elem) { this->_parseArrow(elem); };

auto altgroup_lambda = [this](CDXElement elem) { this->_parseAltGroup(elem); };

std::unordered_map<std::string, std::function<void(CDXElement elem)>> cdxml_dispatcher = {
{"n", node_lambda}, {"b", bond_lambda}, {"fragment", fragment_lambda}, {"group", group_lambda}, {"bracketedgroup", bracketed_lambda},
{"t", text_lambda}, {"graphic", graphic_lambda}, {"arrow", arrow_lambda}};
{"n", node_lambda}, {"b", bond_lambda}, {"fragment", fragment_lambda}, {"group", group_lambda}, {"bracketedgroup", bracketed_lambda},
{"t", text_lambda}, {"graphic", graphic_lambda}, {"arrow", arrow_lambda}, {"altgroup", altgroup_lambda}};

for (elem; elem.hasContent(); elem = elem.nextSiblingElement())
{
Expand Down Expand Up @@ -416,6 +426,10 @@ void MoleculeCdxmlLoader::_addAtomsAndBonds(BaseMolecule& mol, const std::vector
if (_pmol)
{
atom_idx = _pmol->addAtom(atom.element);

if (atom.type == kCDXNodeType_NamedAlternativeGroup)
mol.allowRGroupOnRSite(atom_idx, atom.rg_index);

_id_to_atom_idx.emplace(atom.id, atom_idx);
mol.setAtomXyz(atom_idx, atom.pos);
_pmol->setAtomCharge_Silent(atom_idx, atom.charge);
Expand Down Expand Up @@ -752,7 +766,11 @@ void MoleculeCdxmlLoader::_parseNode(CdxmlNode& node, CDXElement elem)

auto stereo_lambda = [&node](const std::string& data) { node.stereo = KCIPStereochemistryCharToIndex.at(data.front()); };

auto node_type_lambda = [&node](const std::string& data) { node.type = KNodeTypeNameToInt.at(data); };
auto node_type_lambda = [&node](const std::string& data) {
node.type = KNodeTypeNameToInt.at(data);
if (node.type == kCDXNodeType_NamedAlternativeGroup)
node.element = ELEM_RSITE;
};

auto element_list_lambda = [&node](const std::string& data) {
std::vector<std::string> elements = split(data, ' ');
Expand All @@ -770,25 +788,36 @@ void MoleculeCdxmlLoader::_parseNode(CdxmlNode& node, CDXElement elem)

auto enhanced_stereo_group_lambda = [&node](const std::string& data) { node.enhanced_stereo_group = data; };

std::unordered_map<std::string, std::function<void(const std::string&)>> node_dispatcher = {
{"id", id_lambda},
{"p", pos_lambda},
{"xyz", pos_lambda},
{"NumHydrogens", hydrogens_lambda},
{"Charge", charge_lambda},
{"Isotope", isotope_lambda},
{"Radical", radical_lambda},
{"AS", stereo_lambda},
{"NodeType", node_type_lambda},
{"Element", element_lambda},
{"GenericNickname", label_lambda},
{"ElementList", element_list_lambda},
{"BondOrdering", bond_ordering_lambda},
{"Geometry", geometry_lambda},
{"EnhancedStereoType", enhanced_stereo_type_lambda},
{"EnhancedStereoGroupNum", enhanced_stereo_group_lambda},
};
auto alt_group_id_lambda = [&node](const std::string& data) { node.alt_group_id = data; };

std::unordered_map<std::string, std::function<void(const std::string&)>> node_dispatcher = {{"id", id_lambda},
{"p", pos_lambda},
{"xyz", pos_lambda},
{"NumHydrogens", hydrogens_lambda},
{"Charge", charge_lambda},
{"Isotope", isotope_lambda},
{"Radical", radical_lambda},
{"AS", stereo_lambda},
{"NodeType", node_type_lambda},
{"Element", element_lambda},
{"GenericNickname", label_lambda},
{"ElementList", element_list_lambda},
{"BondOrdering", bond_ordering_lambda},
{"Geometry", geometry_lambda},
{"EnhancedStereoType", enhanced_stereo_type_lambda},
{"EnhancedStereoGroupNum", enhanced_stereo_group_lambda},
{"AltGroupID", alt_group_id_lambda}};
applyDispatcher(elem.firstProperty(), node_dispatcher);
for (auto child_elem = elem.firstChildElement(); child_elem.hasContent(); child_elem = child_elem.nextSiblingElement())
{
if (child_elem.name() == "t")
{
std::string label;
_parseLabel(child_elem, label);
if (label.find("R") == 0)
node.rg_index = label.substr(1);
}
}
}

void MoleculeCdxmlLoader::_addNode(CdxmlNode& node)
Expand Down Expand Up @@ -879,6 +908,37 @@ void MoleculeCdxmlLoader::parseBBox(const std::string& data, Rect2f& bbox)
throw Error("Not enought coordinates for text bounding box");
}

void MoleculeCdxmlLoader::_parseAltGroup(CDXElement elem)
{
std::vector<AutoInt> r_labels;
std::vector<CDXElement> r_fragments;
for (auto r_elem = elem.firstChildElement(); r_elem.hasContent(); r_elem = r_elem.nextSiblingElement())
{
auto el_name = r_elem.name();
if (el_name == "fragment")
r_fragments.push_back(r_elem);
else if (el_name == "t")
{
std::string rl;
_parseLabel(r_elem, rl);
if (rl.find("R") == 0)
r_labels.push_back(rl.substr(1));
}
}

if (r_fragments.size() && r_labels.size())
{
MoleculeCdxmlLoader alt_loader(_scanner, _is_binary);
BaseMolecule& mol = _pmol ? *(BaseMolecule*)_pmol : *(BaseMolecule*)_pqmol;
std::unique_ptr<BaseMolecule> fragment(mol.neu());
alt_loader.stereochemistry_options = stereochemistry_options;
alt_loader.loadMoleculeFromFragment(*fragment.get(), r_fragments.front());
MoleculeRGroups& rgroups = mol.rgroups;
RGroup& rgroup = rgroups.getRGroup(r_labels.front());
rgroup.fragments.add(fragment.release());
}
}

void MoleculeCdxmlLoader::_parseGraphic(CDXElement elem)
{
AutoInt superseded_id = 0;
Expand Down