From 23585459c6491ab5e61679433555f2e9d3681896 Mon Sep 17 00:00:00 2001 From: Roman Porozhnetov Date: Wed, 21 Dec 2022 14:22:48 +0100 Subject: [PATCH] rgroups --- .../integration/ref/formats/cdx_to_ket.py.out | 3 + .../integration/tests/formats/cdx_to_ket.py | 28 +++++ .../formats/molecules/cdx/simple_rgrp64.cdx | 43 +++++++ .../molecule/molecule_cdxml_loader.h | 4 + .../molecule/src/molecule_cdxml_loader.cpp | 108 ++++++++++++++---- 5 files changed, 162 insertions(+), 24 deletions(-) create mode 100644 api/tests/integration/ref/formats/cdx_to_ket.py.out create mode 100644 api/tests/integration/tests/formats/cdx_to_ket.py create mode 100644 api/tests/integration/tests/formats/molecules/cdx/simple_rgrp64.cdx diff --git a/api/tests/integration/ref/formats/cdx_to_ket.py.out b/api/tests/integration/ref/formats/cdx_to_ket.py.out new file mode 100644 index 0000000000..47c1a2826c --- /dev/null +++ b/api/tests/integration/ref/formats/cdx_to_ket.py.out @@ -0,0 +1,3 @@ +*** CDXML to mol *** +simple_rgrp64.cdx +{"root":{"nodes":[{"$ref":"mol0"},{"$ref":"rg1"}]},"mol0":{"type":"molecule","atoms":[{"label":"C","location":[0.4969991147518158,-0.639666736125946,0.0]},{"type":"rg-label","$refs":["rg-1"],"location":[0.49533283710479739,-0.1623331755399704,0.0]},{"label":"C","location":[0.011334228329360485,-0.6336659789085388,0.0]},{"label":"C","location":[0.009999592788517475,-0.1623331755399704,0.0]}],"bonds":[{"type":1,"atoms":[0,1]},{"type":1,"atoms":[0,2]},{"type":1,"atoms":[1,3]},{"type":1,"atoms":[2,3]}]},"rg1":{"rlogic":{"number":1},"type":"rgroup","atoms":[{"label":"C","location":[27.818666458129884,-13.968999862670899,0.0]},{"label":"C","location":[28.23433494567871,-14.208999633789063,0.0]},{"label":"C","location":[28.649999618530275,-13.968999862670899,0.0]},{"label":"C","location":[29.06566619873047,-14.208999633789063,0.0]},{"label":"Cl","location":[29.481332778930665,-13.968999862670899,0.0]}],"bonds":[{"type":1,"atoms":[0,1]},{"type":1,"atoms":[1,2]},{"type":1,"atoms":[2,3]},{"type":1,"atoms":[3,4]}]}} diff --git a/api/tests/integration/tests/formats/cdx_to_ket.py b/api/tests/integration/tests/formats/cdx_to_ket.py new file mode 100644 index 0000000000..598b35ecf5 --- /dev/null +++ b/api/tests/integration/tests/formats/cdx_to_ket.py @@ -0,0 +1,28 @@ +import os +import sys + +sys.path.append( + os.path.normpath( + os.path.join(os.path.abspath(__file__), "..", "..", "..", "common") + ) +) +from env_indigo import * # noqa + +indigo = Indigo() +indigo.setOption("molfile-saving-skip-date", True) + +print("*** CDXML to mol ***") + +root = joinPathPy("molecules/cdx", __file__) +files = os.listdir(root) +files.sort() +for filename in files: + print(filename) + try: + mol = indigo.loadMoleculeFromFile(os.path.join(root, filename)) + print(mol.json()) + except IndigoException as e: + print(getIndigoExceptionText(e)) + print("*** Try as Query ***") + mol = indigo.loadQueryMoleculeFromFile(os.path.join(root, filename)) + print(mol.json()) diff --git a/api/tests/integration/tests/formats/molecules/cdx/simple_rgrp64.cdx b/api/tests/integration/tests/formats/molecules/cdx/simple_rgrp64.cdx new file mode 100644 index 0000000000..f7adcf0c6f --- /dev/null +++ b/api/tests/integration/tests/formats/molecules/cdx/simple_rgrp64.cdx @@ -0,0 +1,43 @@ +VmpDRDAxMDAEAwIBAAAAAAAAAAAAAACAAAAAAAMAFQAAAENoZW1EcmF3IEpTIDIu +MC4wLjkEAhAAvVl5ASOP3QJw/bgBHkWEAwEJCAAAAAAAAAAAAAIJCAAAANIEAADb +CQ0IAQABCAcBAAE6BAEAATsEAQAARQQBAAE8BAEAAEoEAQAADAYBAAEPBgEAAQ0G +AQAAQgQBAABDBAEAAEQEAQAADggCAJoCCggIABgAYADIAAMACwgIABgAAADIAAMA +CQgEAACAAgAICAQAmZkBAAcIBACZmQAABggEAAAAAgAFCAQAZmYOAAQIAgC0AAMI +BAAAAHgAIwgBAAUMCAEAACgIAQABKQgBAAEqCAEAATIIAQAAKwgBACgsCAEACi0I +AQABLggBAAACCBAAAAAkAAAAJAAAACQAAAAkAAEDAgAAAAIDAgABAAADMgAIAP// +/////wAAAAAAAP//AAAAAP////8AAAAA//8AAAAA/////wAAAAD/////AAD//wAB +DwAAAAEAGADp/QUAQXJpYWwACHgAAAMAAAEgASAAAAAAC2YIoP+E/4gL4wkYA2cF +JwP8AAIAAAEgASAAAAAAC2YIoAABAAAAZAAAAAEAAQEBAAAAAScPAAEAAQAAAAAA +AAAAAAAAAAACABkBkAAAAAAAQAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAtAsC +AAAAtQsUAAAAQ2hlbWljYWwgRm9ybXVsYTogtgsOAAAARXhhY3QgTWFzczogtwsU +AAAATW9sZWN1bGFyIFdlaWdodDoguAsHAAAAbS96OiC5CxYAAABFbGVtZW50YWwg +QW5hbHlzaXM6ILoLEQAAAEJvaWxpbmcgUG9pbnQ6ILsLEQAAAE1lbHRpbmcgUG9p +bnQ6ILwLEQAAAENyaXRpY2FsIFRlbXA6IL0LEQAAAENyaXRpY2FsIFByZXM6IL4L +EAAAAENyaXRpY2FsIFZvbDogvwsQAAAAR2liYnMgRW5lcmd5OiDACwkAAABMb2cg +UDogwQsGAAAATVI6IMILDwAAAEhlbnJ5J3MgTGF3OiDDCxAAAABIZWF0IG9mIEZv +cm06IMQLCAAAAHRQU0E6IMkLAgAAAMoLAgAAAAsMAgABAAoMAQAACQwBAAAMDAUA +AAAoIykBgBwAAAAEAhAAAAAAAAAAAACFqzYDAACSBhIIBAAAAJIGEwgEAIWrNgMW +CAQAAAAkABgIBAAAACQAFAgEAAAAAAAZCAAAEAgCAAEADwgCAAEAEQgBAAEDgAEA +AAAEAhAAvVl5ASOP3QL+14wBtTP0AgoAAgABAASAEAAAAAACCAA9iowBUXjsAgoA +AgACADcEAQABAAAEgBEAAAAAAggAUTh+AYVr7AIKAAIAAwAABAIACQBIBAAANwQB +AAE0BAQAAgAAADEECAAVAAAAFwAAAAaAAAAAAAACCAA0sYEBJc/oAgQCEAC9WXkB +Jc/oAjRxgwG1M/QCIwgBAAACBwIAAAAFBwEAAQAHDgABAAAAGABgAMgAAABSMQkH +DgABAAAAGABgAMgAAABSMQAAAAAEgBMAAAAAAggAKFyMAWbm3QIKAAIABAA3BAEA +AQAABIAUAAAAAAIIAFE4fgEo3N0CCgACAAUANwQBAAEAAAWAFQAAAAoAAgAGAAQG +BAAQAAAABQYEABEAAAAKBgEAAQAABYAWAAAACgACAAcABAYEABAAAAAFBgQAEwAA +AAoGAQABAAAFgBcAAAAKAAIACAAEBgQAEQAAAAUGBAAUAAAACgYBAAEAAAWAGAAA +AAoAAgAJAAQGBAATAAAABQYEABQAAAAKBgEAAQAAAAAKgAIAAAAEAhAAhat9Abhe +JwNw/bgBHkWEAwALEAAeRX4BUfgnAz2KlAFm5joDAQsQAD2KlAFR+CcD12O4AYWr +gwMCCwIAAQADgAQAAAAEAhAAVzOeAR/FMQOZma4Bw0V6AwoAAgALAASABQAAAAAC +CADrEaMBXI9CAwoAAgAMADcEAQABAAAEgAYAAAAAAggAHkWqAa4HTwMKAAIADQA3 +BAEAAQAABIAHAAAAAAIIAOsRowEAgFsDCgACAA4ANwQBAAEAAASACAAAAAACCAAe +RaoBUfhnAwoAAgAPADcEAQABAAAEgAkAAAAAAggA6xGjAaNwdAMKAAIAEAACBAIA +EQArBAIAAABIBAAANwQBAAEGgAAAAAAAAggAzoqmAUPUcAMEAhAAVzOeAUPUcAMB +vqYBw0V6AyMIAQAAAgcCAAAABQcBAAEABw4AAQAAABgAYADIAAAAQ2wJBw4AAQAA +ABgAYADIAAAAQ2wAAAAABIAOAAAAAAIIAABAqgG4HjYDCgACABEAAAQCAAwASwQB +AAE3BAEAAQAABYAKAAAACgACABIABAYEAAUAAAAFBgQABgAAAAoGAQABAAAFgAsA +AAAKAAIAEwAEBgQABgAAAAUGBAAHAAAACgYBAAEAAAWADAAAAAoAAgAUAAQGBAAH +AAAABQYEAAgAAAAKBgEAAQAABYANAAAACgACABUABAYEAAgAAAAFBgQACQAAAAoG +AQABAAAFgA8AAAAKAAIAFgAEBgQADgAAAAUGBAAFAAAACgYBAAEAAAAABoAdAAAA +AAIIAJnZigEeRS0DBAIQAIWrfQG4XicDcP24AR5FhAMKAAIACgAHBwIAAAACBwIA +AAAABw4AAQAAABgAYADIAAAAUjEJBw4AAQAAABgAYADIAAAAUjEAAAAAAAAAAAAA diff --git a/core/indigo-core/molecule/molecule_cdxml_loader.h b/core/indigo-core/molecule/molecule_cdxml_loader.h index 70ff3b4c41..d6ffa42b6c 100644 --- a/core/indigo-core/molecule/molecule_cdxml_loader.h +++ b/core/indigo-core/molecule/molecule_cdxml_loader.h @@ -132,6 +132,9 @@ namespace indigo AutoInt enhanced_stereo_group; AutoInt index; AutoInt geometry; + AutoInt alt_group_id; + AutoInt rg_index; + bool is_not_list; std::vector element_list; std::unordered_map bond_id_to_connection_idx; @@ -893,6 +896,7 @@ namespace indigo void _parseGraphic(CDXElement elem); void _parseArrow(CDXElement elem); + void _parseAltGroup(CDXElement elem); void _addAtomsAndBonds(BaseMolecule& mol, const std::vector& atoms, const std::vector& bonds); void _addBracket(BaseMolecule& mol, const CdxmlBracket& bracket); diff --git a/core/indigo-core/molecule/src/molecule_cdxml_loader.cpp b/core/indigo-core/molecule/src/molecule_cdxml_loader.cpp index 647eb9c7f8..f876f4422e 100644 --- a/core/indigo-core/molecule/src/molecule_cdxml_loader.cpp +++ b/core/indigo-core/molecule/src/molecule_cdxml_loader.cpp @@ -152,14 +152,22 @@ void MoleculeCdxmlLoader::_parseCollections(BaseMolecule& mol) int node_idx = _id_to_node_index.at(node.id); switch (node.type) { + case kCDXNodeType_NamedAlternativeGroup: case kCDXNodeType_Element: case kCDXNodeType_ElementList: atoms.push_back(node_idx); break; case kCDXNodeType_ExternalConnectionPoint: { - auto& fn = nodes[_fragment_nodes.back()]; - if (fn.connections.size() == 0) - fn.ext_connections.push_back(node.id); + if (_fragment_nodes.size()) + { + auto& fn = nodes[_fragment_nodes.back()]; + if (fn.connections.size() == 0) + fn.ext_connections.push_back(node.id); + } + else + { + // handle free external connection. attachment point? + } } break; case kCDXNodeType_Nickname: @@ -366,9 +374,11 @@ void MoleculeCdxmlLoader::_parseCDXMLElements(CDXElement elem, bool no_siblings, auto arrow_lambda = [this](CDXElement elem) { this->_parseArrow(elem); }; + auto altgroup_lambda = [this](CDXElement elem) { this->_parseAltGroup(elem); }; + std::unordered_map> cdxml_dispatcher = { - {"n", node_lambda}, {"b", bond_lambda}, {"fragment", fragment_lambda}, {"group", group_lambda}, {"bracketedgroup", bracketed_lambda}, - {"t", text_lambda}, {"graphic", graphic_lambda}, {"arrow", arrow_lambda}}; + {"n", node_lambda}, {"b", bond_lambda}, {"fragment", fragment_lambda}, {"group", group_lambda}, {"bracketedgroup", bracketed_lambda}, + {"t", text_lambda}, {"graphic", graphic_lambda}, {"arrow", arrow_lambda}, {"altgroup", altgroup_lambda}}; for (elem; elem.hasContent(); elem = elem.nextSiblingElement()) { @@ -416,6 +426,10 @@ void MoleculeCdxmlLoader::_addAtomsAndBonds(BaseMolecule& mol, const std::vector if (_pmol) { atom_idx = _pmol->addAtom(atom.element); + + if (atom.type == kCDXNodeType_NamedAlternativeGroup) + mol.allowRGroupOnRSite(atom_idx, atom.rg_index); + _id_to_atom_idx.emplace(atom.id, atom_idx); mol.setAtomXyz(atom_idx, atom.pos); _pmol->setAtomCharge_Silent(atom_idx, atom.charge); @@ -752,7 +766,11 @@ void MoleculeCdxmlLoader::_parseNode(CdxmlNode& node, CDXElement elem) auto stereo_lambda = [&node](const std::string& data) { node.stereo = KCIPStereochemistryCharToIndex.at(data.front()); }; - auto node_type_lambda = [&node](const std::string& data) { node.type = KNodeTypeNameToInt.at(data); }; + auto node_type_lambda = [&node](const std::string& data) { + node.type = KNodeTypeNameToInt.at(data); + if (node.type == kCDXNodeType_NamedAlternativeGroup) + node.element = ELEM_RSITE; + }; auto element_list_lambda = [&node](const std::string& data) { std::vector elements = split(data, ' '); @@ -770,25 +788,36 @@ void MoleculeCdxmlLoader::_parseNode(CdxmlNode& node, CDXElement elem) auto enhanced_stereo_group_lambda = [&node](const std::string& data) { node.enhanced_stereo_group = data; }; - std::unordered_map> node_dispatcher = { - {"id", id_lambda}, - {"p", pos_lambda}, - {"xyz", pos_lambda}, - {"NumHydrogens", hydrogens_lambda}, - {"Charge", charge_lambda}, - {"Isotope", isotope_lambda}, - {"Radical", radical_lambda}, - {"AS", stereo_lambda}, - {"NodeType", node_type_lambda}, - {"Element", element_lambda}, - {"GenericNickname", label_lambda}, - {"ElementList", element_list_lambda}, - {"BondOrdering", bond_ordering_lambda}, - {"Geometry", geometry_lambda}, - {"EnhancedStereoType", enhanced_stereo_type_lambda}, - {"EnhancedStereoGroupNum", enhanced_stereo_group_lambda}, - }; + auto alt_group_id_lambda = [&node](const std::string& data) { node.alt_group_id = data; }; + + std::unordered_map> node_dispatcher = {{"id", id_lambda}, + {"p", pos_lambda}, + {"xyz", pos_lambda}, + {"NumHydrogens", hydrogens_lambda}, + {"Charge", charge_lambda}, + {"Isotope", isotope_lambda}, + {"Radical", radical_lambda}, + {"AS", stereo_lambda}, + {"NodeType", node_type_lambda}, + {"Element", element_lambda}, + {"GenericNickname", label_lambda}, + {"ElementList", element_list_lambda}, + {"BondOrdering", bond_ordering_lambda}, + {"Geometry", geometry_lambda}, + {"EnhancedStereoType", enhanced_stereo_type_lambda}, + {"EnhancedStereoGroupNum", enhanced_stereo_group_lambda}, + {"AltGroupID", alt_group_id_lambda}}; applyDispatcher(elem.firstProperty(), node_dispatcher); + for (auto child_elem = elem.firstChildElement(); child_elem.hasContent(); child_elem = child_elem.nextSiblingElement()) + { + if (child_elem.name() == "t") + { + std::string label; + _parseLabel(child_elem, label); + if (label.find("R") == 0) + node.rg_index = label.substr(1); + } + } } void MoleculeCdxmlLoader::_addNode(CdxmlNode& node) @@ -879,6 +908,37 @@ void MoleculeCdxmlLoader::parseBBox(const std::string& data, Rect2f& bbox) throw Error("Not enought coordinates for text bounding box"); } +void MoleculeCdxmlLoader::_parseAltGroup(CDXElement elem) +{ + std::vector r_labels; + std::vector r_fragments; + for (auto r_elem = elem.firstChildElement(); r_elem.hasContent(); r_elem = r_elem.nextSiblingElement()) + { + auto el_name = r_elem.name(); + if (el_name == "fragment") + r_fragments.push_back(r_elem); + else if (el_name == "t") + { + std::string rl; + _parseLabel(r_elem, rl); + if (rl.find("R") == 0) + r_labels.push_back(rl.substr(1)); + } + } + + if (r_fragments.size() && r_labels.size()) + { + MoleculeCdxmlLoader alt_loader(_scanner, _is_binary); + BaseMolecule& mol = _pmol ? *(BaseMolecule*)_pmol : *(BaseMolecule*)_pqmol; + std::unique_ptr fragment(mol.neu()); + alt_loader.stereochemistry_options = stereochemistry_options; + alt_loader.loadMoleculeFromFragment(*fragment.get(), r_fragments.front()); + MoleculeRGroups& rgroups = mol.rgroups; + RGroup& rgroup = rgroups.getRGroup(r_labels.front()); + rgroup.fragments.add(fragment.release()); + } +} + void MoleculeCdxmlLoader::_parseGraphic(CDXElement elem) { AutoInt superseded_id = 0;