diff --git a/lib/mmseqs/src/commons/MathUtil.h b/lib/mmseqs/src/commons/MathUtil.h index 6808cf85..06eeb3b0 100644 --- a/lib/mmseqs/src/commons/MathUtil.h +++ b/lib/mmseqs/src/commons/MathUtil.h @@ -256,6 +256,13 @@ class MathUtil { return sum; } + static float squareDist(const float xx, const float xy, const float xz, + const float yx, const float yy, const float yz){ + float d1 = xx - yx; + float d2 = xy - yy; + float d3 = xz - yz; + return (d1 * d1 + d2 * d2 + d3 * d3); + } }; diff --git a/src/commons/LocalParameters.cpp b/src/commons/LocalParameters.cpp index eff70a4c..16dd72c4 100644 --- a/src/commons/LocalParameters.cpp +++ b/src/commons/LocalParameters.cpp @@ -36,6 +36,8 @@ LocalParameters::LocalParameters() : PARAM_PDB_OUTPUT_MODE(PARAM_PDB_OUTPUT_MODE_ID, "--pdb-output-mode", "PDB output mode", "PDB output mode:\n0: Single multi-model PDB file\n1: One PDB file per chain\n2: One PDB file per complex", typeid(int), (void *) &pdbOutputMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC), PARAM_PROSTT5_MODEL(PARAM_PROSTT5_MODEL_ID, "--prostt5-model", "Path to ProstT5", "Path to ProstT5 model", typeid(std::string), (void *) &prostt5Model, "^.*$", MMseqsParameter::COMMAND_COMMON), PARAM_GPU(PARAM_GPU_ID, "--gpu", "Use GPU", "Use GPU (CUDA) if possible", typeid(int), (void *) &gpu, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON), + PARAM_DB_EXTRACTION_MODE(PARAM_DB_EXTRACTION_MODE_ID, "--db-extraction-mode", "Createdb extraction mode", "createdb extraction mode: 0: chain 1: interface", typeid(int), (void *) &dbExtractionMode, "^[0-1]{1}$"), + PARAM_DISTANCE_THRESHOLD(PARAM_DISTANCE_THRESHOLD_ID, "--distance-threshold", "Interface distance threshold", "Residues with C-beta below this threshold will be part of interface", typeid(float), (void *) &distanceThreshold, "^[0-9]*(\\.[0-9]+)?$"), PARAM_MULTIMER_TM_THRESHOLD(PARAM_MULTIMER_TM_THRESHOLD_ID,"--multimer-tm-threshold", "TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtMultimerTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_CHAIN_TM_THRESHOLD(PARAM_CHAIN_TM_THRESHOLD_ID,"--chain-tm-threshold", "chain TMscore threshold for filtermultimer", "accept alignments with a tmsore > thr [0.0,1.0]",typeid(float), (void *) &filtChainTmThr, "^0(\\.[0-9]+)?|1(\\.0+)?$"), PARAM_INTERFACE_LDDT_THRESHOLD(PARAM_INTERFACE_LDDT_THRESHOLD_ID,"--interface-lddt-threshold", "Interface LDDT threshold", "accept alignments with a lddt > thr [0.0,1.0]",typeid(float), (void *) &filtInterfaceLddtThr, "^0(\\.[0-9]+)?|1(\\.0+)?$") @@ -88,6 +90,8 @@ LocalParameters::LocalParameters() : #endif structurecreatedb.push_back(&PARAM_PROSTT5_MODEL); structurecreatedb.push_back(&PARAM_CHAIN_NAME_MODE); + structurecreatedb.push_back(&PARAM_DB_EXTRACTION_MODE); + structurecreatedb.push_back(&PARAM_DISTANCE_THRESHOLD); structurecreatedb.push_back(&PARAM_WRITE_MAPPING); structurecreatedb.push_back(&PARAM_MASK_BFACTOR_THRESHOLD); structurecreatedb.push_back(&PARAM_COORD_STORE_MODE); @@ -272,6 +276,8 @@ LocalParameters::LocalParameters() : eValueThrExpandMultimer = 10000.0; prostt5Model = ""; gpu = 0; + dbExtractionMode = DB_EXTRACT_MODE_CHAIN; + distanceThreshold = 8.0; filtMultimerTmThr = 0.0; filtChainTmThr = 0.0; filtInterfaceLddtThr = 0.0; diff --git a/src/commons/LocalParameters.h b/src/commons/LocalParameters.h index 4d0eee25..042bf2b3 100644 --- a/src/commons/LocalParameters.h +++ b/src/commons/LocalParameters.h @@ -70,6 +70,9 @@ class LocalParameters : public Parameters { static const int OUTFMT_COMPLEX_U = 56; static const int OUTFMT_COMPLEX_T = 57; + static const int DB_EXTRACT_MODE_CHAIN = 0; + static const int DB_EXTRACT_MODE_INTERFACE = 1; + static const int COORD_STORE_MODE_CA_FLOAT = 1; static const int COORD_STORE_MODE_CA_DIFF = 2; static const int COORD_STORE_MODE_CA_PLAIN_TEXT = 3; @@ -143,6 +146,8 @@ class LocalParameters : public Parameters { PARAMETER(PARAM_PDB_OUTPUT_MODE) PARAMETER(PARAM_PROSTT5_MODEL) PARAMETER(PARAM_GPU) + PARAMETER(PARAM_DB_EXTRACTION_MODE) + PARAMETER(PARAM_DISTANCE_THRESHOLD) PARAMETER(PARAM_MULTIMER_TM_THRESHOLD) PARAMETER(PARAM_CHAIN_TM_THRESHOLD) PARAMETER(PARAM_INTERFACE_LDDT_THRESHOLD) @@ -176,6 +181,8 @@ class LocalParameters : public Parameters { float filtInterfaceLddtThr; std::string prostt5Model; int gpu; + int dbExtractionMode; + float distanceThreshold; static std::vector getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy, bool &needQCa, bool &needTCa, bool &needTMaligner, diff --git a/src/strucclustutils/GemmiWrapper.cpp b/src/strucclustutils/GemmiWrapper.cpp index c638e653..7d9b1e81 100644 --- a/src/strucclustutils/GemmiWrapper.cpp +++ b/src/strucclustutils/GemmiWrapper.cpp @@ -263,7 +263,8 @@ bool GemmiWrapper::loadFoldcompStructure(std::istream& stream, const std::string if (coordinates.size() == 0) { return false; } - + seq3di.clear(); + taxIds.clear(); title.clear(); chain.clear(); names.clear(); diff --git a/src/strucclustutils/GemmiWrapper.h b/src/strucclustutils/GemmiWrapper.h index 7ea09020..e06f47f5 100644 --- a/src/strucclustutils/GemmiWrapper.h +++ b/src/strucclustutils/GemmiWrapper.h @@ -40,6 +40,7 @@ class GemmiWrapper { std::vector c; std::vector cb; std::vector ami; + std::vector seq3di; std::vector names; std::vector chainNames; std::vector modelIndices; diff --git a/src/strucclustutils/structcreatedb.cpp b/src/strucclustutils/structcreatedb.cpp index cf224212..43d58288 100644 --- a/src/strucclustutils/structcreatedb.cpp +++ b/src/strucclustutils/structcreatedb.cpp @@ -15,6 +15,8 @@ #include "PatternCompiler.h" #include "Coordinate16.h" #include "itoa.h" +#include "MathUtil.h" + #ifdef HAVE_PROSTT5 #include "prostt5.h" #endif @@ -79,39 +81,24 @@ static inline bool compareByFirst(const std::pair& a, const std::pair & alphabet3di, std::vector & alphabetAA, - std::vector & camol, std::string & header, - DBWriter & aadbw, DBWriter & hdbw, DBWriter & torsiondbw, DBWriter & cadbw, int chainNameMode, - float maskBfactorThreshold, size_t & tooShort, size_t & notProtein, size_t & globalCnt, int thread_idx, int coordStoreMode, - std::string & filename, size_t & fileidCnt, - std::map> & entrynameToFileId, - std::map & filenameToFileId, - std::map & fileIdToName, - DBWriter* mappingWriter) { - size_t id = __sync_fetch_and_add(&globalCnt, readStructure.chain.size()); - size_t entriesAdded = 0; + +void addMissingAtomsInStructure(GemmiWrapper &readStructure, PulchraWrapper &pulchra) { + std::vector chainNames; for (size_t ch = 0; ch < readStructure.chain.size(); ch++) { - size_t dbKey = id + ch; size_t chainStart = readStructure.chain[ch].first; size_t chainEnd = readStructure.chain[ch].second; size_t chainLen = chainEnd - chainStart; - if (chainLen <= 3) { - tooShort++; - continue; - } - bool allX = true; for (size_t pos = 0; pos < chainLen; pos++) { const char aa = readStructure.ami[chainStart+pos]; @@ -121,10 +108,10 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru } } if (allX) { - notProtein++; + chainNames.push_back("SKIP"); continue; } - + chainNames.push_back(readStructure.chainNames[ch]); // Detect if structure is Ca only if (std::isnan(readStructure.n[chainStart + 0].x) && std::isnan(readStructure.n[chainStart + 1].x) && @@ -133,73 +120,366 @@ writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, Stru std::isnan(readStructure.c[chainStart + 0].x) && std::isnan(readStructure.c[chainStart + 1].x) && std::isnan(readStructure.c[chainStart + 2].x) && - std::isnan(readStructure.c[chainStart + 3].x)) - { + std::isnan(readStructure.c[chainStart + 3].x)) { pulchra.rebuildBackbone(&readStructure.ca[chainStart], &readStructure.n[chainStart], &readStructure.c[chainStart], &readStructure.ami[chainStart], chainLen); + + } + } + readStructure.chainNames.clear(); + readStructure.chainNames.insert(readStructure.chainNames.begin(), chainNames.begin(), chainNames.end()); + chainNames.clear(); +} + +void findInterfaceResidues(GemmiWrapper &readStructure, std::pair res1, std::pair res2, + std::vector & resIdx1, float distanceThreshold) +{ + std::vector coord1, coord2; + size_t sameRes = 0; + size_t chainLen = res1.second - res1.first; + bool noSameRes = true; + const float squareThreshold = distanceThreshold * distanceThreshold; + for (size_t res1Idx = res1.first; res1Idx < res1.second; res1Idx++) { + float x1, y1, z1; + if (readStructure.ami[res1Idx] == 'G') { + x1 = readStructure.ca[res1Idx].x; + y1 = readStructure.ca[res1Idx].y; + z1 = readStructure.ca[res1Idx].z; + } + else { + x1 = readStructure.cb[res1Idx].x; + y1 = readStructure.cb[res1Idx].y; + z1 = readStructure.cb[res1Idx].z; } + for (size_t res2Idx = res2.first; res2Idx < res2.second; res2Idx++) { + float x2, y2, z2; + if (readStructure.ami[res2Idx] == 'G') { + x2 = readStructure.ca[res2Idx].x; + y2 = readStructure.ca[res2Idx].y; + z2 = readStructure.ca[res2Idx].z; + } + else { + x2 = readStructure.cb[res2Idx].x; + y2 = readStructure.cb[res2Idx].y; + z2 = readStructure.cb[res2Idx].z; + } + float distance = MathUtil::squareDist(x1, y1, z1, x2, y2, z2); + if (distance < 0.01) { + noSameRes = false; + sameRes++; + break; + } + if (distance < squareThreshold) { + resIdx1.push_back(res1Idx); + if (noSameRes) { + break; + } + } + } + } + if (sameRes / chainLen > 0.9){ + resIdx1.clear(); + } +} + +void compute3DiInterfaces(GemmiWrapper &readStructure, PulchraWrapper &pulchra, StructureTo3Di &structureTo3Di, SubstitutionMatrix & mat3Di, int chainNameMode, float distanceThreshold) { + size_t prevInterfaceChainLen = 0; + std::vector interfaceSeq3di, interfaceAmi; + std::vector resIdx1, resIdx2; + std::vector notProteinChains; + std::vector interfacetaxIds; + std::vector interfaceModelIndices; + std::vector ca, n, c, cb; + std::vector interfaceCa; + std::vector interfaceNames, interfaceChainNames; + std::vector> interfaceChain; + std::unordered_map modelToInterfaceNum; + addMissingAtomsInStructure(readStructure, pulchra); + for (size_t ch1 = 0; ch1 < readStructure.chain.size(); ch1++) { + if (readStructure.chainNames[ch1] == "SKIP") { + interfaceCa.push_back(Vec3(0,0,0)); + interfaceAmi.push_back('X'); + interfaceSeq3di.push_back('X'); + interfaceChain.push_back(std::make_pair(prevInterfaceChainLen, prevInterfaceChainLen+1)); + interfaceNames.push_back("ALLX"); + interfaceChainNames.push_back(readStructure.chainNames[ch1]); + interfacetaxIds.push_back(readStructure.taxIds[ch1]); + interfaceModelIndices.push_back(readStructure.modelIndices[ch1]); + prevInterfaceChainLen++; + continue; + } + for (size_t ch2 = ch1 + 1; ch2 < readStructure.chain.size(); ch2++) { + if (readStructure.chainNames[ch2] == "SKIP") { + continue; + } + if (readStructure.modelIndices[ch1] == readStructure.modelIndices[ch2]) { + findInterfaceResidues(readStructure, readStructure.chain[ch1], readStructure.chain[ch2], resIdx1, distanceThreshold); + findInterfaceResidues(readStructure, readStructure.chain[ch2], readStructure.chain[ch1], resIdx2, distanceThreshold); + if (resIdx1.size() >= 4 && resIdx2.size() >= 4) { + modelToInterfaceNum[readStructure.modelIndices[ch2]]++; + for (size_t i = 0; i < resIdx1.size(); i++) { + ca.push_back(readStructure.ca[resIdx1[i]]); + n.push_back(readStructure.n[resIdx1[i]]); + c.push_back(readStructure.c[resIdx1[i]]); + cb.push_back(readStructure.cb[resIdx1[i]]); + } + // std::sort(resIdx2.begin(), resIdx2.end()); + for (size_t i = 0; i < resIdx2.size(); i++) { + ca.push_back(readStructure.ca[resIdx2[i]]); + n.push_back(readStructure.n[resIdx2[i]]); + c.push_back(readStructure.c[resIdx2[i]]); + cb.push_back(readStructure.cb[resIdx2[i]]); + } + char *states = structureTo3Di.structure2states(ca.data(), + n.data(), + c.data(), + cb.data(), + resIdx1.size() + resIdx2.size()); + for (size_t i = 0; i < resIdx1.size(); i++) { + interfaceSeq3di.push_back(mat3Di.num2aa[static_cast(states[i])]); + interfaceAmi.push_back(readStructure.ami[resIdx1[i]]); + interfaceCa.push_back(readStructure.ca[resIdx1[i]]); + } + for (size_t i = 0; i < resIdx2.size(); i++) { + interfaceSeq3di.push_back(mat3Di.num2aa[static_cast(states[resIdx1.size()+i])]); + interfaceAmi.push_back(readStructure.ami[resIdx2[i]]); + interfaceCa.push_back(readStructure.ca[resIdx2[i]]); + } + interfaceChain.push_back(std::make_pair(prevInterfaceChainLen, prevInterfaceChainLen + resIdx1.size())); + prevInterfaceChainLen += resIdx1.size(); + interfaceChain.push_back(std::make_pair(prevInterfaceChainLen, prevInterfaceChainLen + resIdx2.size())); + prevInterfaceChainLen += resIdx2.size(); + std::string interfaceName; + if (Util::endsWith(".gz", readStructure.names[ch1] )) { + interfaceName.append(Util::remove_extension(Util::remove_extension(readStructure.names[ch1]))); + } else { + interfaceName.append(Util::remove_extension(readStructure.names[ch1])); + } + if (readStructure.modelCount > 1) { + interfaceName.append("_MODEL_"); + interfaceName.append(SSTR(readStructure.modelIndices[ch1])); + } + interfaceModelIndices.push_back(readStructure.modelIndices[ch1]); + interfaceModelIndices.push_back(readStructure.modelIndices[ch2]); + interfaceName.append("_INT_"); + interfaceName.append(SSTR(modelToInterfaceNum[readStructure.modelIndices[ch2]])); + std::string interfaceNameFirst = interfaceName; + std::string interfaceNameSecond = interfaceName; + if (chainNameMode == LocalParameters::CHAIN_MODE_ADD || + (chainNameMode == LocalParameters::CHAIN_MODE_AUTO && readStructure.names.size() > 1)) { + interfaceNameFirst.push_back('_'); + interfaceNameFirst.append(readStructure.chainNames[ch1]); + interfaceNameSecond.push_back('_'); + interfaceNameSecond.append(readStructure.chainNames[ch2]); + } + if (readStructure.title.size() > 0) { + interfaceNameFirst.push_back(' '); + interfaceNameFirst.append(readStructure.title); + interfaceNameSecond.push_back(' '); + interfaceNameSecond.append(readStructure.title); + } + interfaceNames.push_back(interfaceNameFirst); + interfaceNames.push_back(interfaceNameSecond); + interfaceChainNames.push_back(readStructure.chainNames[ch1]); + interfaceChainNames.push_back(readStructure.chainNames[ch2]); + interfacetaxIds.push_back(readStructure.taxIds[ch1]); + interfacetaxIds.push_back(readStructure.taxIds[ch2]); + } + else { + interfaceCa.push_back(Vec3(0,0,0)); + interfaceAmi.push_back('X'); + interfaceSeq3di.push_back('X'); + interfaceChain.push_back(std::make_pair(prevInterfaceChainLen, prevInterfaceChainLen+1)); + interfaceNames.push_back("SHORT"); + interfaceChainNames.push_back(readStructure.chainNames[ch1]); + interfacetaxIds.push_back(readStructure.taxIds[ch1]); + interfaceModelIndices.push_back(readStructure.modelIndices[ch1]); + prevInterfaceChainLen++; + } + resIdx1.clear(); + resIdx2.clear(); + ca.clear(); + n.clear(); + c.clear(); + cb.clear(); + } + } + } + // copy interface data to readStructure + // this overwrites the original data (clear and push_back) + readStructure.ca.clear(); + readStructure.ca.insert(readStructure.ca.begin(), interfaceCa.begin(), interfaceCa.end()); + readStructure.ami.clear(); + readStructure.ami.insert(readStructure.ami.begin(), interfaceAmi.begin(), interfaceAmi.end()); + readStructure.seq3di.clear(); + readStructure.seq3di.insert(readStructure.seq3di.begin(), interfaceSeq3di.begin(), interfaceSeq3di.end()); + readStructure.chain.clear(); + readStructure.chain.insert(readStructure.chain.begin(), interfaceChain.begin(), interfaceChain.end()); + readStructure.names.clear(); + readStructure.names.insert(readStructure.names.begin(), interfaceNames.begin(), interfaceNames.end()); + readStructure.chainNames.clear(); + readStructure.chainNames.insert(readStructure.chainNames.begin(), interfaceChainNames.begin(), interfaceChainNames.end()); + readStructure.taxIds.clear(); + readStructure.taxIds.insert(readStructure.taxIds.begin(), interfacetaxIds.begin(), interfacetaxIds.end()); + readStructure.modelIndices.clear(); + readStructure.modelIndices.insert(readStructure.modelIndices.begin(), interfaceModelIndices.begin(), interfaceModelIndices.end()); +} + +size_t +writeStructureEntry(SubstitutionMatrix & mat, GemmiWrapper & readStructure, StructureTo3Di & structureTo3Di, + PulchraWrapper & pulchra, std::vector & alphabet3di, std::vector & alphabetAA, + std::vector & camol, std::string & header, + DBWriter & aadbw, DBWriter & hdbw, DBWriter & torsiondbw, DBWriter & cadbw, int chainNameMode, + float maskBfactorThreshold, size_t & tooShort, size_t & notProtein, size_t & globalCnt, int thread_idx, int coordStoreMode, + size_t & fileidCnt, std::map> & entrynameToFileId, + std::map & filenameToFileId, + std::map & fileIdToName, + DBWriter* mappingWriter) { + LocalParameters &par = LocalParameters::getLocalInstance(); + if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_INTERFACE) { + compute3DiInterfaces(readStructure, pulchra, structureTo3Di, mat, chainNameMode, par.distanceThreshold); + } + size_t id = __sync_fetch_and_add(&globalCnt, readStructure.chain.size()); + size_t entriesAdded = 0; + for (size_t ch = 0; ch < readStructure.chain.size(); ch++) { + size_t dbKey = id + ch; + size_t chainStart = readStructure.chain[ch].first; + size_t chainEnd = readStructure.chain[ch].second; + size_t chainLen = chainEnd - chainStart; + header.clear(); + if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_CHAIN) { + if (chainLen <= 3) { + tooShort++; + continue; + } + bool allX = true; + for (size_t pos = 0; pos < chainLen; pos++) { + const char aa = readStructure.ami[chainStart+pos]; + if (aa != 'X' && aa != 'x') { + allX = false; + break; + } + } + if (allX) { + notProtein++; + continue; + } + // Detect if structure is Ca only + if (std::isnan(readStructure.n[chainStart + 0].x) && + std::isnan(readStructure.n[chainStart + 1].x) && + std::isnan(readStructure.n[chainStart + 2].x) && + std::isnan(readStructure.n[chainStart + 3].x) && + std::isnan(readStructure.c[chainStart + 0].x) && + std::isnan(readStructure.c[chainStart + 1].x) && + std::isnan(readStructure.c[chainStart + 2].x) && + std::isnan(readStructure.c[chainStart + 3].x)) { + pulchra.rebuildBackbone(&readStructure.ca[chainStart], + &readStructure.n[chainStart], + &readStructure.c[chainStart], + &readStructure.ami[chainStart], + chainLen); - char * states = structureTo3Di.structure2states(&readStructure.ca[chainStart], - &readStructure.n[chainStart], - &readStructure.c[chainStart], - &readStructure.cb[chainStart], - chainLen); - for(size_t pos = 0; pos < chainLen; pos++){ - if(readStructure.ca_bfactor[pos] < maskBfactorThreshold){ - alphabet3di.push_back(tolower(mat.num2aa[static_cast(states[pos])])); - alphabetAA.push_back(tolower(readStructure.ami[chainStart+pos])); - }else{ - alphabet3di.push_back(mat.num2aa[static_cast(states[pos])]); + } + char * states = structureTo3Di.structure2states(&readStructure.ca[chainStart], + &readStructure.n[chainStart], + &readStructure.c[chainStart], + &readStructure.cb[chainStart], + chainLen); + for (size_t pos = 0; pos < chainLen; pos++) { + if (readStructure.ca_bfactor[pos] < maskBfactorThreshold) { + alphabet3di.push_back(tolower(mat.num2aa[static_cast(states[pos])])); + alphabetAA.push_back(tolower(readStructure.ami[chainStart+pos])); + } else { + alphabet3di.push_back(mat.num2aa[static_cast(states[pos])]); + alphabetAA.push_back(readStructure.ami[chainStart+pos]); + } + } + if (Util::endsWith(".gz", readStructure.names[ch] )) { + header.append(Util::remove_extension(Util::remove_extension(readStructure.names[ch]))); + } + else { + header.append(Util::remove_extension(readStructure.names[ch])); + } + if (readStructure.modelCount > 1) { + header.append("_MODEL_"); + header.append(SSTR(readStructure.modelIndices[ch])); + } + if (chainNameMode == LocalParameters::CHAIN_MODE_ADD || + (chainNameMode == LocalParameters::CHAIN_MODE_AUTO && readStructure.names.size() > 1)) { + header.push_back('_'); + header.append(readStructure.chainNames[ch]); + } + if (readStructure.title.size() > 0) { + header.push_back(' '); + header.append(readStructure.title); + } + } + else if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_INTERFACE) { + if (readStructure.names[ch] == "ALLX") { + notProtein++; + continue; + } + if (readStructure.names[ch] == "SHORT") { + tooShort++; + continue; + } + for (size_t pos = 0; pos < chainLen; pos++) { + alphabet3di.push_back(readStructure.seq3di[chainStart+pos]); alphabetAA.push_back(readStructure.ami[chainStart+pos]); } + header.append(readStructure.names[ch]); } alphabet3di.push_back('\n'); alphabetAA.push_back('\n'); torsiondbw.writeData(alphabet3di.data(), alphabet3di.size(), dbKey, thread_idx); aadbw.writeData(alphabetAA.data(), alphabetAA.size(), dbKey, thread_idx); - header.clear(); - if (Util::endsWith(".gz", readStructure.names[ch])){ - header.append(Util::remove_extension(Util::remove_extension(readStructure.names[ch]))); - } - else{ - header.append(Util::remove_extension(readStructure.names[ch])); - } - if(readStructure.modelCount > 1){ - header.append("_MODEL_"); - header.append(std::to_string(readStructure.modelIndices[ch])); - } - if(chainNameMode == LocalParameters::CHAIN_MODE_ADD || - (chainNameMode == LocalParameters::CHAIN_MODE_AUTO && readStructure.names.size() > 1)){ - header.push_back('_'); - header.append(readStructure.chainNames[ch]); - } - if(readStructure.title.size() > 0){ - header.push_back(' '); - header.append(readStructure.title); - } header.push_back('\n'); std::string entryName = Util::parseFastaHeader(header.c_str()); #pragma omp critical { - std::string filenameWithExtension = filename; - if (Util::endsWith(".gz", filename)){ - filenameWithExtension = Util::remove_extension(filename); - } - std::string filenameWithoutExtension = Util::remove_extension(filenameWithExtension); - std::map::iterator it = filenameToFileId.find(filenameWithoutExtension); - size_t fileid; - if (it != filenameToFileId.end()) { - fileid = it->second; - } else { - fileid = fileidCnt; - filenameToFileId[filenameWithoutExtension] = fileid; - fileIdToName[fileid] = filenameWithoutExtension; - fileidCnt++; + if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_CHAIN) { + std::string filenameWithExtension; + if (Util::endsWith(".gz", readStructure.names[ch] )) { + filenameWithExtension = Util::remove_extension(Util::remove_extension(readStructure.names[ch])); + } + else { + filenameWithExtension = Util::remove_extension(readStructure.names[ch]); + } + std::string filenameWithoutExtension = Util::remove_extension(filenameWithExtension); + std::map::iterator it = filenameToFileId.find(filenameWithoutExtension); + size_t fileid; + if (it != filenameToFileId.end()) { + fileid = it->second; + } else { + fileid = fileidCnt; + filenameToFileId[filenameWithoutExtension] = fileid; + fileIdToName[fileid] = filenameWithoutExtension; + fileidCnt++; + } + entrynameToFileId[entryName] = std::make_pair(fileid, readStructure.modelIndices[ch]); + } else if (par.dbExtractionMode == LocalParameters::DB_EXTRACT_MODE_INTERFACE) { + std::string filenameWithoutExtension; + if (chainNameMode == LocalParameters::CHAIN_MODE_ADD || chainNameMode == LocalParameters::CHAIN_MODE_AUTO) { + size_t firstUnderscore = readStructure.names[ch].find('_'); + filenameWithoutExtension = readStructure.names[ch].substr(0, firstUnderscore); + } else { + filenameWithoutExtension = readStructure.names[ch]; + } + std::map::iterator it = filenameToFileId.find(filenameWithoutExtension); + size_t fileid; + if (it != filenameToFileId.end()) { + fileid = it->second; + } else { + fileid = fileidCnt; + filenameToFileId[filenameWithoutExtension] = fileid; + fileIdToName[fileid] = filenameWithoutExtension; + fileidCnt++; + } + entrynameToFileId[entryName] = std::make_pair(fileid, readStructure.modelIndices[ch]); } - entrynameToFileId[entryName] = std::make_pair(fileid, readStructure.modelIndices[ch]); } hdbw.writeData(header.c_str(), header.size(), dbKey, thread_idx); @@ -487,7 +767,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { int inputFormat = par.inputFormat; // Process tar files! - for(size_t i = 0; i < tarFiles.size(); i++) { + for (size_t i = 0; i < tarFiles.size(); i++) { mtar_t tar; if (Util::endsWith(".tar.gz", tarFiles[i]) || Util::endsWith(".tgz", tarFiles[i])) { #ifdef HAVE_ZLIB @@ -508,7 +788,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { progress.updateProgress(); #ifdef OPENMP int localThreads = par.threads; - if(localThreads > 1){ + if (localThreads > 1) { needsReorderingAtTheEnd = true; } #endif @@ -649,7 +929,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { mat, readStructure, structureTo3Di, pulchra, alphabet3di, alphabetAA, camol, header, aadbw, hdbw, torsiondbw, cadbw, par.chainNameMode, par.maskBfactorThreshold, tooShort, notProtein, globalCnt, thread_idx, par.coordStoreMode, - name, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter ); } @@ -682,7 +962,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { for (size_t i = 0; i < looseFiles.size(); i++) { progress.updateProgress(); - if(readStructure.load(looseFiles[i], (GemmiWrapper::Format)inputFormat) == false){ + if (readStructure.load(looseFiles[i], (GemmiWrapper::Format)inputFormat) == false) { incorrectFiles++; continue; } @@ -692,7 +972,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { mat, readStructure, structureTo3Di, pulchra, alphabet3di, alphabetAA, camol, header, aadbw, hdbw, torsiondbw, cadbw, par.chainNameMode, par.maskBfactorThreshold, tooShort, notProtein, globalCnt, thread_idx, par.coordStoreMode, - looseFiles[i], globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter ); } @@ -755,7 +1035,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { mat, readStructure, structureTo3Di, pulchra, alphabet3di, alphabetAA, camol, header, aadbw, hdbw, torsiondbw, cadbw, par.chainNameMode, par.maskBfactorThreshold, tooShort, notProtein, globalCnt, thread_idx, par.coordStoreMode, - obj_name, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter ); } @@ -806,7 +1086,7 @@ int structcreatedb(int argc, const char **argv, const Command& command) { mat, readStructure, structureTo3Di, pulchra, alphabet3di, alphabetAA, camol, header, aadbw, hdbw, torsiondbw, cadbw, par.chainNameMode, par.maskBfactorThreshold, tooShort, notProtein, globalCnt, thread_idx, par.coordStoreMode, - dbname, globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, + globalFileidCnt, entrynameToFileId, filenameToFileId, fileIdToName, mappingWriter ); } @@ -972,16 +1252,15 @@ int structcreatedb(int argc, const char **argv, const Command& command) { entry.entryName = removeModel(entryNameWithModel); std::pair fileIdModelEntry = entrynameToFileId[entryNameWithModel]; size_t fileId = fileIdModelEntry.first; - if(modelFileIdLookup.find(fileIdModelEntry) == modelFileIdLookup.end()){ + if (modelFileIdLookup.find(fileIdModelEntry) == modelFileIdLookup.end()) { modelFileIdLookup[fileIdModelEntry] = globalFileNumber; entry.fileNumber = globalFileNumber; globalFileNumber++; needToWriteSource = true; - }else{ + } else { entry.fileNumber = modelFileIdLookup[fileIdModelEntry]; needToWriteSource = false; } - maxFileId = std::max(fileId, maxFileId); readerHeader.lookupEntryToBuffer(buffer, entry); size_t written = fwrite(buffer.c_str(), sizeof(char), buffer.size(), file); @@ -990,9 +1269,9 @@ int structcreatedb(int argc, const char **argv, const Command& command) { EXIT(EXIT_FAILURE); } buffer.clear(); - if(needToWriteSource) { + if (needToWriteSource) { std::string filename = FileUtil::baseName(fileIdToName[fileId]); - if(needToWriteModel) { + if (needToWriteModel) { filename.append("_MODEL_"); filename.append(SSTR(fileIdModelEntry.second)); }