From 5e0c50c92d3546f425787068a8428390b25aadfb Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 23 Aug 2022 19:17:56 +0200 Subject: [PATCH 1/6] Add first rudimentary support for path alignment. --- command_line/image.cpp | 1 + command_line/querypaths.cpp | 1 + command_line/reduce.cpp | 1 + graph/path.cpp | 81 +++++++++++---- graph/path.h | 22 ++++- graphsearch/blast/blastsearch.cpp | 76 +++++++------- graphsearch/blast/blastsearch.h | 11 ++- graphsearch/graphsearch.cpp | 63 ++++++++++++ graphsearch/graphsearch.h | 15 ++- graphsearch/hmmer/hmmersearch.cpp | 125 ++++++++++-------------- graphsearch/hmmer/hmmersearch.h | 16 ++- graphsearch/minimap2/minimap2search.cpp | 76 +++++++------- graphsearch/minimap2/minimap2search.h | 11 ++- graphsearch/query.h | 3 +- graphsearch/querypath.cpp | 13 ++- graphsearch/querypath.h | 5 +- program/globals.cpp | 4 +- program/globals.h | 7 +- program/main.cpp | 2 +- tests/bandagetests.cpp | 2 +- 20 files changed, 341 insertions(+), 194 deletions(-) diff --git a/command_line/image.cpp b/command_line/image.cpp index fd02c1e6..3563245d 100644 --- a/command_line/image.cpp +++ b/command_line/image.cpp @@ -123,6 +123,7 @@ int bandageImage(QStringList arguments) QString blastError = g_blastSearch->doAutoGraphSearch(*g_assemblyGraph, g_settings->blastQueryFilename, + false, /* include paths */ g_settings->blastSearchParameters); if (!blastError.isEmpty()) { err << blastError << Qt::endl; diff --git a/command_line/querypaths.cpp b/command_line/querypaths.cpp index e0ea90dd..653bca9a 100644 --- a/command_line/querypaths.cpp +++ b/command_line/querypaths.cpp @@ -128,6 +128,7 @@ int bandageQueryPaths(QStringList arguments) out << "(" << QDateTime::currentDateTime().toString("dd MMM yyyy hh:mm:ss") << ") Running BLAST search... " << Qt::flush; QString blastError = g_blastSearch->doAutoGraphSearch(*g_assemblyGraph, g_settings->blastQueryFilename, + false, /* include paths */ g_settings->blastSearchParameters); if (!blastError.isEmpty()) { err << Qt::endl << blastError << Qt::endl; diff --git a/command_line/reduce.cpp b/command_line/reduce.cpp index 3e5cc386..d36fe940 100644 --- a/command_line/reduce.cpp +++ b/command_line/reduce.cpp @@ -92,6 +92,7 @@ int bandageReduce(QStringList arguments) QString blastError = g_blastSearch->doAutoGraphSearch(*g_assemblyGraph, g_settings->blastQueryFilename, + false, /* include paths */ g_settings->blastSearchParameters); if (!blastError.isEmpty()) { err << blastError << Qt::endl; diff --git a/graph/path.cpp b/graph/path.cpp index e1bb19dd..65512a87 100644 --- a/graph/path.cpp +++ b/graph/path.cpp @@ -157,7 +157,7 @@ Path Path::makeFromString(const QString& pathString, const AssemblyGraph &graph, return path; } - //Find which node names are and are not actually in the graph. + //Find which node names are and are not actually in the graph. std::vector nodesInGraph; QStringList nodesNotInGraph; for (auto & i : nodeNameList) { @@ -433,33 +433,32 @@ int Path::getLength() const { } -QString Path::getFasta() const { - //The description line is a comma-delimited list of the nodes in the path - QString fasta = ">" + getString(false); +QByteArray Path::getFasta(QString name) const { + // The description line is a comma-delimited list of the nodes in the path + QByteArray fasta = ">" + (name.isEmpty() ? getString(false).toLatin1() : name.toLatin1()); if (isCircular()) fasta += " (circular)"; fasta += "\n"; + fasta += utils::addNewlinesToSequence(getPathSequence()); - QString pathSequence = getPathSequence(); - int charactersOnLine = 0; - for (auto i : pathSequence) - { - fasta += i; - ++charactersOnLine; - if (charactersOnLine >= 70) - { - fasta += "\n"; - charactersOnLine = 0; - } - } + return fasta; +} + +QByteArray Path::getAAFasta(unsigned shift, QString name) const { + // The description line is a comma-delimited list of the nodes in the path + QByteArray fasta = ">" + (name.isEmpty() ? getString(false).toLatin1() : name.toLatin1()); + + if (isCircular()) + fasta += " (circular)"; + fasta += "/" + std::to_string(shift); fasta += "\n"; + fasta += utils::addNewlinesToSequence(getPathSequence()); return fasta; } - QString Path::getString(bool spaces) const { QString output; for (int i = 0; i < m_nodes.size(); ++i) { @@ -773,11 +772,11 @@ std::vector Path::getPosition(const DeBruijnNode *node) const { return res; } -// Checks whether [leftx, rightx) intersects [lefty, righty) +// Checks whether [leftx, rightx] intersects [lefty, righty] static bool intersects(int leftx, int rightx, int lefty, int righty) { - return (leftx <= lefty && lefty < rightx) || - (lefty <= leftx && leftx < righty); + return (leftx <= lefty && lefty <= rightx) || + (lefty <= leftx && leftx <= righty); } @@ -791,7 +790,7 @@ std::vector Path::getNodesAt(int startPosition, int endPosition) if (i > 0) pos -= m_edges[i-1]->getOverlap(); int nodeLen = m_nodes[i]->getLength(); - if (intersects(pos, pos+nodeLen, + if (intersects(pos, pos + nodeLen - 1, startPosition, endPosition)) res.push_back(m_nodes[i]); @@ -800,3 +799,43 @@ std::vector Path::getNodesAt(int startPosition, int endPosition) return res; } + +Path::MappingPath Path::getNodeCovering(int startPosition, int endPosition) const { + int pos = -m_startLocation.getPosition() + 1; // all UI positions are 1-based, convert to 0-based + + startPosition -= 1; endPosition -= 1; // all UI positions are 1-based, convert to 0-based + + MappingPath res; + for (size_t i = 0; i < m_nodes.size(); ++i) { + if (i > 0) + pos -= m_edges[i-1]->getOverlap(); + int nodeLen = m_nodes[i]->getLength(); + if (intersects(pos, pos + nodeLen - 1, + startPosition, endPosition)) { + MappingRange range; + // Determing the mapping range for a node + // Note that MappingRange is 1-based. + if (pos < startPosition) { + range.initial_range.from = startPosition + 1; + range.mapped_range.from = startPosition - pos + 1; + } else { + range.initial_range.from = pos + 1; + range.mapped_range.from = 1; + } + + if (pos + nodeLen - 1 > endPosition) { + range.initial_range.to = endPosition + 1; + range.mapped_range.to = endPosition - pos + 1; + } else { + range.initial_range.to = pos + nodeLen; + range.mapped_range.to = nodeLen; + } + + res.emplace_back(m_nodes[i], range); + } + + pos += nodeLen; + } + + return res; +} diff --git a/graph/path.h b/graph/path.h index a491c246..ce46325d 100644 --- a/graph/path.h +++ b/graph/path.h @@ -31,9 +31,22 @@ class DeBruijnNode; class DeBruijnEdge; class AssemblyGraph; -class Path -{ +class Path { public: + // [from, to] since UI does this + struct Range { + int from; int to; + }; + + struct MappingRange { + // on genome/contig/whatever + Range initial_range; + // on node + Range mapped_range; + }; + + using MappingPath = std::vector>; + //CREATORS Path() {} Path(GraphLocation location); @@ -55,7 +68,8 @@ class Path bool haveSameNodes(const Path& other) const; bool hasNodeSubset(const Path& other) const; [[nodiscard]] QByteArray getPathSequence() const; - [[nodiscard]] QString getFasta() const; + [[nodiscard]] QByteArray getFasta(QString name = "") const; + [[nodiscard]] QByteArray getAAFasta(unsigned shift, QString name = "") const; [[nodiscard]] QString getString(bool spaces) const; int getLength() const; QList extendPathInAllPossibleWays() const; @@ -64,6 +78,8 @@ class Path std::vector getPosition(const DeBruijnNode *node) const; std::vector getNodesAt(int startPosition, int endPosition) const; + MappingPath getNodeCovering(int startPosition, int endPosition) const; + bool containsNode(const DeBruijnNode * node) const; bool containsEntireNode(const DeBruijnNode * node) const; bool isInMiddleOfPath(const DeBruijnNode * node) const; diff --git a/graphsearch/blast/blastsearch.cpp b/graphsearch/blast/blastsearch.cpp index 7c66b91e..b7ca8c87 100644 --- a/graphsearch/blast/blastsearch.cpp +++ b/graphsearch/blast/blastsearch.cpp @@ -52,9 +52,9 @@ bool BlastSearch::findTools() { return true; } -QString BlastSearch::buildDatabase(const AssemblyGraph &graph) { +QString BlastSearch::buildDatabase(const AssemblyGraph &graph, bool includePaths) { DbBuildFinishedRAII watcher(this); - + m_lastError = ""; if (!findTools()) return m_lastError; @@ -68,14 +68,24 @@ QString BlastSearch::buildDatabase(const AssemblyGraph &graph) { if (!file.open(QIODevice::WriteOnly | QIODevice::Text)) return (m_lastError = "Failed to open: " + file.fileName()); - QTextStream out(&file); - for (const auto *node : graph.m_deBruijnGraphNodes) { - if (m_cancelBuildDatabase) - return (m_lastError = "Build cancelled."); + { + QTextStream out(&file); + for (const auto *node : graph.m_deBruijnGraphNodes) { + if (m_cancelBuildDatabase) + return (m_lastError = "Build cancelled."); + + out << node->getFasta(true, false, false); + } + + if (includePaths) { + for (auto it = graph.m_deBruijnGraphPaths.begin(); it != graph.m_deBruijnGraphPaths.end(); ++it) { + if (m_cancelBuildDatabase) + return (m_lastError = "Build cancelled."); - out << node->getFasta(true, false, false); + out << it.value()->getFasta(it.key().c_str()); + } + } } - file.close(); // Make sure the graph has sequences bool atLeastOneSequence = false; @@ -182,7 +192,7 @@ static void buildHitsFromBlastOutput(QString blastOutput, Queries &queries); QString BlastSearch::doSearch(Queries &queries, QString extraParameters) { GraphSearchFinishedRAII watcher(this); - + m_lastError = ""; if (!findTools()) return m_lastError; @@ -222,10 +232,11 @@ QString BlastSearch::doSearch(Queries &queries, QString extraParameters) { //This function carries out the entire BLAST search procedure automatically, without user input. //It returns an error string which is empty if all goes well. QString BlastSearch::doAutoGraphSearch(const AssemblyGraph &graph, QString queriesFilename, + bool includePaths, QString extraParameters) { cleanUp(); - QString maybeError = buildDatabase(graph); // It is expected that buildDatabase will setup last error as well + QString maybeError = buildDatabase(graph, includePaths); // It is expected that buildDatabase will setup last error as well if (!maybeError.isEmpty()) return maybeError; @@ -315,8 +326,8 @@ static QString getNodeNameFromString(const QString &nodeString) { // BLAST search) to construct the Hit objects. // It looks at the filters to possibly exclude hits which fail to meet user- // defined thresholds. -static void buildHitsFromBlastOutput(QString blastOutput, - Queries &queries) { +void BlastSearch::buildHitsFromBlastOutput(QString blastOutput, + Queries &queries) const { QStringList blastHitList = blastOutput.split("\n", Qt::SkipEmptyParts); for (const auto &hitString : blastHitList) { @@ -338,27 +349,11 @@ static void buildHitsFromBlastOutput(QString blastOutput, SciNot eValue(alignmentParts[10]); double bitScore = alignmentParts[11].toDouble(); - //Only save BLAST hits that are on forward strands. - if (nodeStart > nodeEnd) - continue; - - QString nodeName = getNodeNameFromString(nodeLabel); - DeBruijnNode *node = nullptr; - auto it = g_assemblyGraph->m_deBruijnGraphNodes.find(nodeName.toStdString()); - if (it == g_assemblyGraph->m_deBruijnGraphNodes.end()) - continue; - - node = it.value(); - Query *query = queries.getQueryFromName(queryName); if (query == nullptr) continue; // Check the user-defined filters. - if (g_settings->blastAlignmentLengthFilter.on && - alignmentLength < g_settings->blastAlignmentLengthFilter) - continue; - if (g_settings->blastIdentityFilter.on && percentIdentity < g_settings->blastIdentityFilter) continue; @@ -371,16 +366,25 @@ static void buildHitsFromBlastOutput(QString blastOutput, bitScore < g_settings->blastBitScoreFilter) continue; - Hit hit(query, node, percentIdentity, alignmentLength, - numberMismatches, numberGapOpens, queryStart, queryEnd, - nodeStart, nodeEnd, eValue, bitScore); - - if (g_settings->blastQueryCoverageFilter.on) { - double hitCoveragePercentage = 100.0 * hit.getQueryCoverageFraction(); - if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + auto nodeIt = g_assemblyGraph->m_deBruijnGraphNodes.find(getNodeNameFromString(nodeLabel).toStdString()); + if (nodeIt != g_assemblyGraph->m_deBruijnGraphNodes.end()) { + // Only save BLAST hits that are on forward strands. + if (nodeStart > nodeEnd) continue; + + addNodeHit(query, nodeIt.value(), + queryStart, queryEnd, + nodeStart, nodeEnd, + percentIdentity, numberMismatches, numberGapOpens, + alignmentLength, eValue, bitScore); + } + + auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); + if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { + addPathHit(query, pathIt.value(), + queryStart, queryEnd, + nodeStart, nodeEnd); } - query->addHit(hit); } } diff --git a/graphsearch/blast/blastsearch.h b/graphsearch/blast/blastsearch.h index a25279ee..e1fdc2f3 100644 --- a/graphsearch/blast/blastsearch.h +++ b/graphsearch/blast/blastsearch.h @@ -29,6 +29,9 @@ class QProcess; +namespace search { +class Queries; + class BlastSearch : public search::GraphSearch { Q_OBJECT public: @@ -37,9 +40,11 @@ class BlastSearch : public search::GraphSearch { QString doAutoGraphSearch(const AssemblyGraph &graph, QString queriesFilename, + bool includePaths = false, QString extraParameters = "") override; int loadQueriesFromFile(QString fullFileName) override; - QString buildDatabase(const AssemblyGraph &graph) override; + QString buildDatabase(const AssemblyGraph &graph, + bool includePaths = true) override; QString doSearch(QString extraParameters) override; QString doSearch(search::Queries &queries, QString extraParameters) override; @@ -53,6 +58,8 @@ public slots: private: bool findTools(); + void buildHitsFromBlastOutput(QString blastOutput, + Queries &queries) const; QString runOneBlastSearch(search::QuerySequenceType sequenceType, const search::Queries &queries, @@ -63,3 +70,5 @@ public slots: QProcess *m_buildDb = nullptr, *m_doSearch = nullptr; QString m_makeblastdbCommand, m_blastnCommand, m_tblastnCommand; }; + +} diff --git a/graphsearch/graphsearch.cpp b/graphsearch/graphsearch.cpp index b0ce9655..1aef3e96 100644 --- a/graphsearch/graphsearch.cpp +++ b/graphsearch/graphsearch.cpp @@ -17,7 +17,10 @@ #include "graphsearch.h" #include "graph/annotationsmanager.h" + +#include "graph/assemblygraph.h" #include "program/globals.h" +#include "program/settings.h" #include #include @@ -131,3 +134,63 @@ GraphSearch::DbBuildFinishedRAII::~DbBuildFinishedRAII() { GraphSearch::GraphSearchFinishedRAII::~GraphSearchFinishedRAII() { emit m_search->finishedSearch(m_search->lastError()); } + + +void GraphSearch::addNodeHit(Query *query, DeBruijnNode *node, + int queryStart, int queryEnd, + int nodeStart, int nodeEnd, + double percentIdentity, + int numberMismatches, int numberGapOpens, + int alignmentLength, SciNot eValue, double bitScore) { + if (g_settings->blastAlignmentLengthFilter.on && + alignmentLength < g_settings->blastAlignmentLengthFilter) + return;; + + Hit hit(query, node, + percentIdentity, alignmentLength, + numberMismatches, numberGapOpens, + queryStart, queryEnd, + nodeStart, nodeEnd, eValue, bitScore); + + if (g_settings->blastQueryCoverageFilter.on) { + double hitCoveragePercentage = 100.0 * hit.getQueryCoverageFraction(); + if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + return; + } + + query->addHit(hit); +} + +void GraphSearch::addPathHit(Query *query, Path *path, + int queryStart, int queryEnd, + int pathStart, int pathEnd) { + bool invert = pathStart > pathEnd; + if (invert) + std::swap(pathStart, pathEnd); + + for (auto entry: path->getNodeCovering(pathStart, pathEnd)) { + DeBruijnNode *node = entry.first; + int nodeStart = entry.second.mapped_range.from, nodeEnd = entry.second.mapped_range.to; + if (invert) { + int nodeLength = node->getLength(); + node = node->getReverseComplement(); + nodeEnd = nodeLength - nodeEnd + 1; + nodeStart = nodeLength - nodeStart + 1; + std::swap(nodeEnd, nodeStart); + } + + Hit hit(query, node, -1, nodeEnd - nodeStart + 1, + -1, -1, + queryStart, queryEnd, + nodeStart, nodeEnd, + 0, 0); + + if (g_settings->blastQueryCoverageFilter.on) { + double hitCoveragePercentage = 100.0 * hit.getQueryCoverageFraction(); + if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + continue; + } + + query->addHit(hit); + } +} diff --git a/graphsearch/graphsearch.h b/graphsearch/graphsearch.h index 4d17e46d..1cfaa868 100644 --- a/graphsearch/graphsearch.h +++ b/graphsearch/graphsearch.h @@ -82,10 +82,11 @@ class GraphSearch : public QObject { void emptyTempDirectory() const; virtual int loadQueriesFromFile(QString fullFileName) = 0; - virtual QString buildDatabase(const AssemblyGraph &graph) = 0; + virtual QString buildDatabase(const AssemblyGraph &graph, bool includePaths = true) = 0; virtual QString doSearch(QString extraParameters) = 0; virtual QString doSearch(search::Queries &queries, QString extraParameters) = 0; virtual QString doAutoGraphSearch(const AssemblyGraph &graph, QString queriesFilename, + bool includePaths = false, QString extraParameters = "") = 0; [[nodiscard]] virtual QString name() const = 0; [[nodiscard]] virtual QString queryFormat() const = 0; @@ -95,6 +96,18 @@ class GraphSearch : public QObject { static std::unique_ptr get(GraphSearchKind kind, const QDir &workDir = QDir::temp(), QObject *parent = nullptr); +protected: + static void addNodeHit(Query *query, DeBruijnNode *node, + int queryStart, int queryEnd, + int nodeStart, int nodeEnd, + double percentIdentity, + int numberMismatches, int numberGapOpens, + int alignmentLength, SciNot eValue, double bitScore); + + static void addPathHit(Query *query, Path *path, + int queryStart, int queryEnd, + int pathStart, int pathEnd); + public slots: virtual void cancelDatabaseBuild() {}; virtual void cancelSearch() {}; diff --git a/graphsearch/hmmer/hmmersearch.cpp b/graphsearch/hmmer/hmmersearch.cpp index c58e45f5..1ace30db 100644 --- a/graphsearch/hmmer/hmmersearch.cpp +++ b/graphsearch/hmmer/hmmersearch.cpp @@ -20,6 +20,7 @@ #include "graph/debruijnnode.h" #include "graphsearch/query.h" +#include "program/globals.h" #include "program/settings.h" #include "graph/assemblygraph.h" @@ -52,7 +53,7 @@ bool HmmerSearch::findTools() { return true; } -QString HmmerSearch::buildDatabase(const AssemblyGraph &graph) { +QString HmmerSearch::buildDatabase(const AssemblyGraph &graph, bool includePaths) { DbBuildFinishedRAII watcher(this); m_lastError = ""; @@ -93,6 +94,15 @@ QString HmmerSearch::buildDatabase(const AssemblyGraph &graph) { out << node->getFasta(true, false, false); } + + if (includePaths) { + for (auto it = graph.m_deBruijnGraphPaths.begin(); it != graph.m_deBruijnGraphPaths.end(); ++it) { + if (m_cancelBuildDatabase) + return (m_lastError = "Build cancelled."); + + out << it.value()->getFasta(it.key().c_str()); + } + } } // No need to perform empty checks for AAs as they all are handled above @@ -158,11 +168,6 @@ static QString getNodeNameFromString(const QString &nodeString) { return nodeName; } -static void buildHitsFromTblOut(QString hmmerOutput, - Queries &queries); -static void buildHitsFromDomTblOut(QString hmmerOutput, - Queries &queries); - QString HmmerSearch::doSearch(Queries &queries, QString extraParameters) { GraphSearchFinishedRAII watcher(this); m_lastError = ""; @@ -254,10 +259,11 @@ QString HmmerSearch::doOneSearch(search::QuerySequenceType sequenceType, } QString HmmerSearch::doAutoGraphSearch(const AssemblyGraph &graph, QString queriesFilename, - QString extraParameters) { + bool includePaths, + QString extraParameters) { cleanUp(); - QString maybeError = buildDatabase(graph); // It is expected that buildDatabase will setup last error as well + QString maybeError = buildDatabase(graph, includePaths); // It is expected that buildDatabase will setup last error as well if (!maybeError.isEmpty()) return maybeError; @@ -316,9 +322,9 @@ void HmmerSearch::cancelSearch() { m_doSearch->kill(); } -static void buildHitsFromTblOut(QString hmmerOutput, - Queries &queries) { - QStringList hmmerHitList = hmmerOutput.split("\n", Qt::SkipEmptyParts); +void HmmerSearch::buildHitsFromTblOut(QString hmmerOutput, + Queries &queries) const { + QStringList hmmerHitList = hmmerOutput.split('\n', Qt::SkipEmptyParts); for (const auto &hitString : hmmerHitList) { if (hitString.startsWith('#')) @@ -331,10 +337,6 @@ static void buildHitsFromTblOut(QString hmmerOutput, QString nodeLabel = alignmentParts[0]; QString queryName = alignmentParts[2]; - double percentIdentity = -1; - int numberMismatches = -1; - int numberGapOpens = -1; - int queryStart = alignmentParts[4].toInt(); int queryEnd = alignmentParts[5].toInt(); @@ -346,27 +348,11 @@ static void buildHitsFromTblOut(QString hmmerOutput, SciNot eValue(alignmentParts[12]); double bitScore = alignmentParts[13].toDouble(); - // Only save hits that are on forward strands. - if (nodeStart > nodeEnd) - continue; - - QString nodeName = getNodeNameFromString(nodeLabel); - DeBruijnNode *node = nullptr; - auto it = g_assemblyGraph->m_deBruijnGraphNodes.find(nodeName.toStdString()); - if (it == g_assemblyGraph->m_deBruijnGraphNodes.end()) - continue; - - node = it.value(); - Query *query = queries.getQueryFromName(queryName); if (query == nullptr) continue; // Check the user-defined filters. - if (g_settings->blastAlignmentLengthFilter.on && - alignmentLength < g_settings->blastAlignmentLengthFilter) - continue; - if (g_settings->blastEValueFilter.on && eValue > g_settings->blastEValueFilter) continue; @@ -375,22 +361,30 @@ static void buildHitsFromTblOut(QString hmmerOutput, bitScore < g_settings->blastBitScoreFilter) continue; - Hit hit(query, node, percentIdentity, alignmentLength, - numberMismatches, numberGapOpens, queryStart, queryEnd, - nodeStart, nodeEnd, eValue, bitScore); - - if (g_settings->blastQueryCoverageFilter.on) { - double hitCoveragePercentage = 100.0 * hit.getQueryCoverageFraction(); - if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + auto nodeIt = g_assemblyGraph->m_deBruijnGraphNodes.find(getNodeNameFromString(nodeLabel).toStdString()); + if (nodeIt != g_assemblyGraph->m_deBruijnGraphNodes.end()) { + // Only save hits that are on forward strands. + if (nodeStart > nodeEnd) continue; + + addNodeHit(query, nodeIt.value(), + queryStart, queryEnd, + nodeStart, nodeEnd, + -1, -1, -1, + alignmentLength, eValue, bitScore); } - query->addHit(hit); + auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); + if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { + addPathHit(query, pathIt.value(), + queryStart, queryEnd, + nodeStart, nodeEnd); + } } } -static void buildHitsFromDomTblOut(QString hmmerOutput, - Queries &queries) { +void HmmerSearch::buildHitsFromDomTblOut(QString hmmerOutput, + Queries &queries) const { QStringList hmmerHitList = hmmerOutput.split("\n", Qt::SkipEmptyParts); for (const auto &hitString : hmmerHitList) { @@ -419,35 +413,11 @@ static void buildHitsFromDomTblOut(QString hmmerOutput, SciNot eValue(alignmentParts[6]); double bitScore = alignmentParts[7].toDouble(); - // Only save hits that are on forward strands. - if (nodeStart > nodeEnd) - continue; - - QString nodeName = getNodeNameFromString(nodeLabel); - DeBruijnNode *node = nullptr; - auto it = g_assemblyGraph->m_deBruijnGraphNodes.find(nodeName.toStdString()); - if (it == g_assemblyGraph->m_deBruijnGraphNodes.end()) - continue; - - node = it.value(); - bool ok = false; - unsigned shift = nodeLabel.last(1).toInt(&ok); - if (!ok || shift > 2) - continue; - - nodeStart = (nodeStart - 1) * 3 + shift + 1; - nodeEnd = (nodeEnd - 1) * 3 + shift + 1; - - Query *query = queries.getQueryFromName(queryName); if (query == nullptr) continue; // Check the user-defined filters. - if (g_settings->blastAlignmentLengthFilter.on && - alignmentLength < g_settings->blastAlignmentLengthFilter) - continue; - if (g_settings->blastEValueFilter.on && eValue > g_settings->blastEValueFilter) continue; @@ -456,16 +426,25 @@ static void buildHitsFromDomTblOut(QString hmmerOutput, bitScore < g_settings->blastBitScoreFilter) continue; - Hit hit(query, node, percentIdentity, alignmentLength, - numberMismatches, numberGapOpens, queryStart, queryEnd, - nodeStart, nodeEnd, eValue, bitScore); + auto nodeIt = g_assemblyGraph->m_deBruijnGraphNodes.find(getNodeNameFromString(nodeLabel).toStdString()); + if (nodeIt != g_assemblyGraph->m_deBruijnGraphNodes.end()) { + // Only save hits that are on forward strands. + if (nodeStart > nodeEnd) + continue; - if (g_settings->blastQueryCoverageFilter.on) { - double hitCoveragePercentage = 100.0 * hit.getQueryCoverageFraction(); - if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + bool ok = false; + unsigned shift = nodeLabel.last(1).toInt(&ok); + if (!ok || shift > 2) continue; - } - query->addHit(hit); + nodeStart = (nodeStart - 1) * 3 + shift + 1; + nodeEnd = (nodeEnd - 1) * 3 + shift + 1; + + addNodeHit(query, nodeIt.value(), + queryStart, queryEnd, + nodeStart, nodeEnd, + -1, -1, -1, + alignmentLength, eValue, bitScore); + } } } diff --git a/graphsearch/hmmer/hmmersearch.h b/graphsearch/hmmer/hmmersearch.h index 07b38300..8a58e577 100644 --- a/graphsearch/hmmer/hmmersearch.h +++ b/graphsearch/hmmer/hmmersearch.h @@ -24,7 +24,11 @@ class QProcess; -class HmmerSearch : public search::GraphSearch { +namespace search { + +class Queries; + +class HmmerSearch : public GraphSearch { Q_OBJECT public: explicit HmmerSearch(const QDir &workDir = QDir::temp(), QObject *parent = nullptr); @@ -32,9 +36,11 @@ class HmmerSearch : public search::GraphSearch { QString doAutoGraphSearch(const AssemblyGraph &graph, QString queriesFilename, + bool includePaths = false, QString extraParameters = "") override; int loadQueriesFromFile(QString fullFileName) override; - QString buildDatabase(const AssemblyGraph &graph) override; + QString buildDatabase(const AssemblyGraph &graph, + bool includePaths = true) override; QString doSearch(QString extraParameters) override; QString doSearch(search::Queries &queries, QString extraParameters) override; @@ -49,6 +55,10 @@ public slots: private: bool findTools(); + void buildHitsFromTblOut(QString hmmerOutput, + Queries &queries) const; + void buildHitsFromDomTblOut(QString hmmerOutput, + Queries &queries) const; QString doOneSearch(search::QuerySequenceType sequenceType, search::Queries &queries, QString extraParameters); @@ -58,3 +68,5 @@ public slots: QProcess *m_buildDb = nullptr, *m_doSearch = nullptr; QString m_nhmmerCommand, m_hmmerCommand; }; + +} diff --git a/graphsearch/minimap2/minimap2search.cpp b/graphsearch/minimap2/minimap2search.cpp index 6553c50e..f3f616aa 100644 --- a/graphsearch/minimap2/minimap2search.cpp +++ b/graphsearch/minimap2/minimap2search.cpp @@ -42,7 +42,7 @@ bool Minimap2Search::findTools() { return true; } -QString Minimap2Search::buildDatabase(const AssemblyGraph &graph) { +QString Minimap2Search::buildDatabase(const AssemblyGraph &graph, bool includePaths) { DbBuildFinishedRAII watcher(this); m_lastError = ""; if (!findTools()) @@ -57,14 +57,24 @@ QString Minimap2Search::buildDatabase(const AssemblyGraph &graph) { if (!file.open(QIODevice::WriteOnly | QIODevice::Text)) return (m_lastError = "Failed to open: " + file.fileName()); - QTextStream out(&file); - for (const auto *node : graph.m_deBruijnGraphNodes) { - if (m_cancelBuildDatabase) - return (m_lastError = "Build cancelled."); + { + QTextStream out(&file); + for (const auto *node : graph.m_deBruijnGraphNodes) { + if (m_cancelBuildDatabase) + return (m_lastError = "Build cancelled."); + + out << node->getFasta(true, false, false); + } + + if (includePaths) { + for (auto it = graph.m_deBruijnGraphPaths.begin(); it != graph.m_deBruijnGraphPaths.end(); ++it) { + if (m_cancelBuildDatabase) + return (m_lastError = "Build cancelled."); - out << node->getFasta(true, false, false); + out << it.value()->getFasta(it.key().c_str()); + } + } } - file.close(); // Make sure the graph has sequences bool atLeastOneSequence = false; @@ -141,8 +151,8 @@ static QString getNodeNameFromString(const QString &nodeString) { return nodeName; } -static void buildHitsFromPAF(const QString &PAF, - Queries &queries) { +void Minimap2Search::buildHitsFromPAF(const QString &PAF, + Queries &queries) const { QStringList hitList = PAF.split("\n", Qt::SkipEmptyParts); for (const auto &hitString : hitList) { @@ -161,44 +171,29 @@ static void buildHitsFromPAF(const QString &PAF, int alignmentLength = alignmentParts[10].toInt(); - double percentIdentity = -1; - int numberMismatches = -1; - int numberGapOpens = -1; - - SciNot eValue(0); - double bitScore = 0; - - if (!strand) - continue; - - QString nodeName = getNodeNameFromString(nodeLabel); - DeBruijnNode *node = nullptr; - auto it = g_assemblyGraph->m_deBruijnGraphNodes.find(nodeName.toStdString()); - if (it == g_assemblyGraph->m_deBruijnGraphNodes.end()) - continue; - - node = it.value(); - Query *query = queries.getQueryFromName(queryName); if (query == nullptr) continue; - // Check the user-defined filters. - if (g_settings->blastAlignmentLengthFilter.on && - alignmentLength < g_settings->blastAlignmentLengthFilter) - continue; + auto nodeIt = g_assemblyGraph->m_deBruijnGraphNodes.find(getNodeNameFromString(nodeLabel).toStdString()); + if (nodeIt != g_assemblyGraph->m_deBruijnGraphNodes.end()) { + if (!strand) + continue; - Hit hit(query, node, percentIdentity, alignmentLength, - numberMismatches, numberGapOpens, queryStart, queryEnd, - nodeStart, nodeEnd, eValue, bitScore); + addNodeHit(query, nodeIt.value(), + queryStart, queryEnd, + nodeStart, nodeEnd, + -1, -1, -1, + alignmentLength, 0, 0); + } - if (g_settings->blastQueryCoverageFilter.on) { - double hitCoveragePercentage = 100.0 * hit.getQueryCoverageFraction(); - if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) - continue; + auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); + if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { + addPathHit(query, pathIt.value(), + queryStart, queryEnd, + nodeStart, nodeEnd); } - query->addHit(hit); } } @@ -267,10 +262,11 @@ QString Minimap2Search::doSearch(Queries &queries, QString extraParameters) { } QString Minimap2Search::doAutoGraphSearch(const AssemblyGraph &graph, QString queriesFilename, + bool includePaths, QString extraParameters) { cleanUp(); - QString maybeError = buildDatabase(graph); // It is expected that buildDatabase will setup last error as well + QString maybeError = buildDatabase(graph, includePaths); // It is expected that buildDatabase will setup last error as well if (!maybeError.isEmpty()) return maybeError; diff --git a/graphsearch/minimap2/minimap2search.h b/graphsearch/minimap2/minimap2search.h index 9c6f2414..0bd3f882 100644 --- a/graphsearch/minimap2/minimap2search.h +++ b/graphsearch/minimap2/minimap2search.h @@ -24,6 +24,10 @@ class QProcess; +namespace search { + +class Queries; + class Minimap2Search : public search::GraphSearch { Q_OBJECT public: @@ -32,9 +36,11 @@ class Minimap2Search : public search::GraphSearch { QString doAutoGraphSearch(const AssemblyGraph &graph, QString queriesFilename, + bool includePaths = false, QString extraParameters = "") override; int loadQueriesFromFile(QString fullFileName) override; - QString buildDatabase(const AssemblyGraph &graph) override; + QString buildDatabase(const AssemblyGraph &graph, + bool includePaths = true) override; QString doSearch(QString extraParameters) override; QString doSearch(search::Queries &queries, QString extraParameters) override; @@ -48,9 +54,12 @@ public slots: private: bool findTools(); + void buildHitsFromPAF(const QString &PAF, Queries &queries) const; bool m_cancelBuildDatabase = false, m_cancelSearch = false; QProcess *m_buildDb = nullptr, *m_doSearch = nullptr; QString m_minimap2Command; }; + +} diff --git a/graphsearch/query.h b/graphsearch/query.h index 2580133e..373abf50 100644 --- a/graphsearch/query.h +++ b/graphsearch/query.h @@ -19,7 +19,7 @@ #pragma once #include "querypath.h" -#include "graphsearch/hit.h" +#include "hit.h" #include #include @@ -68,6 +68,7 @@ namespace search { void setAsSearchedFor() { m_searchedFor = true; } void findQueryPaths(); + void addQueryPath(QueryPath path) { m_paths.emplace_back(path); } void setColour(QColor newColour) { m_colour = newColour; } void setShown(bool newShown) { m_shown = newShown; } diff --git a/graphsearch/querypath.cpp b/graphsearch/querypath.cpp index f42958b5..478d1f22 100644 --- a/graphsearch/querypath.cpp +++ b/graphsearch/querypath.cpp @@ -29,8 +29,8 @@ using namespace search; -QueryPath::QueryPath(Path path, Query *query) : - m_path(std::move(path)), m_query(query) { +QueryPath::QueryPath(Path path, Query *query) + : m_path(std::move(path)), m_query(query) { //This function follows the path, returning the BLAST hits it finds for the //query. It requires that the hits occur in order, i.e. that each hit in //the path begins later in the query than the previous hit. @@ -68,6 +68,9 @@ QueryPath::QueryPath(Path path, Query *query) : } } +QueryPath::QueryPath(Path path, Query *query, std::vector hits) + : m_path(std::move(path)), m_query(query), m_hits(std::move(hits)) {} + double QueryPath::getMeanHitPercIdentity() const { int totalHitLength = 0; double sum = 0.0; @@ -83,9 +86,6 @@ double QueryPath::getMeanHitPercIdentity() const { return totalHitLength == 0 ? 0.0 : sum / totalHitLength; } - - - //This function looks at all of the hits in the path for this query and //multiplies the e-values together. If the hits overlap each other, then //this function reduces the e-values accoringly (effectively to prevent @@ -124,8 +124,7 @@ SciNot QueryPath::getEvalueProduct() const { } -int QueryPath::getHitOverlap(const Hit * hit1, const Hit * hit2) const -{ +int QueryPath::getHitOverlap(const Hit * hit1, const Hit * hit2) const { int hit1Start, hit1End, hit2Start, hit2End; QPair possibleEdge(hit1->m_node, hit2->m_node); diff --git a/graphsearch/querypath.h b/graphsearch/querypath.h index 8042d475..6cd06fb4 100644 --- a/graphsearch/querypath.h +++ b/graphsearch/querypath.h @@ -28,8 +28,9 @@ namespace search { class QueryPath { public: - //CREATORS + // CREATORS QueryPath(Path path, Query *query); + QueryPath(Path path, Query *query, std::vector hits); //ACCESSORS Path getPath() const { return m_path; } @@ -56,4 +57,4 @@ namespace search { int getHitQueryLength() const; int getHitOverlap(const Hit *hit1, const Hit *hit2) const; }; -} \ No newline at end of file +} diff --git a/program/globals.cpp b/program/globals.cpp index 6567c863..46ecae1f 100644 --- a/program/globals.cpp +++ b/program/globals.cpp @@ -24,7 +24,7 @@ QSharedPointer g_settings; QSharedPointer g_memory; BandageGraphicsView * g_graphicsView; double g_absoluteZoom; -QSharedPointer g_blastSearch; +QSharedPointer g_blastSearch; QSharedPointer g_assemblyGraph; std::shared_ptr g_annotationsManager; @@ -82,4 +82,4 @@ QString formatDepthForDisplay(double depth) { decimals += 1; } return formatDoubleForDisplay(depth, decimals) + "x"; -} \ No newline at end of file +} diff --git a/program/globals.h b/program/globals.h index 7732421f..b15600b2 100644 --- a/program/globals.h +++ b/program/globals.h @@ -27,10 +27,13 @@ class Settings; class Memory; class BandageGraphicsView; -class BlastSearch; class AssemblyGraph; class AnnotationsManager; +namespace search { +class BlastSearch; +}; + // Some of the program's common components are made global, so they don't have // to be passed around as parameters. // FIXME: This is bad. Factor this out when possible @@ -38,7 +41,7 @@ extern QSharedPointer g_settings; extern QSharedPointer g_memory; extern BandageGraphicsView * g_graphicsView; extern double g_absoluteZoom; -extern QSharedPointer g_blastSearch; +extern QSharedPointer g_blastSearch; extern QSharedPointer g_assemblyGraph; extern std::shared_ptr g_annotationsManager; diff --git a/program/main.cpp b/program/main.cpp index 8666e959..d3a8335b 100644 --- a/program/main.cpp +++ b/program/main.cpp @@ -111,7 +111,7 @@ int main(int argc, char *argv[]) //Create the important global objects. g_settings.reset(new Settings()); g_memory.reset(new Memory()); - g_blastSearch.reset(new BlastSearch()); + g_blastSearch.reset(new search::BlastSearch()); g_assemblyGraph.reset(new AssemblyGraph()); g_graphicsView = new BandageGraphicsView(); g_annotationsManager = std::make_shared(); diff --git a/tests/bandagetests.cpp b/tests/bandagetests.cpp index 69267f03..0a424856 100644 --- a/tests/bandagetests.cpp +++ b/tests/bandagetests.cpp @@ -89,7 +89,7 @@ private slots: void init() { g_settings.reset(new Settings()); g_memory.reset(new Memory()); - g_blastSearch.reset(new BlastSearch(QDir("."))); + g_blastSearch.reset(new search::BlastSearch(QDir("."))); g_assemblyGraph.reset(new AssemblyGraph()); g_annotationsManager = std::make_shared(); } From 899760e6268b2eced8fd25d97c0e5e30098a7553 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 24 Aug 2022 14:11:13 +0200 Subject: [PATCH 2/6] Ensure Hit pointers are always stable --- graph/annotationsmanager.cpp | 10 +++++----- graphsearch/graphsearch.cpp | 4 ++-- graphsearch/queries.cpp | 12 ++++++------ graphsearch/queries.h | 4 ++-- graphsearch/query.cpp | 26 +++++++++++++------------- graphsearch/query.h | 11 +++++++++-- graphsearch/querypath.cpp | 6 +++--- graphsearch/querypath.h | 4 ++-- tests/bandagetests.cpp | 26 +++++++++++++------------- ui/dialogs/graphsearchdialog.cpp | 32 ++++++++++++++++---------------- ui/dialogs/graphsearchdialog.h | 8 ++++---- 11 files changed, 75 insertions(+), 68 deletions(-) diff --git a/graph/annotationsmanager.cpp b/graph/annotationsmanager.cpp index a16a6a0f..674fe2c3 100644 --- a/graph/annotationsmanager.cpp +++ b/graph/annotationsmanager.cpp @@ -96,14 +96,14 @@ void AnnotationsManager::updateGroupFromHits(const QString &name, const std::vec auto &group = createAnnotationGroup(name); for (auto *query: queries) { for (const auto &hit: query->getHits()) { - auto &annotation = group.annotationMap[hit.m_node].emplace_back( + auto &annotation = group.annotationMap[hit->m_node].emplace_back( std::make_unique( - hit.m_nodeStart, - hit.m_nodeEnd, + hit->m_nodeStart, + hit->m_nodeEnd, query->getName().toStdString())); annotation->addView(std::make_unique(1.0, query->getColour())); - annotation->addView(std::make_unique(hit.queryStartFraction(), - hit.queryEndFraction())); + annotation->addView(std::make_unique(hit->queryStartFraction(), + hit->queryEndFraction())); } } diff --git a/graphsearch/graphsearch.cpp b/graphsearch/graphsearch.cpp index 1aef3e96..e6a42eb5 100644 --- a/graphsearch/graphsearch.cpp +++ b/graphsearch/graphsearch.cpp @@ -158,7 +158,7 @@ void GraphSearch::addNodeHit(Query *query, DeBruijnNode *node, return; } - query->addHit(hit); + query->emplaceHit(hit); } void GraphSearch::addPathHit(Query *query, Path *path, @@ -191,6 +191,6 @@ void GraphSearch::addPathHit(Query *query, Path *path, continue; } - query->addHit(hit); + query->emplaceHit(hit); } } diff --git a/graphsearch/queries.cpp b/graphsearch/queries.cpp index f37bba62..c0098f6c 100644 --- a/graphsearch/queries.cpp +++ b/graphsearch/queries.cpp @@ -149,12 +149,12 @@ void Queries::findQueryPaths() { query->findQueryPaths(); } -std::vector Queries::allHits() const { - std::vector res; +Query::Hits Queries::allHits() const { + Query::Hits res; for (const auto *query : m_queries) { - const auto &hits = query->getHits(); - res.insert(res.end(), hits.begin(), hits.end()); + for (const auto &hit : query->getHits()) + res.push_back(hit.get()); } return res; @@ -171,12 +171,12 @@ std::vector Queries::getNodesFromHits(const QString& queryName) // Add pointers to nodes that have a hit for the selected target(s). for (auto *currentQuery: m_queries) { for (const auto &hit: currentQuery->getHits()) - returnVector.push_back(hit.m_node); + returnVector.push_back(hit->m_node); } } else { if (Query *query = getQueryFromName(queryName)) { for (const auto &hit: query->getHits()) - returnVector.push_back(hit.m_node); + returnVector.push_back(hit->m_node); } } diff --git a/graphsearch/queries.h b/graphsearch/queries.h index b5dcb284..f8063562 100644 --- a/graphsearch/queries.h +++ b/graphsearch/queries.h @@ -51,7 +51,7 @@ class Queries { size_t getQueryPathCount() const ; size_t getQueryCount(QuerySequenceType sequenceType) const; bool isQueryPresent(const Query * query) const; - std::vector allHits() const; + Query::Hits allHits() const; const auto &queries() const { return m_queries; } auto &queries() { return m_queries; } Query *query(size_t idx) { return m_queries[idx]; } @@ -67,4 +67,4 @@ class Queries { std::vector m_presetColours; }; -} \ No newline at end of file +} diff --git a/graphsearch/query.cpp b/graphsearch/query.cpp index bddaf1af..36e50718 100644 --- a/graphsearch/query.cpp +++ b/graphsearch/query.cpp @@ -88,24 +88,24 @@ void Query::findQueryPaths() { if (m_sequenceType == PROTEIN) queryLength *= 3; - //Find all possible path starts within an acceptable distance from the query - //start. - std::vector possibleStarts; + // Find all possible path starts within an acceptable distance from the query + // start. + Hits possibleStarts; double acceptableStartFraction = 1.0 - g_settings->minQueryCoveredByPath; - for (auto &hit : m_hits) { - if (hit.queryStartFraction() <= acceptableStartFraction) - possibleStarts.push_back(&hit); + for (const auto &hit : m_hits) { + if (hit->queryStartFraction() <= acceptableStartFraction) + possibleStarts.push_back(hit.get()); } - //Find all possible path ends. + // Find all possible path ends. std::vector possibleEnds; double acceptableEndFraction = g_settings->minQueryCoveredByPath; - for (auto &hit : m_hits) { - if (hit.queryEndFraction() >= acceptableEndFraction) - possibleEnds.push_back(&hit); + for (const auto &hit : m_hits) { + if (hit->queryEndFraction() >= acceptableEndFraction) + possibleEnds.push_back(hit.get()); } - //For each possible start, find paths to each possible end. + // For each possible start, find paths to each possible end. QList possiblePaths; for (auto start : possibleStarts) { GraphLocation startLocation = start->getHitStart(); @@ -213,7 +213,7 @@ void Query::findQueryPaths() { //If a list of BLAST hits is passed to the function, it only looks in those //hits. If no such list is passed, it looks in all hits for this query. // http://stackoverflow.com/questions/5276686/merging-ranges-in-c -double Query::fractionCoveredByHits(const std::vector &hitsToCheck) const { +double Query::fractionCoveredByHits(const Hits &hitsToCheck) const { int hitBases = 0; int queryLength = getLength(); if (queryLength == 0) @@ -222,7 +222,7 @@ double Query::fractionCoveredByHits(const std::vector &hitsToCheck) std::vector > ranges; if (hitsToCheck.empty()) { for (const auto &hit : m_hits) - ranges.emplace_back(hit.m_queryStart - 1, hit.m_queryEnd); + ranges.emplace_back(hit->m_queryStart - 1, hit->m_queryEnd); } else { for (auto *hit : hitsToCheck) { ranges.emplace_back(hit->m_queryStart - 1, hit->m_queryEnd); diff --git a/graphsearch/query.h b/graphsearch/query.h index 373abf50..5a6bd482 100644 --- a/graphsearch/query.h +++ b/graphsearch/query.h @@ -23,6 +23,7 @@ #include #include +#include #include namespace search { @@ -33,6 +34,8 @@ namespace search { class Query { public: + using Hits = std::vector; + //CREATORS Query(QString name, QString sequence, QByteArray aux = QByteArray()); @@ -62,7 +65,11 @@ namespace search { // MODIFIERS void setName(QString newName) { m_name = std::move(newName); } - void addHit(Hit newHit) { m_hits.emplace_back(newHit); } + template + void emplaceHit(Args&&... args) { + m_hits.emplace_back(new Hit(std::forward(args)...)); + } + void addHit(Hit *newHit) { m_hits.emplace_back(newHit); } void clearSearchResults(); void setAsSearchedFor() { m_searchedFor = true; } @@ -76,7 +83,7 @@ namespace search { QString m_name; QString m_sequence; QByteArray m_aux; - std::vector m_hits; + std::vector> m_hits; QuerySequenceType m_sequenceType; bool m_searchedFor = false; bool m_shown = true; diff --git a/graphsearch/querypath.cpp b/graphsearch/querypath.cpp index 478d1f22..f0812b20 100644 --- a/graphsearch/querypath.cpp +++ b/graphsearch/querypath.cpp @@ -40,10 +40,10 @@ QueryPath::QueryPath(Path path, Query *query) for (int i = 0; i < pathNodes.size(); ++i) { DeBruijnNode * node = pathNodes[i]; - std::vector hitsThisNode; + Query::Hits hitsThisNode; for (auto &queryHit : query->getHits()) { - if (queryHit.m_node == node) - hitsThisNode.push_back(&queryHit); + if (queryHit->m_node == node) + hitsThisNode.push_back(queryHit.get()); } std::sort(hitsThisNode.begin(), hitsThisNode.end(), diff --git a/graphsearch/querypath.h b/graphsearch/querypath.h index 6cd06fb4..1d655843 100644 --- a/graphsearch/querypath.h +++ b/graphsearch/querypath.h @@ -18,7 +18,7 @@ #pragma once -#include "graphsearch/hit.h" +#include "hit.h" #include "graph/path.h" #include "program/scinot.h" @@ -52,7 +52,7 @@ namespace search { private: Path m_path; Query *m_query; - std::vector m_hits; + std::vector m_hits; int getHitQueryLength() const; int getHitOverlap(const Hit *hit1, const Hit *hit2) const; diff --git a/tests/bandagetests.cpp b/tests/bandagetests.cpp index 0a424856..3e7a02c5 100644 --- a/tests/bandagetests.cpp +++ b/tests/bandagetests.cpp @@ -548,19 +548,19 @@ void BandageTests::blastSearch() const auto &one_insertionHit = one_insertion->getHits().at(0); const auto &one_deletionHit = one_deletion->getHits().at(0); - QCOMPARE(exactHit.m_numberMismatches, 0); - QCOMPARE(exactHit.m_numberGapOpens, 0); - QCOMPARE(one_mismatchHit.m_numberMismatches, 1); - QCOMPARE(one_mismatchHit.m_numberGapOpens, 0); - QCOMPARE(one_insertionHit.m_numberMismatches, 0); - QCOMPARE(one_insertionHit.m_numberGapOpens, 1); - QCOMPARE(one_deletionHit.m_numberMismatches, 0); - QCOMPARE(one_deletionHit.m_numberGapOpens, 1); - - QCOMPARE(exactHit.m_percentIdentity < 100.0, false); - QCOMPARE(one_mismatchHit.m_percentIdentity < 100.0, true); - QCOMPARE(one_insertionHit.m_percentIdentity < 100.0, true); - QCOMPARE(one_deletionHit.m_percentIdentity < 100.0, true); + QCOMPARE(exactHit->m_numberMismatches, 0); + QCOMPARE(exactHit->m_numberGapOpens, 0); + QCOMPARE(one_mismatchHit->m_numberMismatches, 1); + QCOMPARE(one_mismatchHit->m_numberGapOpens, 0); + QCOMPARE(one_insertionHit->m_numberMismatches, 0); + QCOMPARE(one_insertionHit->m_numberGapOpens, 1); + QCOMPARE(one_deletionHit->m_numberMismatches, 0); + QCOMPARE(one_deletionHit->m_numberGapOpens, 1); + + QCOMPARE(exactHit->m_percentIdentity < 100.0, false); + QCOMPARE(one_mismatchHit->m_percentIdentity < 100.0, true); + QCOMPARE(one_insertionHit->m_percentIdentity < 100.0, true); + QCOMPARE(one_deletionHit->m_percentIdentity < 100.0, true); } diff --git a/ui/dialogs/graphsearchdialog.cpp b/ui/dialogs/graphsearchdialog.cpp index a648e672..33372c96 100644 --- a/ui/dialogs/graphsearchdialog.cpp +++ b/ui/dialogs/graphsearchdialog.cpp @@ -821,7 +821,7 @@ QVariant HitsListModel::data(const QModelIndex &index, int role) const { auto column = HitsColumns(index.column()); const auto &hit = m_hits[index.row()]; - const auto &hitQuery = *hit.m_query; + const auto &hitQuery = *hit->m_query; if (role == Qt::BackgroundRole) { if (hitQuery.isHidden()) // Hide disabled queries @@ -839,35 +839,35 @@ QVariant HitsListModel::data(const QModelIndex &index, int role) const { case HitsColumns::QueryName: return hitQuery.getName(); case HitsColumns::NodeName: - return hit.m_node->getName(); + return hit->m_node->getName(); case HitsColumns::PercentIdentity: - if (hit.m_percentIdentity > 0) - return formatDoubleForDisplay(hit.m_percentIdentity, 2) + "%"; + if (hit->m_percentIdentity > 0) + return formatDoubleForDisplay(hit->m_percentIdentity, 2) + "%"; return "N/A"; case HitsColumns::AlignmentLength: - return hit.m_alignmentLength; + return hit->m_alignmentLength; case HitsColumns::QueryCover: - return formatDoubleForDisplay(100.0 * hit.getQueryCoverageFraction(), 2) + "%"; + return formatDoubleForDisplay(100.0 * hit->getQueryCoverageFraction(), 2) + "%"; case HitsColumns::Mismatches: - if (hit.m_numberMismatches < 0) + if (hit->m_numberMismatches < 0) return "N/A"; - return hit.m_numberMismatches; + return hit->m_numberMismatches; case HitsColumns::GapOpens: - if (hit.m_numberGapOpens < 0) + if (hit->m_numberGapOpens < 0) return "N/A"; - return hit.m_numberGapOpens; + return hit->m_numberGapOpens; case HitsColumns::QueryStart: - return hit.m_queryStart; + return hit->m_queryStart; case HitsColumns::QueryEnd: - return hit.m_queryEnd; + return hit->m_queryEnd; case HitsColumns::NodeStart: - return hit.m_nodeStart; + return hit->m_nodeStart; case HitsColumns::NodeEnd: - return hit.m_nodeEnd; + return hit->m_nodeEnd; case HitsColumns::Evalue: - return hit.m_eValue.asString(false); + return hit->m_eValue.asString(false); case HitsColumns::BitScore: - return hit.m_bitScore; + return hit->m_bitScore; } return {}; diff --git a/ui/dialogs/graphsearchdialog.h b/ui/dialogs/graphsearchdialog.h index c9811827..5faecfd5 100644 --- a/ui/dialogs/graphsearchdialog.h +++ b/ui/dialogs/graphsearchdialog.h @@ -19,6 +19,8 @@ #pragma once +#include "graphsearch/query.h" + #include #include #include @@ -30,8 +32,6 @@ namespace search { class GraphSearch; } -using Hits = std::vector; - namespace Ui { class GraphSearchDialog; } @@ -100,7 +100,7 @@ class HitsListModel : public QAbstractTableModel { void endUpdate() { endResetModel(); } bool empty() const { return m_hits.empty(); } - Hits m_hits; + search::Query::Hits m_hits; }; class GraphSearchDialog : public QDialog { @@ -147,4 +147,4 @@ private slots: signals: void changed(); void queryPathSelectionChanged(); -}; \ No newline at end of file +}; From 3acfa00cb118e844faba114000d19add23ec73aa Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 24 Aug 2022 14:36:03 +0200 Subject: [PATCH 3/6] Simplify filtering --- graphsearch/blast/blastsearch.cpp | 11 +++++++ graphsearch/graphsearch.cpp | 41 ++++++------------------- graphsearch/hit.cpp | 15 ++++++--- graphsearch/hit.h | 4 ++- graphsearch/hmmer/hmmersearch.cpp | 22 +++++++++++++ graphsearch/minimap2/minimap2search.cpp | 11 +++++++ 6 files changed, 68 insertions(+), 36 deletions(-) diff --git a/graphsearch/blast/blastsearch.cpp b/graphsearch/blast/blastsearch.cpp index b7ca8c87..8f933612 100644 --- a/graphsearch/blast/blastsearch.cpp +++ b/graphsearch/blast/blastsearch.cpp @@ -366,6 +366,17 @@ void BlastSearch::buildHitsFromBlastOutput(QString blastOutput, bitScore < g_settings->blastBitScoreFilter) continue; + if (g_settings->blastAlignmentLengthFilter.on && + alignmentLength < g_settings->blastAlignmentLengthFilter) + continue; + + if (g_settings->blastQueryCoverageFilter.on) { + double hitCoveragePercentage = 100.0 * Hit::getQueryCoverageFraction(query, + queryStart, queryEnd); + if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + continue; + } + auto nodeIt = g_assemblyGraph->m_deBruijnGraphNodes.find(getNodeNameFromString(nodeLabel).toStdString()); if (nodeIt != g_assemblyGraph->m_deBruijnGraphNodes.end()) { // Only save BLAST hits that are on forward strands. diff --git a/graphsearch/graphsearch.cpp b/graphsearch/graphsearch.cpp index e6a42eb5..d0079442 100644 --- a/graphsearch/graphsearch.cpp +++ b/graphsearch/graphsearch.cpp @@ -20,7 +20,6 @@ #include "graph/assemblygraph.h" #include "program/globals.h" -#include "program/settings.h" #include #include @@ -142,23 +141,11 @@ void GraphSearch::addNodeHit(Query *query, DeBruijnNode *node, double percentIdentity, int numberMismatches, int numberGapOpens, int alignmentLength, SciNot eValue, double bitScore) { - if (g_settings->blastAlignmentLengthFilter.on && - alignmentLength < g_settings->blastAlignmentLengthFilter) - return;; - - Hit hit(query, node, - percentIdentity, alignmentLength, - numberMismatches, numberGapOpens, - queryStart, queryEnd, - nodeStart, nodeEnd, eValue, bitScore); - - if (g_settings->blastQueryCoverageFilter.on) { - double hitCoveragePercentage = 100.0 * hit.getQueryCoverageFraction(); - if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) - return; - } - - query->emplaceHit(hit); + query->emplaceHit(query, node, + percentIdentity, alignmentLength, + numberMismatches, numberGapOpens, + queryStart, queryEnd, + nodeStart, nodeEnd, eValue, bitScore); } void GraphSearch::addPathHit(Query *query, Path *path, @@ -179,18 +166,10 @@ void GraphSearch::addPathHit(Query *query, Path *path, std::swap(nodeEnd, nodeStart); } - Hit hit(query, node, -1, nodeEnd - nodeStart + 1, - -1, -1, - queryStart, queryEnd, - nodeStart, nodeEnd, - 0, 0); - - if (g_settings->blastQueryCoverageFilter.on) { - double hitCoveragePercentage = 100.0 * hit.getQueryCoverageFraction(); - if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) - continue; - } - - query->emplaceHit(hit); + query->emplaceHit(query, node, -1, nodeEnd - nodeStart + 1, + -1, -1, + queryStart, queryEnd, + nodeStart, nodeEnd, + 0, 0); } } diff --git a/graphsearch/hit.cpp b/graphsearch/hit.cpp index 421997ca..9479e856 100644 --- a/graphsearch/hit.cpp +++ b/graphsearch/hit.cpp @@ -53,9 +53,10 @@ double Hit::queryEndFraction() const { return double(m_queryEnd - 1) / m_query->getLength(); } -double Hit::getQueryCoverageFraction() const { - int queryRegionSize = m_queryEnd - m_queryStart + 1; - int queryLength = m_query->getLength(); +double Hit::getQueryCoverageFraction(Query *query, + int queryStart, int queryEnd) { + int queryRegionSize = queryEnd - queryStart + 1; + int queryLength = query->getLength(); if (queryLength == 0) return 0.0; @@ -64,6 +65,12 @@ double Hit::getQueryCoverageFraction() const { } +double Hit::getQueryCoverageFraction() const { + return getQueryCoverageFraction(m_query, + m_queryStart, m_queryEnd); +} + + GraphLocation Hit::getHitStart() const { return {m_node, m_nodeStart}; } @@ -75,4 +82,4 @@ GraphLocation Hit::getHitEnd() const { //This function returns the node sequence for this hit. QByteArray Hit::getNodeSequence() const { return utils::sequenceToQByteArray(m_node->getSequence().Subseq(m_nodeStart-1, m_nodeEnd)); -} \ No newline at end of file +} diff --git a/graphsearch/hit.h b/graphsearch/hit.h index 1d34cd1b..a3220ac6 100644 --- a/graphsearch/hit.h +++ b/graphsearch/hit.h @@ -51,6 +51,8 @@ namespace search { double m_bitScore; double getQueryCoverageFraction() const; + static double getQueryCoverageFraction(Query *query, + int queryStart, int queryEnd); GraphLocation getHitStart() const; GraphLocation getHitEnd() const; @@ -63,4 +65,4 @@ namespace search { double queryStartFraction() const; double queryEndFraction() const; }; -} \ No newline at end of file +} diff --git a/graphsearch/hmmer/hmmersearch.cpp b/graphsearch/hmmer/hmmersearch.cpp index 1ace30db..0107e4b6 100644 --- a/graphsearch/hmmer/hmmersearch.cpp +++ b/graphsearch/hmmer/hmmersearch.cpp @@ -361,6 +361,17 @@ void HmmerSearch::buildHitsFromTblOut(QString hmmerOutput, bitScore < g_settings->blastBitScoreFilter) continue; + if (g_settings->blastAlignmentLengthFilter.on && + alignmentLength < g_settings->blastAlignmentLengthFilter) + continue; + + if (g_settings->blastQueryCoverageFilter.on) { + double hitCoveragePercentage = 100.0 * Hit::getQueryCoverageFraction(query, + queryStart, queryEnd); + if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + continue; + } + auto nodeIt = g_assemblyGraph->m_deBruijnGraphNodes.find(getNodeNameFromString(nodeLabel).toStdString()); if (nodeIt != g_assemblyGraph->m_deBruijnGraphNodes.end()) { // Only save hits that are on forward strands. @@ -426,6 +437,17 @@ void HmmerSearch::buildHitsFromDomTblOut(QString hmmerOutput, bitScore < g_settings->blastBitScoreFilter) continue; + if (g_settings->blastAlignmentLengthFilter.on && + alignmentLength < g_settings->blastAlignmentLengthFilter) + continue; + + if (g_settings->blastQueryCoverageFilter.on) { + double hitCoveragePercentage = 100.0 * Hit::getQueryCoverageFraction(query, + queryStart, queryEnd); + if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + continue; + } + auto nodeIt = g_assemblyGraph->m_deBruijnGraphNodes.find(getNodeNameFromString(nodeLabel).toStdString()); if (nodeIt != g_assemblyGraph->m_deBruijnGraphNodes.end()) { // Only save hits that are on forward strands. diff --git a/graphsearch/minimap2/minimap2search.cpp b/graphsearch/minimap2/minimap2search.cpp index f3f616aa..2bf58120 100644 --- a/graphsearch/minimap2/minimap2search.cpp +++ b/graphsearch/minimap2/minimap2search.cpp @@ -175,6 +175,17 @@ void Minimap2Search::buildHitsFromPAF(const QString &PAF, if (query == nullptr) continue; + if (g_settings->blastAlignmentLengthFilter.on && + alignmentLength < g_settings->blastAlignmentLengthFilter) + continue; + + if (g_settings->blastQueryCoverageFilter.on) { + double hitCoveragePercentage = 100.0 * Hit::getQueryCoverageFraction(query, + queryStart, queryEnd); + if (hitCoveragePercentage < g_settings->blastQueryCoverageFilter) + continue; + } + auto nodeIt = g_assemblyGraph->m_deBruijnGraphNodes.find(getNodeNameFromString(nodeLabel).toStdString()); if (nodeIt != g_assemblyGraph->m_deBruijnGraphNodes.end()) { if (!strand) From 815ed8e106fb75e3d02641a91093f1c779865857 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 24 Aug 2022 15:36:24 +0200 Subject: [PATCH 4/6] Factor out parsing and filtering --- graphsearch/blast/blastsearch.cpp | 36 ++++++++----- graphsearch/blast/blastsearch.h | 2 - graphsearch/graphsearch.h | 3 ++ graphsearch/hmmer/hmmersearch.cpp | 69 ++++++++++++++++--------- graphsearch/hmmer/hmmersearch.h | 4 -- graphsearch/minimap2/minimap2search.cpp | 37 ++++++++----- graphsearch/minimap2/minimap2search.h | 1 - 7 files changed, 96 insertions(+), 56 deletions(-) diff --git a/graphsearch/blast/blastsearch.cpp b/graphsearch/blast/blastsearch.cpp index 8f933612..42f8a8ee 100644 --- a/graphsearch/blast/blastsearch.cpp +++ b/graphsearch/blast/blastsearch.cpp @@ -188,7 +188,9 @@ QString BlastSearch::runOneBlastSearch(QuerySequenceType sequenceType, return blastOutput; } -static void buildHitsFromBlastOutput(QString blastOutput, Queries &queries); +static std::pair +buildHitsFromBlastOutput(QString blastOutput, + Queries &queries); QString BlastSearch::doSearch(Queries &queries, QString extraParameters) { GraphSearchFinishedRAII watcher(this); @@ -221,8 +223,12 @@ QString BlastSearch::doSearch(Queries &queries, QString extraParameters) { return (m_lastError = "BLAST search cancelled"); // If the code got here, then the search completed successfully. - buildHitsFromBlastOutput(blastOutput, queries); + auto [nodeHits, pathHits] = buildHitsFromBlastOutput(blastOutput, queries); + // Simply glue hits to queries. Now query owns hit. + for (auto &entry : nodeHits) + entry.first->addHit(entry.second); queries.findQueryPaths(); + queries.searchOccurred(); m_lastError = ""; @@ -326,8 +332,11 @@ static QString getNodeNameFromString(const QString &nodeString) { // BLAST search) to construct the Hit objects. // It looks at the filters to possibly exclude hits which fail to meet user- // defined thresholds. -void BlastSearch::buildHitsFromBlastOutput(QString blastOutput, - Queries &queries) const { +static std::pair +buildHitsFromBlastOutput(QString blastOutput, + Queries &queries) { + GraphSearch::NodeHits nodeHits; GraphSearch::PathHits pathHits; + QStringList blastHitList = blastOutput.split("\n", Qt::SkipEmptyParts); for (const auto &hitString : blastHitList) { @@ -383,19 +392,22 @@ void BlastSearch::buildHitsFromBlastOutput(QString blastOutput, if (nodeStart > nodeEnd) continue; - addNodeHit(query, nodeIt.value(), - queryStart, queryEnd, - nodeStart, nodeEnd, - percentIdentity, numberMismatches, numberGapOpens, - alignmentLength, eValue, bitScore); + nodeHits.emplace_back(query, + new Hit(query, nodeIt.value(), + percentIdentity, alignmentLength, + numberMismatches, numberGapOpens, + queryStart, queryEnd, + nodeStart, nodeEnd, eValue, bitScore)); } auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { - addPathHit(query, pathIt.value(), - queryStart, queryEnd, - nodeStart, nodeEnd); + pathHits.emplace_back(pathIt.value(), + Path::MappingRange{queryStart, queryEnd, + nodeStart, nodeEnd}); } } + + return { nodeHits, pathHits }; } diff --git a/graphsearch/blast/blastsearch.h b/graphsearch/blast/blastsearch.h index e1fdc2f3..d9928ebf 100644 --- a/graphsearch/blast/blastsearch.h +++ b/graphsearch/blast/blastsearch.h @@ -58,8 +58,6 @@ public slots: private: bool findTools(); - void buildHitsFromBlastOutput(QString blastOutput, - Queries &queries) const; QString runOneBlastSearch(search::QuerySequenceType sequenceType, const search::Queries &queries, diff --git a/graphsearch/graphsearch.h b/graphsearch/graphsearch.h index 1cfaa868..4eb40315 100644 --- a/graphsearch/graphsearch.h +++ b/graphsearch/graphsearch.h @@ -37,6 +37,9 @@ class GraphSearch : public QObject { explicit GraphSearch(const QDir &workDir = QDir::temp(), QObject *parent = nullptr); ~GraphSearch(); + using NodeHits = std::vector>; + using PathHits = std::vector>; + // Small RAII helpers to emit signals on destruction class DbBuildFinishedRAII { public: diff --git a/graphsearch/hmmer/hmmersearch.cpp b/graphsearch/hmmer/hmmersearch.cpp index 0107e4b6..ac447dbc 100644 --- a/graphsearch/hmmer/hmmersearch.cpp +++ b/graphsearch/hmmer/hmmersearch.cpp @@ -168,24 +168,37 @@ static QString getNodeNameFromString(const QString &nodeString) { return nodeName; } +static std::pair +buildHitsFromTblOut(QString hmmerOutput, Queries &queries); +static std::pair +buildHitsFromDomTblOut(QString hmmerOutput, Queries &queries); + QString HmmerSearch::doSearch(Queries &queries, QString extraParameters) { GraphSearchFinishedRAII watcher(this); m_lastError = ""; if (!findTools()) return m_lastError; + GraphSearch::NodeHits nnodeHits; GraphSearch::PathHits npathHits; if (queries.getQueryCount(NUCLEOTIDE) > 0 && !m_cancelSearch) { QString hmmerOutput = doOneSearch(NUCLEOTIDE, queries, extraParameters); if (!m_lastError.isEmpty()) return m_lastError; - buildHitsFromTblOut(hmmerOutput, queries); + std::tie(nnodeHits, npathHits) = buildHitsFromTblOut(hmmerOutput, queries); + // Simply glue hits to queries. Now query owns hit. + for (auto &entry : nnodeHits) + entry.first->addHit(entry.second); } + GraphSearch::NodeHits pnodeHits; GraphSearch::PathHits ppathHits; if (queries.getQueryCount(PROTEIN) > 0 && !m_cancelSearch) { QString hmmerOutput = doOneSearch(PROTEIN, queries, extraParameters); if (!m_lastError.isEmpty()) return m_lastError; - buildHitsFromDomTblOut(hmmerOutput, queries); + std::tie(pnodeHits, ppathHits) = buildHitsFromDomTblOut(hmmerOutput, queries); + // Simply glue hits to queries. Now query owns hit. + for (auto &entry : pnodeHits) + entry.first->addHit(entry.second); } queries.findQueryPaths(); @@ -322,11 +335,12 @@ void HmmerSearch::cancelSearch() { m_doSearch->kill(); } -void HmmerSearch::buildHitsFromTblOut(QString hmmerOutput, - Queries &queries) const { - QStringList hmmerHitList = hmmerOutput.split('\n', Qt::SkipEmptyParts); +static std::pair +buildHitsFromTblOut(QString hmmerOutput, + Queries &queries) { + GraphSearch::NodeHits nodeHits; GraphSearch::PathHits pathHits; - for (const auto &hitString : hmmerHitList) { + for (const auto &hitString : hmmerOutput.split('\n', Qt::SkipEmptyParts)) { if (hitString.startsWith('#')) continue; @@ -378,27 +392,32 @@ void HmmerSearch::buildHitsFromTblOut(QString hmmerOutput, if (nodeStart > nodeEnd) continue; - addNodeHit(query, nodeIt.value(), - queryStart, queryEnd, - nodeStart, nodeEnd, - -1, -1, -1, - alignmentLength, eValue, bitScore); + nodeHits.emplace_back(query, + new Hit(query, nodeIt.value(), + -1, alignmentLength, + -1, -1, + queryStart, queryEnd, + nodeStart, nodeEnd, + eValue, bitScore)); } auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { - addPathHit(query, pathIt.value(), - queryStart, queryEnd, - nodeStart, nodeEnd); + pathHits.emplace_back(pathIt.value(), + Path::MappingRange{queryStart, queryEnd, + nodeStart, nodeEnd}); } } + + return { nodeHits, pathHits }; } -void HmmerSearch::buildHitsFromDomTblOut(QString hmmerOutput, - Queries &queries) const { - QStringList hmmerHitList = hmmerOutput.split("\n", Qt::SkipEmptyParts); +static std::pair +buildHitsFromDomTblOut(QString hmmerOutput, + Queries &queries) { + GraphSearch::NodeHits nodeHits; GraphSearch::PathHits pathHits; - for (const auto &hitString : hmmerHitList) { + for (const auto &hitString : hmmerOutput.split("\n", Qt::SkipEmptyParts)) { if (hitString.startsWith('#')) continue; @@ -462,11 +481,15 @@ void HmmerSearch::buildHitsFromDomTblOut(QString hmmerOutput, nodeStart = (nodeStart - 1) * 3 + shift + 1; nodeEnd = (nodeEnd - 1) * 3 + shift + 1; - addNodeHit(query, nodeIt.value(), - queryStart, queryEnd, - nodeStart, nodeEnd, - -1, -1, -1, - alignmentLength, eValue, bitScore); + nodeHits.emplace_back(query, + new Hit(query, nodeIt.value(), + -1, alignmentLength, + -1, -1, + queryStart, queryEnd, + nodeStart, nodeEnd, + eValue, bitScore)); } } + + return { nodeHits, pathHits }; } diff --git a/graphsearch/hmmer/hmmersearch.h b/graphsearch/hmmer/hmmersearch.h index 8a58e577..8e30fec6 100644 --- a/graphsearch/hmmer/hmmersearch.h +++ b/graphsearch/hmmer/hmmersearch.h @@ -55,10 +55,6 @@ public slots: private: bool findTools(); - void buildHitsFromTblOut(QString hmmerOutput, - Queries &queries) const; - void buildHitsFromDomTblOut(QString hmmerOutput, - Queries &queries) const; QString doOneSearch(search::QuerySequenceType sequenceType, search::Queries &queries, QString extraParameters); diff --git a/graphsearch/minimap2/minimap2search.cpp b/graphsearch/minimap2/minimap2search.cpp index 2bf58120..2ad9a8a5 100644 --- a/graphsearch/minimap2/minimap2search.cpp +++ b/graphsearch/minimap2/minimap2search.cpp @@ -18,6 +18,7 @@ #include "minimap2search.h" +#include "graphsearch/graphsearch.h" #include "program/settings.h" #include "graph/assemblygraph.h" @@ -151,11 +152,12 @@ static QString getNodeNameFromString(const QString &nodeString) { return nodeName; } -void Minimap2Search::buildHitsFromPAF(const QString &PAF, - Queries &queries) const { - QStringList hitList = PAF.split("\n", Qt::SkipEmptyParts); +static std::pair +buildHitsFromPAF(const QString &PAF, + Queries &queries) { + GraphSearch::NodeHits nodeHits; GraphSearch::PathHits pathHits; - for (const auto &hitString : hitList) { + for (const auto &hitString : PAF.split("\n", Qt::SkipEmptyParts)) { QStringList alignmentParts = hitString.split('\t'); if (alignmentParts.size() < 12) continue; @@ -191,26 +193,29 @@ void Minimap2Search::buildHitsFromPAF(const QString &PAF, if (!strand) continue; - addNodeHit(query, nodeIt.value(), - queryStart, queryEnd, - nodeStart, nodeEnd, - -1, -1, -1, - alignmentLength, 0, 0); + nodeHits.emplace_back(query, + new Hit(query, nodeIt.value(), + -1, alignmentLength, + -1, -1, + queryStart, queryEnd, + nodeStart, nodeEnd, 0, 0)); } auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { - addPathHit(query, pathIt.value(), - queryStart, queryEnd, - nodeStart, nodeEnd); + pathHits.emplace_back(pathIt.value(), + Path::MappingRange{queryStart, queryEnd, + nodeStart, nodeEnd}); } } + + return { nodeHits, pathHits }; } QString Minimap2Search::doSearch(Queries &queries, QString extraParameters) { GraphSearchFinishedRAII watcher(this); - + m_lastError = ""; if (!findTools()) return m_lastError; @@ -263,8 +268,12 @@ QString Minimap2Search::doSearch(Queries &queries, QString extraParameters) { if (m_cancelSearch) return (m_lastError = "Minimap2 search cancelled"); - buildHitsFromPAF(minimap2Output, queries); + auto [nodeHits, pathHits] = buildHitsFromPAF(minimap2Output, queries); + // Simply glue hits to queries. Now query owns hit. + for (auto &entry : nodeHits) + entry.first->addHit(entry.second); queries.findQueryPaths(); + queries.searchOccurred(); m_lastError = ""; diff --git a/graphsearch/minimap2/minimap2search.h b/graphsearch/minimap2/minimap2search.h index 0bd3f882..f4c54882 100644 --- a/graphsearch/minimap2/minimap2search.h +++ b/graphsearch/minimap2/minimap2search.h @@ -54,7 +54,6 @@ public slots: private: bool findTools(); - void buildHitsFromPAF(const QString &PAF, Queries &queries) const; bool m_cancelBuildDatabase = false, m_cancelSearch = false; From a64b00ceea8d16c686faeb6662c480c9a85b4a47 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 24 Aug 2022 17:04:34 +0200 Subject: [PATCH 5/6] Build minimap index every time as it depends on -x option --- graphsearch/minimap2/minimap2search.cpp | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/graphsearch/minimap2/minimap2search.cpp b/graphsearch/minimap2/minimap2search.cpp index 2ad9a8a5..5dcd4498 100644 --- a/graphsearch/minimap2/minimap2search.cpp +++ b/graphsearch/minimap2/minimap2search.cpp @@ -89,26 +89,6 @@ QString Minimap2Search::buildDatabase(const AssemblyGraph &graph, bool includePa if (!atLeastOneSequence) return (m_lastError = "Cannot build the Minimap2 database as this graph contains no sequences"); - QStringList minimap2Options; - minimap2Options << "-d" << temporaryDir().filePath("all_nodes.idx") - << temporaryDir().filePath("all_nodes.fasta"); - - m_buildDb = new QProcess(); - m_buildDb->start(m_minimap2Command, minimap2Options); - - bool finished = m_buildDb->waitForFinished(-1); - if (m_buildDb->exitCode() != 0 || !finished) { - m_lastError = "There was a problem building minimap2 database"; - QString stdErr = m_buildDb->readAllStandardError(); - m_lastError += stdErr.isEmpty() ? "." : ":\n\n" + stdErr; - } else if (m_cancelBuildDatabase) - m_lastError = "Build cancelled."; - else - m_lastError = ""; - - m_buildDb->deleteLater(); - m_buildDb = nullptr; - return m_lastError; } @@ -237,7 +217,7 @@ QString Minimap2Search::doSearch(Queries &queries, QString extraParameters) { QStringList minimap2Options; minimap2Options << extraParameters.split(" ", Qt::SkipEmptyParts) - << temporaryDir().filePath("all_nodes.idx") + << temporaryDir().filePath("all_nodes.fasta") << tmpFile.fileName(); m_cancelSearch = false; From b2459c0cb6b27fd2f6055c8303961b795e165300 Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Wed, 24 Aug 2022 19:12:44 +0200 Subject: [PATCH 6/6] Do mapping paths properly --- graphsearch/blast/blastsearch.cpp | 14 ++++---- graphsearch/graphsearch.cpp | 13 ------- graphsearch/graphsearch.h | 12 +------ graphsearch/hits.h | 30 ++++++++++++++++ graphsearch/hmmer/hmmersearch.cpp | 28 +++++++-------- graphsearch/minimap2/minimap2search.cpp | 12 +++---- graphsearch/queries.cpp | 46 +++++++++++++++++++++++++ graphsearch/queries.h | 3 ++ graphsearch/query.h | 7 +++- graphsearch/querypath.cpp | 2 +- graphsearch/querypath.h | 3 +- 11 files changed, 112 insertions(+), 58 deletions(-) create mode 100644 graphsearch/hits.h diff --git a/graphsearch/blast/blastsearch.cpp b/graphsearch/blast/blastsearch.cpp index 42f8a8ee..14e91ac0 100644 --- a/graphsearch/blast/blastsearch.cpp +++ b/graphsearch/blast/blastsearch.cpp @@ -188,7 +188,7 @@ QString BlastSearch::runOneBlastSearch(QuerySequenceType sequenceType, return blastOutput; } -static std::pair +static std::pair buildHitsFromBlastOutput(QString blastOutput, Queries &queries); @@ -224,11 +224,9 @@ QString BlastSearch::doSearch(Queries &queries, QString extraParameters) { // If the code got here, then the search completed successfully. auto [nodeHits, pathHits] = buildHitsFromBlastOutput(blastOutput, queries); - // Simply glue hits to queries. Now query owns hit. - for (auto &entry : nodeHits) - entry.first->addHit(entry.second); + queries.addNodeHits(nodeHits); queries.findQueryPaths(); - + queries.addPathHits(pathHits); queries.searchOccurred(); m_lastError = ""; @@ -332,10 +330,10 @@ static QString getNodeNameFromString(const QString &nodeString) { // BLAST search) to construct the Hit objects. // It looks at the filters to possibly exclude hits which fail to meet user- // defined thresholds. -static std::pair +static std::pair buildHitsFromBlastOutput(QString blastOutput, Queries &queries) { - GraphSearch::NodeHits nodeHits; GraphSearch::PathHits pathHits; + NodeHits nodeHits; PathHits pathHits; QStringList blastHitList = blastOutput.split("\n", Qt::SkipEmptyParts); @@ -402,7 +400,7 @@ buildHitsFromBlastOutput(QString blastOutput, auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { - pathHits.emplace_back(pathIt.value(), + pathHits.emplace_back(query, pathIt.value(), Path::MappingRange{queryStart, queryEnd, nodeStart, nodeEnd}); } diff --git a/graphsearch/graphsearch.cpp b/graphsearch/graphsearch.cpp index d0079442..d5f20ce9 100644 --- a/graphsearch/graphsearch.cpp +++ b/graphsearch/graphsearch.cpp @@ -135,19 +135,6 @@ GraphSearch::GraphSearchFinishedRAII::~GraphSearchFinishedRAII() { } -void GraphSearch::addNodeHit(Query *query, DeBruijnNode *node, - int queryStart, int queryEnd, - int nodeStart, int nodeEnd, - double percentIdentity, - int numberMismatches, int numberGapOpens, - int alignmentLength, SciNot eValue, double bitScore) { - query->emplaceHit(query, node, - percentIdentity, alignmentLength, - numberMismatches, numberGapOpens, - queryStart, queryEnd, - nodeStart, nodeEnd, eValue, bitScore); -} - void GraphSearch::addPathHit(Query *query, Path *path, int queryStart, int queryEnd, int pathStart, int pathEnd) { diff --git a/graphsearch/graphsearch.h b/graphsearch/graphsearch.h index 4eb40315..80a7dcac 100644 --- a/graphsearch/graphsearch.h +++ b/graphsearch/graphsearch.h @@ -17,7 +17,7 @@ #pragma once -#include "graphsearch/queries.h" +#include "queries.h" #include #include @@ -37,9 +37,6 @@ class GraphSearch : public QObject { explicit GraphSearch(const QDir &workDir = QDir::temp(), QObject *parent = nullptr); ~GraphSearch(); - using NodeHits = std::vector>; - using PathHits = std::vector>; - // Small RAII helpers to emit signals on destruction class DbBuildFinishedRAII { public: @@ -100,13 +97,6 @@ class GraphSearch : public QObject { const QDir &workDir = QDir::temp(), QObject *parent = nullptr); protected: - static void addNodeHit(Query *query, DeBruijnNode *node, - int queryStart, int queryEnd, - int nodeStart, int nodeEnd, - double percentIdentity, - int numberMismatches, int numberGapOpens, - int alignmentLength, SciNot eValue, double bitScore); - static void addPathHit(Query *query, Path *path, int queryStart, int queryEnd, int pathStart, int pathEnd); diff --git a/graphsearch/hits.h b/graphsearch/hits.h new file mode 100644 index 00000000..08e9c1b9 --- /dev/null +++ b/graphsearch/hits.h @@ -0,0 +1,30 @@ +// Copyright 2022 Anton Korobeynikov + +// This file is part of Bandage + +// Bandage is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Bandage is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Bandage. If not, see . + +#pragma once + +#include "graph/path.h" +#include +#include + +namespace search { +class Query; +class Hit; + +using NodeHits = std::vector>; +using PathHits = std::vector>; +} diff --git a/graphsearch/hmmer/hmmersearch.cpp b/graphsearch/hmmer/hmmersearch.cpp index ac447dbc..914beca8 100644 --- a/graphsearch/hmmer/hmmersearch.cpp +++ b/graphsearch/hmmer/hmmersearch.cpp @@ -168,9 +168,9 @@ static QString getNodeNameFromString(const QString &nodeString) { return nodeName; } -static std::pair +static std::pair buildHitsFromTblOut(QString hmmerOutput, Queries &queries); -static std::pair +static std::pair buildHitsFromDomTblOut(QString hmmerOutput, Queries &queries); QString HmmerSearch::doSearch(Queries &queries, QString extraParameters) { @@ -179,29 +179,27 @@ QString HmmerSearch::doSearch(Queries &queries, QString extraParameters) { if (!findTools()) return m_lastError; - GraphSearch::NodeHits nnodeHits; GraphSearch::PathHits npathHits; + NodeHits nnodeHits; PathHits npathHits; if (queries.getQueryCount(NUCLEOTIDE) > 0 && !m_cancelSearch) { QString hmmerOutput = doOneSearch(NUCLEOTIDE, queries, extraParameters); if (!m_lastError.isEmpty()) return m_lastError; std::tie(nnodeHits, npathHits) = buildHitsFromTblOut(hmmerOutput, queries); - // Simply glue hits to queries. Now query owns hit. - for (auto &entry : nnodeHits) - entry.first->addHit(entry.second); + queries.addNodeHits(nnodeHits); } - GraphSearch::NodeHits pnodeHits; GraphSearch::PathHits ppathHits; + NodeHits pnodeHits; PathHits ppathHits; if (queries.getQueryCount(PROTEIN) > 0 && !m_cancelSearch) { QString hmmerOutput = doOneSearch(PROTEIN, queries, extraParameters); if (!m_lastError.isEmpty()) return m_lastError; std::tie(pnodeHits, ppathHits) = buildHitsFromDomTblOut(hmmerOutput, queries); - // Simply glue hits to queries. Now query owns hit. - for (auto &entry : pnodeHits) - entry.first->addHit(entry.second); + queries.addNodeHits(pnodeHits); } queries.findQueryPaths(); + queries.addPathHits(npathHits); + queries.addPathHits(ppathHits); queries.searchOccurred(); return m_lastError; @@ -335,10 +333,10 @@ void HmmerSearch::cancelSearch() { m_doSearch->kill(); } -static std::pair +static std::pair buildHitsFromTblOut(QString hmmerOutput, Queries &queries) { - GraphSearch::NodeHits nodeHits; GraphSearch::PathHits pathHits; + NodeHits nodeHits; PathHits pathHits; for (const auto &hitString : hmmerOutput.split('\n', Qt::SkipEmptyParts)) { if (hitString.startsWith('#')) @@ -403,7 +401,7 @@ buildHitsFromTblOut(QString hmmerOutput, auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { - pathHits.emplace_back(pathIt.value(), + pathHits.emplace_back(query, pathIt.value(), Path::MappingRange{queryStart, queryEnd, nodeStart, nodeEnd}); } @@ -412,10 +410,10 @@ buildHitsFromTblOut(QString hmmerOutput, return { nodeHits, pathHits }; } -static std::pair +static std::pair buildHitsFromDomTblOut(QString hmmerOutput, Queries &queries) { - GraphSearch::NodeHits nodeHits; GraphSearch::PathHits pathHits; + NodeHits nodeHits; PathHits pathHits; for (const auto &hitString : hmmerOutput.split("\n", Qt::SkipEmptyParts)) { if (hitString.startsWith('#')) diff --git a/graphsearch/minimap2/minimap2search.cpp b/graphsearch/minimap2/minimap2search.cpp index 5dcd4498..16ec3688 100644 --- a/graphsearch/minimap2/minimap2search.cpp +++ b/graphsearch/minimap2/minimap2search.cpp @@ -132,10 +132,10 @@ static QString getNodeNameFromString(const QString &nodeString) { return nodeName; } -static std::pair +static std::pair buildHitsFromPAF(const QString &PAF, Queries &queries) { - GraphSearch::NodeHits nodeHits; GraphSearch::PathHits pathHits; + NodeHits nodeHits; PathHits pathHits; for (const auto &hitString : PAF.split("\n", Qt::SkipEmptyParts)) { QStringList alignmentParts = hitString.split('\t'); @@ -183,7 +183,7 @@ buildHitsFromPAF(const QString &PAF, auto pathIt = g_assemblyGraph->m_deBruijnGraphPaths.find(nodeLabel.toStdString()); if (pathIt != g_assemblyGraph->m_deBruijnGraphPaths.end()) { - pathHits.emplace_back(pathIt.value(), + pathHits.emplace_back(query, pathIt.value(), Path::MappingRange{queryStart, queryEnd, nodeStart, nodeEnd}); } @@ -249,11 +249,9 @@ QString Minimap2Search::doSearch(Queries &queries, QString extraParameters) { return (m_lastError = "Minimap2 search cancelled"); auto [nodeHits, pathHits] = buildHitsFromPAF(minimap2Output, queries); - // Simply glue hits to queries. Now query owns hit. - for (auto &entry : nodeHits) - entry.first->addHit(entry.second); + queries.addNodeHits(nodeHits); queries.findQueryPaths(); - + queries.addPathHits(pathHits); queries.searchOccurred(); m_lastError = ""; diff --git a/graphsearch/queries.cpp b/graphsearch/queries.cpp index c0098f6c..7f163557 100644 --- a/graphsearch/queries.cpp +++ b/graphsearch/queries.cpp @@ -17,6 +17,9 @@ #include "queries.h" +#include "hits.h" + +#include "graph/debruijnnode.h" #include "program/globals.h" #include "program/settings.h" @@ -182,3 +185,46 @@ std::vector Queries::getNodesFromHits(const QString& queryName) return returnVector; } + +void Queries::addNodeHits(const NodeHits &nodeHits) { + // Simply glue hits to queries. Now query owns hit. + for (auto &entry : nodeHits) + entry.first->addHit(entry.second); +} + +void Queries::addPathHits(const PathHits &hits) { + for (const auto &hit : hits) { + Query *query; + const Path *path; + Path::MappingRange range; + std::tie(query, path, range) = hit; + std::vector pathHits; + + int pathStart = range.mapped_range.from, pathEnd = range.mapped_range.to; + int queryStart = range.initial_range.from, queryEnd = range.initial_range.to; + + bool invert = pathStart > pathEnd; + if (invert) + std::swap(pathStart, pathEnd); + + for (auto entry: path->getNodeCovering(pathStart, pathEnd)) { + DeBruijnNode *node = entry.first; + int nodeStart = entry.second.mapped_range.from, nodeEnd = entry.second.mapped_range.to; + if (invert) { + int nodeLength = node->getLength(); + node = node->getReverseComplement(); + nodeEnd = nodeLength - nodeEnd + 1; + nodeStart = nodeLength - nodeStart + 1; + std::swap(nodeEnd, nodeStart); + } + + pathHits.push_back(query->emplaceHit(query, node, -1, nodeEnd - nodeStart + 1, + -1, -1, + queryStart, queryEnd, + nodeStart, nodeEnd, + 0, 0)); + } + + query->emplaceQueryPath(*path, query, pathHits); + } +} diff --git a/graphsearch/queries.h b/graphsearch/queries.h index f8063562..e0f8b84d 100644 --- a/graphsearch/queries.h +++ b/graphsearch/queries.h @@ -19,6 +19,7 @@ #pragma once #include "query.h" +#include "hits.h" #include namespace search { @@ -58,6 +59,8 @@ class Queries { const Query *query(size_t idx) const { return m_queries[idx]; } std::vector getNodesFromHits(const QString& queryName = "") const; + void addNodeHits(const NodeHits &hits); + void addPathHits(const PathHits &hits); void findQueryPaths(); private: QString getUniqueName(QString name); diff --git a/graphsearch/query.h b/graphsearch/query.h index 5a6bd482..39269ad6 100644 --- a/graphsearch/query.h +++ b/graphsearch/query.h @@ -66,8 +66,9 @@ namespace search { // MODIFIERS void setName(QString newName) { m_name = std::move(newName); } template - void emplaceHit(Args&&... args) { + const Hit* emplaceHit(Args&&... args) { m_hits.emplace_back(new Hit(std::forward(args)...)); + return m_hits.back().get(); } void addHit(Hit *newHit) { m_hits.emplace_back(newHit); } @@ -76,6 +77,10 @@ namespace search { void findQueryPaths(); void addQueryPath(QueryPath path) { m_paths.emplace_back(path); } + template + void emplaceQueryPath(Args&... args) { + m_paths.emplace_back(std::forward(args)...); + } void setColour(QColor newColour) { m_colour = newColour; } void setShown(bool newShown) { m_shown = newShown; } diff --git a/graphsearch/querypath.cpp b/graphsearch/querypath.cpp index f0812b20..0373e372 100644 --- a/graphsearch/querypath.cpp +++ b/graphsearch/querypath.cpp @@ -35,7 +35,7 @@ QueryPath::QueryPath(Path path, Query *query) //query. It requires that the hits occur in order, i.e. that each hit in //the path begins later in the query than the previous hit. - const Hit * previousHit = nullptr; + const Hit *previousHit = nullptr; const auto &pathNodes = m_path.nodes(); for (int i = 0; i < pathNodes.size(); ++i) { DeBruijnNode * node = pathNodes[i]; diff --git a/graphsearch/querypath.h b/graphsearch/querypath.h index 1d655843..7bd719b9 100644 --- a/graphsearch/querypath.h +++ b/graphsearch/querypath.h @@ -33,8 +33,7 @@ namespace search { QueryPath(Path path, Query *query, std::vector hits); //ACCESSORS - Path getPath() const { return m_path; } - + const Path &getPath() const { return m_path; } const auto &getHits() const { return m_hits; } SciNot getEvalueProduct() const;