diff --git a/src/mode3-Tangle.cpp b/src/mode3-Tangle.cpp index 69d732ed..64d2f35d 100644 --- a/src/mode3-Tangle.cpp +++ b/src/mode3-Tangle.cpp @@ -1,5 +1,6 @@ // Shasta. #include "mode3-Tangle.hpp" +#include "mode3-TangleGraph.hpp" #include "deduplicate.hpp" using namespace shasta; using namespace mode3; @@ -8,6 +9,55 @@ using namespace mode3; #include +Tangle::Tangle( + bool debug, + AssemblyGraph& assemblyGraph, + uint64_t maxOffset, + const vector& tangleVerticesArgument) : + debug(debug), + assemblyGraph(assemblyGraph), + tangleVertices(tangleVerticesArgument) +{ + if(debug) { + cout << "Working on a tangle with " << tangleVertices.size() << " vertices." << endl; + } + + // Sort the tangleVertices so we can do binary searches in them + // in isTangleVertex. + sort(tangleVertices.begin(), tangleVertices.end()); + if(debug) { + writeTangleVertices(); + } + + findTangleEdges(maxOffset); + if(debug) { + writeTangleEdges(); + } + + findEntrances(); + findExits(); + if(debug) { + writeEntrances(); + writeExits(); + } + + // Create the TangleGraph. + vector entranceAnchorIds; + for(const Entrance& entrance: entrances) { + entranceAnchorIds.push_back(entrance.anchorId); + } + vector exitAnchorIds; + for(const Exit& exit: exits) { + exitAnchorIds.push_back(exit.anchorId); + } + const bool bidirectional = true; + TangleGraph tangleGraph(debug, assemblyGraph.anchors, + entranceAnchorIds, exitAnchorIds, bidirectional); +} + + +#if 0 +// Old code that does not use the TangleGraph. Tangle::Tangle( bool debug, AssemblyGraph& assemblyGraph, @@ -65,7 +115,7 @@ Tangle::Tangle( } } - +#endif bool Tangle::isTangleVertex(AssemblyGraph::vertex_descriptor v) const @@ -139,16 +189,30 @@ void Tangle::writeTangleEdges() const Tangle::EntranceOrExit::EntranceOrExit( AssemblyGraph::edge_descriptor e, - AnchorId anchorId, - const Anchor& anchor) : + AnchorId anchorId) : e(e), anchorId(anchorId) { - copy(anchor.begin(), anchor.end(), back_inserter(anchorMarkerIntervals)); } +// The entrances are AssemblyGraph edges that are not in the Tangle +// but whose target vertex is in the Tangle. +void Tangle::findEntrances() +{ + for(const AssemblyGraph::vertex_descriptor v: tangleVertices) { + BGL_FORALL_INEDGES(v, e, assemblyGraph, AssemblyGraph) { + if(not isTangleEdge(e)) { + const Chain& chain = assemblyGraph[e].getOnlyChain(); + const AnchorId anchorId = chain.secondToLast(); + entrances.push_back(Entrance(e, anchorId)); + } + } + } +} + +#if 0 // The entrances are AssemblyGraph edges that are not in the Tangle // but whose target vertex is in the Tangle. void Tangle::findEntrances() @@ -203,9 +267,26 @@ void Tangle::findEntrances() } } } +#endif +// The exits are AssemblyGraph edges that are not in the Tangle +// but whose source vertex is in the Tangle. +void Tangle::findExits() +{ + for(const AssemblyGraph::vertex_descriptor v: tangleVertices) { + BGL_FORALL_OUTEDGES(v, e, assemblyGraph, AssemblyGraph) { + if(not isTangleEdge(e)) { + const Chain& chain = assemblyGraph[e].getOnlyChain(); + const AnchorId anchorId = chain.second(); + exits.push_back(Exit(e, anchorId)); + } + } + } +} + +#if 0 // The exits are AssemblyGraph edges that are not in the Tangle // but whose source vertex is in the Tangle. void Tangle::findExits() @@ -284,7 +365,7 @@ bool Tangle::isExit(AssemblyGraph::edge_descriptor e) const } return false; } - +#endif void Tangle::writeEntrances() const @@ -308,7 +389,7 @@ void Tangle::writeExits() const } - +#if 0 void Tangle::readFollowingFromEntrances() { for(Entrance& entrance: entrances) { @@ -426,3 +507,6 @@ void Tangle::readFollowingFromExit(Exit& exit) cout << "After deduplication, read following found " << exit.journeyAnchorIds.size() << " anchors." << endl; } } + +#endif + diff --git a/src/mode3-Tangle.hpp b/src/mode3-Tangle.hpp index 5261c882..514b3fbb 100644 --- a/src/mode3-Tangle.hpp +++ b/src/mode3-Tangle.hpp @@ -60,6 +60,7 @@ class shasta::mode3::Tangle { AssemblyGraph::edge_descriptor e; AnchorId anchorId; +#if 0 // The AnchorMarkerIntervals on that AnchorId. // These are initially copies from class Anchors. // But later, for entrances we remove AnchorMarkerIntervals @@ -76,11 +77,11 @@ class shasta::mode3::Tangle { // The AnchorIds encountered during read following starting from this Entrance // and no other entrance. vector uniqueJourneyAnchorIds; +#endif EntranceOrExit( AssemblyGraph::edge_descriptor, - AnchorId, - const Anchor&); + AnchorId); }; class Entrance : public EntranceOrExit {using EntranceOrExit::EntranceOrExit;}; class Exit : public EntranceOrExit {using EntranceOrExit::EntranceOrExit;}; diff --git a/src/mode3-TangleGraph.cpp b/src/mode3-TangleGraph.cpp new file mode 100644 index 00000000..c6678ab3 --- /dev/null +++ b/src/mode3-TangleGraph.cpp @@ -0,0 +1,304 @@ +#include "mode3-TangleGraph.hpp" +#include "deduplicate.hpp" +using namespace shasta; +using namespace mode3; + + +TangleGraph::TangleGraph( + bool debug, + const Anchors& anchors, + const vector& entranceAnchors, + const vector& exitAnchors, + bool bidirectional) : + debug(debug), + anchors(anchors), + bidirectional(bidirectional) +{ + if(debug) { + cout << "Creating a tangle graph with " << entranceAnchors.size() << + " entrances and " << exitAnchors.size() << " exits." << endl; + cout << "Entrances:"; + for(const AnchorId anchorId: entranceAnchors) { + cout << " " << anchorIdToString(anchorId); + } + cout << endl; + cout << "Exits:"; + for(const AnchorId anchorId: exitAnchors) { + cout << " " << anchorIdToString(anchorId); + } + cout << endl; + } + + constructEntrances(entranceAnchors); + constructExits(exitAnchors); + computeTangleMatrix(); +} + + + +void TangleGraph::constructEntrances(const vector& entranceAnchors) +{ + // Initialize the anchorMarkerIntervals of the entrances. + for(const AnchorId anchorId: entranceAnchors) { + entrances.push_back(Entrance(anchorId, anchors[anchorId])); + } + + // If an OrientedReadId appears in more than one entrance, + // we want to remove the corresponding AnchorMarkerIntervals + // from all entrances. + + // Find the duplicate OrientedReadIds. + vector orientedReadIds; + for(const Entrance& entrance: entrances) { + for(const AnchorMarkerInterval& anchorMarkerInterval: entrance.anchorMarkerIntervals) { + orientedReadIds.push_back(anchorMarkerInterval.orientedReadId); + } + } + vector count; + deduplicateAndCountWithThreshold(orientedReadIds, count, 2UL); + + if(debug) { + if(orientedReadIds.empty()) { + cout << "The entrances contain no duplicate oriented reads." << endl; + } else { + cout << "The following oriented reads appear in more than once entrance " + " will be neglected during read following from the entrances:"; + for(const OrientedReadId orientedReadId: orientedReadIds) { + cout << " " << orientedReadId; + } + cout << endl; + } + } + + // If we found any duplicate OrientedReadIds, remove them from all entrances. + if(not orientedReadIds.empty()) { + for(Entrance& entrance: entrances) { + vector newAnchorMarkerIntervals; + for(const AnchorMarkerInterval& anchorMarkerInterval: entrance.anchorMarkerIntervals) { + if(not binary_search(orientedReadIds.begin(), orientedReadIds.end(), anchorMarkerInterval.orientedReadId)) { + newAnchorMarkerIntervals.push_back(anchorMarkerInterval); + } + } + entrance.anchorMarkerIntervals.swap(newAnchorMarkerIntervals); + } + } + + + + // Read following for each entrance. + // This fills in the journeyAnchorIds of each entrance. + for(Entrance& entrance: entrances) { + entrance.readFollowing(debug, anchors, bidirectional); + } + + + + // Find the AnchorIds that were found in more than one Entrance. + vector duplicateAnchorIds; + for(Entrance& entrance: entrances) { + copy(entrance.journeyAnchorIds.begin(), entrance.journeyAnchorIds.end(), + back_inserter(duplicateAnchorIds)); + } + deduplicateAndCountWithThreshold(duplicateAnchorIds, count, 2UL); + + if(debug) { + cout << duplicateAnchorIds.size() << + " anchors were found by read following from more than one entrance." << endl; + } + + + // Now we can fill in the uniqueJourneyAnchorIds for each Entrance. + for(Entrance& entrance: entrances) { + std::set_difference( + entrance.journeyAnchorIds.begin(), entrance.journeyAnchorIds.end(), + duplicateAnchorIds.begin(), duplicateAnchorIds.end(), + back_inserter(entrance.uniqueJourneyAnchorIds)); + + if(debug) { + cout << "Read following for entrance " << anchorIdToString(entrance.anchorId) << + " found " << entrance.uniqueJourneyAnchorIds.size() << " anchors unique to this entrance." << endl; + } + } +} + + + +void TangleGraph::constructExits(const vector& exitAnchors) +{ + // Initialize the anchorMarkerIntervals of the entrances. + for(const AnchorId anchorId: exitAnchors) { + exits.push_back(Exit(anchorId, anchors[anchorId])); + } + + // If an OrientedReadId appears in more than one exit, + // we want to remove the corresponding AnchorMarkerIntervals + // from all exits. + + // Find the duplicate OrientedReadIds. + vector orientedReadIds; + for(const Exit& exit: exits) { + for(const AnchorMarkerInterval& anchorMarkerInterval: exit.anchorMarkerIntervals) { + orientedReadIds.push_back(anchorMarkerInterval.orientedReadId); + } + } + vector count; + deduplicateAndCountWithThreshold(orientedReadIds, count, 2UL); + + if(debug) { + if(orientedReadIds.empty()) { + cout << "The exits contain no duplicate oriented reads." << endl; + } else { + cout << "The following oriented reads appear in more than once exits " + " and will be neglected during read following from the exits:"; + for(const OrientedReadId orientedReadId: orientedReadIds) { + cout << " " << orientedReadId; + } + cout << endl; + } + } + + // If we found any duplicate OrientedReadIds, remove them from all exits. + if(not orientedReadIds.empty()) { + for(Exit& exit: exits) { + vector newAnchorMarkerIntervals; + for(const AnchorMarkerInterval& anchorMarkerInterval: exit.anchorMarkerIntervals) { + if(not binary_search(orientedReadIds.begin(), orientedReadIds.end(), anchorMarkerInterval.orientedReadId)) { + newAnchorMarkerIntervals.push_back(anchorMarkerInterval); + } + } + exit.anchorMarkerIntervals.swap(newAnchorMarkerIntervals); + } + } + + + + // Read following for each exit. + // This fills in the journeyAnchorIds of each exit. + for(Exit& exit: exits) { + exit.readFollowing(debug, anchors, bidirectional); + } + + + + // Find the AnchorIds that were found in more than one Exit. + vector duplicateAnchorIds; + for(Exit& exit: exits) { + copy(exit.journeyAnchorIds.begin(), exit.journeyAnchorIds.end(), + back_inserter(duplicateAnchorIds)); + } + deduplicateAndCountWithThreshold(duplicateAnchorIds, count, 2UL); + + if(debug) { + cout << duplicateAnchorIds.size() << + " anchors were found by read following from more than one exit." << endl; + } + + + // Now we can fill in the uniqueJourneyAnchorIds for each Entrance. + for(Exit& exit: exits) { + std::set_difference( + exit.journeyAnchorIds.begin(), exit.journeyAnchorIds.end(), + duplicateAnchorIds.begin(), duplicateAnchorIds.end(), + back_inserter(exit.uniqueJourneyAnchorIds)); + + if(debug) { + cout << "Read following for exit " << anchorIdToString(exit.anchorId) << + " found " << exit.uniqueJourneyAnchorIds.size() << " anchors unique to this exit." << endl; + } + } +} + + + +TangleGraph::EntranceOrExit::EntranceOrExit( + AnchorId anchorId, + const Anchor& anchor) : + anchorId(anchorId) +{ + copy(anchor.begin(), anchor.end(), back_inserter(anchorMarkerIntervals)); +} + + + +// This fills in the journeyAnchorIds. +void TangleGraph::Entrance::readFollowing( + bool debug, const Anchors& anchors, bool bidirectional) +{ + for(const AnchorMarkerInterval& anchorMarkerInterval: anchorMarkerIntervals) { + const OrientedReadId orientedReadId = anchorMarkerInterval.orientedReadId; + const auto journey = anchors.journeys[orientedReadId.getValue()]; + + const uint64_t begin = (bidirectional ? 0 : anchorMarkerInterval.positionInJourney + 1); + const uint64_t end = journey.size(); + for(uint64_t position = begin; position != end; position++) { + journeyAnchorIds.push_back(journey[position]); + } + } + + deduplicate(journeyAnchorIds); + + if(debug) { + cout << "Read following for entrance " << anchorIdToString(anchorId) << + " found " << journeyAnchorIds.size() << " anchors after deduplication." << endl; + } + +} + + + +// This fills in the journeyAnchorIds. +void TangleGraph::Exit::readFollowing( + bool debug, const Anchors& anchors, bool bidirectional) +{ + for(const AnchorMarkerInterval& anchorMarkerInterval: anchorMarkerIntervals) { + const OrientedReadId orientedReadId = anchorMarkerInterval.orientedReadId; + const auto journey = anchors.journeys[orientedReadId.getValue()]; + + const uint64_t begin = 0; + const uint64_t end = (bidirectional ? journey.size() : anchorMarkerInterval.positionInJourney); + for(uint64_t position = begin; position != end; position++) { + journeyAnchorIds.push_back(journey[position]); + } + } + + deduplicate(journeyAnchorIds); + + if(debug) { + cout << "Read following for exit " << anchorIdToString(anchorId) << + " found " << journeyAnchorIds.size() << " anchors after deduplication." << endl; + } + +} + + + + +void TangleGraph::computeTangleMatrix() +{ + if(debug) { + cout << "Tangle matrix:" << endl; + } + + tangleMatrix.resize(entrances.size(), vector(exits.size())); + + for(uint64_t entranceIndex=0; entranceIndex commonUniqueAnchors; + std::set_intersection( + entrance.uniqueJourneyAnchorIds.begin(), entrance.uniqueJourneyAnchorIds.end(), + exit.uniqueJourneyAnchorIds.begin(), exit.uniqueJourneyAnchorIds.end(), + back_inserter(commonUniqueAnchors)); + tangleMatrix[entranceIndex][exitIndex] = commonUniqueAnchors.size(); + + if(debug) { + cout << anchorIdToString(entrance.anchorId) << " " << + anchorIdToString(exit.anchorId) << " " << tangleMatrix[entranceIndex][exitIndex] << endl; + } + + } + } + +} diff --git a/src/mode3-TangleGraph.hpp b/src/mode3-TangleGraph.hpp new file mode 100644 index 00000000..484d5504 --- /dev/null +++ b/src/mode3-TangleGraph.hpp @@ -0,0 +1,86 @@ +#pragma once + +// Class TangleGraph is used for read following in the AnchorGraph +// with a given set of entrances and exits. + +// Shasta. +#include "mode3-Anchor.hpp" + +// Standard library. +#include +#include + +namespace shasta { + namespace mode3 { + class TangleGraph; + } +} + + + +class shasta::mode3::TangleGraph { +public: + TangleGraph( + bool debug, + const Anchors&, + const vector& entranceAnchors, + const vector& exitAnchors, + bool bidirectional + ); + +private: + bool debug; + const Anchors& anchors; + bool bidirectional; + + // A base class to describe an Entrance or Exit to the TangleGraph. + class EntranceOrExit { + public: + AnchorId anchorId; + + // The AnchorMarkerIntervals on that AnchorId. + // These are initially copied from class Anchors. + // But later, for entrances we remove AnchorMarkerIntervals + // for which the same OrientedReadId appears in another entrance; + // and for exits we remove AnchorMarkerIntervals + // for which the same OrientedReadId appears in another exit. + vector anchorMarkerIntervals; + + // The AnchorIds encountered during read following. + // For an entrance, read following moves forward, starting at the entrance. + // For an exit, read following moves backward, starting at the exit. + // However, if bidirectional is true read following moves in both directions + // for both entrances and exits. + vector journeyAnchorIds; + + // The AnchorIds encountered during read following starting from this Entrance/Exit + // and no other Entrance/Exit. + vector uniqueJourneyAnchorIds; + + EntranceOrExit(AnchorId, const Anchor&); + }; + + class Entrance : public EntranceOrExit { + public: + using EntranceOrExit::EntranceOrExit; + void readFollowing(bool debug, const Anchors&, bool bidirectional); + }; + + class Exit : public EntranceOrExit { + public: + using EntranceOrExit::EntranceOrExit; + void readFollowing(bool debug, const Anchors&, bool bidirectional); + }; + + vector entrances; + vector exits; + void constructEntrances(const vector& entranceAnchors); + void constructExits(const vector& entranceAnchors); + + // The tangle matrix is the number of common unique anchors between each + // entrance/exit pair. + // Indexed by [entranceIndex][exitIndex] + vector< vector > tangleMatrix; + void computeTangleMatrix(); + +};