Skip to content

Commit

Permalink
Reorder header file by clusters too
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed Aug 1, 2023
1 parent 95e1c10 commit 8310cd6
Showing 1 changed file with 47 additions and 11 deletions.
58 changes: 47 additions & 11 deletions src/util/createclusterdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ int createclusearchdb(int argc, const char **argv, const Command& command) {
#ifdef OPENMP
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
#endif
std::string resultBuffer;
// write output file
#pragma omp for schedule(dynamic, 1)
for (size_t id = 0; id < clusterReader.getSize(); id++) {
progress.updateProgress();
Expand All @@ -65,7 +63,6 @@ int createclusearchdb(int argc, const char **argv, const Command& command) {
reader.getEntryLen(readerId) - 1, dbKey, thread_idx);
data = Util::skipLine(data);
}
resultBuffer.clear();
}
}
dbwRep.close(true);
Expand All @@ -76,20 +73,20 @@ int createclusearchdb(int argc, const char **argv, const Command& command) {
DBReader<unsigned int> dbrRep(repDbSeq.c_str(), repDbSeqIdx.c_str(), par.threads,
DBReader<unsigned int>::USE_INDEX);
dbrRep.open(DBReader<unsigned int>::NOSORT);
DBReader<unsigned int> dbrClu(seqsDbSeq.c_str(), seqsDbSeqIdx.c_str(), par.threads,
DBReader<unsigned int> dbrSeq(seqsDbSeq.c_str(), seqsDbSeqIdx.c_str(), par.threads,
DBReader<unsigned int>::USE_INDEX);
dbrClu.open(DBReader<unsigned int>::NOSORT);
dbrSeq.open(DBReader<unsigned int>::NOSORT);
std::string seqsDbSeqIdxTmp = seqsDbSeqIdx + "_tmp";

FILE *sIndex = FileUtil::openAndDelete(seqsDbSeqIdxTmp.c_str(), "w");
std::vector<DBReader<unsigned int>::Index> allIndex(dbrClu.getSize() + dbrRep.getSize());
std::vector<DBReader<unsigned int>::Index> allIndex(dbrSeq.getSize() + dbrRep.getSize());
size_t dataSize = 0;
for (size_t i = 0; i < dbrRep.getSize(); i++) {
allIndex[i] = *dbrRep.getIndex(i);
dataSize += allIndex[i].length;
}
for (size_t i = 0; i < dbrClu.getSize(); i++) {
DBReader<unsigned int>::Index *index = dbrClu.getIndex(i);
for (size_t i = 0; i < dbrSeq.getSize(); i++) {
DBReader<unsigned int>::Index *index = dbrSeq.getIndex(i);
index->offset += dataSize;
allIndex[dbrRep.getSize() + i] = *index;
}
Expand All @@ -110,8 +107,9 @@ int createclusearchdb(int argc, const char **argv, const Command& command) {
FileUtil::move(seqsDbSeqIdxTmp.c_str(), seqsDbSeqIdx.c_str());
FileUtil::symlinkAlias(repDbSeq, seqsDbSeq + ".0");
FileUtil::move(seqsDbSeq.c_str(), (seqsDbSeq + ".1").c_str());
dbrRep.close();
dbrSeq.close();
}
clusterReader.close();
DBReader<unsigned int>::copyDb(par.db2, par.db3 + "_clu");

struct DBSuffix {
Expand All @@ -130,7 +128,41 @@ int createclusearchdb(int argc, const char **argv, const Command& command) {
{DBFiles::TAX_MERGED, "_taxonomy"},
};

for (size_t i = 0; i < ARRAY_SIZE(suffices); ++i) {
Debug::Progress progress2(clusterReader.getSize());
DBReader<unsigned int> headerreader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads,
DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
headerreader.open(DBReader<unsigned int>::NOSORT);
headerreader.readMmapedDataInMemory();

DBWriter dbwRep(par.hdr3.c_str(), par.hdr3Index.c_str(), static_cast<unsigned int>(par.threads), par.compressed,
headerreader.getDbtype());
dbwRep.open();
#pragma omp parallel
{
unsigned int thread_idx = 0;
#ifdef OPENMP
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
#endif
#pragma omp for schedule(dynamic, 1)
for (size_t id = 0; id < clusterReader.getSize(); id++) {
progress2.updateProgress();
char *data = clusterReader.getData(id, thread_idx);
while (*data != '\0') {
// parse dbkey
size_t dbKey = Util::fast_atoi<unsigned int>(data);
size_t readerId = headerreader.getId(dbKey);
dbwRep.writeData(headerreader.getData(readerId, thread_idx),
headerreader.getEntryLen(readerId) - 1, dbKey, thread_idx);
data = Util::skipLine(data);
}
}
}
dbwRep.close(true);
headerreader.close();
clusterReader.close();

// dont copy header file
for (size_t i = 1; i < ARRAY_SIZE(suffices); ++i) {
std::string file = par.db1 + suffices[i].suffix;
if (suffices[i].flag && FileUtil::fileExists(file.c_str())) {
DBReader<unsigned int>::copyDb(file, par.db3 + suffices[i].suffix);
Expand All @@ -139,7 +171,11 @@ int createclusearchdb(int argc, const char **argv, const Command& command) {
for (size_t i = 0; i < ARRAY_SIZE(suffices); ++i) {
std::string file = par.db3 + suffices[i].suffix;
if (suffices[i].flag && FileUtil::fileExists(file.c_str())) {
DBReader<unsigned int>::aliasDb(file, par.db3 + "_seq" + suffices[i].suffix);
std::string fileToLinkTo = par.db3 + "_seq" + suffices[i].suffix;
if (FileUtil::fileExists(fileToLinkTo.c_str())){
DBReader<unsigned int>::removeDb(fileToLinkTo);
}
DBReader<unsigned int>::aliasDb(file, fileToLinkTo);
}
}
return EXIT_SUCCESS;
Expand Down

0 comments on commit 8310cd6

Please sign in to comment.