Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 1c07e03013ddf80c01b78419c98fde60122a65da
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Thu Feb 16 16:41:35 2023 +0100

    Updated version.

commit d655b42d095fd5fc187b7734a17b360b897e63dd
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Thu Feb 16 12:22:35 2023 +0100

    Use linsearch for backmapping.

commit 330becf3ffca6bbef91c33490dbb2c55adb5379c
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Thu Feb 16 10:17:38 2023 +0100

    Added linsearch option.

commit c7d7cffa920950a6927a91ff1903cb795076f104
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Wed Feb 15 20:51:35 2023 +0100

    Allow lin suffix.

commit b7e01fc21aa2adee3a6f8d08a59c828e2d0c74c1
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Wed Feb 15 10:19:52 2023 +0100

    Use linear clustering only for linclust.

commit 2e82ce30f52bc4d76a02290c3641e9f1761f15a8
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Tue Feb 14 16:49:58 2023 +0100

    Added check.

commit 46966470bf2e2bd6ba4e5fce9a6f73fb4ee53007
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Tue Feb 14 16:02:42 2023 +0100

    Added linear search to --iterate.

commit 746d8a4addc0c9b5357c120f1768368380181140
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Tue Feb 14 12:39:31 2023 +0100

    Fixed timer.

commit 30233debd687e414d51e5f1bc904648b5540f06f
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Tue Feb 14 11:15:01 2023 +0100

    Added linear stage for targets.

commit 1c9bf349fb07ba92b46a1f7ab719d54fb81e8b11
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Mon Feb 13 16:51:30 2023 +0100

    No extra for lin-stage1  option.

commit 190cef799c9877603f3f86b62515090323c3b909
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Mon Feb 13 16:34:00 2023 +0100

    Added linclust command.

commit fa888a19422daea83e8124ea3092aa7c3d0b180e
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Mon Feb 13 14:59:01 2023 +0100

    Fixed linclust bug.

commit 20a9bd2efe2a78f4c24cbd9317b4537c7d292dc5
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Mon Feb 13 13:33:01 2023 +0100

    Add tests to cmake.

commit a95bfc459b79ddd42911f0df7ac6441d1d7961e5
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Mon Feb 13 13:11:17 2023 +0100

    Added ctest file.

commit c73bc9d8c1f6648e350c5e81b7b43048c3715f63
Merge: d8475271 1de45b3
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Mon Feb 13 13:10:04 2023 +0100

    Merge branch 'master' into dev

commit d847527139fcbb9a63ebb35a8a80586d4ef7f937
Merge: 06361689 579b497
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Mon Feb 13 10:36:45 2023 +0100

    merged w/master

commit 06361689da3aab9cbb094559e990c1e508e5e8dc
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Mon Feb 13 10:30:58 2023 +0100

    Fixed macos errors.

commit 65911a2cc276db2f50dbbdc55ea1a37dcf1ff325
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Fri Feb 10 17:26:36 2023 +0100

    Use temp files for db blocks.

commit 5350c59abf3d2e5be661edc56f0fa33d5ab0d1dd
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Fri Feb 10 15:16:10 2023 +0100

    Fixed -k for view.

commit f6cc746b206be49939eef8c339448d0c8b220c2b
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Fri Feb 10 15:13:23 2023 +0100

    Added merge-daa command.

commit fb573774f12267218e80d2691bbbc8843b93dfa1
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Fri Feb 10 15:03:34 2023 +0100

    Fixed config,

commit 0da23199907de5f01abeb453a17e34d4d7903d31
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Fri Feb 10 14:42:03 2023 +0100

    Added merge-daa command.

commit de5db9380d527cfaeb63cc729389070fedf94e1d
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Fri Feb 10 14:22:56 2023 +0100

    Fixed error in view.

commit 3e265c1b226020aa3c18e326de726730246b55e3
Author: Benjamin Buchfink <buchfink@gmail.com>
Date:   Fri Feb 10 13:59:45 2023 +0100

    Added option to turn off avx2.
  • Loading branch information
bbuchfink committed Feb 16, 2023
1 parent 1de45b3 commit 9006cd7
Show file tree
Hide file tree
Showing 24 changed files with 230 additions and 69 deletions.
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -509,4 +509,8 @@ target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})

install(TARGETS diamond DESTINATION bin)

add_test(NAME diamond COMMAND "diamond test")
enable_testing()
add_test(
NAME diamond
COMMAND diamond test
)
17 changes: 16 additions & 1 deletion src/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
[2.1.2]
- The iterated search mode (option `--iterate`) now uses a linear-time feature as
the first search round.
- Added the `linclust` command to cluster using only a single linear-time search
round.
- Fixed compiler errors on macOS.
- Fixed a bug that caused invalid alignment traceback output for the DAA `view`
workflow.
- Added the `merge-daa` workflow to merge DAA files.
Expand All @@ -9,6 +14,15 @@
- Permitted the `--ignore-warnings` option for the `cluster` and `deepclust`
workflows.
- Use unlinked temporary files for database blocks in clustering workflows.
- Fixed a bug that could cause invalid results when using a clustering step with
linearization as the final round in combination with database processing in
multiple super blocks.
- The `--lin-stage1` option can now be used without compilation using the
`-DEXTRA=ON` cmake option.
- Added the option to specify the `_lin` suffix for sensitivity keywords for the
`--iterate` option to activate linear-time feature.
- Added the option `--linsearch` to activate linear-time feature for the search
workflows.

[2.1.1]
- Fixed compilation errors on non-x86 systems and for the clang compiler.
Expand Down Expand Up @@ -50,7 +64,8 @@
- Added the output fields `approx_pident` and `corrected_bitscore` to the tabular
format.
- Added the `--lin-stage1` option to linearize comparisons in the seeding stage
by only considering hits against the longest query sequence for identical seeds.
by only considering hits against the longest query sequence for identical seeds
(only supported when compiled with `-DEXTRA=ON`).
- Added the `--kmer-ranking` option to rank sequences when `--lin-stage1` is used
(only supported when compiled with `-DKEEP_TARGET_ID=ON`).
- Added the option `--no-block-size-limit` to deactivate upper limits for the block
Expand Down
2 changes: 1 addition & 1 deletion src/basic/basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../util/util.h"
#include "../stats/standard_matrix.h"

const char* Const::version_string = "2.1.1";
const char* Const::version_string = "2.1.2";
using std::string;
using std::vector;
using std::count;
Expand Down
16 changes: 10 additions & 6 deletions src/basic/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
.add_command("blastp", "Align amino acid query sequences against a protein reference database", blastp)
.add_command("blastx", "Align DNA query sequences against a protein reference database", blastx)
.add_command("cluster", "Cluster protein sequences", cluster)
.add_command("linclust", "Cluster protein sequences in linear time", LINCLUST)
.add_command("realign", "Realign clustered sequences against their centroids", CLUSTER_REALIGN)
.add_command("recluster", "Recompute clustering to fix errors", RECLUSTER)
.add_command("reassign", "Reassign clustered sequences to the closest centroid", CLUSTER_REASSIGN)
Expand Down Expand Up @@ -248,7 +249,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
#endif
;

auto& general = parser.add_group("General options", { makedb, blastp, blastx, cluster, view, prep_db, getseq, dbinfo, makeidx, CLUSTER_REALIGN, GREEDY_VERTEX_COVER, DEEPCLUST, RECLUSTER, MERGE_DAA });
auto& general = parser.add_group("General options", { makedb, blastp, blastx, cluster, view, prep_db, getseq, dbinfo, makeidx, CLUSTER_REALIGN, GREEDY_VERTEX_COVER, DEEPCLUST, RECLUSTER, MERGE_DAA, LINCLUST });
general.add()
("threads", 'p', "number of CPU threads", threads_)
("db", 'd', "database file", database)
Expand All @@ -266,7 +267,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
("taxonnodes", 0, "taxonomy nodes.dmp from NCBI", nodesdmp)
("taxonnames", 0, "taxonomy names.dmp from NCBI", namesdmp);

auto& align_clust = parser.add_group("Aligner/Clustering options", { blastp, blastx, cluster, RECLUSTER, CLUSTER_REASSIGN, DEEPCLUST, CLUSTER_REALIGN });
auto& align_clust = parser.add_group("Aligner/Clustering options", { blastp, blastx, cluster, RECLUSTER, CLUSTER_REASSIGN, DEEPCLUST, CLUSTER_REALIGN, LINCLUST });
align_clust.add()
("evalue", 'e', "maximum e-value to report alignments (default=0.001)", max_evalue, 0.001)
("tmpdir", 't', "directory for temporary files", tmpdir)
Expand Down Expand Up @@ -381,7 +382,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
\tqstrand means Query strand\n\
\n\tDefault: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", output_format);

auto& cluster_opt = parser.add_group("Clustering options", { cluster, RECLUSTER, CLUSTER_REASSIGN, GREEDY_VERTEX_COVER, DEEPCLUST });
auto& cluster_opt = parser.add_group("Clustering options", { cluster, RECLUSTER, CLUSTER_REASSIGN, GREEDY_VERTEX_COVER, DEEPCLUST, LINCLUST });
kmer_ranking = false;
cluster_opt.add()
("member-cover", 0, "Minimum coverage% of the cluster member sequence (default=80.0)", member_cover, 80.0)
Expand All @@ -400,7 +401,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa

string algo_str;

auto& advanced = parser.add_group("Advanced options", { blastp, blastx, makeidx, CLUSTER_REASSIGN, regression_test, cluster, DEEPCLUST });
auto& advanced = parser.add_group("Advanced options", { blastp, blastx, makeidx, CLUSTER_REASSIGN, regression_test, cluster, DEEPCLUST, LINCLUST });
advanced.add()
("algo", 0, "Seed search algorithm (0=double-indexed/1=query-indexed/ctg=contiguous-seed)", algo_str)
("bin", 0, "number of query bins for seed search", query_bins_)
Expand All @@ -409,6 +410,8 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
("freq-masking", 0, "mask seeds based on frequency", freq_masking)
("freq-sd", 0, "number of standard deviations for ignoring frequent seeds", freq_sd_, 0.0)
("id2", 0, "minimum number of identities for stage 1 hit", min_identities_)
("linsearch", 0, "only consider seed hits against longest target for identical seeds", linsearch)
("lin-stage1", 0, "only consider seed hits against longest query for identical seeds", lin_stage1)
("xdrop", 'x', "xdrop for ungapped alignment", ungapped_xdrop, 12.3)
("gapped-filter-evalue", 0, "E-value threshold for gapped filter (auto)", gapped_filter_evalue_, -1.0)
("band", 0, "band for dynamic programming computation", padding)
Expand Down Expand Up @@ -597,7 +600,6 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
("chaining-stacked-hsp-ratio", 0, "", chaining_stacked_hsp_ratio, 0.5)
("swipe-task-size", 0, "", swipe_task_size, (int64_t)100000000)
("minimizer-window", 0, "", minimizer_window_)
("lin-stage1", 0, "", lin_stage1)
("min_task_trace_pts", 0, "", min_task_trace_pts, (int64_t)1024)
("sketch-size", 0, "", sketch_size)
("oid-list", 0, "", oid_list)
Expand Down Expand Up @@ -780,6 +782,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
case Config::view:
case Config::cluster:
case Config::DEEPCLUST:
case Config::LINCLUST:
case Config::regression_test:
case Config::compute_medoids:
case Config::CLUSTER_REASSIGN:
Expand All @@ -800,6 +803,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
case Config::makedb:
case Config::cluster:
case Config::DEEPCLUST:
case Config::LINCLUST:
case Config::regression_test:
case Config::compute_medoids:
case Config::LIST_SEEDS:
Expand Down Expand Up @@ -828,7 +832,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa

if (command == Config::blastp || command == Config::blastx || command == Config::blastn || command == Config::benchmark || command == Config::model_sim || command == Config::opt
|| command == Config::mask || command == Config::cluster || command == Config::compute_medoids || command == Config::regression_test || command == Config::CLUSTER_REASSIGN
|| command == Config::RECLUSTER || command == Config::DEEPCLUST) {
|| command == Config::RECLUSTER || command == Config::DEEPCLUST || command == Config::LINCLUST) {
if (tmpdir == "")
tmpdir = extract_dir(output_file);

Expand Down
3 changes: 2 additions & 1 deletion src/basic/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ struct Config
bool recluster_bd;
bool pipeline_short;
string graph_algo;
bool linsearch;

SequenceType dbtype;

Expand All @@ -350,7 +351,7 @@ struct Config
match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta = 18, dbinfo = 19, test_extra = 20, test_io = 21, db_annot_stats = 22, read_sim = 23, info = 24, seed_stat = 25,
smith_waterman = 26, cluster = 27, translate = 28, filter_blasttab = 29, show_cbs = 30, simulate_seqs = 31, split = 32, upgma = 33, upgma_mc = 34, regression_test = 35,
reverse_seqs = 36, compute_medoids = 37, mutate = 38, rocid = 40, makeidx = 41, find_shapes, prep_db, composition, JOIN, HASH_SEQS, LIST_SEEDS, CLUSTER_REALIGN,
GREEDY_VERTEX_COVER, INDEX_FASTA, FETCH_SEQ, CLUSTER_REASSIGN, blastn, RECLUSTER, LENGTH_SORT, MERGE_DAA, DEEPCLUST
GREEDY_VERTEX_COVER, INDEX_FASTA, FETCH_SEQ, CLUSTER_REASSIGN, blastn, RECLUSTER, LENGTH_SORT, MERGE_DAA, DEEPCLUST, LINCLUST
};
unsigned command;

Expand Down
2 changes: 1 addition & 1 deletion src/basic/const.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct Const
{

enum {
build_version = 155,
build_version = 156,
#ifdef SINGLE_THREADED
seedp_bits = 0,
#else
Expand Down
10 changes: 7 additions & 3 deletions src/cluster/cascaded/cascaded.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
/****
DIAMOND protein aligner
Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
Copyright (C) 2016-2023 Max Planck Society for the Advancement of Science e.V.
Benjamin Buchfink
Code developed by Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -66,6 +69,7 @@ vector<SuperBlockId> cluster(shared_ptr<SequenceFile>& db, const shared_ptr<BitV
config.self = true;
config.iterate.unset();
config.mapany = false;
config.linsearch = false;
tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), config.sensitivity, config.lin_stage1);

shared_ptr<Callback> callback(new Callback);
Expand Down Expand Up @@ -99,10 +103,10 @@ static pair<vector<SuperBlockId>, BitVector> update_clustering(const BitVector&
return { current_centroids, oid_filter };
}

vector<SuperBlockId> cascaded(shared_ptr<SequenceFile>& db) {
vector<SuperBlockId> cascaded(shared_ptr<SequenceFile>& db, bool linear) {
if (db->sequence_count() > (int64_t)numeric_limits<SuperBlockId>::max())
throw runtime_error("Workflow supports a maximum of " + to_string(numeric_limits<SuperBlockId>::max()) + " input sequences.");
const auto steps = cluster_steps(config.approx_min_id);
const auto steps = cluster_steps(config.approx_min_id, linear);
shared_ptr<BitVector> oid_filter(new BitVector);
int64_t cluster_count = db->sequence_count();
vector<SuperBlockId> centroids(cluster_count);
Expand Down
9 changes: 6 additions & 3 deletions src/cluster/cascaded/cascaded.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
/****
DIAMOND protein aligner
Copyright (C) 2013-2018 Benjamin Buchfink <buchfink@gmail.com>
Copyright (C) 2016-2023 Max Planck Society for the Advancement of Science e.V.
Benjamin Buchfink
Code developed by Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -30,8 +33,8 @@ struct Cascaded : public ClusteringAlgorithm {
}
};

std::vector<SuperBlockId> cascaded(std::shared_ptr<SequenceFile>& db);
std::vector<std::string> cluster_steps(double approx_id);
std::vector<SuperBlockId> cascaded(std::shared_ptr<SequenceFile>& db, bool linear);
std::vector<std::string> cluster_steps(double approx_id, bool linear);

struct Callback : public Consumer {
using Edge = Util::Algo::Edge<SuperBlockId>;
Expand Down
27 changes: 25 additions & 2 deletions src/cluster/cascaded/helpers.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,37 @@
/****
DIAMOND protein aligner
Copyright (C) 2019-2023 Max Planck Society for the Advancement of Science e.V.
Code developed by Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/

#include "cascaded.h"

using std::string;
using std::vector;

namespace Cluster {

vector<string> cluster_steps(double approx_id) {
vector<string> cluster_steps(double approx_id, bool linear) {
if (!config.cluster_steps.empty())
return config.cluster_steps;
vector<string> v = { "faster_lin", "fast" };
vector<string> v = { "faster_lin" };
if (linear)
return v;
v.push_back("fast");
if (approx_id < 90)
v.push_back("default");
if (approx_id < 50)
Expand Down
10 changes: 6 additions & 4 deletions src/cluster/cascaded/recluster.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/****
DIAMOND protein aligner
Copyright (C) 2022 Max Planck Society for the Advancement of Science e.V.
Copyright (C) 2022-2023 Max Planck Society for the Advancement of Science e.V.
Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de>
Code developed by Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -83,10 +83,12 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
config.query_cover = config.recluster_bd ? 0 : config.member_cover;
config.subject_cover = 0;
config.query_or_target_cover = config.recluster_bd ? config.member_cover : 0;
config.sensitivity = from_string<Sensitivity>(cluster_steps(config.approx_min_id).back());
config.sensitivity = from_string<Sensitivity>(cluster_steps(config.approx_min_id, false).back());
//tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), Search::iterated_sens.at(config.sensitivity).front(), false);
config.lowmem_ = 1;
config.chunk_size = 4.0;
config.lin_stage1 = false;
config.linsearch = false;
shared_ptr<Mapback> mapback = make_shared<Mapback>(unal_members.size());
Search::run(centroid_db, unaligned, mapback);

Expand Down Expand Up @@ -157,7 +159,7 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
unaligned.reset();
timer.finish();

const vector<OId> reclust = recluster(unmapped, convert_mapping(cascaded(unmapped), OId()), iteration + 1);
const vector<OId> reclust = recluster(unmapped, convert_mapping(cascaded(unmapped, false), OId()), iteration + 1);

timer.go("Deallocating memory");
unmapped.reset();
Expand Down
38 changes: 34 additions & 4 deletions src/cluster/cascaded/wrapper.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
/****
DIAMOND protein aligner
Copyright (C) 2019-2023 Max Planck Society for the Advancement of Science e.V.
Code developed by Benjamin Buchfink <buchfink@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/

#include <numeric>
#include "cascaded.h"
#include "../data/fasta/fasta_file.h"
Expand Down Expand Up @@ -26,16 +46,18 @@ namespace Cluster {

struct Config {
Config(shared_ptr<SequenceFile>& db) :
linclust(config.command == ::Config::LINCLUST),
message_stream(true),
verbosity(1),
sens(from_string<Sensitivity>(rstrip(cluster_steps(config.approx_min_id).back(), "_lin"))),
sens(from_string<Sensitivity>(rstrip(cluster_steps(config.approx_min_id, linclust).back(), "_lin"))),
output_format(init_output(-1)),
centroids(new FastaFile("", true, FastaFile::WriteAccess())),
seqs_processed(0),
letters_processed(0),
oid_to_centroid_oid(new File(Schema{ Type::INT64, Type::INT64 }, "", Flags::TEMP))
{
}
bool linclust;
MessageStream message_stream;
int verbosity;
Sensitivity sens;
Expand Down Expand Up @@ -78,7 +100,15 @@ static vector<SuperBlockId> search_vs_centroids(shared_ptr<FastaFile>& super_blo
config.query_cover = config.member_cover;
config.subject_cover = 0;
config.query_or_target_cover = 0;
config.iterate = vector<string>();
if (cfg.linclust) {
config.iterate.unset();
config.linsearch = true;
}
else {
config.iterate = vector<string>();
config.linsearch = false;
}
config.lin_stage1 = false;
tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), cfg.sens, false);
cfg.centroids->set_seqinfo_ptr(0);
shared_ptr<BestCentroid> best_centroid(new BestCentroid(super_block->sequence_count()));
Expand Down Expand Up @@ -119,7 +149,7 @@ void Cascaded::run() {
unique_ptr<Util::Tsv::File> out(open_out_tsv());

if (block_size >= (double)db->letters() && db->sequence_count() < numeric_limits<SuperBlockId>::max()) {
const auto centroids = cascaded(db);
const auto centroids = cascaded(db, config.command == ::Config::LINCLUST);
timer.go("Generating output");
output_mem<SuperBlockId>(*out, *db, centroids);
}
Expand Down Expand Up @@ -159,7 +189,7 @@ void Cascaded::run() {
seqs.reset();
timer.finish();
}
const vector<SuperBlockId> clustering = cascaded(unaligned_db);
const vector<SuperBlockId> clustering = cascaded(unaligned_db, cfg.linclust);
timer.go("Updating clustering");
vector<SuperBlockId> centroids;
for (SuperBlockId i = 0; i < (SuperBlockId)unaligned.size(); ++i) {
Expand Down
Loading

0 comments on commit 9006cd7

Please sign in to comment.