diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 489d716..61518af 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,23 +1,10 @@ ============================================================================================================ -v0.1.3 -- Changed the database name convention (omega parameter) -- PhyML parser rolled back to slow and naive solution due to breaking updates in last PhyML versions +v0.2.0 -============================================================================================================ -v0.1.2 -- The database name convention has changed to be consistent to the latest versions of RAPPASv1 (java) -- The default omega value now is 1.5 -- Updated core to v0.1.7, which supports ambiguous bases - -============================================================================================================ -v0.1.1 +The pre-publication release intended to be the reference version for the further improvements. Previous releases are left mostly for history and are not supposed to be used. -- New construction algorithm: now the construction is done by completing two steps. -In the first one, hashmaps of group nodes are built independently and stored on disk in ${workdir}/hashmaps. -After, they are merged into a database hashmap. -- AR Probability matrix is destroyed after the first stage of the algorithm. -It makes the peak RAM consumption approximately to be max(proba matrix size, final database size). - -============================================================================================================ -v0.1.0 -First released version. +- Supported partial loading of databases with phylo-k-mer filtering (--mu, --max-ram) +- Reintroduced parallelism (--threads) +- Reworked LWR formula so that it is normalized over all branches of the tree +- Various improvements and fixes +- Removed the old changelog since it was related to the functionality that had migrated to IPK \ No newline at end of file diff --git a/README.md b/README.md index 87458c2..bb20bd6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,11 @@ # EPIK: Evolutionary Placement with Informative K-mers + + +[![build](https://github.com/phylo42/EPIK/actions/workflows/build.yml/badge.svg)](https://github.com/phylo42/EPIK/actions/workflows/build.yml) + + + + EPIK is a program for rapid alignment-free phylogenetic placement, the successor of [RAPPAS](https://github.com/phylo42/RAPPAS). ## Installation @@ -26,16 +33,56 @@ cmake .. make -j4 ``` +### Install +You can use `epik.py` from the directory where it was built or install it system-wide or for a single user to make `epik.py` visible from any directory. + +For a system-wide installation (requires elevated permissions): +``` +sudo cmake --install . +``` + +Alternatively, to install for the current user, choose a directory where you want to install the tool. For instance, you might choose `/home/$USER/opt` or any other directory that you prefer. Replace `DIRECTORY` in the commands below with your chosen directory path: + +``` +cmake --install . --prefix DIRECTORY +export PATH=DIRECTORY/bin:$PATH +``` +Remember to export the `DIRECTORY/bin` to your `PATH`. You can do this manually each time or add the export command to your shell initialization scripts (e.g., `.bashrc`). + + ## Usage ### Phylogenetic placement +To place queries to a phylogenetic tree, you need to first preprocess it with IPK and make a phylo-k-mer database (see [here](https://github.com/phylo42/IPK) for detail). Queries should be in non-compressed fasta format. An example of placement command (see below for possible parameters values): ``` -python epik.py place -i DATABASE -s [nucl|amino] -o OUTPUT_DIR INPUT_FASTA +epik.py place -i DATABASE -s [nucl|amino] -o OUTPUT_DIR INPUT_FASTA ``` -See `python epik.py place --help` for more information. +If EPIK is not installed, run `./epik.py` from the EPIK directory instead. + +### Parameters + +| Option | Meaning | Default | +|-----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------| +| -i | The path to the phylo-k-mer database to use for placement. | | +| -s | States, `nucl` for DNA and `amino` for proteins | nucl | +| --omega | The user-defined threshold. Can be set higher than the one used when database was created. (If you are not sure, ignore this parameter.) | 1.5 | +| --mu | The proportion of the database to keep when filtering. Mutually exclusive with `--max-ram`. Should be a value in (0.0, 1.0] | 1.0 | +| --max-ram | The maximum amount of memory used to keep the database content. Mutually exclusive with `--mu`. Sets an approximate limit to EPIK's RAM consumption. | | +| --threads | Number of parallel threads used for placement. EPIK should be compiled with OpenMP support enabled, i.e. `EPIK_OMP=ON`. (If you compile as we recommend, it is enabled) | 1 | + +Also, see `epik.py place --help` for information. + -### Building databases +## Other + +### Code quality + +Code quality evaluation with [softwipe](https://github.com/adrianzap/softwipe) [1]: +``` +softwipe --cmake --cpp -x third-party,i2l/third-party,i2l/tests/catch2,i2l/examples --no-execution . +``` -To compute phylo-k-mer databases, use [IPK](https://github.com/phylo42/IPK). +## References +[1] A. Zapletal, D. Höhler, C. Sinz, A. Stamatakis (2021) The SoftWipe tool and benchmark for assessing coding standards adherence of scientific software Sci Rep 11, 10015 (2021). https://doi.org/10.1038/s41598-021-89495-8 diff --git a/epik.py b/epik.py index 8f0f1cf..3efff43 100755 --- a/epik.py +++ b/epik.py @@ -12,7 +12,11 @@ import subprocess +__version__ = "0.2.0" + + @click.group() +@click.version_option(__version__) def epik(): """ EPIK: Evolutionary Placement with Informative K-mers @@ -46,7 +50,7 @@ def epik(): help="Output directory.") @click.option('--threads', type=int, - default=4, show_default=True, + default=1, show_default=True, help="Number of threads used.") @click.option('--max-ram', type=str, @@ -57,7 +61,10 @@ def place(database, states, omega, mu, outputdir, threads, max_ram, input_file): """ Places .fasta files using the input IPK database. - \tpython epik.py place -s [nucl|amino] -i db.ipk -o output file.fasta [file2.fasta ...] + epik.py place -s [nucl|amino] -i DB.ipk -o output file.fasta [file2.fasta ...] + + Examples: + \tepik.py place -i DB.ipk -o temp --max-ram 4G --threads 8 query.fasta """ place_queries(database, states, omega, mu, outputdir, threads, max_ram, input_file) diff --git a/epik/src/epik/main.cpp b/epik/src/epik/main.cpp index 6f1ba9c..82a8fa2 100644 --- a/epik/src/epik/main.cpp +++ b/epik/src/epik/main.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -63,29 +64,51 @@ void print_intruction_set() /// Float-to-humanized string for better output template -std::string humanize(T num) +std::string to_human_readable(T num) { std::ostringstream oss; - oss.precision(1); - if (num < 1000.0) + if (num < 1024) { oss << std::fixed << num; } - else if (num < 1000000.0) - { - oss << std::fixed << num / 1000.0 << "K"; - } - else if (num < 1000000000.0) - { - oss << std::fixed << num / 1000000.0 << "M"; - } else { - oss << std::fixed << num / 1000000000.0 << "B"; + double value; + std::string suffix; + + if (num < 1024 * 1024) + { + value = num / 1024.0; + suffix = "K"; + } + else if (num < 1024 * 1024 * 1024) + { + value = num / (1024.0 * 1024.0); + suffix = "M"; + } + else + { + value = num / (1024.0 * 1024.0 * 1024.0); + suffix = "B"; + } + + // Check if the fractional part is zero + double int_part; + double frac_part = std::modf(value, &int_part); + if (frac_part == 0.0) + { + oss << static_cast(int_part) << suffix; + } + else + { + oss.precision(1); + oss << std::fixed << value << suffix; + } } return oss.str(); + } /// Size_t-to-string that translates milliseconds to humanized time @@ -128,9 +151,9 @@ std::string humanize_time(size_t milliseconds) return oss.str(); } -/// Parse the humanized RAM size to a number. -/// I know that the name of this function is unfortunate. -size_t dehumanize_ram(const std::string& max_ram) +/// Parse a human-readable --max-ram value as the number of bytes +/// e.g. 128K, 50M, 4.2Gb etc. +size_t parse_human_readable(const std::string& max_ram) { double value; char unit = 0; @@ -140,7 +163,7 @@ size_t dehumanize_ram(const std::string& max_ram) ss >> value; if (ss.fail()) { - throw std::runtime_error("Can't parse max_ram parameter: wrong numerical part"); + throw std::runtime_error("Could not parse --max-ram parameter: wrong numerical part"); } // Check if there is a memory unit @@ -149,7 +172,7 @@ size_t dehumanize_ram(const std::string& max_ram) ss >> unit; if (ss.fail()) { - throw std::runtime_error("Can't parse max_ram parameter: wrong unit"); + throw std::runtime_error("Could not parse --max-ram parameter: wrong unit"); } } @@ -230,10 +253,15 @@ int main(int argc, char** argv) if (parsed_options.count("max-ram")) { const auto max_ram_string = parsed_options["max-ram"].as(); - const auto max_ram = dehumanize_ram(max_ram_string); - max_entries = max_ram / sizeof(i2l::pkdb_value); + const auto max_ram = parse_human_readable(max_ram_string); + max_entries = static_cast(max_ram / sizeof(i2l::pkdb_value)); + + if (max_entries == 0) + { + throw std::runtime_error("Memory limit is too low"); + } std::cout << "Max-RAM provided: will be loaded not more than " - << humanize(max_entries) << " phylo-k-mers." << std::endl; + << to_human_readable(max_entries) << " phylo-k-mers." << std::endl; } #ifndef EPIK_OMP @@ -259,8 +287,8 @@ int main(int argc, char** argv) << "\tk: " << db.kmer_size() << std::endl << "\tomega: " << db.omega() << std::endl << "\tPositions loaded: " << (db.positions_loaded() ? "true" : "false") << std::endl << std::endl; - std::cout << "Loaded " << humanize(db.get_num_entries_loaded()) - << " of " << humanize(db.get_num_entries_total()) + std::cout << "Loaded " << to_human_readable(db.get_num_entries_loaded()) + << " of " << to_human_readable(db.get_num_entries_total()) << " phylo-k-mers. " << std::endl << std::endl; const auto tree = i2l::io::parse_newick(db.tree()); @@ -325,7 +353,7 @@ int main(int argc, char** argv) average_speed += seq_per_second; // Update progress bar - bar.set_option(option::PrefixText{humanize(seq_per_second) + " seq/s "}); + bar.set_option(option::PrefixText{to_human_readable(seq_per_second) + " seq/s "}); bar.set_option(option::PostfixText{std::to_string(num_seq_placed) + " / ?"}); bar.set_progress(reader.bytes_read()); @@ -339,12 +367,12 @@ int main(int argc, char** argv) average_speed /= (double)num_iterations; bar.set_option(option::PrefixText{"Done. "}); - bar.set_option(option::PostfixText{std::to_string(num_seq_placed)}); + bar.set_option(option::PostfixText{to_human_readable(num_seq_placed)}); bar.set_progress(reader.bytes_read()); std::cout << std::endl << termcolor::bold << termcolor::white << "Placed " << num_seq_placed << " sequences.\nAverage speed: " - << humanize(average_speed) << " seq/s.\n"; + << to_human_readable(average_speed) << " seq/s.\n"; std::cout << "Output: " << jplace_filename << std::endl; const auto placement_time = std::chrono::duration_cast( diff --git a/epik/src/epik/place.cpp b/epik/src/epik/place.cpp index 3a14283..9ca42c7 100644 --- a/epik/src/epik/place.cpp +++ b/epik/src/epik/place.cpp @@ -200,6 +200,8 @@ std::vector filter_by_ratio(const std::vector& placements, placed_collection placer::place(const std::vector& seq_records, size_t num_threads) { + (void)num_threads; + /// There may be identical sequences with different headers. We group them /// by the sequence content to not to place the same sequences more than once const auto sequence_map = group_by_sequence_content(seq_records); diff --git a/scripts/config.json b/scripts/config.json deleted file mode 100644 index 0a19ce4..0000000 --- a/scripts/config.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "soft": - { - "rappas": "/home/nikolai/miniconda3/envs/PEWO/bin/RAPPAS.jar", - "xpas": "xpas.py", - "rappas2": "rappas2.py", - "arbinary": "/home/nikolai/miniconda3/envs/PEWO/bin/raxml-ng" - }, - - "args": - { - "alignment": "/ngs/data/D652/reference.fasta", - "tree": "/ngs/data/D652/tree.newick", - "queries": "/ngs/rappas/D652/EMP_92_studies_10000.fas", - - "states": "nucl", - - "ardir": "/ngs/rappas/D652/temp/AR", - "categories": "4", - "workdir": "/ngs/ppdiff/D652", - - "k": "5", - "omega": "1.5", - "reduction-ratio": "0.99", - "filter": "no-filter", - "mu": "1.0", - - "use-unrooted": "True" - - } -} \ No newline at end of file