diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 489d716..61518af 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -1,23 +1,10 @@
============================================================================================================
-v0.1.3
-- Changed the database name convention (omega parameter)
-- PhyML parser rolled back to slow and naive solution due to breaking updates in last PhyML versions
+v0.2.0
-============================================================================================================
-v0.1.2
-- The database name convention has changed to be consistent to the latest versions of RAPPASv1 (java)
-- The default omega value now is 1.5
-- Updated core to v0.1.7, which supports ambiguous bases
-
-============================================================================================================
-v0.1.1
+The pre-publication release intended to be the reference version for the further improvements. Previous releases are left mostly for history and are not supposed to be used.
-- New construction algorithm: now the construction is done by completing two steps.
-In the first one, hashmaps of group nodes are built independently and stored on disk in ${workdir}/hashmaps.
-After, they are merged into a database hashmap.
-- AR Probability matrix is destroyed after the first stage of the algorithm.
-It makes the peak RAM consumption approximately to be max(proba matrix size, final database size).
-
-============================================================================================================
-v0.1.0
-First released version.
+- Supported partial loading of databases with phylo-k-mer filtering (--mu, --max-ram)
+- Reintroduced parallelism (--threads)
+- Reworked LWR formula so that it is normalized over all branches of the tree
+- Various improvements and fixes
+- Removed the old changelog since it was related to the functionality that had migrated to IPK
\ No newline at end of file
diff --git a/README.md b/README.md
index 87458c2..bb20bd6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,11 @@
# EPIK: Evolutionary Placement with Informative K-mers
+
+
+[![build](https://github.com/phylo42/EPIK/actions/workflows/build.yml/badge.svg)](https://github.com/phylo42/EPIK/actions/workflows/build.yml)
+
+
+
+
EPIK is a program for rapid alignment-free phylogenetic placement, the successor of [RAPPAS](https://github.com/phylo42/RAPPAS).
## Installation
@@ -26,16 +33,56 @@ cmake ..
make -j4
```
+### Install
+You can use `epik.py` from the directory where it was built or install it system-wide or for a single user to make `epik.py` visible from any directory.
+
+For a system-wide installation (requires elevated permissions):
+```
+sudo cmake --install .
+```
+
+Alternatively, to install for the current user, choose a directory where you want to install the tool. For instance, you might choose `/home/$USER/opt` or any other directory that you prefer. Replace `DIRECTORY` in the commands below with your chosen directory path:
+
+```
+cmake --install . --prefix DIRECTORY
+export PATH=DIRECTORY/bin:$PATH
+```
+Remember to export the `DIRECTORY/bin` to your `PATH`. You can do this manually each time or add the export command to your shell initialization scripts (e.g., `.bashrc`).
+
+
## Usage
### Phylogenetic placement
+To place queries to a phylogenetic tree, you need to first preprocess it with IPK and make a phylo-k-mer database (see [here](https://github.com/phylo42/IPK) for detail). Queries should be in non-compressed fasta format. An example of placement command (see below for possible parameters values):
```
-python epik.py place -i DATABASE -s [nucl|amino] -o OUTPUT_DIR INPUT_FASTA
+epik.py place -i DATABASE -s [nucl|amino] -o OUTPUT_DIR INPUT_FASTA
```
-See `python epik.py place --help` for more information.
+If EPIK is not installed, run `./epik.py` from the EPIK directory instead.
+
+### Parameters
+
+| Option | Meaning | Default |
+|-----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|
+| -i | The path to the phylo-k-mer database to use for placement. | |
+| -s | States, `nucl` for DNA and `amino` for proteins | nucl |
+| --omega | The user-defined threshold. Can be set higher than the one used when database was created. (If you are not sure, ignore this parameter.) | 1.5 |
+| --mu | The proportion of the database to keep when filtering. Mutually exclusive with `--max-ram`. Should be a value in (0.0, 1.0] | 1.0 |
+| --max-ram | The maximum amount of memory used to keep the database content. Mutually exclusive with `--mu`. Sets an approximate limit to EPIK's RAM consumption. | |
+| --threads | Number of parallel threads used for placement. EPIK should be compiled with OpenMP support enabled, i.e. `EPIK_OMP=ON`. (If you compile as we recommend, it is enabled) | 1 |
+
+Also, see `epik.py place --help` for information.
+
-### Building databases
+## Other
+
+### Code quality
+
+Code quality evaluation with [softwipe](https://github.com/adrianzap/softwipe) [1]:
+```
+softwipe --cmake --cpp -x third-party,i2l/third-party,i2l/tests/catch2,i2l/examples --no-execution .
+```
-To compute phylo-k-mer databases, use [IPK](https://github.com/phylo42/IPK).
+## References
+[1] A. Zapletal, D. Höhler, C. Sinz, A. Stamatakis (2021) The SoftWipe tool and benchmark for assessing coding standards adherence of scientific software Sci Rep 11, 10015 (2021). https://doi.org/10.1038/s41598-021-89495-8
diff --git a/epik.py b/epik.py
index 8f0f1cf..3efff43 100755
--- a/epik.py
+++ b/epik.py
@@ -12,7 +12,11 @@
import subprocess
+__version__ = "0.2.0"
+
+
@click.group()
+@click.version_option(__version__)
def epik():
"""
EPIK: Evolutionary Placement with Informative K-mers
@@ -46,7 +50,7 @@ def epik():
help="Output directory.")
@click.option('--threads',
type=int,
- default=4, show_default=True,
+ default=1, show_default=True,
help="Number of threads used.")
@click.option('--max-ram',
type=str,
@@ -57,7 +61,10 @@ def place(database, states, omega, mu, outputdir, threads, max_ram, input_file):
"""
Places .fasta files using the input IPK database.
- \tpython epik.py place -s [nucl|amino] -i db.ipk -o output file.fasta [file2.fasta ...]
+ epik.py place -s [nucl|amino] -i DB.ipk -o output file.fasta [file2.fasta ...]
+
+ Examples:
+ \tepik.py place -i DB.ipk -o temp --max-ram 4G --threads 8 query.fasta
"""
place_queries(database, states, omega, mu, outputdir, threads, max_ram, input_file)
diff --git a/epik/src/epik/main.cpp b/epik/src/epik/main.cpp
index 6f1ba9c..82a8fa2 100644
--- a/epik/src/epik/main.cpp
+++ b/epik/src/epik/main.cpp
@@ -5,6 +5,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -63,29 +64,51 @@ void print_intruction_set()
/// Float-to-humanized string for better output
template
-std::string humanize(T num)
+std::string to_human_readable(T num)
{
std::ostringstream oss;
- oss.precision(1);
- if (num < 1000.0)
+ if (num < 1024)
{
oss << std::fixed << num;
}
- else if (num < 1000000.0)
- {
- oss << std::fixed << num / 1000.0 << "K";
- }
- else if (num < 1000000000.0)
- {
- oss << std::fixed << num / 1000000.0 << "M";
- }
else
{
- oss << std::fixed << num / 1000000000.0 << "B";
+ double value;
+ std::string suffix;
+
+ if (num < 1024 * 1024)
+ {
+ value = num / 1024.0;
+ suffix = "K";
+ }
+ else if (num < 1024 * 1024 * 1024)
+ {
+ value = num / (1024.0 * 1024.0);
+ suffix = "M";
+ }
+ else
+ {
+ value = num / (1024.0 * 1024.0 * 1024.0);
+ suffix = "B";
+ }
+
+ // Check if the fractional part is zero
+ double int_part;
+ double frac_part = std::modf(value, &int_part);
+ if (frac_part == 0.0)
+ {
+ oss << static_cast(int_part) << suffix;
+ }
+ else
+ {
+ oss.precision(1);
+ oss << std::fixed << value << suffix;
+ }
}
return oss.str();
+
}
/// Size_t-to-string that translates milliseconds to humanized time
@@ -128,9 +151,9 @@ std::string humanize_time(size_t milliseconds)
return oss.str();
}
-/// Parse the humanized RAM size to a number.
-/// I know that the name of this function is unfortunate.
-size_t dehumanize_ram(const std::string& max_ram)
+/// Parse a human-readable --max-ram value as the number of bytes
+/// e.g. 128K, 50M, 4.2Gb etc.
+size_t parse_human_readable(const std::string& max_ram)
{
double value;
char unit = 0;
@@ -140,7 +163,7 @@ size_t dehumanize_ram(const std::string& max_ram)
ss >> value;
if (ss.fail())
{
- throw std::runtime_error("Can't parse max_ram parameter: wrong numerical part");
+ throw std::runtime_error("Could not parse --max-ram parameter: wrong numerical part");
}
// Check if there is a memory unit
@@ -149,7 +172,7 @@ size_t dehumanize_ram(const std::string& max_ram)
ss >> unit;
if (ss.fail())
{
- throw std::runtime_error("Can't parse max_ram parameter: wrong unit");
+ throw std::runtime_error("Could not parse --max-ram parameter: wrong unit");
}
}
@@ -230,10 +253,15 @@ int main(int argc, char** argv)
if (parsed_options.count("max-ram"))
{
const auto max_ram_string = parsed_options["max-ram"].as();
- const auto max_ram = dehumanize_ram(max_ram_string);
- max_entries = max_ram / sizeof(i2l::pkdb_value);
+ const auto max_ram = parse_human_readable(max_ram_string);
+ max_entries = static_cast(max_ram / sizeof(i2l::pkdb_value));
+
+ if (max_entries == 0)
+ {
+ throw std::runtime_error("Memory limit is too low");
+ }
std::cout << "Max-RAM provided: will be loaded not more than "
- << humanize(max_entries) << " phylo-k-mers." << std::endl;
+ << to_human_readable(max_entries) << " phylo-k-mers." << std::endl;
}
#ifndef EPIK_OMP
@@ -259,8 +287,8 @@ int main(int argc, char** argv)
<< "\tk: " << db.kmer_size() << std::endl
<< "\tomega: " << db.omega() << std::endl
<< "\tPositions loaded: " << (db.positions_loaded() ? "true" : "false") << std::endl << std::endl;
- std::cout << "Loaded " << humanize(db.get_num_entries_loaded())
- << " of " << humanize(db.get_num_entries_total())
+ std::cout << "Loaded " << to_human_readable(db.get_num_entries_loaded())
+ << " of " << to_human_readable(db.get_num_entries_total())
<< " phylo-k-mers. " << std::endl << std::endl;
const auto tree = i2l::io::parse_newick(db.tree());
@@ -325,7 +353,7 @@ int main(int argc, char** argv)
average_speed += seq_per_second;
// Update progress bar
- bar.set_option(option::PrefixText{humanize(seq_per_second) + " seq/s "});
+ bar.set_option(option::PrefixText{to_human_readable(seq_per_second) + " seq/s "});
bar.set_option(option::PostfixText{std::to_string(num_seq_placed) + " / ?"});
bar.set_progress(reader.bytes_read());
@@ -339,12 +367,12 @@ int main(int argc, char** argv)
average_speed /= (double)num_iterations;
bar.set_option(option::PrefixText{"Done. "});
- bar.set_option(option::PostfixText{std::to_string(num_seq_placed)});
+ bar.set_option(option::PostfixText{to_human_readable(num_seq_placed)});
bar.set_progress(reader.bytes_read());
std::cout << std::endl << termcolor::bold << termcolor::white
<< "Placed " << num_seq_placed << " sequences.\nAverage speed: "
- << humanize(average_speed) << " seq/s.\n";
+ << to_human_readable(average_speed) << " seq/s.\n";
std::cout << "Output: " << jplace_filename << std::endl;
const auto placement_time = std::chrono::duration_cast(
diff --git a/epik/src/epik/place.cpp b/epik/src/epik/place.cpp
index 3a14283..9ca42c7 100644
--- a/epik/src/epik/place.cpp
+++ b/epik/src/epik/place.cpp
@@ -200,6 +200,8 @@ std::vector filter_by_ratio(const std::vector& placements,
placed_collection placer::place(const std::vector& seq_records, size_t num_threads)
{
+ (void)num_threads;
+
/// There may be identical sequences with different headers. We group them
/// by the sequence content to not to place the same sequences more than once
const auto sequence_map = group_by_sequence_content(seq_records);
diff --git a/scripts/config.json b/scripts/config.json
deleted file mode 100644
index 0a19ce4..0000000
--- a/scripts/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
- "soft":
- {
- "rappas": "/home/nikolai/miniconda3/envs/PEWO/bin/RAPPAS.jar",
- "xpas": "xpas.py",
- "rappas2": "rappas2.py",
- "arbinary": "/home/nikolai/miniconda3/envs/PEWO/bin/raxml-ng"
- },
-
- "args":
- {
- "alignment": "/ngs/data/D652/reference.fasta",
- "tree": "/ngs/data/D652/tree.newick",
- "queries": "/ngs/rappas/D652/EMP_92_studies_10000.fas",
-
- "states": "nucl",
-
- "ardir": "/ngs/rappas/D652/temp/AR",
- "categories": "4",
- "workdir": "/ngs/ppdiff/D652",
-
- "k": "5",
- "omega": "1.5",
- "reduction-ratio": "0.99",
- "filter": "no-filter",
- "mu": "1.0",
-
- "use-unrooted": "True"
-
- }
-}
\ No newline at end of file