-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
296 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#!/usr/bin/env python | ||
import csv | ||
import sys | ||
from typing import List, NamedTuple | ||
from collections import defaultdict, Counter | ||
|
||
class ToolData(NamedTuple): | ||
command: str | ||
mean: float | ||
stddev: float | ||
median: float | ||
user: float | ||
system: float | ||
min: float | ||
max: float | ||
|
||
def parse_csv(file_path: str) -> List[ToolData]: | ||
data = [] | ||
with open(file_path, 'r') as csvfile: | ||
try: | ||
reader = csv.reader(csvfile) | ||
except csv.Error as e: | ||
sys.exit('file {}, line {}: {}'.format(file_path, reader.line_num, e)) | ||
|
||
try: | ||
next(reader) # Skip header | ||
except StopIteration: | ||
return data | ||
for row in reader: | ||
command, mean, stddev, median, user, system, min_time, max_time = row | ||
tool = command.split()[0] | ||
# if tool is a path, extract the basename | ||
if '/' in tool: | ||
tool = tool.split('/')[-1] | ||
data.append(ToolData( | ||
tool, | ||
float(mean), | ||
float(stddev), | ||
float(median), | ||
float(user), | ||
float(system), | ||
float(min_time), | ||
float(max_time) | ||
)) | ||
return data | ||
|
||
def process_files(file_paths: List[str]) -> List[ToolData]: | ||
all_data = [] | ||
for file_path in file_paths: | ||
all_data.extend(parse_csv(file_path)) | ||
return all_data | ||
|
||
def rank_tools(data: List[ToolData]) -> List[ToolData]: | ||
# Group by command and calculate average mean for each command | ||
command_data = defaultdict(list) | ||
for tool in data: | ||
command_data[tool.command].append(tool) | ||
|
||
# Calculate average ToolData for each command | ||
averaged_data = [] | ||
for command, tools in command_data.items(): | ||
avg_tool = ToolData( | ||
command=command, | ||
mean=sum(t.mean for t in tools) / len(tools), | ||
stddev=sum(t.stddev for t in tools) / len(tools), | ||
median=sum(t.median for t in tools) / len(tools), | ||
user=sum(t.user for t in tools) / len(tools), | ||
system=sum(t.system for t in tools) / len(tools), | ||
min=min(t.min for t in tools), | ||
max=max(t.max for t in tools) | ||
) | ||
averaged_data.append(avg_tool) | ||
|
||
# Sort by mean execution time | ||
return sorted(averaged_data, key=lambda x: x.mean) | ||
|
||
def count_fastest_tools(file_paths: List[str]) -> Counter: | ||
fastest_tools = Counter() | ||
for file_path in file_paths: | ||
data = parse_csv(file_path) | ||
if data: | ||
fastest_tool = min(data, key=lambda x: x.mean).command | ||
fastest_tools[fastest_tool] += 1 | ||
return fastest_tools | ||
|
||
def print_ranked_tools(ranked_tools: List[ToolData]): | ||
print("Ranked list of tools (from fastest to slowest):") | ||
print("-" * 80) | ||
print(f"{'Rank':<5}{'Command':<20}{'Mean (s)':<12}{'Median (s)':<12}{'StdDev (s)':<12}{'Min (s)':<12}{'Max (s)':<12}") | ||
print("-" * 80) | ||
for i, tool in enumerate(ranked_tools, 1): | ||
print(f"{i:<5}{tool.command:<20}{tool.mean:<12.6f}{tool.median:<12.6f}{tool.stddev:<12.6f}{tool.min:<12.6f}{tool.max:<12.6f}") | ||
|
||
def print_fastest_tool_frequency(fastest_tools: Counter): | ||
print("\nRanked list of tools by frequency of being fastest:") | ||
print("-" * 50) | ||
print(f"{'Rank':<5}{'Command':<20}{'Frequency':<10}") | ||
print("-" * 50) | ||
for i, (tool, count) in enumerate(fastest_tools.most_common(), 1): | ||
print(f"{i:<5}{tool:<20}{count:<10}") | ||
|
||
def main(): | ||
if len(sys.argv) < 2: | ||
print("Usage: python script.py <csv_file1> <csv_file2> ...") | ||
sys.exit(1) | ||
|
||
file_paths = sys.argv[1:] | ||
all_data = process_files(file_paths) | ||
ranked_tools = rank_tools(all_data) | ||
|
||
print_ranked_tools(ranked_tools) | ||
|
||
fastest_tools = count_fastest_tools(file_paths) | ||
print_fastest_tool_frequency(fastest_tools) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/usr/bin/env bash | ||
set -euo pipefail | ||
SELF_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
PARENT_DIR="$(dirname "$SELF_DIR")" | ||
|
||
|
||
# Get argument: INPUT_DIR | ||
if [ "$#" -ne 1 ]; then | ||
echo "Usage: $0 INPUT_DIR" | ||
exit 1 | ||
fi | ||
INPUT_DIR=$1 | ||
|
||
# Check binary presence | ||
for BIN in bin/n50 seqfu seqkit hyperfine; | ||
do | ||
# Check binary presence with command -v | ||
if ! command -v "$BIN" &> /dev/null; then | ||
echo "Binary not found: $BIN" | ||
exit 1 | ||
else | ||
echo "OK: Binary found: $BIN" | ||
fi | ||
done | ||
|
||
for FILE in "$INPUT_DIR"/*; | ||
do | ||
echo "== TEST $FILE" | ||
BASE=$(basename "$FILE" | sed 's/\./_/g') | ||
hyperfine --warmup 1 --max-runs 5 \ | ||
--export-csv "$PARENT_DIR"/test/benchmark/single_"$BASE".csv \ | ||
--export-markdown "$PARENT_DIR"/test/benchmark/single_"$BASE".md \ | ||
-n "n50" -n "seqfu" -n "seqkit" \ | ||
"bin/n50 $FILE" \ | ||
"seqfu stats $FILE" \ | ||
"seqkit stats --all $FILE" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
#!/usr/bin/env bash | ||
set -euo pipefail | ||
SELF_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
PARENT_DIR="$(dirname "$SELF_DIR")" | ||
BIN_DIR="$PARENT_DIR/bin" | ||
# Check binary presence | ||
for BIN in n50 n50_simreads gen; | ||
do | ||
if [ ! -f "$BIN_DIR/$BIN" ]; then | ||
echo "Binary not found: $BIN" | ||
exit 1 | ||
else | ||
echo "OK: Binary found: $BIN" | ||
fi | ||
done | ||
|
||
# get 1 argument to perform deep test | ||
DEEP=0 | ||
if [ "$#" -eq 1 ]; then | ||
DEEP=1 | ||
fi | ||
|
||
# Simulate reads | ||
OUT_DIR="$PARENT_DIR/test/sim/" | ||
if [ -d "$OUT_DIR" ]; then | ||
echo "Cleaning $OUT_DIR" | ||
# Remove all files if not empty already | ||
rm -f "$OUT_DIR"/* || true | ||
fi | ||
mkdir -p "$OUT_DIR" | ||
COUNTER=0 | ||
# $COMPRESSOR = gzip or pigz if available | ||
COMPRESSOR=$(which pigz) | ||
if [ -z "$COMPRESSOR" ]; then | ||
COMPRESSOR=$(which gzip) | ||
fi | ||
|
||
for FORMAT in fasta fastq; | ||
do | ||
COUNTER=$((COUNTER+4)) | ||
# Small/Medium | ||
"${BIN_DIR}"/n50_simreads --${FORMAT} -o $OUT_DIR 100*250 20*1k 1*50k 2> /dev/null | ||
"${BIN_DIR}"/n50_simreads --${FORMAT} -o $OUT_DIR 100*1000 10*100k 1*10M 2> /dev/null | ||
# Large | ||
if [[ $FORMAT == "fastq" ]] && [[ $DEEP == 1 ]]; then | ||
"${BIN_DIR}"/n50_simreads --${FORMAT} -o $OUT_DIR 100*5k 1*1G 2> /dev/null | ||
fi | ||
${COMPRESSOR} -k "$OUT_DIR"/*."$FORMAT" | ||
done | ||
|
||
I=0 | ||
for FILE in "$OUT_DIR"/*; | ||
do | ||
I=$((I+1)) | ||
echo "$I/$COUNTER: Simulated reads: $FILE" | ||
OUTPUT=$("${BIN_DIR}"/n50 "$FILE" | sed 's/\t/,/g') | ||
# First field | ||
FILENAME=$(basename $(echo $OUTPUT | cut -d' ' -f1)) | ||
EXP_N50=$(echo "${FILENAME}" | cut -f 1 -d '_' | cut -f 1 -d '.') | ||
EXP_SEQS=$(echo "${FILENAME}" | cut -f 2 -d '_' | cut -f 1 -d '.') | ||
EXP_BASES=$(echo "${FILENAME}" | cut -f 3 -d '_' | cut -f 1 -d '.') | ||
REAL_N50=$(echo "$OUTPUT" | cut -d',' -f 5) | ||
REAL_SEQS=$(echo "$OUTPUT" | cut -d',' -f 4) | ||
REAL_BASES=$(echo "$OUTPUT" | cut -d',' -f 3) | ||
if [ "$EXP_N50" != "$REAL_N50" ]; then | ||
echo "ERROR: N50 mismatch: $EXP_N50 != $REAL_N50" | ||
exit 1 | ||
else | ||
echo "OK: N50 match: $EXP_N50 == $REAL_N50" | ||
fi | ||
|
||
if [ "$EXP_SEQS" != "$REAL_SEQS" ]; then | ||
echo "ERROR: Sequences mismatch: $EXP_SEQS != $REAL_SEQS" | ||
exit 1 | ||
else | ||
echo "OK: Sequences match: $EXP_SEQS == $REAL_SEQS" | ||
fi | ||
|
||
|
||
if [ "$EXP_BASES" != "$REAL_BASES" ]; then | ||
echo "ERROR: Bases mismatch: $EXP_BASES != $REAL_BASES" | ||
exit 1 | ||
else | ||
echo "OK: Bases match: $EXP_BASES == $REAL_BASES" | ||
fi | ||
echo "" | ||
done | ||
|
||
# if hyperfine and seqkit and seqfu are available go on | ||
if [ -z "$(which hyperfine)" ] || [ -z "$(which seqkit)" ] || [ -z "$(which seqfu)" ]; then | ||
echo "Skipping benchmark" | ||
exit 0 | ||
fi | ||
|
||
mkdir -p "$PARENT_DIR"/test/benchmark/ | ||
|
||
hyperfine --warmup 1 --max-runs 9 \ | ||
--export-csv "$PARENT_DIR"/test/benchmark/all_files.csv \ | ||
-n "n50" -n "seqfu" -n "seqkit" \ | ||
"${BIN_DIR}/n50 $OUT_DIR/*" \ | ||
"seqfu stats $OUT_DIR/*" \ | ||
"seqkit stats --all $OUT_DIR/*" | ||
|
||
hyperfine --warmup 1 --max-runs 9 \ | ||
--export-csv "$PARENT_DIR"/test/benchmark/all_fastq.csv \ | ||
-n "n50" -n "seqfu" -n "seqkit" \ | ||
"${BIN_DIR}/n50 $OUT_DIR/*.fastq" \ | ||
"seqfu stats $OUT_DIR/*.fastq" \ | ||
"seqkit stats --all $OUT_DIR/*.fastq" | ||
|
||
hyperfine --warmup 1 --max-runs 9 \ | ||
--export-csv "$PARENT_DIR"/test/benchmark/all_fasta.csv \ | ||
-n "n50" -n "seqfu" -n "seqkit" \ | ||
"${BIN_DIR}/n50 $OUT_DIR/*.fasta" \ | ||
"seqfu stats $OUT_DIR/*.fasta" \ | ||
"seqkit stats --all $OUT_DIR/*.fasta" | ||
|
||
hyperfine --warmup 1 --max-runs 9 \ | ||
--export-csv "$PARENT_DIR"/test/benchmark/all_fasta_gz.csv \ | ||
-n "n50" -n "seqfu" -n "seqkit" \ | ||
"${BIN_DIR}/n50 $OUT_DIR/*.fasta.gz" \ | ||
"seqfu stats $OUT_DIR/*.fasta.gz" \ | ||
"seqkit stats --all $OUT_DIR/*.fasta.gz" | ||
|
||
hyperfine --warmup 1 --max-runs 9 \ | ||
--export-csv "$PARENT_DIR"/test/benchmark/all_fastq_gz.csv \ | ||
-n "n50" -n "seqfu" -n "seqkit" \ | ||
"${BIN_DIR}/n50 $OUT_DIR/*.fastq.gz" \ | ||
"seqfu stats $OUT_DIR/*.fastq.gz" \ | ||
"seqkit stats --all $OUT_DIR/*.fastq.gz" | ||
|
||
for FILE in "$OUT_DIR"/*; | ||
do | ||
echo "Benchmarking single file: $FILE" | ||
BASE=$(basename $FILE | sed 's/\./_/g') | ||
hyperfine --warmup 1 --max-runs 9 \ | ||
--export-csv $PARENT_DIR/test/benchmark/single_${BASE}.csv \ | ||
-n "n50" -n "seqfu" -n "seqkit" \ | ||
"${BIN_DIR}/n50 $FILE" \ | ||
"seqfu stats $FILE" \ | ||
"seqkit stats --all $FILE" | ||
done |