Skip to content

Commit

Permalink
update test
Browse files Browse the repository at this point in the history
  • Loading branch information
telatin committed Sep 20, 2024
1 parent d9bdfdf commit ae54809
Show file tree
Hide file tree
Showing 4 changed files with 296 additions and 0 deletions.
Empty file modified src/bench.sh
100644 → 100755
Empty file.
117 changes: 117 additions & 0 deletions src/bench_comp_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python
import csv
import sys
from typing import List, NamedTuple
from collections import defaultdict, Counter

class ToolData(NamedTuple):
command: str
mean: float
stddev: float
median: float
user: float
system: float
min: float
max: float

def parse_csv(file_path: str) -> List[ToolData]:
data = []
with open(file_path, 'r') as csvfile:
try:
reader = csv.reader(csvfile)
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(file_path, reader.line_num, e))

try:
next(reader) # Skip header
except StopIteration:
return data
for row in reader:
command, mean, stddev, median, user, system, min_time, max_time = row
tool = command.split()[0]
# if tool is a path, extract the basename
if '/' in tool:
tool = tool.split('/')[-1]
data.append(ToolData(
tool,
float(mean),
float(stddev),
float(median),
float(user),
float(system),
float(min_time),
float(max_time)
))
return data

def process_files(file_paths: List[str]) -> List[ToolData]:
all_data = []
for file_path in file_paths:
all_data.extend(parse_csv(file_path))
return all_data

def rank_tools(data: List[ToolData]) -> List[ToolData]:
# Group by command and calculate average mean for each command
command_data = defaultdict(list)
for tool in data:
command_data[tool.command].append(tool)

# Calculate average ToolData for each command
averaged_data = []
for command, tools in command_data.items():
avg_tool = ToolData(
command=command,
mean=sum(t.mean for t in tools) / len(tools),
stddev=sum(t.stddev for t in tools) / len(tools),
median=sum(t.median for t in tools) / len(tools),
user=sum(t.user for t in tools) / len(tools),
system=sum(t.system for t in tools) / len(tools),
min=min(t.min for t in tools),
max=max(t.max for t in tools)
)
averaged_data.append(avg_tool)

# Sort by mean execution time
return sorted(averaged_data, key=lambda x: x.mean)

def count_fastest_tools(file_paths: List[str]) -> Counter:
fastest_tools = Counter()
for file_path in file_paths:
data = parse_csv(file_path)
if data:
fastest_tool = min(data, key=lambda x: x.mean).command
fastest_tools[fastest_tool] += 1
return fastest_tools

def print_ranked_tools(ranked_tools: List[ToolData]):
print("Ranked list of tools (from fastest to slowest):")
print("-" * 80)
print(f"{'Rank':<5}{'Command':<20}{'Mean (s)':<12}{'Median (s)':<12}{'StdDev (s)':<12}{'Min (s)':<12}{'Max (s)':<12}")
print("-" * 80)
for i, tool in enumerate(ranked_tools, 1):
print(f"{i:<5}{tool.command:<20}{tool.mean:<12.6f}{tool.median:<12.6f}{tool.stddev:<12.6f}{tool.min:<12.6f}{tool.max:<12.6f}")

def print_fastest_tool_frequency(fastest_tools: Counter):
print("\nRanked list of tools by frequency of being fastest:")
print("-" * 50)
print(f"{'Rank':<5}{'Command':<20}{'Frequency':<10}")
print("-" * 50)
for i, (tool, count) in enumerate(fastest_tools.most_common(), 1):
print(f"{i:<5}{tool:<20}{count:<10}")

def main():
if len(sys.argv) < 2:
print("Usage: python script.py <csv_file1> <csv_file2> ...")
sys.exit(1)

file_paths = sys.argv[1:]
all_data = process_files(file_paths)
ranked_tools = rank_tools(all_data)

print_ranked_tools(ranked_tools)

fastest_tools = count_fastest_tools(file_paths)
print_fastest_tool_frequency(fastest_tools)

if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions test/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
SELF_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PARENT_DIR="$(dirname "$SELF_DIR")"


# Get argument: INPUT_DIR
if [ "$#" -ne 1 ]; then
echo "Usage: $0 INPUT_DIR"
exit 1
fi
INPUT_DIR=$1

# Check binary presence
for BIN in bin/n50 seqfu seqkit hyperfine;
do
# Check binary presence with command -v
if ! command -v "$BIN" &> /dev/null; then
echo "Binary not found: $BIN"
exit 1
else
echo "OK: Binary found: $BIN"
fi
done

for FILE in "$INPUT_DIR"/*;
do
echo "== TEST $FILE"
BASE=$(basename "$FILE" | sed 's/\./_/g')
hyperfine --warmup 1 --max-runs 5 \
--export-csv "$PARENT_DIR"/test/benchmark/single_"$BASE".csv \
--export-markdown "$PARENT_DIR"/test/benchmark/single_"$BASE".md \
-n "n50" -n "seqfu" -n "seqkit" \
"bin/n50 $FILE" \
"seqfu stats $FILE" \
"seqkit stats --all $FILE"
done
142 changes: 142 additions & 0 deletions test/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env bash
set -euo pipefail
SELF_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PARENT_DIR="$(dirname "$SELF_DIR")"
BIN_DIR="$PARENT_DIR/bin"
# Check binary presence
for BIN in n50 n50_simreads gen;
do
if [ ! -f "$BIN_DIR/$BIN" ]; then
echo "Binary not found: $BIN"
exit 1
else
echo "OK: Binary found: $BIN"
fi
done

# get 1 argument to perform deep test
DEEP=0
if [ "$#" -eq 1 ]; then
DEEP=1
fi

# Simulate reads
OUT_DIR="$PARENT_DIR/test/sim/"
if [ -d "$OUT_DIR" ]; then
echo "Cleaning $OUT_DIR"
# Remove all files if not empty already
rm -f "$OUT_DIR"/* || true
fi
mkdir -p "$OUT_DIR"
COUNTER=0
# $COMPRESSOR = gzip or pigz if available
COMPRESSOR=$(which pigz)
if [ -z "$COMPRESSOR" ]; then
COMPRESSOR=$(which gzip)
fi

for FORMAT in fasta fastq;
do
COUNTER=$((COUNTER+4))
# Small/Medium
"${BIN_DIR}"/n50_simreads --${FORMAT} -o $OUT_DIR 100*250 20*1k 1*50k 2> /dev/null
"${BIN_DIR}"/n50_simreads --${FORMAT} -o $OUT_DIR 100*1000 10*100k 1*10M 2> /dev/null
# Large
if [[ $FORMAT == "fastq" ]] && [[ $DEEP == 1 ]]; then
"${BIN_DIR}"/n50_simreads --${FORMAT} -o $OUT_DIR 100*5k 1*1G 2> /dev/null
fi
${COMPRESSOR} -k "$OUT_DIR"/*."$FORMAT"
done

I=0
for FILE in "$OUT_DIR"/*;
do
I=$((I+1))
echo "$I/$COUNTER: Simulated reads: $FILE"
OUTPUT=$("${BIN_DIR}"/n50 "$FILE" | sed 's/\t/,/g')
# First field
FILENAME=$(basename $(echo $OUTPUT | cut -d' ' -f1))
EXP_N50=$(echo "${FILENAME}" | cut -f 1 -d '_' | cut -f 1 -d '.')
EXP_SEQS=$(echo "${FILENAME}" | cut -f 2 -d '_' | cut -f 1 -d '.')
EXP_BASES=$(echo "${FILENAME}" | cut -f 3 -d '_' | cut -f 1 -d '.')
REAL_N50=$(echo "$OUTPUT" | cut -d',' -f 5)
REAL_SEQS=$(echo "$OUTPUT" | cut -d',' -f 4)
REAL_BASES=$(echo "$OUTPUT" | cut -d',' -f 3)
if [ "$EXP_N50" != "$REAL_N50" ]; then
echo "ERROR: N50 mismatch: $EXP_N50 != $REAL_N50"
exit 1
else
echo "OK: N50 match: $EXP_N50 == $REAL_N50"
fi

if [ "$EXP_SEQS" != "$REAL_SEQS" ]; then
echo "ERROR: Sequences mismatch: $EXP_SEQS != $REAL_SEQS"
exit 1
else
echo "OK: Sequences match: $EXP_SEQS == $REAL_SEQS"
fi


if [ "$EXP_BASES" != "$REAL_BASES" ]; then
echo "ERROR: Bases mismatch: $EXP_BASES != $REAL_BASES"
exit 1
else
echo "OK: Bases match: $EXP_BASES == $REAL_BASES"
fi
echo ""
done

# if hyperfine and seqkit and seqfu are available go on
if [ -z "$(which hyperfine)" ] || [ -z "$(which seqkit)" ] || [ -z "$(which seqfu)" ]; then
echo "Skipping benchmark"
exit 0
fi

mkdir -p "$PARENT_DIR"/test/benchmark/

hyperfine --warmup 1 --max-runs 9 \
--export-csv "$PARENT_DIR"/test/benchmark/all_files.csv \
-n "n50" -n "seqfu" -n "seqkit" \
"${BIN_DIR}/n50 $OUT_DIR/*" \
"seqfu stats $OUT_DIR/*" \
"seqkit stats --all $OUT_DIR/*"

hyperfine --warmup 1 --max-runs 9 \
--export-csv "$PARENT_DIR"/test/benchmark/all_fastq.csv \
-n "n50" -n "seqfu" -n "seqkit" \
"${BIN_DIR}/n50 $OUT_DIR/*.fastq" \
"seqfu stats $OUT_DIR/*.fastq" \
"seqkit stats --all $OUT_DIR/*.fastq"

hyperfine --warmup 1 --max-runs 9 \
--export-csv "$PARENT_DIR"/test/benchmark/all_fasta.csv \
-n "n50" -n "seqfu" -n "seqkit" \
"${BIN_DIR}/n50 $OUT_DIR/*.fasta" \
"seqfu stats $OUT_DIR/*.fasta" \
"seqkit stats --all $OUT_DIR/*.fasta"

hyperfine --warmup 1 --max-runs 9 \
--export-csv "$PARENT_DIR"/test/benchmark/all_fasta_gz.csv \
-n "n50" -n "seqfu" -n "seqkit" \
"${BIN_DIR}/n50 $OUT_DIR/*.fasta.gz" \
"seqfu stats $OUT_DIR/*.fasta.gz" \
"seqkit stats --all $OUT_DIR/*.fasta.gz"

hyperfine --warmup 1 --max-runs 9 \
--export-csv "$PARENT_DIR"/test/benchmark/all_fastq_gz.csv \
-n "n50" -n "seqfu" -n "seqkit" \
"${BIN_DIR}/n50 $OUT_DIR/*.fastq.gz" \
"seqfu stats $OUT_DIR/*.fastq.gz" \
"seqkit stats --all $OUT_DIR/*.fastq.gz"

for FILE in "$OUT_DIR"/*;
do
echo "Benchmarking single file: $FILE"
BASE=$(basename $FILE | sed 's/\./_/g')
hyperfine --warmup 1 --max-runs 9 \
--export-csv $PARENT_DIR/test/benchmark/single_${BASE}.csv \
-n "n50" -n "seqfu" -n "seqkit" \
"${BIN_DIR}/n50 $FILE" \
"seqfu stats $FILE" \
"seqkit stats --all $FILE"
done

0 comments on commit ae54809

Please sign in to comment.