Skip to content

Commit 35ffdb4

Browse files
committed
Only create files that are needed
1 parent 03fd2f2 commit 35ffdb4

File tree

1 file changed

+33
-36
lines changed

1 file changed

+33
-36
lines changed

benchmarks/bench.sh

Lines changed: 33 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -203,26 +203,22 @@ main() {
203203
# nlj uses range() function, no data generation needed
204204
;;
205205
tpch)
206-
data_tpch "1"
206+
data_tpch "1" "parquet"
207207
;;
208208
tpch_mem)
209-
# same data as for tpch
210-
data_tpch "1"
209+
data_tpch "1" "parquet"
211210
;;
212211
tpch_csv)
213-
# same data as for tpch
214-
data_tpch "1"
212+
data_tpch "1" "csv"
215213
;;
216214
tpch10)
217-
data_tpch "10"
215+
data_tpch "10" "parquet"
218216
;;
219217
tpch_mem10)
220-
# same data as for tpch10
221-
data_tpch "10"
218+
data_tpch "10" "parquet"
222219
;;
223220
tpch_csv10)
224-
# same data as for tpch10
225-
data_tpch "10"
221+
data_tpch "10" "csv"
226222
;;
227223
clickbench_1)
228224
data_clickbench_1
@@ -537,7 +533,7 @@ main() {
537533
# Creates TPCH data at a certain scale factor, if it doesn't already
538534
# exist
539535
#
540-
# call like: data_tpch($scale_factor)
536+
# call like: data_tpch($scale_factor, format)
541537
#
542538
# Creates data in $DATA_DIR/tpch_sf1 for scale factor 1
543539
# Creates data in $DATA_DIR/tpch_sf10 for scale factor 10
@@ -548,9 +544,10 @@ data_tpch() {
548544
echo "Internal error: Scale factor not specified"
549545
exit 1
550546
fi
547+
FORMAT=$2
551548

552549
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
553-
echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
550+
echo "Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..."
554551

555552
# Ensure the target data directory exists
556553
mkdir -p "${TPCH_DIR}"
@@ -562,15 +559,6 @@ data_tpch() {
562559
exit 1
563560
fi
564561

565-
# Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist
566-
FILE="${TPCH_DIR}/supplier.tbl"
567-
if test -f "${FILE}"; then
568-
echo " tbl files exist ($FILE exists)."
569-
else
570-
echo " creating tbl files with tpchgen-cli..."
571-
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format tbl --output-dir "${TPCH_DIR}"
572-
fi
573-
574562
# Copy expected answers into the ./data/answers directory if it does not already exist
575563
FILE="${TPCH_DIR}/answers/q1.out"
576564
if test -f "${FILE}"; then
@@ -581,23 +569,32 @@ data_tpch() {
581569
docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
582570
fi
583571

584-
# Create 'parquet' files, one directory per file
585-
FILE="${TPCH_DIR}/supplier"
586-
if test -d "${FILE}"; then
587-
echo " parquet files exist ($FILE exists)."
588-
else
589-
echo " creating parquet files using tpchgen-cli ..."
590-
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet --parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"
572+
if [ "$FORMAT" = "parquet" ]; then
573+
# Create 'parquet' files, one directory per file
574+
FILE="${TPCH_DIR}/supplier"
575+
if test -d "${FILE}"; then
576+
echo " parquet files exist ($FILE exists)."
577+
else
578+
echo " creating parquet files using tpchgen-cli ..."
579+
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet --parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"
580+
fi
581+
return
591582
fi
592583

593-
# Create 'csv' files, one direcotry per file
594-
FILE="${TPCH_DIR}/csv/supplier"
595-
if test -d "${FILE}"; then
596-
echo " csv files exist ($FILE exists)."
597-
else
598-
echo " creating csv files using tpchgen-cli binary ..."
599-
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format csv --parts=1 --output-dir "${TPCH_DIR}/csv"
584+
# Create 'csv' files, one directory per file
585+
if [ "$FORMAT" = "csv" ]; then
586+
FILE="${TPCH_DIR}/csv/supplier"
587+
if test -d "${FILE}"; then
588+
echo " csv files exist ($FILE exists)."
589+
else
590+
echo " creating csv files using tpchgen-cli binary ..."
591+
tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format csv --parts=1 --output-dir "${TPCH_DIR}/csv"
592+
fi
593+
return
600594
fi
595+
596+
echo "Error: unknown format '$FORMAT' for tpch data generation, expected 'parquet' or 'csv'"
597+
exit 1
601598
}
602599

603600
# Runs the tpch benchmark
@@ -617,7 +614,7 @@ run_tpch() {
617614
debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG}
618615
}
619616

620-
# Runs the tpch in memory
617+
# Runs the tpch in memory (needs tpch parquet data)
621618
run_tpch_mem() {
622619
SCALE_FACTOR=$1
623620
if [ -z "$SCALE_FACTOR" ] ; then

0 commit comments

Comments
 (0)