Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 19 additions & 28 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -629,22 +629,24 @@ data_tpch() {
exit 1
}

# Points to TPCDS data generation instructions
# Downloads TPC-DS data
data_tpcds() {
TPCDS_DIR="${DATA_DIR}"

# Check if TPCDS data directory exists
if [ ! -d "${TPCDS_DIR}" ]; then
echo ""
echo "For TPC-DS data generation, please clone the datafusion-benchmarks repository:"
echo " git clone https://github.com/apache/datafusion-benchmarks"
echo ""
return 1
TPCDS_DIR="${DATA_DIR}/tpcds_sf1"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this to line 43 ?!
This way it will be defined once and reused by data_tpcds() and run_tpcds().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good idea and it would avoid deuplication. However, none of the other datasets follow this pattern (they all duplicate the paths), so in this case I would prefer to keep the code consistent (we can refactor the common locations into variables as a follow on PR if we want)


# Check if `web_site.parquet` exists in the TPCDS data directory to verify data presence
echo "Checking TPC-DS data directory: ${TPCDS_DIR}"
if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extract a variable for "${TPCDS_DIR}/web_site.parquet" at line 44 ? And reuse it in data_tpcds() and run_tpcds()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

mkdir -p "${TPCDS_DIR}"
# Download the DataFusion benchmarks repository zip if it is not already downloaded
if [ ! -f "${DATA_DIR}/datafusion-benchmarks.zip" ]; then
echo "Downloading DataFusion benchmarks repository zip to: ${DATA_DIR}/datafusion-benchmarks.zip"
wget --timeout=30 --tries=3 -O "${DATA_DIR}/datafusion-benchmarks.zip" https://github.com/apache/datafusion-benchmarks/archive/refs/heads/main.zip
fi
echo "Extracting TPC-DS parquet data to ${TPCDS_DIR}..."
unzip -o -j -d "${TPCDS_DIR}" "${DATA_DIR}/datafusion-benchmarks.zip" datafusion-benchmarks-main/tpcds/data/sf1/*
echo "TPC-DS data extracted."
fi

echo ""
echo "TPC-DS data already exists in ${TPCDS_DIR}"
echo ""
echo "Done."
}

# Runs the tpch benchmark
Expand Down Expand Up @@ -682,21 +684,10 @@ run_tpch_mem() {

# Runs the tpcds benchmark
run_tpcds() {
TPCDS_DIR="${DATA_DIR}"

# Check if TPCDS data directory exists
if [ ! -d "${TPCDS_DIR}" ]; then
echo "Error: TPC-DS data directory does not exist: ${TPCDS_DIR}" >&2
echo "" >&2
echo "Please prepare TPC-DS data first by following instructions:" >&2
echo " ./bench.sh data tpcds" >&2
echo "" >&2
exit 1
fi
TPCDS_DIR="${DATA_DIR}/tpcds_sf1"

# Check if directory contains parquet files
if ! find "${TPCDS_DIR}" -name "*.parquet" -print -quit | grep -q .; then
echo "Error: TPC-DS data directory exists but contains no parquet files: ${TPCDS_DIR}" >&2
# Check if TPCDS data directory and representative file exists
if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then
echo "" >&2
echo "Please prepare TPC-DS data first by following instructions:" >&2
echo " ./bench.sh data tpcds" >&2
Expand Down