Skip to content

Commit

Permalink
Rename data prep tool
Browse files Browse the repository at this point in the history
  • Loading branch information
ankoh committed Oct 7, 2021
1 parent bda2c78 commit 996ff7d
Show file tree
Hide file tree
Showing 12 changed files with 67 additions and 67 deletions.
22 changes: 11 additions & 11 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ on:
workflow_dispatch:

jobs:
tpch_generator:
tpchgen:
name: TPCH Generator
runs-on: ubuntu-latest
steps:
Expand Down Expand Up @@ -33,8 +33,8 @@ jobs:
./submodules/tpch-dbgen/dbgen/dbgen
retention-days: 1

parquet_generator:
name: Parquet Generator
dataprep:
name: Dataprep
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
Expand All @@ -47,23 +47,23 @@ jobs:
path: |
./.cargo
./target
key: ${{ runner.os }}-datagen-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./tools/parquetgen/src/*.rs') }}
key: ${{ runner.os }}-dataprep-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./tools/dataprep/src/*.rs') }}
restore-keys: |
${{ runner.os }}-datagen-
${{ runner.os }}-dataprep-
- name: Build generator
uses: duckdb/duckdb-wasm-ci-env@v0.4
if: steps.cache-generator.outputs.cache-hit != 'true'
with:
script: |-
cargo build --manifest-path=./Cargo.toml --release -p parquetgen
cargo build --manifest-path=./Cargo.toml --release -p dataprep
- name: Upload artifact
uses: actions/upload-artifact@v2
with:
name: parquetgen
name: dataprep
path: |
./target/release/parquetgen
./target/release/dataprep
retention-days: 1

duckdb_shell:
Expand Down Expand Up @@ -106,8 +106,8 @@ jobs:
name: Dummy Benchmark
runs-on: ubuntu-latest
needs:
- parquet_generator
- tpch_generator
- dataprep
- tpchgen
- duckdb_shell
steps:
- uses: actions/checkout@v2
Expand All @@ -117,7 +117,7 @@ jobs:

- uses: actions/download-artifact@v2
with:
name: parquetgen
name: dataprep
path: ./target/release/

- uses: actions/download-artifact@v2
Expand Down
36 changes: 18 additions & 18 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
script: |-
yarn workspace @duckdb/duckdb-wasm run lint
tpch_generator:
tpchgen:
name: TPCH Generator
runs-on: ubuntu-latest
steps:
Expand All @@ -68,7 +68,7 @@ jobs:
id: cache-generator
with:
path: ./submodules/tpch-dbgen/dbgen/dbgen
key: ${{ runner.os }}-tpch-dben
key: ${{ runner.os }}-tpch-dbgen

- name: Build generator
if: steps.cache-generator.outputs.cache-hit != 'true'
Expand All @@ -85,8 +85,8 @@ jobs:
./submodules/tpch-dbgen/dbgen/dbgen
retention-days: 1

parquet_generator:
name: Parquet Generator
dataprep:
name: Dataprep
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
Expand All @@ -99,23 +99,23 @@ jobs:
path: |
./.cargo
./target
key: ${{ runner.os }}-datagen-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./tools/parquetgen/src/*.rs') }}
key: ${{ runner.os }}-dataprep-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./tools/dataprep/src/*.rs') }}
restore-keys: |
${{ runner.os }}-datagen-
${{ runner.os }}-dataprep-
- name: Build generator
uses: duckdb/duckdb-wasm-ci-env@v0.4
if: steps.cache-generator.outputs.cache-hit != 'true'
with:
script: |-
cargo build --manifest-path=./Cargo.toml --release -p parquetgen
cargo build --manifest-path=./Cargo.toml --release -p dataprep
- name: Upload artifact
uses: actions/upload-artifact@v2
with:
name: parquetgen
name: dataprep
path: |
./target/release/parquetgen
./target/release/dataprep
retention-days: 1

duckdb_shell:
Expand Down Expand Up @@ -160,8 +160,8 @@ jobs:
needs:
- clang_format
- eslint
- parquet_generator
- tpch_generator
- dataprep
- tpchgen
steps:
- uses: actions/checkout@v2
with:
Expand All @@ -170,7 +170,7 @@ jobs:

- uses: actions/download-artifact@v2
with:
name: parquetgen
name: dataprep
path: ./target/release/

- uses: actions/download-artifact@v2
Expand Down Expand Up @@ -250,8 +250,8 @@ jobs:
needs:
- clang_format
- eslint
- parquet_generator
- tpch_generator
- dataprep
- tpchgen
steps:
- uses: actions/checkout@v2
with:
Expand All @@ -260,7 +260,7 @@ jobs:

- uses: actions/download-artifact@v2
with:
name: parquetgen
name: dataprep
path: ./target/release/

- uses: actions/download-artifact@v2
Expand Down Expand Up @@ -498,8 +498,8 @@ jobs:
- wasm_default
- wasm_next
- wasm_next_coi
- parquet_generator
- tpch_generator
- dataprep
- tpchgen
- duckdb_shell
steps:
- uses: actions/checkout@v2
Expand All @@ -509,7 +509,7 @@ jobs:

- uses: actions/download-artifact@v2
with:
name: parquetgen
name: dataprep
path: ./target/release/

- uses: actions/download-artifact@v2
Expand Down
18 changes: 9 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[workspace]

members = [
"tools/parquetgen",
"tools/dataprep",
"packages/duckdb-wasm-shell/crate",
]
40 changes: 20 additions & 20 deletions scripts/generate_tpch_raw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,50 +5,50 @@ set -euo pipefail
trap exit SIGINT

PROJECT_ROOT="$(cd $(dirname "$BASH_SOURCE[0]") && cd .. && pwd)" &> /dev/null
DBGEN_DIR="${PROJECT_ROOT}/submodules/tpch-dbgen/dbgen"
DBGEN=${DBGEN_DIR}/dbgen
PARQUETGEN=${PROJECT_ROOT}/target/release/parquetgen
TPCH_DBGEN_DIR="${PROJECT_ROOT}/submodules/tpch-dbgen/dbgen"
TPCH_DBGEN=${TPCH_DBGEN_DIR}/dbgen
DATAPREP=${PROJECT_ROOT}/target/release/dataprep
SCALE_FACTOR=${1:-0.01}
SCALE_FACTOR_DIR=${SCALE_FACTOR/./_}
TPCH_DIR=${PROJECT_ROOT}/data/tpch
TPCH_SF_OUT=${TPCH_DIR}/${SCALE_FACTOR_DIR}
TPCH_SF_OUT_TBL=${TPCH_SF_OUT}/tbl
TPCH_SF_OUT_PARQUET=${TPCH_SF_OUT}/parquet
TPCH_SF_OUT_DATA=${TPCH_SF_OUT}/parquet

echo "SCALE_FACTOR=${SCALE_FACTOR}"

if [ ! -f ${DBGEN} ]; then
cd ${DBGEN_DIR}
if [ ! -f ${TPCH_DBGEN} ]; then
cd ${TPCH_DBGEN_DIR}
case "$(uname)" in
'Linux');;
'Darwin')
echo "Patch Makefile"
sed -i '.bak' -e "s/LINUX/MACOS/g" ./Makefile
;;
esac
make -C ${DBGEN_DIR} dbgen
make -C ${TPCH_DBGEN_DIR} dbgen
fi
chmod +x ${DBGEN}
echo "DBGEN=${DBGEN}"
chmod +x ${TPCH_DBGEN}
echo "TPCH_DBGEN=${TPCH_DBGEN}"

if [ ! -f ${PARQUETGEN} ]; then
cargo build --manifest-path=${PROJECT_ROOT}/Cargo.toml --release -p parquetgen
if [ ! -f ${DATAPREP} ]; then
cargo build --manifest-path=${PROJECT_ROOT}/Cargo.toml --release -p dataprep
fi
chmod +x ${PARQUETGEN}
echo "PARQUETGEN=${PARQUETGEN}"
chmod +x ${DATAPREP}
echo "DATAPREP=${DATAPREP}"

mkdir -p ${TPCH_SF_OUT_TBL}
TBL_COUNT=$(find ${TPCH_SF_OUT_TBL} -name "*.tbl" | wc -l)
if [[ ${TBL_COUNT} -ne 8 ]]; then
cd ${DBGEN_DIR}
cd ${TPCH_DBGEN_DIR}
DSS_PATH=${TPCH_SF_OUT_TBL} ./dbgen -vf -s ${SCALE_FACTOR}
fi
echo "TPCH_SF_OUT_TBL=${TPCH_SF_OUT_TBL}"

mkdir -p ${TPCH_SF_OUT_PARQUET}
PARQUET_COUNT=$(find ${TPCH_SF_OUT_TBL} -name "*.parquet" | wc -l)
if [[ ${PARQUET_COUNT} -ne 8 ]]; then
rm -rf ${TPCH_SF_OUT_PARQUET}/*.parquet
${PARQUETGEN} tpch -i ${TPCH_SF_OUT_TBL} -o ${TPCH_SF_OUT_PARQUET}
mkdir -p ${TPCH_SF_OUT_DATA}
DATA_COUNT=$(find ${TPCH_SF_OUT_TBL} -name "*.parquet" | wc -l)
if [[ ${DATA_COUNT} -ne 8 ]]; then
rm -rf ${TPCH_SF_OUT_DATA}/*.parquet
${DATAPREP} tpch -i ${TPCH_SF_OUT_TBL} -o ${TPCH_SF_OUT_DATA}
fi
echo "TPCH_SF_OUT_PARQUET=${TPCH_SF_OUT_PARQUET}"
echo "TPCH_SF_OUT_DATA=${TPCH_SF_OUT_DATA}"
10 changes: 5 additions & 5 deletions scripts/generate_uni.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@ trap exit SIGINT
PROJECT_ROOT="$(cd $(dirname "$BASH_SOURCE[0]") && cd .. && pwd)" &> /dev/null

UNI_DIR=${PROJECT_ROOT}/data/uni
PARQUETGEN=${PROJECT_ROOT}/target/release/parquetgen
DATAPREP=${PROJECT_ROOT}/target/release/dataprep

if [ ! -f ${PARQUETGEN} ]; then
cargo build --manifest-path=${PROJECT_ROOT}/Cargo.toml --release -p parquetgen
if [ ! -f ${DATAPREP} ]; then
cargo build --manifest-path=${PROJECT_ROOT}/Cargo.toml --release -p dataprep
fi
echo "PARQUETGEN=${PARQUETGEN}"
echo "DATAPREP=${DATAPREP}"

if [ ! -f "${UNI_DIR}/studenten.parquet" ]; then
mkdir -p ${UNI_DIR}
${PARQUETGEN} uni -o ${UNI_DIR}
${DATAPREP} uni -o ${UNI_DIR}
fi
echo "UNI_DIR=${UNI_DIR}"
File renamed without changes.
2 changes: 1 addition & 1 deletion tools/parquetgen/Cargo.toml → tools/dataprep/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[package]
name = "parquetgen"
name = "dataprep"
version = "0.1.0"
authors = ["Andre Kohn <kohn.a@outlook.com>"]
edition = "2018"
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions tools/parquetgen/src/main.rs → tools/dataprep/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ mod tpch;
mod uni;

fn main() -> Result<(), Box<dyn std::error::Error>> {
let matches = App::new("Parquet Generator")
let matches = App::new("Data Preparation")
.version("0.1")
.author("Andre Kohn. <kohn.a@outlook.com>")
.subcommand(
Expand All @@ -25,7 +25,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
)
.subcommand(
App::new("tpch")
.about("Generates parquet files for the university schema")
.about("Generates parquet files from the TPCH TBL files")
.arg(
Arg::new("out")
.short('o')
Expand Down
File renamed without changes.
File renamed without changes.

0 comments on commit 996ff7d

Please sign in to comment.