Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ jobs:
"subcommand": "tpch",
"name": "TPC-H SF=1 on S3",
"local_dir": "bench-vortex/data/tpch/1.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/1.0/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 1.0",
"build_args": "--features lance"
Expand All @@ -142,7 +142,7 @@ jobs:
"subcommand": "tpch",
"name": "TPC-H SF=10 on S3",
"local_dir": "bench-vortex/data/tpch/10.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 10.0",
"build_args": "--features lance"
Expand Down Expand Up @@ -174,7 +174,7 @@ jobs:
"subcommand": "fineweb",
"name": "FineWeb S3",
"local_dir": "bench-vortex/data/fineweb",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/fineweb/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/fineweb/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 100"
},
Expand All @@ -190,7 +190,7 @@ jobs:
"subcommand": "gharchive",
"name": "GitHub Archive (S3)",
"local_dir": "bench-vortex/data/gharchive",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/gharchive/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/gharchive/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 100"
},
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nightly-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
"subcommand": "tpch",
"name": "TPC-H on S3",
"local_dir": "bench-vortex/data/tpch/10.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex",
"scale_factor": "--scale-factor 10.0",
"build_args": "--features lance"
Expand All @@ -58,7 +58,7 @@ jobs:
"subcommand": "tpch",
"name": "TPC-H on S3",
"local_dir": "bench-vortex/data/tpch/100.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/100.0/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/100.0/",
"targets": "datafusion:parquet,duckdb:parquet,duckdb:vortex",
"scale_factor": "--scale-factor 100.0"
},
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ on:
"subcommand": "tpch",
"name": "TPC-H SF=1 on S3",
"local_dir": "bench-vortex/data/tpch/1.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/1.0/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 1.0"
},
Expand All @@ -51,7 +51,7 @@ on:
"subcommand": "tpch",
"name": "TPC-H SF=10 on S3",
"local_dir": "bench-vortex/data/tpch/10.0",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 10.0"
},
Expand Down Expand Up @@ -81,7 +81,7 @@ on:
"subcommand": "fineweb",
"name": "FineWeb S3",
"local_dir": "bench-vortex/data/fineweb",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/fineweb/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/fineweb/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 100"
},
Expand All @@ -97,7 +97,7 @@ on:
"subcommand": "gharchive",
"name": "GitHub Archive (S3)",
"local_dir": "bench-vortex/data/gharchive",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/gharchive/",
"remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/gharchive/",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
"scale_factor": "--scale-factor 100"
},
Expand Down
10 changes: 2 additions & 8 deletions bench-vortex/src/bin/query_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -425,10 +425,7 @@ fn run_statpopgen(args: StatPopGenArgs) -> anyhow::Result<()> {
fn run_fineweb(args: FinewebArgs) -> anyhow::Result<()> {
setup_logging_and_tracing(args.common.verbose, args.common.tracing)?;

let data_url = Url::from_directory_path("fineweb".to_data_path())
.map_err(|_| anyhow::anyhow!("bad data path"))?;

let benchmark = Fineweb::new(data_url);
let benchmark = Fineweb::with_remote_data_dir(args.common.use_remote_data_dir)?;

let config = DriverConfig {
targets: args.targets,
Expand Down Expand Up @@ -456,10 +453,7 @@ fn run_fineweb(args: FinewebArgs) -> anyhow::Result<()> {
fn run_gharchive(args: GhArchiveArgs) -> anyhow::Result<()> {
setup_logging_and_tracing(args.common.verbose, args.common.tracing)?;

let data_url = Url::from_directory_path("gharchive".to_data_path())
.map_err(|_| anyhow::anyhow!("bad data path"))?;

let benchmark = GithubArchive::new(data_url);
let benchmark = GithubArchive::with_remote_data_dir(args.common.use_remote_data_dir)?;

let config = DriverConfig {
targets: args.targets,
Expand Down
43 changes: 43 additions & 0 deletions bench-vortex/src/fineweb/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,38 @@ impl Fineweb {
pub fn new(data_url: Url) -> Self {
Self { data_url }
}

pub fn with_remote_data_dir(use_remote_data_dir: Option<String>) -> anyhow::Result<Self> {
let data_url = Self::create_data_url(&use_remote_data_dir)?;
Ok(Self { data_url })
}

fn create_data_url(remote_data_dir: &Option<String>) -> anyhow::Result<Url> {
match remote_data_dir {
None => {
let data_dir = crate::IdempotentPath::to_data_path("fineweb");
Url::from_directory_path(&data_dir).map_err(|_| {
anyhow::anyhow!("Failed to create URL from directory path: {:?}", &data_dir)
})
}
Some(remote_data_dir) => {
if !remote_data_dir.ends_with("/") {
log::warn!(
"Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/develop/12345/fineweb/"
);
}
log::info!(
concat!(
"Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n",
"If it does not, you should kill this command, locally generate the files (by running without\n",
"--use-remote-data-dir) and upload data/fineweb/ to some remote location.",
),
remote_data_dir,
);
Ok(Url::parse(remote_data_dir)?)
}
}
}
}

impl Fineweb {
Expand Down Expand Up @@ -92,6 +124,17 @@ impl Benchmark for Fineweb {
}

fn generate_data(&self, target: &Target) -> anyhow::Result<()> {
// Skip generation if using remote storage
match self.data_url.scheme() {
"file" => {
// Continue with local generation
}
_ => {
// Remote storage - data should already be uploaded
return Ok(());
}
}

// Before downloading anything, make sure we are using a supported target.
anyhow::ensure!(
matches!(
Expand Down
45 changes: 44 additions & 1 deletion bench-vortex/src/realnest/gharchive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,38 @@ impl GithubArchive {
pub fn new(data_url: Url) -> Self {
Self { data_url }
}

pub fn with_remote_data_dir(use_remote_data_dir: Option<String>) -> anyhow::Result<Self> {
let data_url = Self::create_data_url(&use_remote_data_dir)?;
Ok(Self { data_url })
}

fn create_data_url(remote_data_dir: &Option<String>) -> anyhow::Result<Url> {
match remote_data_dir {
None => {
let data_dir = crate::IdempotentPath::to_data_path("gharchive");
Url::from_directory_path(&data_dir).map_err(|_| {
anyhow::anyhow!("Failed to create URL from directory path: {:?}", &data_dir)
})
}
Some(remote_data_dir) => {
if !remote_data_dir.ends_with("/") {
log::warn!(
"Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/develop/12345/gharchive/"
);
}
log::info!(
concat!(
"Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n",
"If it does not, you should kill this command, locally generate the files (by running without\n",
"--use-remote-data-dir) and upload data/gharchive/ to some remote location.",
),
remote_data_dir,
);
Ok(Url::parse(remote_data_dir)?)
}
}
}
}

impl GithubArchive {
Expand Down Expand Up @@ -95,13 +127,24 @@ impl Benchmark for GithubArchive {
}

fn generate_data(&self, target: &Target) -> anyhow::Result<()> {
// Skip generation if using remote storage
match self.data_url.scheme() {
"file" => {
// Continue with local generation
}
_ => {
// Remote storage - data should already be uploaded
return Ok(());
}
}

// Before downloading anything, make sure we are using a supported target.
anyhow::ensure!(
matches!(
target.format,
Format::Parquet | Format::OnDiskVortex | Format::VortexCompact
),
"unsupported format for `fineweb` bench: {}",
"unsupported format for `gharchive` bench: {}",
target.format
);

Expand Down
Loading