diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 50152a8f722..5ec983f51f7 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -124,7 +124,7 @@ jobs: "subcommand": "tpch", "name": "TPC-H SF=1 on S3", "local_dir": "bench-vortex/data/tpch/1.0", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/1.0/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "--scale-factor 1.0", "build_args": "--features lance" @@ -142,7 +142,7 @@ jobs: "subcommand": "tpch", "name": "TPC-H SF=10 on S3", "local_dir": "bench-vortex/data/tpch/10.0", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "--scale-factor 10.0", "build_args": "--features lance" @@ -174,7 +174,7 @@ jobs: "subcommand": "fineweb", "name": "FineWeb S3", "local_dir": "bench-vortex/data/fineweb", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/fineweb/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/fineweb/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "--scale-factor 100" }, @@ -190,7 +190,7 @@ jobs: "subcommand": "gharchive", "name": "GitHub Archive (S3)", "local_dir": "bench-vortex/data/gharchive", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/gharchive/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/gharchive/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "--scale-factor 100" }, diff --git a/.github/workflows/nightly-bench.yml b/.github/workflows/nightly-bench.yml index 379f3a999ad..b58bf0a92ad 100644 --- a/.github/workflows/nightly-bench.yml +++ b/.github/workflows/nightly-bench.yml @@ -41,7 +41,7 @@ jobs: "subcommand": "tpch", "name": "TPC-H on S3", "local_dir": "bench-vortex/data/tpch/10.0", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex", "scale_factor": "--scale-factor 10.0", "build_args": "--features lance" @@ -58,7 +58,7 @@ jobs: "subcommand": "tpch", "name": "TPC-H on S3", "local_dir": "bench-vortex/data/tpch/100.0", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/100.0/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/100.0/", "targets": "datafusion:parquet,duckdb:parquet,duckdb:vortex", "scale_factor": "--scale-factor 100.0" }, diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index e79077e80ae..536034e11a5 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -35,7 +35,7 @@ on: "subcommand": "tpch", "name": "TPC-H SF=1 on S3", "local_dir": "bench-vortex/data/tpch/1.0", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/1.0/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "--scale-factor 1.0" }, @@ -51,7 +51,7 @@ on: "subcommand": "tpch", "name": "TPC-H SF=10 on S3", "local_dir": "bench-vortex/data/tpch/10.0", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "--scale-factor 10.0" }, @@ -81,7 +81,7 @@ on: "subcommand": "fineweb", "name": "FineWeb S3", "local_dir": "bench-vortex/data/fineweb", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/fineweb/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/fineweb/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "--scale-factor 100" }, @@ -97,7 +97,7 @@ on: "subcommand": "gharchive", "name": "GitHub Archive (S3)", "local_dir": "bench-vortex/data/gharchive", - "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/gharchive/", + "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/gharchive/", "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact", "scale_factor": "--scale-factor 100" }, diff --git a/bench-vortex/src/bin/query_bench.rs b/bench-vortex/src/bin/query_bench.rs index 378a1fdad79..6a41301478f 100644 --- a/bench-vortex/src/bin/query_bench.rs +++ b/bench-vortex/src/bin/query_bench.rs @@ -425,10 +425,7 @@ fn run_statpopgen(args: StatPopGenArgs) -> anyhow::Result<()> { fn run_fineweb(args: FinewebArgs) -> anyhow::Result<()> { setup_logging_and_tracing(args.common.verbose, args.common.tracing)?; - let data_url = Url::from_directory_path("fineweb".to_data_path()) - .map_err(|_| anyhow::anyhow!("bad data path"))?; - - let benchmark = Fineweb::new(data_url); + let benchmark = Fineweb::with_remote_data_dir(args.common.use_remote_data_dir)?; let config = DriverConfig { targets: args.targets, @@ -456,10 +453,7 @@ fn run_fineweb(args: FinewebArgs) -> anyhow::Result<()> { fn run_gharchive(args: GhArchiveArgs) -> anyhow::Result<()> { setup_logging_and_tracing(args.common.verbose, args.common.tracing)?; - let data_url = Url::from_directory_path("gharchive".to_data_path()) - .map_err(|_| anyhow::anyhow!("bad data path"))?; - - let benchmark = GithubArchive::new(data_url); + let benchmark = GithubArchive::with_remote_data_dir(args.common.use_remote_data_dir)?; let config = DriverConfig { targets: args.targets, diff --git a/bench-vortex/src/fineweb/mod.rs b/bench-vortex/src/fineweb/mod.rs index ec8b58975dd..9c5039c4cff 100644 --- a/bench-vortex/src/fineweb/mod.rs +++ b/bench-vortex/src/fineweb/mod.rs @@ -58,6 +58,38 @@ impl Fineweb { pub fn new(data_url: Url) -> Self { Self { data_url } } + + pub fn with_remote_data_dir(use_remote_data_dir: Option) -> anyhow::Result { + let data_url = Self::create_data_url(&use_remote_data_dir)?; + Ok(Self { data_url }) + } + + fn create_data_url(remote_data_dir: &Option) -> anyhow::Result { + match remote_data_dir { + None => { + let data_dir = crate::IdempotentPath::to_data_path("fineweb"); + Url::from_directory_path(&data_dir).map_err(|_| { + anyhow::anyhow!("Failed to create URL from directory path: {:?}", &data_dir) + }) + } + Some(remote_data_dir) => { + if !remote_data_dir.ends_with("/") { + log::warn!( + "Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/develop/12345/fineweb/" + ); + } + log::info!( + concat!( + "Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n", + "If it does not, you should kill this command, locally generate the files (by running without\n", + "--use-remote-data-dir) and upload data/fineweb/ to some remote location.", + ), + remote_data_dir, + ); + Ok(Url::parse(remote_data_dir)?) + } + } + } } impl Fineweb { @@ -92,6 +124,17 @@ impl Benchmark for Fineweb { } fn generate_data(&self, target: &Target) -> anyhow::Result<()> { + // Skip generation if using remote storage + match self.data_url.scheme() { + "file" => { + // Continue with local generation + } + _ => { + // Remote storage - data should already be uploaded + return Ok(()); + } + } + // Before downloading anything, make sure we are using a supported target. anyhow::ensure!( matches!( diff --git a/bench-vortex/src/realnest/gharchive.rs b/bench-vortex/src/realnest/gharchive.rs index ad6f9233f63..59ad3d7072c 100644 --- a/bench-vortex/src/realnest/gharchive.rs +++ b/bench-vortex/src/realnest/gharchive.rs @@ -49,6 +49,38 @@ impl GithubArchive { pub fn new(data_url: Url) -> Self { Self { data_url } } + + pub fn with_remote_data_dir(use_remote_data_dir: Option) -> anyhow::Result { + let data_url = Self::create_data_url(&use_remote_data_dir)?; + Ok(Self { data_url }) + } + + fn create_data_url(remote_data_dir: &Option) -> anyhow::Result { + match remote_data_dir { + None => { + let data_dir = crate::IdempotentPath::to_data_path("gharchive"); + Url::from_directory_path(&data_dir).map_err(|_| { + anyhow::anyhow!("Failed to create URL from directory path: {:?}", &data_dir) + }) + } + Some(remote_data_dir) => { + if !remote_data_dir.ends_with("/") { + log::warn!( + "Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev-eu/develop/12345/gharchive/" + ); + } + log::info!( + concat!( + "Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n", + "If it does not, you should kill this command, locally generate the files (by running without\n", + "--use-remote-data-dir) and upload data/gharchive/ to some remote location.", + ), + remote_data_dir, + ); + Ok(Url::parse(remote_data_dir)?) + } + } + } } impl GithubArchive { @@ -95,13 +127,24 @@ impl Benchmark for GithubArchive { } fn generate_data(&self, target: &Target) -> anyhow::Result<()> { + // Skip generation if using remote storage + match self.data_url.scheme() { + "file" => { + // Continue with local generation + } + _ => { + // Remote storage - data should already be uploaded + return Ok(()); + } + } + // Before downloading anything, make sure we are using a supported target. anyhow::ensure!( matches!( target.format, Format::Parquet | Format::OnDiskVortex | Format::VortexCompact ), - "unsupported format for `fineweb` bench: {}", + "unsupported format for `gharchive` bench: {}", target.format );