diff --git a/README.md b/README.md index a9103f0f9..a17fae599 100644 --- a/README.md +++ b/README.md @@ -220,9 +220,9 @@ DROP DATABASE my_pg; | Newline Delimited JSON | ✅ | ✅\* | ✅ | ✅ | ➖ | | Apache Parquet | ✅ | ✅\* | ✅ | ✅ | ➖ | | BSON | ✅ | ✅\* | ✅ | ✅ | ➖ | +| Lance | ✅ | ✅\* | ✅ | ✅ | ➖ | | Delta | ✅ | 🚧 | ✅ | ✅ | ➖ | | Iceberg | ✅ | 🚧 | ✅ | ✅ | ➖ | -| Lance | ✅ | 🚧 | ✅ | ✅ | ➖ | | Microsoft Excel | ✅ | 🚧 | ✅ | 🚧 | ➖ | | JSON | 🚧 | 🚧 | 🚧 | 🚧 | ➖ | | Apache Avro | 🚧 | 🚧 | 🚧 | 🚧 | ➖ | diff --git a/crates/datasources/src/common/sink/lance.rs b/crates/datasources/src/common/sink/lance.rs new file mode 100644 index 000000000..e10bca87b --- /dev/null +++ b/crates/datasources/src/common/sink/lance.rs @@ -0,0 +1,126 @@ +use std::fmt; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::arrow::record_batch::RecordBatchIterator; +use datafusion::common::Result as DfResult; +use datafusion::error::DataFusionError; +use datafusion::execution::TaskContext; +use datafusion::physical_plan::insert::DataSink; +use datafusion::physical_plan::DisplayAs; +use datafusion::physical_plan::{DisplayFormatType, SendableRecordBatchStream}; +use futures::StreamExt; +use lance::dataset::WriteMode; +use lance::Dataset; +use object_store::{path::Path as ObjectPath, ObjectStore}; + +pub type LanceWriteParams = lance::dataset::WriteParams; + +#[derive(Debug, Clone)] +pub struct LanceSinkOpts { + pub url: Option, + pub max_rows_per_file: usize, + pub max_rows_per_group: usize, + pub max_bytes_per_file: usize, + pub input_batch_size: usize, +} + +/// Writes lance files to object storage. +#[derive(Debug, Clone)] +pub struct LanceSink { + store: Arc, + loc: ObjectPath, + opts: LanceSinkOpts, +} + +impl fmt::Display for LanceSink { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "LanceSink({}:{})", self.store, self.loc) + } +} + +impl DisplayAs for LanceSink { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default => write!(f, "{self}"), + DisplayFormatType::Verbose => write!(f, "{self}"), + } + } +} + +impl LanceSink { + pub fn from_obj_store( + store: Arc, + loc: impl Into, + opts: LanceSinkOpts, + ) -> Self { + LanceSink { + store, + loc: loc.into(), + opts, + } + } + + async fn stream_into_inner( + &self, + stream: SendableRecordBatchStream, + mut ds: Option, + ) -> DfResult> { + let table = match self.opts.url.clone() { + Some(opts_url) => opts_url.join(self.loc.as_ref()), + None => url::Url::parse(self.loc.as_ref()), + } + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + let schema = stream.schema().clone(); + let mut chunks = stream.chunks(32); + let write_opts = LanceWriteParams { + mode: WriteMode::Overwrite, + ..Default::default() + }; + + while let Some(batches) = chunks.next().await { + let batch_iter = + RecordBatchIterator::new(batches.into_iter().map(|item| Ok(item?)), schema.clone()); + + match ds.clone() { + Some(mut d) => { + d.append(batch_iter, Some(write_opts.clone())).await?; + } + None => { + ds.replace( + Dataset::write(batch_iter, table.as_str(), Some(write_opts.clone())) + .await?, + ); + } + } + } + + Ok(ds) + } +} + +#[async_trait] +impl DataSink for LanceSink { + // the dataset is the handle to the lance database. + // + // there's no way to construct an empty dataset except by writing + // to it, so we pass this optional wrapped dataset to this method, + // if it's none, we create a new one, and if it's not we use the + // dataset we constructed before from the optional, and return it, + // and pass it into the next call. + async fn write_all( + &self, + data: Vec, + _context: &Arc, + ) -> DfResult { + let mut ds: Option = None; + for stream in data { + ds = self.stream_into_inner(stream, ds).await?; + } + match ds { + Some(ds) => Ok(ds.count_rows().await? as u64), + None => Ok(0), + } + } +} diff --git a/crates/datasources/src/common/sink/mod.rs b/crates/datasources/src/common/sink/mod.rs index 3dfa87200..9ee08909d 100644 --- a/crates/datasources/src/common/sink/mod.rs +++ b/crates/datasources/src/common/sink/mod.rs @@ -1,6 +1,7 @@ pub mod bson; pub mod csv; pub mod json; +pub mod lance; pub mod parquet; use std::io::{self, Write}; diff --git a/crates/datasources/src/lance/mod.rs b/crates/datasources/src/lance/mod.rs index 09ef19923..a204b73fd 100644 --- a/crates/datasources/src/lance/mod.rs +++ b/crates/datasources/src/lance/mod.rs @@ -3,9 +3,8 @@ use lance::{dataset::builder::DatasetBuilder, Dataset}; use protogen::metastore::types::options::StorageOptions; pub async fn scan_lance_table(location: &str, options: StorageOptions) -> Result { - DatasetBuilder::from_uri(location) + Ok(DatasetBuilder::from_uri(location) .with_storage_options(options.inner.into_iter().collect()) .load() - .await - .map_err(|e| e.into()) + .await?) } diff --git a/crates/protogen/src/metastore/types/options.rs b/crates/protogen/src/metastore/types/options.rs index 0ee7907a3..d34426d2b 100644 --- a/crates/protogen/src/metastore/types/options.rs +++ b/crates/protogen/src/metastore/types/options.rs @@ -1536,6 +1536,7 @@ pub struct CopyToDestinationOptionsAzure { pub enum CopyToFormatOptions { Csv(CopyToFormatOptionsCsv), Parquet(CopyToFormatOptionsParquet), + Lance(CopyToFormatOptionsLance), Json(CopyToFormatOptionsJson), Bson, } @@ -1554,6 +1555,7 @@ impl CopyToFormatOptions { pub const PARQUET: &'static str = "parquet"; pub const JSON: &'static str = "json"; pub const BSON: &'static str = "bson"; + pub const LANCE: &'static str = "lance"; pub fn as_str(&self) -> &'static str { match self { @@ -1561,6 +1563,7 @@ impl CopyToFormatOptions { Self::Parquet(_) => Self::PARQUET, Self::Json(_) => Self::JSON, Self::Bson => Self::BSON, + Self::Lance(_) => Self::LANCE, } } } @@ -1580,3 +1583,11 @@ pub struct CopyToFormatOptionsParquet { pub struct CopyToFormatOptionsJson { pub array: bool, } + +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +pub struct CopyToFormatOptionsLance { + pub max_rows_per_file: Option, + pub max_rows_per_group: Option, + pub max_bytes_per_file: Option, + pub input_batch_size: Option, +} diff --git a/crates/protogen/src/sqlexec/copy_to.rs b/crates/protogen/src/sqlexec/copy_to.rs index afb37757b..64acff794 100644 --- a/crates/protogen/src/sqlexec/copy_to.rs +++ b/crates/protogen/src/sqlexec/copy_to.rs @@ -74,6 +74,8 @@ pub enum CopyToFormatOptionsEnum { Json(CopyToFormatOptionsJson), #[prost(message, tag = "3")] Parquet(CopyToFormatOptionsParquet), + #[prost(message, tag = "4")] + Lance(CopyToFormatOptionsLance), } #[derive(Clone, PartialEq, Message)] @@ -96,6 +98,21 @@ pub struct CopyToFormatOptionsParquet { pub row_group_size: u64, } +#[derive(Clone, PartialEq, Message)] +pub struct CopyToFormatOptionsLance { + #[prost(uint64, optional, tag = "1")] + pub max_rows_per_file: Option, + #[prost(uint64, optional, tag = "2")] + pub max_rows_per_group: Option, + #[prost(uint64, optional, tag = "3")] + pub max_bytes_per_file: Option, + #[prost(uint64, optional, tag = "4")] + pub input_batch_size: Option, +} + +#[derive(Clone, PartialEq, Message)] +pub struct CopyToFormatOptionsBson {} + impl TryFrom for CopyToFormatOptions { type Error = crate::errors::ProtoConvError; fn try_from( @@ -105,6 +122,18 @@ impl TryFrom for CopyToFo crate::metastore::types::options::CopyToFormatOptions::Bson => { Ok(CopyToFormatOptions::default()) } + crate::metastore::types::options::CopyToFormatOptions::Lance(opts) => { + Ok(CopyToFormatOptions { + copy_to_format_options_enum: Some(CopyToFormatOptionsEnum::Lance( + CopyToFormatOptionsLance { + max_rows_per_file: opts.max_rows_per_file.map(|v| v as u64), + max_rows_per_group: opts.max_rows_per_group.map(|v| v as u64), + max_bytes_per_file: opts.max_bytes_per_file.map(|v| v as u64), + input_batch_size: opts.input_batch_size.map(|v| v as u64), + }, + )), + }) + } crate::metastore::types::options::CopyToFormatOptions::Csv(csv) => { Ok(CopyToFormatOptions { copy_to_format_options_enum: Some(CopyToFormatOptionsEnum::Csv( @@ -154,12 +183,21 @@ impl TryFrom for crate::metastore::types::options::CopyToFo }, )) } + CopyToFormatOptionsEnum::Lance(lance) => Ok( + crate::metastore::types::options::CopyToFormatOptions::Lance( + crate::metastore::types::options::CopyToFormatOptionsLance { + max_rows_per_file: lance.max_rows_per_file.map(|v| v as usize), + max_rows_per_group: lance.max_rows_per_group.map(|v| v as usize), + max_bytes_per_file: lance.max_rows_per_group.map(|v| v as usize), + input_batch_size: lance.input_batch_size.map(|v| v as usize), + }, + ), + ), CopyToFormatOptionsEnum::Json(json) => { Ok(crate::metastore::types::options::CopyToFormatOptions::Json( crate::metastore::types::options::CopyToFormatOptionsJson { array: json.array }, )) } - CopyToFormatOptionsEnum::Parquet(parquet) => Ok( crate::metastore::types::options::CopyToFormatOptions::Parquet( crate::metastore::types::options::CopyToFormatOptionsParquet { diff --git a/crates/sqlexec/src/parser/options.rs b/crates/sqlexec/src/parser/options.rs index 80d035192..fb66d194c 100644 --- a/crates/sqlexec/src/parser/options.rs +++ b/crates/sqlexec/src/parser/options.rs @@ -58,6 +58,17 @@ impl ParseOptionValue for OptionValue { } } +impl ParseOptionValue> for OptionValue { + fn parse_opt(self) -> Result, ParserError> { + match self { + Self::QuotedLiteral(s) | Self::UnquotedLiteral(s) => { + Ok(s.split(',').map(|s| s.to_string()).collect()) + } + o => Err(unexpected_type_err!("string slice", o)), + } + } +} + impl ParseOptionValue for OptionValue { fn parse_opt(self) -> Result { let opt = match self { diff --git a/crates/sqlexec/src/planner/physical_plan/copy_to.rs b/crates/sqlexec/src/planner/physical_plan/copy_to.rs index 19ad972d7..1bef8f63e 100644 --- a/crates/sqlexec/src/planner/physical_plan/copy_to.rs +++ b/crates/sqlexec/src/planner/physical_plan/copy_to.rs @@ -13,6 +13,7 @@ use datafusion_ext::metrics::WriteOnlyDataSourceMetricsExecAdapter; use datasources::common::sink::bson::BsonSink; use datasources::common::sink::csv::{CsvSink, CsvSinkOpts}; use datasources::common::sink::json::{JsonSink, JsonSinkOpts}; +use datasources::common::sink::lance::{LanceSink, LanceSinkOpts, LanceWriteParams}; use datasources::common::sink::parquet::{ParquetSink, ParquetSinkOpts}; use datasources::common::url::DatasourceUrl; use datasources::object_store::gcs::GcsStoreAccess; @@ -106,6 +107,13 @@ impl DisplayAs for CopyToExec { impl CopyToExec { async fn copy_to(self, context: Arc) -> DataFusionResult { let sink = match (self.dest, self.format) { + (CopyToDestinationOptions::Local(local_options), CopyToFormatOptions::Lance(opts)) => { + get_sink_for_obj( + CopyToFormatOptions::Lance(opts), + &LocalStoreAccess {}, + &local_options.location, + )? + } (CopyToDestinationOptions::Local(local_options), format) => { { // Create the path if it doesn't exist (for local). @@ -177,6 +185,7 @@ fn get_sink_for_obj( let store = access .create_store() .map_err(|e| DataFusionError::External(Box::new(e)))?; + let path = access .path(location) .map_err(|e| DataFusionError::External(Box::new(e)))?; @@ -197,6 +206,29 @@ fn get_sink_for_obj( row_group_size: parquet_opts.row_group_size, }, )), + CopyToFormatOptions::Lance(opts) => { + let wp = LanceWriteParams::default(); + + Box::new(LanceSink::from_obj_store( + store, + path, + LanceSinkOpts { + url: Some( + url::Url::parse( + access + .base_url() + .map_err(|e| DataFusionError::External(Box::new(e)))? + .as_str(), + ) + .map_err(|e| DataFusionError::External(Box::new(e)))?, + ), + max_rows_per_file: opts.max_rows_per_file.unwrap_or(wp.max_rows_per_file), + max_rows_per_group: opts.max_rows_per_group.unwrap_or(wp.max_rows_per_group), + max_bytes_per_file: opts.max_bytes_per_file.unwrap_or(wp.max_bytes_per_file), + input_batch_size: opts.input_batch_size.unwrap_or(64), + }, + )) + } CopyToFormatOptions::Json(json_opts) => Box::new(JsonSink::from_obj_store( store, path, diff --git a/crates/sqlexec/src/planner/session_planner.rs b/crates/sqlexec/src/planner/session_planner.rs index 4f73e8d0b..7a827f7d1 100644 --- a/crates/sqlexec/src/planner/session_planner.rs +++ b/crates/sqlexec/src/planner/session_planner.rs @@ -41,17 +41,17 @@ use protogen::metastore::types::catalog::{ use protogen::metastore::types::options::{ CopyToDestinationOptions, CopyToDestinationOptionsAzure, CopyToDestinationOptionsGcs, CopyToDestinationOptionsLocal, CopyToDestinationOptionsS3, CopyToFormatOptions, - CopyToFormatOptionsCsv, CopyToFormatOptionsJson, CopyToFormatOptionsParquet, - CredentialsOptions, CredentialsOptionsAws, CredentialsOptionsAzure, CredentialsOptionsDebug, - CredentialsOptionsGcp, DatabaseOptions, DatabaseOptionsBigQuery, DatabaseOptionsCassandra, - DatabaseOptionsClickhouse, DatabaseOptionsDebug, DatabaseOptionsDeltaLake, - DatabaseOptionsMongoDb, DatabaseOptionsMysql, DatabaseOptionsPostgres, - DatabaseOptionsSnowflake, DatabaseOptionsSqlServer, DeltaLakeCatalog, DeltaLakeUnityCatalog, - StorageOptions, TableOptions, TableOptionsBigQuery, TableOptionsCassandra, - TableOptionsClickhouse, TableOptionsDebug, TableOptionsGcs, TableOptionsLocal, - TableOptionsMongoDb, TableOptionsMysql, TableOptionsObjectStore, TableOptionsPostgres, - TableOptionsS3, TableOptionsSnowflake, TableOptionsSqlServer, TunnelOptions, - TunnelOptionsDebug, TunnelOptionsInternal, TunnelOptionsSsh, + CopyToFormatOptionsCsv, CopyToFormatOptionsJson, CopyToFormatOptionsLance, + CopyToFormatOptionsParquet, CredentialsOptions, CredentialsOptionsAws, CredentialsOptionsAzure, + CredentialsOptionsDebug, CredentialsOptionsGcp, DatabaseOptions, DatabaseOptionsBigQuery, + DatabaseOptionsCassandra, DatabaseOptionsClickhouse, DatabaseOptionsDebug, + DatabaseOptionsDeltaLake, DatabaseOptionsMongoDb, DatabaseOptionsMysql, + DatabaseOptionsPostgres, DatabaseOptionsSnowflake, DatabaseOptionsSqlServer, DeltaLakeCatalog, + DeltaLakeUnityCatalog, StorageOptions, TableOptions, TableOptionsBigQuery, + TableOptionsCassandra, TableOptionsClickhouse, TableOptionsDebug, TableOptionsGcs, + TableOptionsLocal, TableOptionsMongoDb, TableOptionsMysql, TableOptionsObjectStore, + TableOptionsPostgres, TableOptionsS3, TableOptionsSnowflake, TableOptionsSqlServer, + TunnelOptions, TunnelOptionsDebug, TunnelOptionsInternal, TunnelOptionsSsh, }; use protogen::metastore::types::service::{AlterDatabaseOperation, AlterTableOperation}; use sqlbuiltins::builtins::{CURRENT_SESSION_SCHEMA, DEFAULT_CATALOG}; @@ -1790,6 +1790,14 @@ impl<'a> SessionPlanner<'a> { CopyToFormatOptions::Json(CopyToFormatOptionsJson { array }) } Some(CopyToFormatOptions::BSON) => CopyToFormatOptions::Bson {}, + Some(CopyToFormatOptions::LANCE) => { + CopyToFormatOptions::Lance(CopyToFormatOptionsLance { + max_rows_per_file: m.remove_optional("max_rows_per_file")?, + max_rows_per_group: m.remove_optional("max_rows_per_group")?, + max_bytes_per_file: m.remove_optional("max_bytes_per_file")?, + input_batch_size: m.remove_optional("input_batch_size")?, + }) + } Some(other) => return Err(internal!("unsupported output format: {other}")), }; diff --git a/testdata/sqllogictests_object_store/local/lance.slt b/testdata/sqllogictests_object_store/local/lance.slt index 130f0363e..7146fe3bb 100644 --- a/testdata/sqllogictests_object_store/local/lance.slt +++ b/testdata/sqllogictests_object_store/local/lance.slt @@ -12,10 +12,27 @@ select * from lance_tbl order by point.lat; 0.2,1.8 {lat:42.1,long:-74.1} 1.1,1.2 {lat:45.5,long:-122.7} - statement error create external table lance_tbl from lance options ( location 'file://${PWD}/testdata/lance/not-a-real-table/' -); \ No newline at end of file +); + +statement ok +copy (select * from lance_tbl) to 'file://${TMP}' format lance; + +query IT +select * from lance_scan('file://${TMP}') order by point.lat; +---- +0.2,1.8 {lat:42.1,long:-74.1} +1.1,1.2 {lat:45.5,long:-122.7} + +statement ok +copy (select * from lance_tbl) to '${TMP}' format lance; + +query IT +select * from lance_scan('${TMP}') order by point.lat; +---- +0.2,1.8 {lat:42.1,long:-74.1} +1.1,1.2 {lat:45.5,long:-122.7} diff --git a/tests/fixtures/glaredb.py b/tests/fixtures/glaredb.py index f0e5dadd3..b9cdfbc8c 100644 --- a/tests/fixtures/glaredb.py +++ b/tests/fixtures/glaredb.py @@ -40,6 +40,7 @@ def glaredb_connection( ], cwd=tmp_path_factory.mktemp("cwd").absolute(), close_fds="posix" in sys.builtin_module_names, + env={"RUST_BACKTRACE": "1"}, ) as p: time.sleep(0.5) assert not p.poll(), p.stdout.read().decode("utf-8") diff --git a/tests/poetry.lock b/tests/poetry.lock index 4657a5a35..6d2a0498a 100644 --- a/tests/poetry.lock +++ b/tests/poetry.lock @@ -41,6 +41,51 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "numpy" +version = "1.26.3" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.26.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:806dd64230dbbfaca8a27faa64e2f414bf1c6622ab78cc4264f7f5f028fee3bf"}, + {file = "numpy-1.26.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02f98011ba4ab17f46f80f7f8f1c291ee7d855fcef0a5a98db80767a468c85cd"}, + {file = "numpy-1.26.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d45b3ec2faed4baca41c76617fcdcfa4f684ff7a151ce6fc78ad3b6e85af0a6"}, + {file = "numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdd2b45bf079d9ad90377048e2747a0c82351989a2165821f0c96831b4a2a54b"}, + {file = "numpy-1.26.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:211ddd1e94817ed2d175b60b6374120244a4dd2287f4ece45d49228b4d529178"}, + {file = "numpy-1.26.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1240f767f69d7c4c8a29adde2310b871153df9b26b5cb2b54a561ac85146485"}, + {file = "numpy-1.26.3-cp310-cp310-win32.whl", hash = "sha256:21a9484e75ad018974a2fdaa216524d64ed4212e418e0a551a2d83403b0531d3"}, + {file = "numpy-1.26.3-cp310-cp310-win_amd64.whl", hash = "sha256:9e1591f6ae98bcfac2a4bbf9221c0b92ab49762228f38287f6eeb5f3f55905ce"}, + {file = "numpy-1.26.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b831295e5472954104ecb46cd98c08b98b49c69fdb7040483aff799a755a7374"}, + {file = "numpy-1.26.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9e87562b91f68dd8b1c39149d0323b42e0082db7ddb8e934ab4c292094d575d6"}, + {file = "numpy-1.26.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c66d6fec467e8c0f975818c1796d25c53521124b7cfb760114be0abad53a0a2"}, + {file = "numpy-1.26.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f25e2811a9c932e43943a2615e65fc487a0b6b49218899e62e426e7f0a57eeda"}, + {file = "numpy-1.26.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:af36e0aa45e25c9f57bf684b1175e59ea05d9a7d3e8e87b7ae1a1da246f2767e"}, + {file = "numpy-1.26.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:51c7f1b344f302067b02e0f5b5d2daa9ed4a721cf49f070280ac202738ea7f00"}, + {file = "numpy-1.26.3-cp311-cp311-win32.whl", hash = "sha256:7ca4f24341df071877849eb2034948459ce3a07915c2734f1abb4018d9c49d7b"}, + {file = "numpy-1.26.3-cp311-cp311-win_amd64.whl", hash = "sha256:39763aee6dfdd4878032361b30b2b12593fb445ddb66bbac802e2113eb8a6ac4"}, + {file = "numpy-1.26.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a7081fd19a6d573e1a05e600c82a1c421011db7935ed0d5c483e9dd96b99cf13"}, + {file = "numpy-1.26.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12c70ac274b32bc00c7f61b515126c9205323703abb99cd41836e8125ea0043e"}, + {file = "numpy-1.26.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f784e13e598e9594750b2ef6729bcd5a47f6cfe4a12cca13def35e06d8163e3"}, + {file = "numpy-1.26.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f24750ef94d56ce6e33e4019a8a4d68cfdb1ef661a52cdaee628a56d2437419"}, + {file = "numpy-1.26.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:77810ef29e0fb1d289d225cabb9ee6cf4d11978a00bb99f7f8ec2132a84e0166"}, + {file = "numpy-1.26.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8ed07a90f5450d99dad60d3799f9c03c6566709bd53b497eb9ccad9a55867f36"}, + {file = "numpy-1.26.3-cp312-cp312-win32.whl", hash = "sha256:f73497e8c38295aaa4741bdfa4fda1a5aedda5473074369eca10626835445511"}, + {file = "numpy-1.26.3-cp312-cp312-win_amd64.whl", hash = "sha256:da4b0c6c699a0ad73c810736303f7fbae483bcb012e38d7eb06a5e3b432c981b"}, + {file = "numpy-1.26.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1666f634cb3c80ccbd77ec97bc17337718f56d6658acf5d3b906ca03e90ce87f"}, + {file = "numpy-1.26.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18c3319a7d39b2c6a9e3bb75aab2304ab79a811ac0168a671a62e6346c29b03f"}, + {file = "numpy-1.26.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b7e807d6888da0db6e7e75838444d62495e2b588b99e90dd80c3459594e857b"}, + {file = "numpy-1.26.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4d362e17bcb0011738c2d83e0a65ea8ce627057b2fdda37678f4374a382a137"}, + {file = "numpy-1.26.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b8c275f0ae90069496068c714387b4a0eba5d531aace269559ff2b43655edd58"}, + {file = "numpy-1.26.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cc0743f0302b94f397a4a65a660d4cd24267439eb16493fb3caad2e4389bccbb"}, + {file = "numpy-1.26.3-cp39-cp39-win32.whl", hash = "sha256:9bc6d1a7f8cedd519c4b7b1156d98e051b726bf160715b769106661d567b3f03"}, + {file = "numpy-1.26.3-cp39-cp39-win_amd64.whl", hash = "sha256:867e3644e208c8922a3be26fc6bbf112a035f50f0a86497f98f228c50c607bb2"}, + {file = "numpy-1.26.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3c67423b3703f8fbd90f5adaa37f85b5794d3366948efe9a5190a5f3a83fc34e"}, + {file = "numpy-1.26.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46f47ee566d98849323f01b349d58f2557f02167ee301e5e28809a8c0e27a2d0"}, + {file = "numpy-1.26.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a8474703bffc65ca15853d5fd4d06b18138ae90c17c8d12169968e998e448bb5"}, + {file = "numpy-1.26.3.tar.gz", hash = "sha256:697df43e2b6310ecc9d95f05d5ef20eacc09c7c4ecc9da3f235d39e71b7da1e4"}, +] + [[package]] name = "packaging" version = "23.2" @@ -89,6 +134,77 @@ files = [ {file = "psycopg2-2.9.9.tar.gz", hash = "sha256:d1454bde93fb1e224166811694d600e746430c006fbb031ea06ecc2ea41bf156"}, ] +[[package]] +name = "pyarrow" +version = "14.0.2" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyarrow-14.0.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ba9fe808596c5dbd08b3aeffe901e5f81095baaa28e7d5118e01354c64f22807"}, + {file = "pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:22a768987a16bb46220cef490c56c671993fbee8fd0475febac0b3e16b00a10e"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dbba05e98f247f17e64303eb876f4a80fcd32f73c7e9ad975a83834d81f3fda"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a898d134d00b1eca04998e9d286e19653f9d0fcb99587310cd10270907452a6b"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:87e879323f256cb04267bb365add7208f302df942eb943c93a9dfeb8f44840b1"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:76fc257559404ea5f1306ea9a3ff0541bf996ff3f7b9209fc517b5e83811fa8e"}, + {file = "pyarrow-14.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0c4a18e00f3a32398a7f31da47fefcd7a927545b396e1f15d0c85c2f2c778cd"}, + {file = "pyarrow-14.0.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:87482af32e5a0c0cce2d12eb3c039dd1d853bd905b04f3f953f147c7a196915b"}, + {file = "pyarrow-14.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:059bd8f12a70519e46cd64e1ba40e97eae55e0cbe1695edd95384653d7626b23"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f16111f9ab27e60b391c5f6d197510e3ad6654e73857b4e394861fc79c37200"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06ff1264fe4448e8d02073f5ce45a9f934c0f3db0a04460d0b01ff28befc3696"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd4f4b472ccf4042f1eab77e6c8bce574543f54d2135c7e396f413046397d5a"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:32356bfb58b36059773f49e4e214996888eeea3a08893e7dbde44753799b2a02"}, + {file = "pyarrow-14.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:52809ee69d4dbf2241c0e4366d949ba035cbcf48409bf404f071f624ed313a2b"}, + {file = "pyarrow-14.0.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:c87824a5ac52be210d32906c715f4ed7053d0180c1060ae3ff9b7e560f53f944"}, + {file = "pyarrow-14.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a25eb2421a58e861f6ca91f43339d215476f4fe159eca603c55950c14f378cc5"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c1da70d668af5620b8ba0a23f229030a4cd6c5f24a616a146f30d2386fec422"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cc61593c8e66194c7cdfae594503e91b926a228fba40b5cf25cc593563bcd07"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:78ea56f62fb7c0ae8ecb9afdd7893e3a7dbeb0b04106f5c08dbb23f9c0157591"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:37c233ddbce0c67a76c0985612fef27c0c92aef9413cf5aa56952f359fcb7379"}, + {file = "pyarrow-14.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:e4b123ad0f6add92de898214d404e488167b87b5dd86e9a434126bc2b7a5578d"}, + {file = "pyarrow-14.0.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e354fba8490de258be7687f341bc04aba181fc8aa1f71e4584f9890d9cb2dec2"}, + {file = "pyarrow-14.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:20e003a23a13da963f43e2b432483fdd8c38dc8882cd145f09f21792e1cf22a1"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0de7575e841f1595ac07e5bc631084fd06ca8b03c0f2ecece733d23cd5102a"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66e986dc859712acb0bd45601229021f3ffcdfc49044b64c6d071aaf4fa49e98"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:f7d029f20ef56673a9730766023459ece397a05001f4e4d13805111d7c2108c0"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:209bac546942b0d8edc8debda248364f7f668e4aad4741bae58e67d40e5fcf75"}, + {file = "pyarrow-14.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:1e6987c5274fb87d66bb36816afb6f65707546b3c45c44c28e3c4133c010a881"}, + {file = "pyarrow-14.0.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a01d0052d2a294a5f56cc1862933014e696aa08cc7b620e8c0cce5a5d362e976"}, + {file = "pyarrow-14.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a51fee3a7db4d37f8cda3ea96f32530620d43b0489d169b285d774da48ca9785"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64df2bf1ef2ef14cee531e2dfe03dd924017650ffaa6f9513d7a1bb291e59c15"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c0fa3bfdb0305ffe09810f9d3e2e50a2787e3a07063001dcd7adae0cee3601a"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c65bf4fd06584f058420238bc47a316e80dda01ec0dfb3044594128a6c2db794"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:63ac901baec9369d6aae1cbe6cca11178fb018a8d45068aaf5bb54f94804a866"}, + {file = "pyarrow-14.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:75ee0efe7a87a687ae303d63037d08a48ef9ea0127064df18267252cfe2e9541"}, + {file = "pyarrow-14.0.2.tar.gz", hash = "sha256:36cef6ba12b499d864d1def3e990f97949e0b79400d08b7cf74504ffbd3eb025"}, +] + +[package.dependencies] +numpy = ">=1.16.6" + +[[package]] +name = "pylance" +version = "0.9.6" +description = "python wrapper for Lance columnar format" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pylance-0.9.6-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:d77d592d443b30eb6eb6ba702dee025cc31ae428d395ee50a38ba0bd3661f34f"}, + {file = "pylance-0.9.6-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:5e1757410f3762d427a0eb2ebd24d5a0f0e394486478609f4100ea2c82a6cc99"}, + {file = "pylance-0.9.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89ddfaa44bb6112fa28295c095e40e53ad2e2502ead52774897bc17f7a612f09"}, + {file = "pylance-0.9.6-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:25ac671c7cccae05756a8922e38cc4e7f4010e750b25691d6b6aa3ff0e4f699a"}, + {file = "pylance-0.9.6-cp38-abi3-win_amd64.whl", hash = "sha256:7a6ba6e7642cc962724b26eccd6bf21f0b71bdc9d53b14a66e3a9df32064e85f"}, +] + +[package.dependencies] +numpy = ">=1.22" +pyarrow = ">=12" + +[package.extras] +benchmarks = ["pytest-benchmark"] +tests = ["duckdb", "ml_dtypes", "pandas", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"] +torch = ["torch"] + [[package]] name = "pymongo" version = "4.6.1" @@ -215,4 +331,4 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "a205b5904910e3d371ed5d219aed81bab23ca13724c8a9e9f9a356893e4bb522" +content-hash = "c815a4b9cbe67158141b6469d3c2fda089a11744abd7d2c507f7fbea2881db0b" diff --git a/tests/pyproject.toml b/tests/pyproject.toml index 0dcdd231d..eaa62cc27 100644 --- a/tests/pyproject.toml +++ b/tests/pyproject.toml @@ -10,6 +10,8 @@ python = "^3.11" pytest = "^7.4.3" pymongo = "^4.6.1" psycopg2 = "^2.9.9" +pyarrow = "^14.0.2" +pylance = "^0.9.6" [build-system] requires = ["poetry-core"] diff --git a/tests/tests/test_bson.py b/tests/tests/test_bson.py index 14da0f380..e103cfe0c 100644 --- a/tests/tests/test_bson.py +++ b/tests/tests/test_bson.py @@ -31,15 +31,12 @@ def test_copy_to( assert not os.path.exists(output_path) with glaredb_connection.cursor() as curr: - print(output_path) curr.execute(f"COPY( SELECT * FROM bson_test ) TO '{output_path}'") assert os.path.exists(output_path) with open(output_path, "rb") as f: for idx, doc in enumerate(bson.decode_file_iter(f)): - print(doc) - assert len(doc) == 1 assert "amount" in doc assert doc["amount"] == idx diff --git a/tests/tests/test_lance.py b/tests/tests/test_lance.py new file mode 100644 index 000000000..c60884419 --- /dev/null +++ b/tests/tests/test_lance.py @@ -0,0 +1,73 @@ +import os.path + +import lance +import pyarrow as pa + +import psycopg2.extensions +import psycopg2.extras +import pytest + + +import tools +from fixtures.glaredb import glaredb_connection, debug_path + + +def test_sanity_check( + tmp_path_factory: pytest.TempPathFactory, +): + test_path = tmp_path_factory.mktemp("lance-sanity") + + table = pa.table( + { + "id": pa.array([1, 2, 4]), + "values": pa.array([2, 4, 8]), + } + ) + + assert test_path.exists(), test_path + dataset = lance.write_dataset(table, test_path) + + print(dir(dataset)) + assert dataset.count_rows() == 3 + + files = os.listdir(test_path) + assert len(files) == 4 + assert "data" in files + assert "_latest.manifest" in files + assert "_transactions" in files + assert "_versions" in files + + +def test_copy_to_round_trip( + glaredb_connection: psycopg2.extensions.connection, + tmp_path_factory: pytest.TempPathFactory, +): + with glaredb_connection.cursor() as curr: + curr.execute("create temp table lance_test (amount int)") + + for i in range(10): + curr.execute("insert into lance_test values (%s)", str(i)) + + output_path_abs = tmp_path_factory.mktemp("lance-abs") + + with glaredb_connection.cursor() as curr: + curr.execute("select count(*) from lance_test;") + res = curr.fetchone() + assert res[0] == 10 + + curr.execute(f"COPY lance_test TO '{output_path_abs}' FORMAT lance") + + with glaredb_connection.cursor() as curr: + curr.execute(f"select count(*) from lance_scan('{output_path_abs}')") + res = curr.fetchone() + assert res[0] == 10 + + output_path_rel = tmp_path_factory.mktemp("lance-rel") + + with tools.cd(output_path_rel): + with glaredb_connection.cursor() as curr: + curr.execute("COPY lance_test TO './' FORMAT lance") + + curr.execute("select count(*) from lance_scan('./')") + res = curr.fetchone() + assert res[0] == 10 diff --git a/tests/tools.py b/tests/tools.py new file mode 100644 index 000000000..803114432 --- /dev/null +++ b/tests/tools.py @@ -0,0 +1,14 @@ +import contextlib +import os +import pathlib + +@contextlib.contextmanager +def cd(path: pathlib.Path): + cur = os.getcwd() + + os.chdir(path) + + try: + yield + finally: + os.chdir(cur)