-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: copy to support for lance (#2342)
- Loading branch information
Showing
16 changed files
with
468 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
use std::fmt; | ||
use std::sync::Arc; | ||
|
||
use async_trait::async_trait; | ||
use datafusion::arrow::record_batch::RecordBatchIterator; | ||
use datafusion::common::Result as DfResult; | ||
use datafusion::error::DataFusionError; | ||
use datafusion::execution::TaskContext; | ||
use datafusion::physical_plan::insert::DataSink; | ||
use datafusion::physical_plan::DisplayAs; | ||
use datafusion::physical_plan::{DisplayFormatType, SendableRecordBatchStream}; | ||
use futures::StreamExt; | ||
use lance::dataset::WriteMode; | ||
use lance::Dataset; | ||
use object_store::{path::Path as ObjectPath, ObjectStore}; | ||
|
||
pub type LanceWriteParams = lance::dataset::WriteParams; | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct LanceSinkOpts { | ||
pub url: Option<url::Url>, | ||
pub max_rows_per_file: usize, | ||
pub max_rows_per_group: usize, | ||
pub max_bytes_per_file: usize, | ||
pub input_batch_size: usize, | ||
} | ||
|
||
/// Writes lance files to object storage. | ||
#[derive(Debug, Clone)] | ||
pub struct LanceSink { | ||
store: Arc<dyn ObjectStore>, | ||
loc: ObjectPath, | ||
opts: LanceSinkOpts, | ||
} | ||
|
||
impl fmt::Display for LanceSink { | ||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
write!(f, "LanceSink({}:{})", self.store, self.loc) | ||
} | ||
} | ||
|
||
impl DisplayAs for LanceSink { | ||
fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { | ||
match t { | ||
DisplayFormatType::Default => write!(f, "{self}"), | ||
DisplayFormatType::Verbose => write!(f, "{self}"), | ||
} | ||
} | ||
} | ||
|
||
impl LanceSink { | ||
pub fn from_obj_store( | ||
store: Arc<dyn ObjectStore>, | ||
loc: impl Into<ObjectPath>, | ||
opts: LanceSinkOpts, | ||
) -> Self { | ||
LanceSink { | ||
store, | ||
loc: loc.into(), | ||
opts, | ||
} | ||
} | ||
|
||
async fn stream_into_inner( | ||
&self, | ||
stream: SendableRecordBatchStream, | ||
mut ds: Option<Dataset>, | ||
) -> DfResult<Option<Dataset>> { | ||
let table = match self.opts.url.clone() { | ||
Some(opts_url) => opts_url.join(self.loc.as_ref()), | ||
None => url::Url::parse(self.loc.as_ref()), | ||
} | ||
.map_err(|e| DataFusionError::External(Box::new(e)))?; | ||
|
||
let schema = stream.schema().clone(); | ||
let mut chunks = stream.chunks(32); | ||
let write_opts = LanceWriteParams { | ||
mode: WriteMode::Overwrite, | ||
..Default::default() | ||
}; | ||
|
||
while let Some(batches) = chunks.next().await { | ||
let batch_iter = | ||
RecordBatchIterator::new(batches.into_iter().map(|item| Ok(item?)), schema.clone()); | ||
|
||
match ds.clone() { | ||
Some(mut d) => { | ||
d.append(batch_iter, Some(write_opts.clone())).await?; | ||
} | ||
None => { | ||
ds.replace( | ||
Dataset::write(batch_iter, table.as_str(), Some(write_opts.clone())) | ||
.await?, | ||
); | ||
} | ||
} | ||
} | ||
|
||
Ok(ds) | ||
} | ||
} | ||
|
||
#[async_trait] | ||
impl DataSink for LanceSink { | ||
// the dataset is the handle to the lance database. | ||
// | ||
// there's no way to construct an empty dataset except by writing | ||
// to it, so we pass this optional wrapped dataset to this method, | ||
// if it's none, we create a new one, and if it's not we use the | ||
// dataset we constructed before from the optional, and return it, | ||
// and pass it into the next call. | ||
async fn write_all( | ||
&self, | ||
data: Vec<SendableRecordBatchStream>, | ||
_context: &Arc<TaskContext>, | ||
) -> DfResult<u64> { | ||
let mut ds: Option<Dataset> = None; | ||
for stream in data { | ||
ds = self.stream_into_inner(stream, ds).await?; | ||
} | ||
match ds { | ||
Some(ds) => Ok(ds.count_rows().await? as u64), | ||
None => Ok(0), | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
pub mod bson; | ||
pub mod csv; | ||
pub mod json; | ||
pub mod lance; | ||
pub mod parquet; | ||
|
||
use std::io::{self, Write}; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.