Skip to content

Commit

Permalink
simplify lazy loading feature
Browse files Browse the repository at this point in the history
  • Loading branch information
PengLiVectra committed Dec 5, 2023
1 parent 4031c32 commit 6ba7634
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 69 deletions.
8 changes: 1 addition & 7 deletions python/deltalake/_internal.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,9 @@ class RawDeltaTable:
storage_options: Optional[Dict[str, str]],
without_files: bool,
log_buffer_size: Optional[int],
load_lazy: bool,
) -> None: ...
@staticmethod
def load_lazy(
table_uri: str,
storage_options: Optional[Dict[str, str]],
without_files: bool,
log_buffer_size: Optional[int],
) -> RawDeltaTable: ...
@staticmethod
def get_table_uri_from_data_catalog(
data_catalog: str,
database_name: str,
Expand Down
19 changes: 7 additions & 12 deletions python/deltalake/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def __init__(
storage_options: Optional[Dict[str, str]] = None,
without_files: bool = False,
log_buffer_size: Optional[int] = None,
lazy_load: bool = False,
load_lazy: bool = False,
):
"""
Create the Delta Table from a path with an optional version.
Expand All @@ -254,26 +254,21 @@ def __init__(
This can decrease latency if there are many files in the log since the last checkpoint,
but will also increase memory usage. Possible rate limits of the storage backend should
also be considered for optimal performance. Defaults to 4 * number of cpus.
lazy_load: when true the table metadata isn't loaded
load_lazy: when true the table metadata isn't loaded
"""
self._storage_options = storage_options
if lazy_load:
self._table = RawDeltaTable.load_lazy(
str(table_uri),
storage_options=storage_options,
without_files=without_files,
log_buffer_size=log_buffer_size,
)
self._metadata = None
return
self._table = RawDeltaTable(
str(table_uri),
version=version,
storage_options=storage_options,
without_files=without_files,
log_buffer_size=log_buffer_size,
load_lazy=load_lazy,
)
self._metadata = Metadata(self._table)
if load_lazy:
self._metadata = None
else:
self._metadata = Metadata(self._table)

@classmethod
def from_data_catalog(
Expand Down
68 changes: 20 additions & 48 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,67 +85,39 @@ struct RawDeltaTableMetaData {
configuration: HashMap<String, Option<String>>,
}

#[inline]
fn build_table(
table_uri: &str,
storage_options: Option<HashMap<String, String>>,
without_files: bool,
log_buffer_size: Option<usize>,
) -> Result<DeltaTableBuilder, PythonError> {
let mut builder = deltalake::DeltaTableBuilder::from_uri(table_uri);
if let Some(storage_options) = storage_options {
builder = builder.with_storage_options(storage_options)
}
if without_files {
builder = builder.without_files()
}
if let Some(buf_size) = log_buffer_size {
builder = builder
.with_log_buffer_size(buf_size)
.map_err(PythonError::from)?;
}
Ok(builder)
}

#[pymethods]
impl RawDeltaTable {
#[new]
#[pyo3(signature = (table_uri, version = None, storage_options = None, without_files = false, log_buffer_size = None))]
#[pyo3(signature = (table_uri, version = None, storage_options = None, without_files = false, log_buffer_size = None, load_lazy = false))]
fn new(
table_uri: &str,
version: Option<i64>,
storage_options: Option<HashMap<String, String>>,
without_files: bool,
log_buffer_size: Option<usize>,
load_lazy: bool,
) -> PyResult<Self> {
let mut builder = deltalake::DeltaTableBuilder::from_uri(table_uri);
let options = storage_options.clone().unwrap_or_default();
let mut builder = build_table(table_uri, storage_options, without_files, log_buffer_size);
if let Some(storage_options) = storage_options {
builder = builder.with_storage_options(storage_options)
}
if without_files {
builder = builder.without_files()
}
if let Some(version) = version {
builder = Ok(builder?.with_version(version))
builder = builder.with_version(version)
}

let table = rt()?.block_on(builder?.load()).map_err(PythonError::from)?;
Ok(RawDeltaTable {
_table: table,
_config: FsConfig {
root_url: table_uri.into(),
options,
},
})
}

#[classmethod]
#[pyo3(signature = (table_uri, storage_options = None, without_files = false, log_buffer_size = None))]
fn load_lazy(
_cls: &PyType,
table_uri: &str,
storage_options: Option<HashMap<String, String>>,
without_files: bool,
log_buffer_size: Option<usize>,
) -> PyResult<Self> {
let options = storage_options.clone().unwrap_or_default();
let builder = build_table(table_uri, storage_options, without_files, log_buffer_size);
let table = builder?.build().map_err(PythonError::from)?;
if let Some(buf_size) = log_buffer_size {
builder = builder
.with_log_buffer_size(buf_size)
.map_err(PythonError::from)?;
}
let table = if !load_lazy {
rt()?.block_on(builder.load()).map_err(PythonError::from)?
} else {
builder.build().map_err(PythonError::from)?
};
Ok(RawDeltaTable {
_table: table,
_config: FsConfig {
Expand Down
4 changes: 2 additions & 2 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_read_simple_table_using_options_to_dict():

def test_simple_table_lazy_loading():
table_path = "../crates/deltalake-core/tests/data/simple_table"
dt = DeltaTable(table_path, lazy_load=True)
dt = DeltaTable(table_path, load_lazy=True)
dt.load_version(2)
assert dt.version() == 2

Expand All @@ -77,7 +77,7 @@ def test_simple_table_lazy_loading_with_options():
storage_options={},
without_files=False,
log_buffer_size=1,
lazy_load=True,
load_lazy=True,
)
assert isinstance(dt, DeltaTable)

Expand Down

0 comments on commit 6ba7634

Please sign in to comment.