From 0bd3cdc13af606cdc9cea717481fb16721fb803a Mon Sep 17 00:00:00 2001 From: ion-elgreco <15728914+ion-elgreco@users.noreply.github.com> Date: Thu, 28 Mar 2024 22:20:10 +0100 Subject: [PATCH 1/4] add missing --- crates/core/src/operations/optimize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/src/operations/optimize.rs b/crates/core/src/operations/optimize.rs index af3541ca2c..babe17a6a0 100644 --- a/crates/core/src/operations/optimize.rs +++ b/crates/core/src/operations/optimize.rs @@ -492,7 +492,7 @@ impl MergePlan { &batch, task_parameters.file_schema.clone(), false, - false, + true, )?; partial_metrics.num_batches += 1; writer.write(&batch).await.map_err(DeltaTableError::from)?; From b051e4956a76968a36f4c1c08a2f737caf1a8b1a Mon Sep 17 00:00:00 2001 From: ion-elgreco <15728914+ion-elgreco@users.noreply.github.com> Date: Thu, 28 Mar 2024 22:27:46 +0100 Subject: [PATCH 2/4] tests --- python/tests/test_optimize.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/python/tests/test_optimize.py b/python/tests/test_optimize.py index 1be0654836..8cb0902dae 100644 --- a/python/tests/test_optimize.py +++ b/python/tests/test_optimize.py @@ -99,3 +99,35 @@ def test_optimize_min_commit_interval( # independently. So with min_commit_interval=0, each will get its # own commit. assert dt.version() == old_version + 5 + + +def test_optimize_schema_evolved_table( + tmp_path: pathlib.Path, + sample_data: pa.Table, +): + data = pa.table({"foo": pa.array(["1"])}) + + write_deltalake(tmp_path, data, engine="rust", mode="append", schema_mode="merge") + + data = pa.table({"bar": pa.array(["1"])}) + write_deltalake(tmp_path, data, engine="rust", mode="append", schema_mode="merge") + + dt = DeltaTable(tmp_path) + old_version = dt.version() + + dt.optimize.compact() + + last_action = dt.history(1)[0] + assert last_action["operation"] == "OPTIMIZE" + assert dt.version() == old_version + 1 + + data = pa.table( + { + "foo": pa.array([None, "1"]), + "bar": pa.array(["1", None]), + } + ) + + assert dt.to_pyarrow_table().sort_by([("foo", "ascending")]) == data.sort_by( + [("foo", "ascending")] + ) From b65850e8c60b2381f16821e1c8589ef744c5d7f3 Mon Sep 17 00:00:00 2001 From: ion-elgreco <15728914+ion-elgreco@users.noreply.github.com> Date: Thu, 28 Mar 2024 22:20:10 +0100 Subject: [PATCH 3/4] add missing --- crates/core/src/operations/optimize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/src/operations/optimize.rs b/crates/core/src/operations/optimize.rs index af3541ca2c..babe17a6a0 100644 --- a/crates/core/src/operations/optimize.rs +++ b/crates/core/src/operations/optimize.rs @@ -492,7 +492,7 @@ impl MergePlan { &batch, task_parameters.file_schema.clone(), false, - false, + true, )?; partial_metrics.num_batches += 1; writer.write(&batch).await.map_err(DeltaTableError::from)?; From 10f11e9d955c91c1041874bfb6517565b84161fd Mon Sep 17 00:00:00 2001 From: ion-elgreco <15728914+ion-elgreco@users.noreply.github.com> Date: Thu, 28 Mar 2024 22:27:46 +0100 Subject: [PATCH 4/4] tests --- python/tests/test_optimize.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/python/tests/test_optimize.py b/python/tests/test_optimize.py index 1be0654836..8cb0902dae 100644 --- a/python/tests/test_optimize.py +++ b/python/tests/test_optimize.py @@ -99,3 +99,35 @@ def test_optimize_min_commit_interval( # independently. So with min_commit_interval=0, each will get its # own commit. assert dt.version() == old_version + 5 + + +def test_optimize_schema_evolved_table( + tmp_path: pathlib.Path, + sample_data: pa.Table, +): + data = pa.table({"foo": pa.array(["1"])}) + + write_deltalake(tmp_path, data, engine="rust", mode="append", schema_mode="merge") + + data = pa.table({"bar": pa.array(["1"])}) + write_deltalake(tmp_path, data, engine="rust", mode="append", schema_mode="merge") + + dt = DeltaTable(tmp_path) + old_version = dt.version() + + dt.optimize.compact() + + last_action = dt.history(1)[0] + assert last_action["operation"] == "OPTIMIZE" + assert dt.version() == old_version + 1 + + data = pa.table( + { + "foo": pa.array([None, "1"]), + "bar": pa.array(["1", None]), + } + ) + + assert dt.to_pyarrow_table().sort_by([("foo", "ascending")]) == data.sort_by( + [("foo", "ascending")] + )