diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index e904662871..e72f9e0a78 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -1194,6 +1194,24 @@ with table.update_spec() as update: update.rename_field("bucketed_id", "sharded_id") ``` +## Sort order updates + +Users can update the sort order on existing tables for new data. See [sorting](https://iceberg.apache.org/spec/#sorting) for more details. + +The API to use when updating a sort order is the `update_sort_order` API on the table. + +Sort orders can only be updated by adding a new sort order. They cannot be deleted or modified. + +### Updating a sort order on a table + +To create a new sort order, you can use either the `asc` or `desc` API depending on whether you want you data sorted in ascending or descending order. Both take the name of the field, the sort order transform, and a null order that describes the order of null values when sorted. + +```python +with table.update_sort_order() as update: + update.desc("event_ts", DayTransform(), NullOrder.NULLS_FIRST) + update.asc("some_field", IdentityTransform(), NullOrder.NULLS_LAST) +``` + ## Table properties Set and remove properties through the `Transaction` API: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 93edf70f46..59abcf3a43 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -120,6 +120,7 @@ UpdateSnapshot, _FastAppendFiles, ) +from pyiceberg.table.update.sorting import UpdateSortOrder from pyiceberg.table.update.spec import UpdateSpec from pyiceberg.table.update.statistics import UpdateStatistics from pyiceberg.transforms import IdentityTransform @@ -431,6 +432,20 @@ def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive name_mapping=self.table_metadata.name_mapping(), ) + def update_sort_order(self, case_sensitive: bool = True) -> UpdateSortOrder: + """Create a new UpdateSortOrder to update the sort order of this table. + + Args: + case_sensitive: If field names are case-sensitive. + + Returns: + A new UpdateSortOrder. + """ + return UpdateSortOrder( + self, + case_sensitive=case_sensitive, + ) + def update_snapshot(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> UpdateSnapshot: """Create a new UpdateSnapshot to produce a new snapshot for the table. @@ -1102,6 +1117,14 @@ def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive name_mapping=self.name_mapping(), ) + def update_sort_order(self, case_sensitive: bool = True) -> UpdateSortOrder: + """Create a new UpdateSortOrder to update the sort order of this table. + + Returns: + A new UpdateSortOrder. + """ + return UpdateSortOrder(transaction=Transaction(self, autocommit=True), case_sensitive=case_sensitive) + def name_mapping(self) -> Optional[NameMapping]: """Return the table's field-id NameMapping.""" return self.metadata.name_mapping() diff --git a/pyiceberg/table/update/sorting.py b/pyiceberg/table/update/sorting.py new file mode 100644 index 0000000000..4df17d700c --- /dev/null +++ b/pyiceberg/table/update/sorting.py @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, List, Tuple + +from pyiceberg.table.sorting import NullOrder, SortDirection, SortField, SortOrder +from pyiceberg.table.update import ( + AddSortOrderUpdate, + AssertDefaultSortOrderId, + SetDefaultSortOrderUpdate, + TableRequirement, + TableUpdate, + UpdatesAndRequirements, + UpdateTableMetadata, +) +from pyiceberg.transforms import Transform + +if TYPE_CHECKING: + from pyiceberg.table import Transaction + + +class UpdateSortOrder(UpdateTableMetadata["UpdateSortOrder"]): + _transaction: Transaction + _last_assigned_order_id: int + _case_sensitive: bool + _fields: List[SortField] + _last_sort_order_id: int + + def __init__(self, transaction: Transaction, case_sensitive: bool = True) -> None: + super().__init__(transaction) + self._fields: List[SortField] = [] + self._case_sensitive: bool = case_sensitive + self._last_sort_order_id: int = transaction.table_metadata.default_sort_order_id + + def _column_name_to_id(self, column_name: str) -> int: + """Map the column name to the column field id.""" + return ( + self._transaction.table_metadata.schema() + .find_field( + name_or_id=column_name, + case_sensitive=self._case_sensitive, + ) + .field_id + ) + + def _add_sort_field( + self, + source_id: int, + transform: Transform[Any, Any], + direction: SortDirection, + null_order: NullOrder, + ) -> UpdateSortOrder: + """Add a sort field to the sort order list.""" + self._fields.append( + SortField( + source_id=source_id, + transform=transform, + direction=direction, + null_order=null_order, + ) + ) + return self + + def asc(self, source_column_name: str, transform: Transform[Any, Any], null_order: NullOrder = NullOrder.NULLS_LAST) -> UpdateSortOrder: + """Add a sort field with ascending order.""" + return self._add_sort_field( + source_id=self._column_name_to_id(source_column_name), + transform=transform, + direction=SortDirection.ASC, + null_order=null_order, + ) + + def desc( + self, source_column_name: str, transform: Transform[Any, Any], null_order: NullOrder = NullOrder.NULLS_LAST + ) -> UpdateSortOrder: + """Add a sort field with descending order.""" + return self._add_sort_field( + source_id=self._column_name_to_id(source_column_name), + transform=transform, + direction=SortDirection.DESC, + null_order=null_order, + ) + + def _apply(self) -> SortOrder: + """Return the sort order.""" + return SortOrder(*self._fields, order_id=self._last_sort_order_id + 1) + + def _commit(self) -> UpdatesAndRequirements: + """Apply the pending changes and commit.""" + new_sort_order = self._apply() + requirements: Tuple[TableRequirement, ...] = () + updates: Tuple[TableUpdate, ...] = () + + if self._transaction.table_metadata.default_sort_order_id != new_sort_order.order_id: + updates = (AddSortOrderUpdate(sort_order=new_sort_order), SetDefaultSortOrderUpdate(sort_order_id=-1)) + else: + updates = (SetDefaultSortOrderUpdate(sort_order_id=new_sort_order.order_id),) + + required_last_assigned_sort_order_id = self._transaction.table_metadata.default_sort_order_id + requirements = (AssertDefaultSortOrderId(default_sort_order_id=required_last_assigned_sort_order_id),) + + return updates, requirements diff --git a/tests/integration/test_sort_order_update.py b/tests/integration/test_sort_order_update.py new file mode 100644 index 0000000000..bfac783e9e --- /dev/null +++ b/tests/integration/test_sort_order_update.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint:disable=redefined-outer-name + +import pytest + +from pyiceberg.catalog import Catalog +from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.schema import Schema +from pyiceberg.table import Table +from pyiceberg.table.sorting import NullOrder, SortDirection, SortField, SortOrder +from pyiceberg.transforms import ( + IdentityTransform, +) + + +def _simple_table(catalog: Catalog, table_schema_simple: Schema, format_version: str) -> Table: + return _create_table_with_schema(catalog, table_schema_simple, format_version) + + +def _create_table_with_schema(catalog: Catalog, schema: Schema, format_version: str) -> Table: + tbl_name = "default.test_schema_evolution" + try: + catalog.drop_table(tbl_name) + except NoSuchTableError: + pass + return catalog.create_table(identifier=tbl_name, schema=schema, properties={"format-version": format_version}) + + +@pytest.mark.integration +@pytest.mark.parametrize( + "catalog, format_version", + [ + (pytest.lazy_fixture("session_catalog"), "1"), + (pytest.lazy_fixture("session_catalog_hive"), "1"), + (pytest.lazy_fixture("session_catalog"), "2"), + (pytest.lazy_fixture("session_catalog_hive"), "2"), + ], +) +def test_map_column_name_to_id(catalog: Catalog, format_version: str, table_schema_simple: Schema) -> None: + simple_table = _simple_table(catalog, table_schema_simple, format_version) + for col_name, col_id in {"foo": 1, "bar": 2, "baz": 3}.items(): + assert col_id == simple_table.update_sort_order()._column_name_to_id(col_name) + + +@pytest.mark.integration +@pytest.mark.parametrize( + "catalog, format_version", + [ + (pytest.lazy_fixture("session_catalog"), "1"), + (pytest.lazy_fixture("session_catalog_hive"), "1"), + (pytest.lazy_fixture("session_catalog"), "2"), + (pytest.lazy_fixture("session_catalog_hive"), "2"), + ], +) +def test_replace_sort_order(catalog: Catalog, format_version: str, table_schema_simple: Schema) -> None: + simple_table = _simple_table(catalog, table_schema_simple, format_version) + simple_table.update_sort_order().asc("foo", IdentityTransform(), NullOrder.NULLS_FIRST).desc( + "bar", IdentityTransform(), NullOrder.NULLS_LAST + ).commit() + assert simple_table.sort_order() == SortOrder( + SortField(source_id=1, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), + SortField(source_id=2, transform=IdentityTransform(), direction=SortDirection.DESC, null_order=NullOrder.NULLS_LAST), + order_id=1, + ) + + +@pytest.mark.integration +@pytest.mark.parametrize( + "catalog, format_version", + [ + (pytest.lazy_fixture("session_catalog"), "1"), + (pytest.lazy_fixture("session_catalog_hive"), "1"), + (pytest.lazy_fixture("session_catalog"), "2"), + (pytest.lazy_fixture("session_catalog_hive"), "2"), + ], +) +def test_replace_existing_sort_order(catalog: Catalog, format_version: str, table_schema_simple: Schema) -> None: + simple_table = _simple_table(catalog, table_schema_simple, format_version) + simple_table.update_sort_order().asc("foo", IdentityTransform(), NullOrder.NULLS_FIRST).commit() + assert simple_table.sort_order() == SortOrder( + SortField(source_id=1, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), + order_id=1, + ) + simple_table.update_sort_order().asc("foo", IdentityTransform(), NullOrder.NULLS_LAST).desc( + "bar", IdentityTransform(), NullOrder.NULLS_FIRST + ).commit() + assert ( + len(simple_table.sort_orders()) == 3 + ) # 0: empty sort order from creating tables, 1: first sort order, 2: second sort order + assert simple_table.sort_order() == SortOrder( + SortField(source_id=1, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_LAST), + SortField(source_id=2, transform=IdentityTransform(), direction=SortDirection.DESC, null_order=NullOrder.NULLS_FIRST), + order_id=2, + )