From 11b7622cc8c7f1212c157fac994bd26b04495211 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Sun, 15 Oct 2023 21:32:43 +0100 Subject: [PATCH 1/8] implement 1708 --- .gitignore | 3 +- docs/python_api.md | 14 +- docs/usage/examining-table.md | 10 +- docs/usage/index.md | 2 +- mkdocs.yml | 44 ++-- python/deltalake/_internal.pyi | 426 +++++++++++++++++++++++++++++--- python/deltalake/fs.py | 30 ++- python/deltalake/schema.py | 7 +- python/deltalake/table.py | 429 ++++++++++++++++++++------------- python/deltalake/writer.py | 91 +++---- python/src/schema.rs | 279 +-------------------- 11 files changed, 782 insertions(+), 553 deletions(-) diff --git a/.gitignore b/.gitignore index 8642b9722a..5ecd5a627d 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,5 @@ Cargo.lock !/delta-inspect/Cargo.lock !/proofs/Cargo.lock -justfile \ No newline at end of file +justfile +site \ No newline at end of file diff --git a/docs/python_api.md b/docs/python_api.md index 4837122c43..e10feb8b41 100644 --- a/docs/python_api.md +++ b/docs/python_api.md @@ -1,6 +1,6 @@ # Python API Reference -## DeltaTable +## Delta Table ::: deltalake.table @@ -13,16 +13,28 @@ Schemas, fields, and data types are provided in the ``deltalake.schema`` submodule. ::: deltalake.schema.Schema + options: + show_root_heading: true ::: deltalake.schema.PrimitiveType + options: + show_root_heading: true ::: deltalake.schema.ArrayType + options: + show_root_heading: true ::: deltalake.schema.MapType + options: + show_root_heading: true ::: deltalake.schema.Field + options: + show_root_heading: true ::: deltalake.schema.StructType + options: + show_root_heading: true ## Data Catalog diff --git a/docs/usage/examining-table.md b/docs/usage/examining-table.md index c4cbfb0836..0d91153953 100644 --- a/docs/usage/examining-table.md +++ b/docs/usage/examining-table.md @@ -14,7 +14,7 @@ The delta log maintains basic metadata about a table, including: to have data deleted from it. Get metadata from a table with the -[DeltaTable.metadata()][] method: +[DeltaTable.metadata()][deltalake.table.DeltaTable.metadata] method: ``` python >>> from deltalake import DeltaTable @@ -27,12 +27,12 @@ Metadata(id: 5fba94ed-9794-4965-ba6e-6ee3c0d22af9, name: None, description: None The schema for the table is also saved in the transaction log. It can either be retrieved in the Delta Lake form as -[deltalake.schema.Schema][] or as a +[Schema][deltalake.schema.Schema] or as a PyArrow schema. The first allows you to introspect any column-level metadata stored in the schema, while the latter represents the schema the table will be loaded into. -Use [DeltaTable.schema][] to retrieve the delta lake schema: +Use [DeltaTable.schema][deltalake.table.DeltaTable.schema] to retrieve the delta lake schema: ``` python >>> from deltalake import DeltaTable @@ -43,14 +43,14 @@ Schema([Field(id, PrimitiveType("long"), nullable=True)]) These schemas have a JSON representation that can be retrieved. To reconstruct from json, use -[deltalake.schema.Schema.from_json()][]. +[DeltaTable.schema.json()][deltalake.schema.Schema.json]. ``` python >>> dt.schema().json() '{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}}]}' ``` -Use [deltalake.schema.Schema.to_pyarrow()][] to retrieve the PyArrow schema: +Use [DeltaTable.schema.to_pyarrow()][deltalake.schema.Schema.to_pyarrow] to retrieve the PyArrow schema: ``` python >>> dt.schema().to_pyarrow() diff --git a/docs/usage/index.md b/docs/usage/index.md index 5f9624653a..6b1ba86879 100644 --- a/docs/usage/index.md +++ b/docs/usage/index.md @@ -1,6 +1,6 @@ # Usage -A [DeltaTable][] represents the state of a +A [DeltaTable][deltalake.table.DeltaTable] represents the state of a delta table at a particular version. This includes which files are currently part of the table, the schema of the table, and other metadata such as creation time. diff --git a/mkdocs.yml b/mkdocs.yml index dd25578b4e..2ae13250b7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,6 +1,8 @@ site_name: Python Delta Lake Documentation theme: - name: readthedocs + name: mkdocs + navigation_depth: 2 + hljs_style: nord nav: - Home: index.md - Installation: installation.md @@ -14,22 +16,30 @@ nav: - API Reference: python_api.md plugins: -- autorefs -- mkdocstrings: - handlers: - python: - path: [../python] - rendering: - heading_level: 4 - show_source: false - show_symbol_type_in_heading: true - show_signature_annotations: true - show_root_heading: true - members_order: source - import: - # for cross references - - https://arrow.apache.org/docs/objects.inv - - https://pandas.pydata.org/docs/objects.inv + - autorefs + - mkdocstrings: + handlers: + python: + path: [../python] + options: + # docstring_style: sphinx + docstring_section_style: table + ocstring_section_style: litabst + filters: ["!^_", "^__init__$"] + heading_level: 3 + show_source: false + show_symbol_type_in_heading: true + show_signature_annotations: true + show_root_heading: false + show_root_full_path: true + separate_signature: true + docstring_options: + ignore_init_summary: false + merge_init_into_class: true + import: + # for cross references + - https://arrow.apache.org/docs/objects.inv + - https://pandas.pydata.org/docs/objects.inv markdown_extensions: - admonition \ No newline at end of file diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index d1fc616359..f57129637d 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -6,7 +6,7 @@ if sys.version_info >= (3, 8): else: from typing_extensions import Literal -import pyarrow as pa +import pyarrow import pyarrow.fs as fs from deltalake.writer import AddAction @@ -83,10 +83,10 @@ class RawDeltaTable: def history(self, limit: Optional[int]) -> List[str]: ... def update_incremental(self) -> None: ... def dataset_partitions( - self, schema: pa.Schema, partition_filters: Optional[FilterType] + self, schema: pyarrow.Schema, partition_filters: Optional[FilterType] ) -> List[Any]: ... def create_checkpoint(self) -> None: ... - def get_add_actions(self, flatten: bool) -> pa.RecordBatch: ... + def get_add_actions(self, flatten: bool) -> pyarrow.RecordBatch: ... def delete(self, predicate: Optional[str]) -> str: ... def update( self, @@ -103,14 +103,14 @@ class RawDeltaTable: add_actions: List[AddAction], mode: str, partition_by: List[str], - schema: pa.Schema, + schema: pyarrow.Schema, partitions_filters: Optional[FilterType], ) -> None: ... def rust_core_version() -> str: ... def write_new_deltalake( table_uri: str, - schema: pa.Schema, + schema: pyarrow.Schema, add_actions: List[AddAction], _mode: str, partition_by: List[str], @@ -119,41 +119,171 @@ def write_new_deltalake( configuration: Optional[Mapping[str, Optional[str]]], storage_options: Optional[Dict[str, str]], ) -> None: ... -def batch_distinct(batch: pa.RecordBatch) -> pa.RecordBatch: ... +def batch_distinct(batch: pyarrow.RecordBatch) -> pyarrow.RecordBatch: ... # Can't implement inheritance (see note in src/schema.rs), so this is next # best thing. DataType = Union["PrimitiveType", "MapType", "StructType", "ArrayType"] class PrimitiveType: + """ A primitive datatype, such as a string or number. + + Can be initialized with a string value: + + ``` + PrimitiveType("integer") + ``` + + Valid primitive data types include: + + * "string", + * "long", + * "integer", + * "short", + * "byte", + * "float", + * "double", + * "boolean", + * "binary", + * "date", + * "timestamp", + * "decimal(, )" + + Args: + data_type: string representation of the data type + """ + def __init__(self, data_type: str) -> None: ... type: str + """ The inner type + """ def to_json(self) -> str: ... @staticmethod - def from_json(json: str) -> "PrimitiveType": ... - def to_pyarrow(self) -> pa.DataType: ... + def from_json(json: str) -> PrimitiveType: + """ Create a PrimitiveType from a JSON string + + The JSON representation for a primitive type is just a quoted string: `PrimitiveType.from_json('"integer"')` + + Args: + json: A JSON string + + Returns a [PrimitiveType][deltalake.schema.PrimitiveType] type + """ + + def to_pyarrow(self) -> pyarrow.DataType: + """ Get the equivalent PyArrow type (pyarrow.DataType) + """ + @staticmethod - def from_pyarrow(type: pa.DataType) -> "PrimitiveType": ... + def from_pyarrow(type: pyarrow.DataType) -> PrimitiveType: + """ Create a [PrimitiveType][deltalake.schema.PrimitiveType] from a PyArrow type + + Will raise `TypeError` if the PyArrow type is not a primitive type. + + Args: + type: A PyArrow [DataType][pyarrow.DataType] type + + Returns: a [PrimitiveType][deltalake.schema.PrimitiveType] type + """ + class ArrayType: + """ An Array (List) DataType + + Can either pass the element type explicitly or can pass a string + if it is a primitive type: + ``` + ArrayType(PrimitiveType("integer")) + # Returns ArrayType(PrimitiveType("integer"), contains_null=True) + + ArrayType("integer", contains_null=False) + # Returns ArrayType(PrimitiveType("integer"), contains_null=False) + ``` + """ + def __init__( self, element_type: DataType, *, contains_null: bool = True ) -> None: ... type: Literal["array"] + """ The string "array" + """ + element_type: DataType + """ The type of the element, of type: + Union[ + [PrimitiveType][deltalake.schema.PrimitiveType], + [ArrayType][deltalake.schema.ArrayType], + [MapType][deltalake.schema.MapType], + [StructType][deltalake.schema.StructType] + ] + """ + contains_null: bool + """ Whether the arrays may contain null values + """ + + def to_json(self) -> str: + """ Get the JSON string representation of the type. + """ - def to_json(self) -> str: ... @staticmethod - def from_json(json: str) -> "ArrayType": ... + def from_json(json: str) -> "ArrayType": + """ Create an ArrayType from a JSON string + + The JSON representation for an array type is an object with `type` (set to + `"array"`), `elementType`, and `containsNull`: + ``` + ArrayType.from_json( + '''{ + "type": "array", + "elementType": "integer", + "containsNull": false + }''' + ) + # Returns ArrayType(PrimitiveType("integer"), contains_null=False) + ``` + + Args: + json: A JSON string + + Returns: an [ArrayType][deltalake.schema.ArrayType] type + """ + def to_pyarrow( self, - ) -> pa.ListType: ... + ) -> pyarrow.ListType: + """ Get the equivalent PyArrow type. + """ + @staticmethod - def from_pyarrow(type: pa.ListType) -> "ArrayType": ... + def from_pyarrow(type: pyarrow.ListType) -> ArrayType: + """ Create an ArrayType from a pyarrow.ListType. + + Will raise `TypeError` if a different PyArrow DataType is provided. + + Args: + type: The PyArrow [ListType][pyarrow.ListType] + + Returns: an [ArrayType][deltalake.schema.ArrayType] type + """ class MapType: + """ A map data type + + `key_type` and `value_type` should be [PrimitiveType][deltalake.schema.PrimitiveType], [ArrayType][deltalake.schema.ArrayType], + or [StructType][deltalake.schema.StructType]. A string can also be passed, which will be + parsed as a primitive type: + + ``` + MapType(PrimitiveType("integer"), PrimitiveType("string")) + # Returns MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=True) + + MapType("integer", "string", value_contains_null=False) + # Returns MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=False) + ``` + """ + def __init__( self, key_type: DataType, @@ -163,17 +293,91 @@ class MapType: ) -> None: ... type: Literal["map"] key_type: DataType + """ The type of the keys, of type: + Union[ + [PrimitiveType][deltalake.schema.PrimitiveType], + [ArrayType][deltalake.schema.ArrayType], + [MapType][deltalake.schema.MapType], + [StructType][deltalake.schema.StructType] + ] + """ + value_type: DataType + """The type of the values, of type: + Union[ + [PrimitiveType][deltalake.schema.PrimitiveType], + [ArrayType][deltalake.schema.ArrayType], + [MapType][deltalake.schema.MapType], + [StructType][deltalake.schema.StructType] + ] + """ + value_contains_null: bool + """ Whether the values in a map may be null + """ + + def to_json(self) -> str: + """ Get JSON string representation of map type. + """ - def to_json(self) -> str: ... @staticmethod - def from_json(json: str) -> "MapType": ... - def to_pyarrow(self) -> pa.MapType: ... + def from_json(json: str) -> MapType: + """ Create a MapType from a JSON string + + The JSON representation for a map type is an object with `type` (set to `map`), + `keyType`, `valueType`, and `valueContainsNull`: + ``` + MapType.from_json( + '''{ + "type": "map", + "keyType": "integer", + "valueType": "string", + "valueContainsNull": true + }''' + ) + # Returns MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=True) + ``` + + Args: + json: A JSON string + + Returns: a [MapType][deltalake.schema.MapType] type + """ + + def to_pyarrow(self) -> pyarrow.MapType: + """ Get the equivalent PyArrow data type. + """ + @staticmethod - def from_pyarrow(type: pa.MapType) -> "MapType": ... + def from_pyarrow(type: pyarrow.MapType) -> MapType: + """ Create a MapType from a PyArrow MapType. + + Will raise `TypeError` if passed a different type. + + Args: + type: the PyArrow MapType + + Returns: a [MapType][deltalake.schema.MapType] type + """ class Field: + """ A field in a Delta StructType or Schema + + Can create with just a name and a type: + ``` + Field("my_int_col", "integer") + # Returns Field("my_int_col", PrimitiveType("integer"), nullable=True, metadata=None) + ``` + + Can also attach metadata to the field. Metadata should be a dictionary with + string keys and JSON-serializable values (str, list, int, float, dict): + + ``` + Field("my_col", "integer", metadata={"custom_metadata": {"test": 2}}) + # Returns Field("my_col", PrimitiveType("integer"), nullable=True, metadata={"custom_metadata": {"test": 2}}) + ``` + """ + def __init__( self, name: str, @@ -183,40 +387,200 @@ class Field: metadata: Optional[Dict[str, Any]] = None, ) -> None: ... name: str + """ The name of the field + """ + type: DataType + """ The type of the field, of type: + Union[ + [PrimitiveType][deltalake.schema.PrimitiveType], + [ArrayType][deltalake.schema.ArrayType], + [MapType][deltalake.schema.MapType], + [StructType][deltalake.schema.StructType] + ] + """ + nullable: bool + """ Whether there may be null values in the field + """ + metadata: Dict[str, Any] + """ The metadata of the field + """ + + def to_json(self) -> str: + """ Get the field as JSON string. + ``` + Field("col", "integer").to_json() + # Returns '{"name":"col","type":"integer","nullable":true,"metadata":{}}' + ``` + """ - def to_json(self) -> str: ... @staticmethod - def from_json(json: str) -> "Field": ... - def to_pyarrow(self) -> pa.Field: ... + def from_json(json: str) -> Field: + """ Create a Field from a JSON string. + + Args: + json: the JSON string. + + Returns: Field + + Example: + ``` + Field.from_json('''{ + "name": "col", + "type": "integer", + "nullable": true, + "metadata": {} + }''' + ) + # Returns Field(col, PrimitiveType("integer"), nullable=True) + ``` + """ + + def to_pyarrow(self) -> pyarrow.Field: + """ Convert to an equivalent PyArrow field + Note: This currently doesn't preserve field metadata. + + Returns: a [pyarrow.Field][pyarrow.Field] type + """ + @staticmethod - def from_pyarrow(type: pa.Field) -> "Field": ... + def from_pyarrow(field: pyarrow.Field) -> Field: + """Create a Field from a PyArrow field + Note: This currently doesn't preserve field metadata. + + Args: + field: a PyArrow Field type + + Returns: a [Field][deltalake.schema.Field] type + """ class StructType: + """ A struct datatype, containing one or more subfields + + Example: + + Create with a list of :class:`Field`: + ``` + StructType([Field("x", "integer"), Field("y", "string")]) + # Creates: StructType([Field(x, PrimitiveType("integer"), nullable=True), Field(y, PrimitiveType("string"), nullable=True)]) + ``` + """ + def __init__(self, fields: List[Field]) -> None: ... type: Literal["struct"] fields: List[Field] + """ The fields within the struct + """ + + def to_json(self) -> str: + """ Get the JSON representation of the type. + ``` + StructType([Field("x", "integer")]).to_json() + # Returns '{"type":"struct","fields":[{"name":"x","type":"integer","nullable":true,"metadata":{}}]}' + ``` + """ - def to_json(self) -> str: ... @staticmethod - def from_json(json: str) -> "StructType": ... - def to_pyarrow(self) -> pa.StructType: ... + def from_json(json: str) -> StructType: + """ Create a new StructType from a JSON string. + ``` + StructType.from_json( + '''{ + "type": "struct", + "fields": [{"name": "x", "type": "integer", "nullable": true, "metadata": {}}] + }''' + ) + # Returns StructType([Field(x, PrimitiveType("integer"), nullable=True)]) + ``` + + Args: + json: a JSON string + + Returns: a [StructType][deltalake.schema.StructType] type + """ + + def to_pyarrow(self) -> pyarrow.StructType: + """ Get the equivalent PyArrow StructType + + Returns: a PyArrow [StructType][pyarrow.StructType] type + """ + @staticmethod - def from_pyarrow(type: pa.StructType) -> "StructType": ... + def from_pyarrow(type: pyarrow.StructType) -> StructType: + """ Create a new StructType from a PyArrow struct type. + + Will raise `TypeError` if a different data type is provided. + + Args: + type: a PyArrow struct type. + + Returns: a [StructType][deltalake.schema.StructType] type + """ class Schema: def __init__(self, fields: List[Field]) -> None: ... fields: List[Field] + invariants: List[Tuple[str, str]] + """ The list of invariants on the table. Each invarint is a tuple of strings. The first string is the + field path and the second is the SQL of the invariant. + """ + + def json(self): + """ + !!! warning "DEPRECATED" + Convert to JSON dictionary representation + """ + def to_json(self) -> str: + """ Get the JSON string representation of the Schema. + A schema has the same JSON format as a StructType. + ``` + Schema([Field("x", "integer")]).to_json() + # Returns '{"type":"struct","fields":[{"name":"x","type":"integer","nullable":true,"metadata":{}}]}' + ``` + Returns: a JSON string + """ - def to_json(self) -> str: ... @staticmethod - def from_json(json: str) -> "Schema": ... - def to_pyarrow(self, as_large_types: bool = False) -> pa.Schema: ... + def from_json(json: str) -> Schema: + """ Create a new Schema from a JSON string. + + A schema has the same JSON format as a StructType. + ``` + Schema.from_json('''{ + "type": "struct", + "fields": [{"name": "x", "type": "integer", "nullable": true, "metadata": {}}] + } + )''' + # Returns Schema([Field(x, PrimitiveType("integer"), nullable=True)]) + ``` + + Args: + json: a JSON string + """ + + def to_pyarrow(self, as_large_types: bool = False) -> pyarrow.Schema: + """ Return equivalent PyArrow schema + + Args: + as_large_types: get schema with all variable size types (list, binary, string) as large variants (with int64 indices). This is for compatibility with systems like Polars that only support the large versions of Arrow types. + + Returns: + a PyArrow [Schema][pyarrow.Schema] type + """ @staticmethod - def from_pyarrow(type: pa.Schema) -> "Schema": ... + def from_pyarrow(type: pyarrow.Schema) -> Schema: + """ Create a [Schema][deltalake.schema.Schema] from a PyArrow Schema type + + Will raise `TypeError` if the PyArrow type is not a primitive type. + + Args: + type: A PyArrow [Schema][pyarrow.Schema] type + + Returns: a [Schema][deltalake.schema.Schema] type + """ class ObjectInputFile: @property @@ -247,7 +611,7 @@ class ObjectOutputStream: def write(self, data: bytes) -> int: ... class DeltaFileSystemHandler: - """Implementation of pyarrow.fs.FileSystemHandler for use with pyarrow.fs.PyFileSystem""" + """Implementation of [pyarrow.fs.FileSystemHandler][pyarrow.fs.FileSystemHandler] for use with [pyarrow.fs.PyFileSystem][pyarrow.fs.PyFileSystem]""" def __init__( self, @@ -312,7 +676,7 @@ class DeltaFileSystemHandler: class DeltaDataChecker: def __init__(self, invariants: List[Tuple[str, str]]) -> None: ... - def check_batch(self, batch: pa.RecordBatch) -> None: ... + def check_batch(self, batch: pyarrow.RecordBatch) -> None: ... class DeltaError(Exception): """The base class for Delta-specific errors.""" diff --git a/python/deltalake/fs.py b/python/deltalake/fs.py index efb92fe239..12e33f40e3 100644 --- a/python/deltalake/fs.py +++ b/python/deltalake/fs.py @@ -16,8 +16,11 @@ def open_input_file(self, path: str) -> pa.PythonFile: """ Open an input file for random access reading. - :param source: The source to open for reading. - :return: NativeFile + Args: + path: The source to open for reading. + + Returns: + NativeFile """ return pa.PythonFile(DeltaFileSystemHandler.open_input_file(self, path)) @@ -25,8 +28,11 @@ def open_input_stream(self, path: str) -> pa.PythonFile: """ Open an input stream for sequential reading. - :param source: The source to open for reading. - :return: NativeFile + Args: + path: The source to open for reading. + + Returns: + NativeFile """ return pa.PythonFile(DeltaFileSystemHandler.open_input_file(self, path)) @@ -38,9 +44,12 @@ def open_output_stream( If the target already exists, existing data is truncated. - :param path: The source to open for writing. - :param metadata: If not None, a mapping of string keys to string values. - :return: NativeFile + Args: + path: The source to open for writing. + metadata: If not None, a mapping of string keys to string values. + + Returns: + NativeFile """ return pa.PythonFile( DeltaFileSystemHandler.open_output_stream(self, path, metadata) @@ -50,8 +59,11 @@ def get_file_info_selector(self, selector: FileSelector) -> List[FileInfo]: # t """ Get info for the files defined by FileSelector. - :param selector: FileSelector object - :return: list of file info objects + Args: + selector: FileSelector object + + Returns: + list of file info objects """ return DeltaFileSystemHandler.get_file_info_selector( self, selector.base_dir, selector.allow_not_found, selector.recursive diff --git a/python/deltalake/schema.py b/python/deltalake/schema.py index ccc0bbe65b..d529e01e14 100644 --- a/python/deltalake/schema.py +++ b/python/deltalake/schema.py @@ -24,8 +24,11 @@ def delta_arrow_schema_from_pandas( Infers the schema for the delta table from the Pandas DataFrame. Necessary because of issues such as: https://github.com/delta-io/delta-rs/issues/686 - :param data: Data to write. - :return: A PyArrow Table and the inferred schema for the Delta Table + Args: + data: Data to write. + + Returns: + A PyArrow Table and the inferred schema for the Delta Table """ table = pa.Table.from_pandas(data) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 80a48f619e..64daeea93a 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -150,7 +150,7 @@ def _check_dnf( def _convert_single_predicate(column: str, op: str, value: Any) -> Expression: """ - Convert given `tuple` to `pyarrow.dataset.Expression`. + Convert given `tuple` to [pyarrow.dataset.Expression]. """ import pyarrow.dataset as ds @@ -179,7 +179,7 @@ def _convert_single_predicate(column: str, op: str, value: Any) -> Expression: def _filters_to_expression(filters: FilterType) -> Expression: """ - Check if filters are well-formed and convert to an ``pyarrow.dataset.Expression``. + Check if filters are well-formed and convert to an [pyarrow.dataset.Expression]. """ if isinstance(filters[0][0], str): # We have encountered the situation where we have one nesting level too few: @@ -208,16 +208,18 @@ def _filters_to_expression(filters: FilterType) -> Expression: The supported type for value is str. Use empty string `''` for Null partition value. Examples: +``` ("x", "=", "a") ("x", "!=", "a") ("y", "in", ["a", "b", "c"]) ("z", "not in", ["a","b"]) +``` """ @dataclass(init=False) class DeltaTable: - """Create a DeltaTable instance.""" + """Represents a Delta Table""" def __init__( self, @@ -232,13 +234,14 @@ def __init__( Multiple StorageBackends are currently supported: AWS S3, Azure Data Lake Storage Gen2, Google Cloud Storage (GCS) and local URI. Depending on the storage backend used, you could provide options values using the ``storage_options`` parameter. - :param table_uri: the path of the DeltaTable - :param version: version of the DeltaTable - :param storage_options: a dictionary of the options to use for the storage backend - :param without_files: If True, will load table without tracking files. - Some append-only applications might have no need of tracking any files. So, the - DeltaTable will be loaded with a significant memory reduction. - :param log_buffer_size: Number of files to buffer when reading the commit log. A positive integer. + Args: + table_uri: the path of the DeltaTable + version: version of the DeltaTable + storage_options: a dictionary of the options to use for the storage backend + without_files: If True, will load table without tracking files. + Some append-only applications might have no need of tracking any files. So, the + DeltaTable will be loaded with a significant memory reduction. + log_buffer_size: Number of files to buffer when reading the commit log. A positive integer. Setting a value greater than 1 results in concurrent calls to the storage api. This can decrease latency if there are many files in the log since the last checkpoint, but will also increase memory usage. Possible rate limits of the storage backend should @@ -268,12 +271,13 @@ def from_data_catalog( """ Create the Delta Table from a Data Catalog. - :param data_catalog: the Catalog to use for getting the storage location of the Delta Table - :param database_name: the database name inside the Data Catalog - :param table_name: the table name inside the Data Catalog - :param data_catalog_id: the identifier of the Data Catalog - :param version: version of the DeltaTable - :param log_buffer_size: Number of files to buffer when reading the commit log. A positive integer. + Args: + data_catalog: the Catalog to use for getting the storage location of the Delta Table + database_name: the database name inside the Data Catalog + table_name: the table name inside the Data Catalog + data_catalog_id: the identifier of the Data Catalog + version: version of the DeltaTable + log_buffer_size: Number of files to buffer when reading the commit log. A positive integer. Setting a value greater than 1 results in concurrent calls to the storage api. This can decrease latency if there are many files in the log since the last checkpoint, but will also increase memory usage. Possible rate limits of the storage backend should @@ -293,27 +297,46 @@ def version(self) -> int: """ Get the version of the DeltaTable. - :return: The current version of the DeltaTable + Returns: + The current version of the DeltaTable """ return self._table.version() def files( self, partition_filters: Optional[List[Tuple[str, str, Any]]] = None ) -> List[str]: - return self._table.files(self.__stringify_partition_values(partition_filters)) + """ + Get the .parquet files of the DeltaTable. - files.__doc__ = f""" -Get the .parquet files of the DeltaTable. + The paths are as they are saved in the delta log, which may either be + relative to the table root or absolute URIs. -The paths are as they are saved in the delta log, which may either be -relative to the table root or absolute URIs. + Args: + partition_filters: the partition filters that will be used for + getting the matched files + + Returns: + list of the .parquet files referenced for the current version of the DeltaTable + + Predicates are expressed in disjunctive normal form (DNF), like [("x", "=", "a"), ...]. + DNF allows arbitrary boolean logical combinations of single partition predicates. + The innermost tuples each describe a single partition predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple partition predicates. Each tuple has format: (key, op, value) and compares + the key with the value. The supported op are: `=`, `!=`, `in`, and `not in`. If + the op is in or not in, the value must be a collection such as a list, a set or a tuple. + The supported type for value is str. Use empty string `''` for Null partition value. + + Examples: + ``` + ("x", "=", "a") + ("x", "!=", "a") + ("y", "in", ["a", "b", "c"]) + ("z", "not in", ["a","b"]) + ``` + """ + return self._table.files(self.__stringify_partition_values(partition_filters)) -:param partition_filters: the partition filters that will be used for - getting the matched files -:return: list of the .parquet files referenced for the current version - of the DeltaTable -{_DNF_filter_doc} - """ def files_by_partitions( self, partition_filters: List[Tuple[str, str, Any]] @@ -321,8 +344,14 @@ def files_by_partitions( """ Get the files that match a given list of partitions filters. - .. deprecated:: 0.7.0 - Use :meth:`file_uris` instead. + !!! warning "DEPRECATED 0.7.0" + Use [file_uris][deltalake.table.DeltaTable.file_uris] instead. + + Args: + partition_filters: the partition filters that will be used for getting the matched files + + Returns: + list of the .parquet files after applying the partition filters referenced for the current version of the DeltaTable. Partitions which do not match the filter predicate will be removed from scanned data. Predicates are expressed in disjunctive normal form (DNF), like [("x", "=", "a"), ...]. @@ -335,13 +364,12 @@ def files_by_partitions( The supported type for value is str. Use empty string `''` for Null partition value. Examples: + ``` ("x", "=", "a") ("x", "!=", "a") ("y", "in", ["a", "b", "c"]) ("z", "not in", ["a","b"]) - - :param partition_filters: the partition filters that will be used for getting the matched files - :return: list of the .parquet files after applying the partition filters referenced for the current version of the DeltaTable. + ``` """ warnings.warn( "Call to deprecated method files_by_partitions. Please use file_uris instead.", @@ -353,29 +381,50 @@ def files_by_partitions( def file_uris( self, partition_filters: Optional[List[Tuple[str, str, Any]]] = None ) -> List[str]: - return self._table.file_uris( - self.__stringify_partition_values(partition_filters) - ) + """ + Get the list of files as absolute URIs, including the scheme (e.g. "s3://"). - file_uris.__doc__ = f""" -Get the list of files as absolute URIs, including the scheme (e.g. "s3://"). + Local files will be just plain absolute paths, without a scheme. (That is, + no 'file://' prefix.) -Local files will be just plain absolute paths, without a scheme. (That is, -no 'file://' prefix.) + Use the partition_filters parameter to retrieve a subset of files that match the + given filters. -Use the partition_filters parameter to retrieve a subset of files that match the -given filters. + Args: + partition_filters: the partition filters that will be used for getting the matched files -:param partition_filters: the partition filters that will be used for getting the matched files -:return: list of the .parquet files with an absolute URI referenced for the current version of the DeltaTable -{_DNF_filter_doc} - """ + Returns: + list of the .parquet files with an absolute URI referenced for the current version of the DeltaTable + Predicates are expressed in disjunctive normal form (DNF), like [("x", "=", "a"), ...]. + DNF allows arbitrary boolean logical combinations of single partition predicates. + The innermost tuples each describe a single partition predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple partition predicates. Each tuple has format: (key, op, value) and compares + the key with the value. The supported op are: `=`, `!=`, `in`, and `not in`. If + the op is in or not in, the value must be a collection such as a list, a set or a tuple. + The supported type for value is str. Use empty string `''` for Null partition value. + + Examples: + ``` + ("x", "=", "a") + ("x", "!=", "a") + ("y", "in", ["a", "b", "c"]) + ("z", "not in", ["a","b"]) + ``` + """ + return self._table.file_uris( + self.__stringify_partition_values(partition_filters) + ) + + file_uris.__doc__ = "" + def load_version(self, version: int) -> None: """ Load a DeltaTable with a specified version. - :param version: the identifier of the version of the DeltaTable to load + Args: + version: the identifier of the version of the DeltaTable to load """ self._table.load_version(version) @@ -384,12 +433,15 @@ def load_with_datetime(self, datetime_string: str) -> None: Time travel Delta table to the latest version that's created at or before provided `datetime_string` argument. The `datetime_string` argument should be an RFC 3339 and ISO 8601 date and time string. - Examples: - `2018-01-26T18:30:09Z` - `2018-12-19T16:39:57-08:00` - `2018-01-26T18:30:09.453+00:00` + Args: + datetime_string: the identifier of the datetime point of the DeltaTable to load - :param datetime_string: the identifier of the datetime point of the DeltaTable to load + Examples: + ``` + "2018-01-26T18:30:09Z" + "2018-12-19T16:39:57-08:00" + "2018-01-26T18:30:09.453+00:00" + ``` """ self._table.load_with_datetime(datetime_string) @@ -401,7 +453,8 @@ def schema(self) -> Schema: """ Get the current schema of the DeltaTable. - :return: the current Schema registered in the transaction log + Returns: + the current Schema registered in the transaction log """ return self._table.schema @@ -409,7 +462,8 @@ def metadata(self) -> Metadata: """ Get the current metadata of the DeltaTable. - :return: the current Metadata registered in the transaction log + Returns: + the current Metadata registered in the transaction log """ return self._metadata @@ -417,7 +471,8 @@ def protocol(self) -> ProtocolVersions: """ Get the reader and writer protocol versions of the DeltaTable. - :return: the current ProtocolVersions registered in the transaction log + Returns: + the current ProtocolVersions registered in the transaction log """ return ProtocolVersions(*self._table.protocol_versions()) @@ -426,8 +481,11 @@ def history(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: Run the history command on the DeltaTable. The operations are returned in reverse chronological order. - :param limit: the commit info limit to return - :return: list of the commit infos registered in the transaction log + Args: + limit: the commit info limit to return + + Returns: + list of the commit infos registered in the transaction log """ def _backwards_enumerate( @@ -458,10 +516,13 @@ def vacuum( """ Run the Vacuum command on the Delta Table: list and delete files no longer referenced by the Delta table and are older than the retention threshold. - :param retention_hours: the retention threshold in hours, if none then the value from `configuration.deletedFileRetentionDuration` is used or default of 1 week otherwise. - :param dry_run: when activated, list only the files, delete otherwise - :param enforce_retention_duration: when disabled, accepts retention hours smaller than the value from `configuration.deletedFileRetentionDuration`. - :return: the list of files no longer referenced by the Delta Table and are older than the retention threshold. + Args: + retention_hours: the retention threshold in hours, if none then the value from `configuration.deletedFileRetentionDuration` is used or default of 1 week otherwise. + dry_run: when activated, list only the files, delete otherwise + enforce_retention_duration: when disabled, accepts retention hours smaller than the value from `configuration.deletedFileRetentionDuration`. + + Returns: + the list of files no longer referenced by the Delta Table and are older than the retention threshold. """ if retention_hours: if retention_hours < 0: @@ -482,36 +543,42 @@ def update( ) -> Dict[str, Any]: """UPDATE records in the Delta Table that matches an optional predicate. - :param updates: a mapping of column name to update SQL expression. - :param predicate: a logical expression, defaults to None - :writer_properties: Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html, - only the fields: data_page_size_limit, dictionary_page_size_limit, data_page_row_count_limit, write_batch_size, max_row_group_size are supported. - :error_on_type_mismatch: specify if merge will return error if data types are mismatching :default = True - :return: the metrics from delete + Args: + updates: a mapping of column name to update SQL expression. + predicate: a logical expression, defaults to None + writer_properties: Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html, + only the following fields are supported: `data_page_size_limit`, `dictionary_page_size_limit`, `data_page_row_count_limit`, `write_batch_size`, `max_row_group_size`. + error_on_type_mismatch: specify if merge will return error if data types are mismatching :default = True + + Returns: + the metrics from delete Examples: Update some row values with SQL predicate. This is equivalent to - ``UPDATE table SET deleted = true WHERE id = '5'`` - - >>> from deltalake import DeltaTable - >>> dt = DeltaTable("tmp") - >>> dt.update(predicate="id = '5'", - ... updates = { - ... "deleted": True, - ... } - ... ) + `UPDATE table SET deleted = true WHERE id = '5'` + + ``` + from deltalake import DeltaTable + dt = DeltaTable("tmp") + dt.update( + predicate="id = '5'", + updates = {"deleted": True,} + ) + ``` Update all row values. This is equivalent to - ``UPDATE table SET id = concat(id, '_old')``. - >>> from deltalake import DeltaTable - >>> dt = DeltaTable("tmp") - >>> dt.update(updates = { - ... "deleted": True, - ... "id": "concat(id, '_old')" - ... } - ... ) - + `UPDATE table SET id = concat(id, '_old')`. + ``` + from deltalake import DeltaTable + dt = DeltaTable("tmp") + dt.update( + updates = { + "deleted": True, + "id": "concat(id, '_old')" + } + ) + ``` """ metrics = self._table.update( @@ -529,9 +596,11 @@ def pyarrow_schema(self) -> pyarrow.Schema: """ Get the current schema of the DeltaTable with the Parquet PyArrow format. - DEPRECATED: use DeltaTable.schema().to_pyarrow() instead. + !!! warning "DEPRECATED" + use DeltaTable.schema().to_pyarrow() instead. - :return: the current Schema with the Parquet PyArrow format + Returns: + the current Schema with the Parquet PyArrow format """ warnings.warn( "DeltaTable.pyarrow_schema() is deprecated. Use DeltaTable.schema().to_pyarrow() instead.", @@ -550,10 +619,13 @@ def restore( """ Run the Restore command on the Delta Table: restore table to a given version or datetime. - :param target: the expected version will restore, which represented by int, date str or datetime. - :param ignore_missing_files: whether the operation carry on when some data files missing. - :param protocol_downgrade_allowed: whether the operation when protocol version upgraded. - :return: the metrics from restore. + Args: + target: the expected version will restore, which represented by int, date str or datetime. + ignore_missing_files: whether the operation carry on when some data files missing. + protocol_downgrade_allowed: whether the operation when protocol version upgraded. + + Returns: + the metrics from restore. """ if isinstance(target, datetime): metrics = self._table.restore( @@ -578,11 +650,15 @@ def to_pyarrow_dataset( """ Build a PyArrow Dataset using data from the DeltaTable. - :param partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax - :param filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem - :param parquet_read_options: Optional read options for Parquet. Use this to handle INT96 to timestamp conversion for edge cases like 0001-01-01 or 9999-12-31 + Args: + partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax + filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem + parquet_read_options: Optional read options for Parquet. Use this to handle INT96 to timestamp conversion for edge cases like 0001-01-01 or 9999-12-31 + More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.ParquetReadOptions.html - :return: the PyArrow dataset in PyArrow + + Returns: + the PyArrow dataset in PyArrow """ if self.protocol().min_reader_version > MAX_SUPPORTED_READER_VERSION: raise DeltaProtocolError( @@ -640,10 +716,11 @@ def to_pyarrow_table( """ Build a PyArrow Table using data from the DeltaTable. - :param partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax - :param columns: The columns to project. This can be a list of column names to include (order and duplicates will be preserved) - :param filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem - :param filters: A disjunctive normal form (DNF) predicate for filtering rows. If you pass a filter you do not need to pass ``partitions`` + Args: + partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax + columns: The columns to project. This can be a list of column names to include (order and duplicates will be preserved) + filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem + filters: A disjunctive normal form (DNF) predicate for filtering rows. If you pass a filter you do not need to pass ``partitions`` """ if filters is not None: filters = _filters_to_expression(filters) @@ -661,10 +738,11 @@ def to_pandas( """ Build a pandas dataframe using data from the DeltaTable. - :param partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax - :param columns: The columns to project. This can be a list of column names to include (order and duplicates will be preserved) - :param filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem - :param filters: A disjunctive normal form (DNF) predicate for filtering rows. If you pass a filter you do not need to pass ``partitions`` + Args: + partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax + columns: The columns to project. This can be a list of column names to include (order and duplicates will be preserved) + filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem + filters: A disjunctive normal form (DNF) predicate for filtering rows. If you pass a filter you do not need to pass ``partitions`` """ return self.to_pyarrow_table( partitions=partitions, @@ -705,30 +783,40 @@ def get_add_actions(self, flatten: bool = False) -> pyarrow.RecordBatch: Add actions represent the files that currently make up the table. This data is a low-level representation parsed from the transaction log. - :param flatten: whether to flatten the schema. Partition values columns are - given the prefix `partition.`, statistics (null_count, min, and max) are - given the prefix `null_count.`, `min.`, and `max.`, and tags the - prefix `tags.`. Nested field names are concatenated with `.`. - - :returns: a PyArrow RecordBatch containing the add action data. - - Examples: - - >>> from deltalake import DeltaTable, write_deltalake - >>> import pyarrow as pa - >>> data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]}) - >>> write_deltalake("tmp", data, partition_by=["x"]) - >>> dt = DeltaTable("tmp") - >>> dt.get_add_actions().to_pandas() + Args: + flatten: whether to flatten the schema. Partition values columns are + given the prefix `partition.`, statistics (null_count, min, and max) are + given the prefix `null_count.`, `min.`, and `max.`, and tags the + prefix `tags.`. Nested field names are concatenated with `.`. + + Returns: + a PyArrow RecordBatch containing the add action data. + + Example: + ``` + from deltalake import DeltaTable, write_deltalake + import pyarrow as pa + data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]}) + write_deltalake("tmp", data, partition_by=["x"]) + dt = DeltaTable("tmp") + dt.get_add_actions().to_pandas() + ``` + + ``` path size_bytes modification_time data_change partition_values num_records null_count min max 0 x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 2} 1 {'y': 0} {'y': 5} {'y': 5} 1 x=3/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 3} 1 {'y': 0} {'y': 6} {'y': 6} 2 x=1/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 1} 1 {'y': 0} {'y': 4} {'y': 4} - >>> dt.get_add_actions(flatten=True).to_pandas() + ``` + ``` + dt.get_add_actions(flatten=True).to_pandas() + ``` + ``` path size_bytes modification_time data_change partition.x num_records null_count.y min.y max.y 0 x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 2 1 0 5 5 1 x=3/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 3 1 0 6 6 2 x=1/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 1 1 0 4 4 + ``` """ return self._table.get_add_actions(flatten) @@ -740,8 +828,11 @@ def delete(self, predicate: Optional[str] = None) -> Dict[str, Any]: that contain records that satisfy the predicate. Once files are determined they are rewritten without the records. - :param predicate: a SQL where clause. If not passed, will delete all rows. - :return: the metrics from delete. + Args: + predicate: a SQL where clause. If not passed, will delete all rows. + + Returns: + the metrics from delete. """ metrics = self._table.delete(predicate) return json.loads(metrics) @@ -760,8 +851,8 @@ def __call__( max_concurrent_tasks: Optional[int] = None, ) -> Dict[str, Any]: """ - .. deprecated:: 0.10.0 - Use :meth:`compact` instead, which has the same signature. + !!! warning "DEPRECATED 0.10.0" + Use [compact][deltalake.table.DeltaTable.compact] instead, which has the same signature. """ warnings.warn( @@ -788,26 +879,30 @@ def compact( If this operation happens concurrently with any operations other than append, it will fail. - :param partition_filters: the partition filters that will be used for getting the matched files - :param target_size: desired file size after bin-packing files, in bytes. If not - provided, will attempt to read the table configuration value ``delta.targetFileSize``. - If that value isn't set, will use default value of 256MB. - :param max_concurrent_tasks: the maximum number of concurrent tasks to use for - file compaction. Defaults to number of CPUs. More concurrent tasks can make compaction - faster, but will also use more memory. - :param min_commit_interval: minimum interval in seconds or as timedeltas before a new commit is - created. Interval is useful for long running executions. Set to 0 or timedelta(0), if you - want a commit per partition. - :return: the metrics from optimize - - Examples: - - Use a timedelta object to specify the seconds, minutes or hours of the interval. - >>> from deltalake import DeltaTable - >>> from datetime import timedelta - >>> dt = DeltaTable("tmp") - >>> time_delta = timedelta(minutes=10) - >>> dt.optimize.z_order(["timestamp"], min_commit_interval=time_delta) + Args: + partition_filters: the partition filters that will be used for getting the matched files + target_size: desired file size after bin-packing files, in bytes. If not + provided, will attempt to read the table configuration value ``delta.targetFileSize``. + If that value isn't set, will use default value of 256MB. + max_concurrent_tasks: the maximum number of concurrent tasks to use for + file compaction. Defaults to number of CPUs. More concurrent tasks can make compaction + faster, but will also use more memory. + min_commit_interval: minimum interval in seconds or as timedeltas before a new commit is + created. Interval is useful for long running executions. Set to 0 or timedelta(0), if you + want a commit per partition. + + Returns: + the metrics from optimize + + Example: + ``` + # Use a timedelta object to specify the seconds, minutes or hours of the interval. + from deltalake import DeltaTable + from datetime import timedelta + dt = DeltaTable("tmp") + time_delta = timedelta(minutes=10) + dt.optimize.z_order(["timestamp"], min_commit_interval=time_delta) + ``` """ if isinstance(min_commit_interval, timedelta): min_commit_interval = int(min_commit_interval.total_seconds()) @@ -832,28 +927,32 @@ def z_order( This also performs compaction, so the same parameters as compact() apply. - :param columns: the columns to use for Z-ordering. There must be at least one column. - :param partition_filters: the partition filters that will be used for getting the matched files - :param target_size: desired file size after bin-packing files, in bytes. If not - provided, will attempt to read the table configuration value ``delta.targetFileSize``. - If that value isn't set, will use default value of 256MB. - :param max_concurrent_tasks: the maximum number of concurrent tasks to use for - file compaction. Defaults to number of CPUs. More concurrent tasks can make compaction - faster, but will also use more memory. - :param max_spill_size: the maximum number of bytes to spill to disk. Defaults to 20GB. - :param min_commit_interval: minimum interval in seconds or as timedeltas before a new commit is - created. Interval is useful for long running executions. Set to 0 or timedelta(0), if you - want a commit per partition. - :return: the metrics from optimize - - Examples: - - Use a timedelta object to specify the seconds, minutes or hours of the interval. - >>> from deltalake import DeltaTable - >>> from datetime import timedelta - >>> dt = DeltaTable("tmp") - >>> time_delta = timedelta(minutes=10) - >>> dt.optimize.compact(min_commit_interval=time_delta) + Args: + columns: the columns to use for Z-ordering. There must be at least one column. + partition_filters: the partition filters that will be used for getting the matched files + target_size: desired file size after bin-packing files, in bytes. If not + provided, will attempt to read the table configuration value ``delta.targetFileSize``. + If that value isn't set, will use default value of 256MB. + max_concurrent_tasks: the maximum number of concurrent tasks to use for + file compaction. Defaults to number of CPUs. More concurrent tasks can make compaction + faster, but will also use more memory. + max_spill_size: the maximum number of bytes to spill to disk. Defaults to 20GB. + min_commit_interval: minimum interval in seconds or as timedeltas before a new commit is + created. Interval is useful for long running executions. Set to 0 or timedelta(0), if you + want a commit per partition. + + Returns: + the metrics from optimize + + Example: + ``` + # Use a timedelta object to specify the seconds, minutes or hours of the interval. + from deltalake import DeltaTable + from datetime import timedelta + dt = DeltaTable("tmp") + time_delta = timedelta(minutes=10) + dt.optimize.compact(min_commit_interval=time_delta) + ``` """ if isinstance(min_commit_interval, timedelta): min_commit_interval = int(min_commit_interval.total_seconds()) diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index db399e857e..944c499dbc 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -104,45 +104,46 @@ def write_deltalake( Note that this function does NOT register this table in a data catalog. - :param table_or_uri: URI of a table or a DeltaTable object. - :param data: Data to write. If passing iterable, the schema must also be given. - :param schema: Optional schema to write. - :param partition_by: List of columns to partition the table by. Only required - when creating a new table. - :param filesystem: Optional filesystem to pass to PyArrow. If not provided will - be inferred from uri. The file system has to be rooted in the table root. - Use the pyarrow.fs.SubTreeFileSystem, to adopt the root of pyarrow file systems. - :param mode: How to handle existing data. Default is to error if table already exists. - If 'append', will add new data. - If 'overwrite', will replace table with new data. - If 'ignore', will not write anything if table already exists. - :param file_options: Optional write options for Parquet (ParquetFileWriteOptions). - Can be provided with defaults using ParquetFileWriteOptions().make_write_options(). - Please refer to https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset_parquet.pyx#L492-L533 - for the list of available options - :param max_partitions: the maximum number of partitions that will be used. - :param max_open_files: Limits the maximum number of - files that can be left open while writing. If an attempt is made to open - too many files then the least recently used file will be closed. - If this setting is set too low you may end up fragmenting your - data into many small files. - :param max_rows_per_file: Maximum number of rows per file. - If greater than 0 then this will limit how many rows are placed in any single file. - Otherwise there will be no limit and one file will be created in each output directory - unless files need to be closed to respect max_open_files - :param min_rows_per_group: Minimum number of rows per group. When the value is set, - the dataset writer will batch incoming data and only write the row groups to the disk - when sufficient rows have accumulated. - :param max_rows_per_group: Maximum number of rows per group. - If the value is set, then the dataset writer may split up large incoming batches into multiple row groups. - If this value is set, then min_rows_per_group should also be set. - :param name: User-provided identifier for this table. - :param description: User-provided description for this table. - :param configuration: A map containing configuration options for the metadata action. - :param overwrite_schema: If True, allows updating the schema of the table. - :param storage_options: options passed to the native delta filesystem. Unused if 'filesystem' is defined. - :param partition_filters: the partition filters that will be used for partition overwrite. - :param large_dtypes: If True, the table schema is checked against large_dtypes + Args: + table_or_uri: URI of a table or a DeltaTable object. + data: Data to write. If passing iterable, the schema must also be given. + schema: Optional schema to write. + partition_by: List of columns to partition the table by. Only required + when creating a new table. + filesystem: Optional filesystem to pass to PyArrow. If not provided will + be inferred from uri. The file system has to be rooted in the table root. + Use the pyarrow.fs.SubTreeFileSystem, to adopt the root of pyarrow file systems. + mode: How to handle existing data. Default is to error if table already exists. + If 'append', will add new data. + If 'overwrite', will replace table with new data. + If 'ignore', will not write anything if table already exists. + file_options: Optional write options for Parquet (ParquetFileWriteOptions). + Can be provided with defaults using ParquetFileWriteOptions().make_write_options(). + Please refer to https://github.com/apache/arrow/blob/master/python/pyarrow/_dataset_parquet.pyx#L492-L533 + for the list of available options + max_partitions: the maximum number of partitions that will be used. + max_open_files: Limits the maximum number of + files that can be left open while writing. If an attempt is made to open + too many files then the least recently used file will be closed. + If this setting is set too low you may end up fragmenting your + data into many small files. + max_rows_per_file: Maximum number of rows per file. + If greater than 0 then this will limit how many rows are placed in any single file. + Otherwise there will be no limit and one file will be created in each output directory + unless files need to be closed to respect max_open_files + min_rows_per_group: Minimum number of rows per group. When the value is set, + the dataset writer will batch incoming data and only write the row groups to the disk + when sufficient rows have accumulated. + max_rows_per_group: Maximum number of rows per group. + If the value is set, then the dataset writer may split up large incoming batches into multiple row groups. + If this value is set, then min_rows_per_group should also be set. + name: User-provided identifier for this table. + description: User-provided description for this table. + configuration: A map containing configuration options for the metadata action. + overwrite_schema: If True, allows updating the schema of the table. + storage_options: options passed to the native delta filesystem. Unused if 'filesystem' is defined. + partition_filters: the partition filters that will be used for partition overwrite. + large_dtypes: If True, the table schema is checked against large_dtypes """ if _has_pandas and isinstance(data, pd.DataFrame): if schema is not None: @@ -403,12 +404,14 @@ def try_get_table_and_table_uri( storage_options: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[DeltaTable], str]: """Parses the `table_or_uri`. + Raises [ValueError] If `table_or_uri` is not of type str, Path or DeltaTable. - :param table_or_uri: URI of a table or a DeltaTable object. - :param storage_options: Options passed to the native delta filesystem. - :raises ValueError: If `table_or_uri` is not of type str, Path or DeltaTable. - :returns table: DeltaTable object - :return table_uri: URI of the table + Args: + table_or_uri: URI of a table or a DeltaTable object. + storage_options: Options passed to the native delta filesystem. + + Returns: + (DeltaTable object, URI of the table) """ if not isinstance(table_or_uri, (str, Path, DeltaTable)): raise ValueError("table_or_uri must be a str, Path or DeltaTable") diff --git a/python/src/schema.rs b/python/src/schema.rs index 425d775a9e..334c705d0e 100644 --- a/python/src/schema.rs +++ b/python/src/schema.rs @@ -90,29 +90,6 @@ fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { Err(PyValueError::new_err("Invalid data type")) } -/// A primitive datatype, such as a string or number. -/// -/// Can be initialized with a string value: -/// -/// >>> PrimitiveType("integer") -/// PrimitiveType("integer") -/// -/// Valid primitive data types include: -/// -/// * "string", -/// * "long", -/// * "integer", -/// * "short", -/// * "byte", -/// * "float", -/// * "double", -/// * "boolean", -/// * "binary", -/// * "date", -/// * "timestamp", -/// * "decimal(, )" -/// -/// :param data_type: string representation of the data type #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct PrimitiveType { @@ -159,9 +136,6 @@ impl PrimitiveType { } } - /// The inner type - /// - /// :rtype: str #[getter] fn get_type(&self) -> PyResult { Ok(self.inner_type.clone()) @@ -181,25 +155,12 @@ impl PrimitiveType { Ok(format!("PrimitiveType(\"{}\")", &self.inner_type)) } - /// Get the JSON string representation of the type. - /// - /// :rtype: str #[pyo3(text_signature = "($self)")] fn to_json(&self) -> PyResult { let inner_type = SchemaDataType::primitive(self.inner_type.clone()); serde_json::to_string(&inner_type).map_err(|err| PyException::new_err(err.to_string())) } - /// Create a PrimitiveType from a JSON string - /// - /// The JSON representation for a primitive type is just a quoted string: - /// - /// >>> PrimitiveType.from_json('"integer"') - /// PrimitiveType("integer") - /// - /// :param type_json: A JSON string - /// :type type_json: str - /// :rtype: PrimitiveType #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { @@ -209,9 +170,6 @@ impl PrimitiveType { data_type.try_into() } - /// Get the equivalent PyArrow type. - /// - /// :rtype: pyarrow.DataType #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { let inner_type = SchemaDataType::primitive(self.inner_type.clone()); @@ -220,13 +178,7 @@ impl PrimitiveType { )?)) } - /// Create a PrimitiveType from a PyArrow type - /// - /// Will raise ``TypeError`` if the PyArrow type is not a primitive type. - /// - /// :param data_type: A PyArrow DataType - /// :type data_type: pyarrow.DataType - /// :rtype: PrimitiveType + #[pyo3(text_signature = "(data_type)")] #[staticmethod] fn from_pyarrow(data_type: PyArrowType) -> PyResult { @@ -238,15 +190,6 @@ impl PrimitiveType { } } -/// An Array (List) DataType -/// -/// Can either pass the element type explicitly or can pass a string -/// if it is a primitive type: -/// -/// >>> ArrayType(PrimitiveType("integer")) -/// ArrayType(PrimitiveType("integer"), contains_null=True) -/// >>> ArrayType("integer", contains_null=False) -/// ArrayType(PrimitiveType("integer"), contains_null=False) #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct ArrayType { @@ -313,53 +256,26 @@ impl ArrayType { } } - /// The string "array" - /// - /// :rtype: str #[getter] fn get_type(&self) -> String { "array".to_string() } - /// The type of the element - /// - /// :rtype: Union[PrimitiveType, ArrayType, MapType, StructType] #[getter] fn element_type(&self, py: Python) -> PyResult { schema_type_to_python(self.inner_type.get_element_type().to_owned(), py) } - /// Whether the arrays may contain null values - /// - /// :rtype: bool #[getter] fn contains_null(&self, py: Python) -> PyResult { Ok(self.inner_type.contains_null().into_py(py)) } - /// Get the JSON string representation of the type. - /// - /// :rtype: str #[pyo3(text_signature = "($self)")] fn to_json(&self) -> PyResult { serde_json::to_string(&self.inner_type).map_err(|err| PyException::new_err(err.to_string())) } - /// Create an ArrayType from a JSON string - /// - /// The JSON representation for an array type is an object with ``type`` (set to - /// ``"array"``), ``elementType``, and ``containsNull``: - /// - /// >>> ArrayType.from_json("""{ - /// ... "type": "array", - /// ... "elementType": "integer", - /// ... "containsNull": false - /// ... }""") - /// ArrayType(PrimitiveType("integer"), contains_null=False) - /// - /// :param type_json: A JSON string - /// :type type_json: str - /// :rtype: ArrayType #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { @@ -369,9 +285,6 @@ impl ArrayType { data_type.try_into() } - /// Get the equivalent PyArrow type. - /// - /// :rtype: pyarrow.DataType #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( @@ -381,13 +294,6 @@ impl ArrayType { )) } - /// Create an ArrayType from a pyarrow.ListType. - /// - /// Will raise ``TypeError`` if a different PyArrow DataType is provided. - /// - /// :param data_type: The PyArrow datatype - /// :type data_type: pyarrow.ListType - /// :rtype: ArrayType #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { @@ -399,16 +305,6 @@ impl ArrayType { } } -/// A map data type -/// -/// ``key_type`` and ``value_type`` should be :class PrimitiveType:, :class ArrayType:, -/// :class ListType:, or :class StructType:. A string can also be passed, which will be -/// parsed as a primitive type: -/// -/// >>> MapType(PrimitiveType("integer"), PrimitiveType("string")) -/// MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=True) -/// >>> MapType("integer", "string", value_contains_null=False) -/// MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=False) #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct MapType { @@ -485,62 +381,31 @@ impl MapType { } } - /// The string "map" - /// - /// :rtype: str #[getter] fn get_type(&self) -> String { "map".to_string() } - /// The type of the keys - /// - /// :rtype: Union[PrimitiveType, ArrayType, MapType, StructType] #[getter] fn key_type(&self, py: Python) -> PyResult { schema_type_to_python(self.inner_type.get_key_type().to_owned(), py) } - /// The type of the values - /// - /// :rtype: Union[PrimitiveType, ArrayType, MapType, StructType] #[getter] fn value_type(&self, py: Python) -> PyResult { schema_type_to_python(self.inner_type.get_value_type().to_owned(), py) } - /// Whether the values in a map may be null - /// - /// :rtype: bool #[getter] fn value_contains_null(&self, py: Python) -> PyResult { Ok(self.inner_type.get_value_contains_null().into_py(py)) } - /// Get JSON string representation of map type. - /// - /// :rtype: str #[pyo3(text_signature = "($self)")] fn to_json(&self) -> PyResult { serde_json::to_string(&self.inner_type).map_err(|err| PyException::new_err(err.to_string())) } - /// Create a MapType from a JSON string - /// - /// The JSON representation for a map type is an object with ``type`` (set to ``map``), - /// ``keyType``, ``valueType``, and ``valueContainsNull``: - /// - /// >>> MapType.from_json("""{ - /// ... "type": "map", - /// ... "keyType": "integer", - /// ... "valueType": "string", - /// ... "valueContainsNull": true - /// ... }""") - /// MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=True) - /// - /// :param type_json: A JSON string - /// :type type_json: str - /// :rtype: MapType #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { @@ -550,9 +415,6 @@ impl MapType { data_type.try_into() } - /// Get the equivalent PyArrow data type. - /// - /// :rtype: pyarrow.MapType #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( @@ -562,13 +424,6 @@ impl MapType { )) } - /// Create a MapType from a PyArrow MapType. - /// - /// Will raise ``TypeError`` if passed a different type. - /// - /// :param data_type: the PyArrow MapType - /// :type data_type: pyarrow.MapType - /// :rtype: MapType #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { @@ -580,18 +435,6 @@ impl MapType { } } -/// A field in a Delta StructType or Schema -/// -/// Can create with just a name and a type: -/// -/// >>> Field("my_int_col", "integer") -/// Field("my_int_col", PrimitiveType("integer"), nullable=True, metadata=None) -/// -/// Can also attach metadata to the field. Metadata should be a dictionary with -/// string keys and JSON-serializable values (str, list, int, float, dict): -/// -/// >>> Field("my_col", "integer", metadata={"custom_metadata": {"test": 2}}) -/// Field("my_col", PrimitiveType("integer"), nullable=True, metadata={"custom_metadata": {"test": 2}}) #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct Field { @@ -629,33 +472,21 @@ impl Field { }) } - /// The name of the field - /// - /// :rtype: str #[getter] fn name(&self) -> String { self.inner.get_name().to_string() } - /// The type of the field - /// - /// :rtype: Union[PrimitiveType, ArrayType, MapType, StructType] #[getter] fn get_type(&self, py: Python) -> PyResult { schema_type_to_python(self.inner.get_type().clone(), py) } - /// Whether there may be null values in the field - /// - /// :rtype: bool #[getter] fn nullable(&self) -> bool { self.inner.is_nullable() } - /// The metadata of the field - /// - /// :rtype: dict #[getter] fn metadata(&self, py: Python) -> PyResult { let json_loads = PyModule::import(py, "json")?.getattr("loads")?; @@ -702,30 +533,12 @@ impl Field { } } - /// Get the field as JSON string. - /// - /// >>> Field("col", "integer").to_json() - /// '{"name":"col","type":"integer","nullable":true,"metadata":{}}' - /// - /// :rtype: str #[pyo3(text_signature = "($self)")] fn to_json(&self) -> PyResult { serde_json::to_string(&self.inner).map_err(|err| PyException::new_err(err.to_string())) } - /// Create a Field from a JSON string. - /// - /// >>> Field.from_json("""{ - /// ... "name": "col", - /// ... "type": "integer", - /// ... "nullable": true, - /// ... "metadata": {} - /// ... }""") - /// Field(col, PrimitiveType("integer"), nullable=True) - /// - /// :param field_json: the JSON string. - /// :type field_json: str - /// :rtype: Field + #[staticmethod] #[pyo3(text_signature = "(field_json)")] fn from_json(field_json: String) -> PyResult { @@ -735,11 +548,6 @@ impl Field { Ok(Self { inner: field }) } - /// Convert to an equivalent PyArrow field - /// - /// Note: This currently doesn't preserve field metadata. - /// - /// :rtype: pyarrow.Field #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType((&self.inner).try_into().map_err( @@ -747,13 +555,6 @@ impl Field { )?)) } - /// Create a Field from a PyArrow field - /// - /// Note: This currently doesn't preserve field metadata. - /// - /// :param field: a field - /// :type: pyarrow.Field - /// :rtype: Field #[staticmethod] #[pyo3(text_signature = "(field)")] fn from_pyarrow(field: PyArrowType) -> PyResult { @@ -764,12 +565,6 @@ impl Field { } } -/// A struct datatype, containing one or more subfields -/// -/// Create with a list of :class:`Field`: -/// -/// >>> StructType([Field("x", "integer"), Field("y", "string")]) -/// StructType([Field(x, PrimitiveType("integer"), nullable=True), Field(y, PrimitiveType("string"), nullable=True)]) #[pyclass(subclass, module = "deltalake._internal")] #[derive(Clone)] pub struct StructType { @@ -840,9 +635,6 @@ impl StructType { "struct".to_string() } - /// The fields within the struct - /// - /// :rtype: List[Field] #[getter] fn fields(&self) -> Vec { self.inner_type @@ -854,28 +646,11 @@ impl StructType { .collect::>() } - /// Get the JSON representation of the type. - /// - /// >>> StructType([Field("x", "integer")]).to_json() - /// '{"type":"struct","fields":[{"name":"x","type":"integer","nullable":true,"metadata":{}}]}' - /// - /// :rtype: str #[pyo3(text_signature = "($self)")] fn to_json(&self) -> PyResult { serde_json::to_string(&self.inner_type).map_err(|err| PyException::new_err(err.to_string())) } - /// Create a new StructType from a JSON string. - /// - /// >>> StructType.from_json("""{ - /// ... "type": "struct", - /// ... "fields": [{"name": "x", "type": "integer", "nullable": true, "metadata": {}}] - /// ... }""") - /// StructType([Field(x, PrimitiveType("integer"), nullable=True)]) - /// - /// :param type_json: a JSON string - /// :type type_json: str - /// :rtype: StructType #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { @@ -885,9 +660,6 @@ impl StructType { data_type.try_into() } - /// Get the equivalent PyArrow StructType - /// - /// :rtype: pyarrow.StructType #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( @@ -897,13 +669,6 @@ impl StructType { )) } - /// Create a new StructType from a PyArrow struct type. - /// - /// Will raise ``TypeError`` if a different data type is provided. - /// - /// :param data_type: a PyArrow struct type. - /// :type data_type: pyarrow.StructType - /// :rtype: StructType #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { @@ -975,7 +740,6 @@ impl PySchema { Ok(format!("Schema([{}])", inner_data.join(", "))) } - /// DEPRECATED: Convert to JSON dictionary representation fn json(self_: PyRef<'_, Self>, py: Python) -> PyResult { let warnings_warn = PyModule::import(py, "warnings")?.getattr("warn")?; let deprecation_warning = PyModule::import(py, "builtins")? @@ -998,14 +762,6 @@ impl PySchema { .map(|obj| obj.to_object(py)) } - /// Return equivalent PyArrow schema - /// - /// :param as_large_types: get schema with all variable size types (list, - /// binary, string) as large variants (with int64 indices). This is for - /// compatibility with systems like Polars that only support the large - /// versions of Arrow types. - /// - /// :rtype: pyarrow.Schema #[pyo3(signature = (as_large_types = false))] fn to_pyarrow( self_: PyRef<'_, Self>, @@ -1079,11 +835,6 @@ impl PySchema { } } - /// Create from a PyArrow schema - /// - /// :param data_type: a PyArrow schema - /// :type data_type: pyarrow.Schema - /// :rtype: Schema #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType, py: Python) -> PyResult { @@ -1094,33 +845,12 @@ impl PySchema { schema_to_pyobject(&inner_type, py) } - /// Get the JSON representation of the schema. - /// - /// A schema has the same JSON format as a StructType. - /// - /// >>> Schema([Field("x", "integer")]).to_json() - /// '{"type":"struct","fields":[{"name":"x","type":"integer","nullable":true,"metadata":{}}]}' - /// - /// :rtype: str #[pyo3(text_signature = "($self)")] fn to_json(self_: PyRef<'_, Self>) -> PyResult { let super_ = self_.as_ref(); super_.to_json() } - /// Create a new Schema from a JSON string. - /// - /// A schema has the same JSON format as a StructType. - /// - /// >>> Schema.from_json("""{ - /// ... "type": "struct", - /// ... "fields": [{"name": "x", "type": "integer", "nullable": true, "metadata": {}}] - /// ... }""") - /// Schema([Field(x, PrimitiveType("integer"), nullable=True)]) - /// - /// :param schema_json: a JSON string - /// :type schema_json: str - /// :rtype: Schema #[staticmethod] #[pyo3(text_signature = "(schema_json)")] fn from_json(schema_json: String, py: Python) -> PyResult> { @@ -1134,11 +864,6 @@ impl PySchema { } } - /// The list of invariants on the table. - /// - /// :rtype: List[Tuple[str, str]] - /// :return: a tuple of strings for each invariant. The first string is the - /// field path and the second is the SQL of the invariant. #[getter] fn invariants(self_: PyRef<'_, Self>) -> PyResult> { let super_ = self_.as_ref(); From 519a71702d4736a744461edd9f65f068b04a2cff Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Wed, 18 Oct 2023 18:52:42 +0100 Subject: [PATCH 2/8] fix format --- python/deltalake/_internal.pyi | 148 ++++++++++++++------------------- python/deltalake/schema.py | 2 +- python/deltalake/table.py | 17 ++-- python/src/schema.rs | 2 - 4 files changed, 72 insertions(+), 97 deletions(-) diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index f57129637d..bd43a44f2b 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -126,7 +126,7 @@ def batch_distinct(batch: pyarrow.RecordBatch) -> pyarrow.RecordBatch: ... DataType = Union["PrimitiveType", "MapType", "StructType", "ArrayType"] class PrimitiveType: - """ A primitive datatype, such as a string or number. + """A primitive datatype, such as a string or number. Can be initialized with a string value: @@ -161,8 +161,8 @@ class PrimitiveType: def to_json(self) -> str: ... @staticmethod def from_json(json: str) -> PrimitiveType: - """ Create a PrimitiveType from a JSON string - + """Create a PrimitiveType from a JSON string + The JSON representation for a primitive type is just a quoted string: `PrimitiveType.from_json('"integer"')` Args: @@ -170,33 +170,29 @@ class PrimitiveType: Returns a [PrimitiveType][deltalake.schema.PrimitiveType] type """ - def to_pyarrow(self) -> pyarrow.DataType: - """ Get the equivalent PyArrow type (pyarrow.DataType) - """ - + """Get the equivalent PyArrow type (pyarrow.DataType)""" @staticmethod def from_pyarrow(type: pyarrow.DataType) -> PrimitiveType: - """ Create a [PrimitiveType][deltalake.schema.PrimitiveType] from a PyArrow type - + """Create a [PrimitiveType][deltalake.schema.PrimitiveType] from a PyArrow type + Will raise `TypeError` if the PyArrow type is not a primitive type. - + Args: type: A PyArrow [DataType][pyarrow.DataType] type Returns: a [PrimitiveType][deltalake.schema.PrimitiveType] type """ - class ArrayType: - """ An Array (List) DataType + """An Array (List) DataType Can either pass the element type explicitly or can pass a string if it is a primitive type: ``` ArrayType(PrimitiveType("integer")) # Returns ArrayType(PrimitiveType("integer"), contains_null=True) - + ArrayType("integer", contains_null=False) # Returns ArrayType(PrimitiveType("integer"), contains_null=False) ``` @@ -208,7 +204,7 @@ class ArrayType: type: Literal["array"] """ The string "array" """ - + element_type: DataType """ The type of the element, of type: Union[ @@ -224,13 +220,11 @@ class ArrayType: """ def to_json(self) -> str: - """ Get the JSON string representation of the type. - """ - + """Get the JSON string representation of the type.""" @staticmethod def from_json(json: str) -> "ArrayType": - """ Create an ArrayType from a JSON string - + """Create an ArrayType from a JSON string + The JSON representation for an array type is an object with `type` (set to `"array"`), `elementType`, and `containsNull`: ``` @@ -243,25 +237,22 @@ class ArrayType: ) # Returns ArrayType(PrimitiveType("integer"), contains_null=False) ``` - - Args: + + Args: json: A JSON string - + Returns: an [ArrayType][deltalake.schema.ArrayType] type """ - def to_pyarrow( self, ) -> pyarrow.ListType: - """ Get the equivalent PyArrow type. - """ - + """Get the equivalent PyArrow type.""" @staticmethod def from_pyarrow(type: pyarrow.ListType) -> ArrayType: - """ Create an ArrayType from a pyarrow.ListType. - + """Create an ArrayType from a pyarrow.ListType. + Will raise `TypeError` if a different PyArrow DataType is provided. - + Args: type: The PyArrow [ListType][pyarrow.ListType] @@ -269,12 +260,12 @@ class ArrayType: """ class MapType: - """ A map data type + """A map data type `key_type` and `value_type` should be [PrimitiveType][deltalake.schema.PrimitiveType], [ArrayType][deltalake.schema.ArrayType], or [StructType][deltalake.schema.StructType]. A string can also be passed, which will be parsed as a primitive type: - + ``` MapType(PrimitiveType("integer"), PrimitiveType("string")) # Returns MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=True) @@ -317,13 +308,11 @@ class MapType: """ def to_json(self) -> str: - """ Get JSON string representation of map type. - """ - + """Get JSON string representation of map type.""" @staticmethod def from_json(json: str) -> MapType: - """ Create a MapType from a JSON string - + """Create a MapType from a JSON string + The JSON representation for a map type is an object with `type` (set to `map`), `keyType`, `valueType`, and `valueContainsNull`: ``` @@ -337,31 +326,28 @@ class MapType: ) # Returns MapType(PrimitiveType("integer"), PrimitiveType("string"), value_contains_null=True) ``` - + Args: json: A JSON string - + Returns: a [MapType][deltalake.schema.MapType] type """ - def to_pyarrow(self) -> pyarrow.MapType: - """ Get the equivalent PyArrow data type. - """ - + """Get the equivalent PyArrow data type.""" @staticmethod def from_pyarrow(type: pyarrow.MapType) -> MapType: - """ Create a MapType from a PyArrow MapType. - + """Create a MapType from a PyArrow MapType. + Will raise `TypeError` if passed a different type. - + Args: type: the PyArrow MapType - + Returns: a [MapType][deltalake.schema.MapType] type """ class Field: - """ A field in a Delta StructType or Schema + """A field in a Delta StructType or Schema Can create with just a name and a type: ``` @@ -409,17 +395,16 @@ class Field: """ def to_json(self) -> str: - """ Get the field as JSON string. + """Get the field as JSON string. ``` Field("col", "integer").to_json() # Returns '{"name":"col","type":"integer","nullable":true,"metadata":{}}' ``` """ - @staticmethod def from_json(json: str) -> Field: - """ Create a Field from a JSON string. - + """Create a Field from a JSON string. + Args: json: the JSON string. @@ -437,19 +422,17 @@ class Field: # Returns Field(col, PrimitiveType("integer"), nullable=True) ``` """ - def to_pyarrow(self) -> pyarrow.Field: - """ Convert to an equivalent PyArrow field + """Convert to an equivalent PyArrow field Note: This currently doesn't preserve field metadata. - + Returns: a [pyarrow.Field][pyarrow.Field] type """ - @staticmethod def from_pyarrow(field: pyarrow.Field) -> Field: """Create a Field from a PyArrow field Note: This currently doesn't preserve field metadata. - + Args: field: a PyArrow Field type @@ -457,7 +440,7 @@ class Field: """ class StructType: - """ A struct datatype, containing one or more subfields + """A struct datatype, containing one or more subfields Example: @@ -475,16 +458,15 @@ class StructType: """ def to_json(self) -> str: - """ Get the JSON representation of the type. + """Get the JSON representation of the type. ``` StructType([Field("x", "integer")]).to_json() # Returns '{"type":"struct","fields":[{"name":"x","type":"integer","nullable":true,"metadata":{}}]}' ``` """ - @staticmethod def from_json(json: str) -> StructType: - """ Create a new StructType from a JSON string. + """Create a new StructType from a JSON string. ``` StructType.from_json( '''{ @@ -494,28 +476,26 @@ class StructType: ) # Returns StructType([Field(x, PrimitiveType("integer"), nullable=True)]) ``` - - Args: + + Args: json: a JSON string - + Returns: a [StructType][deltalake.schema.StructType] type """ - def to_pyarrow(self) -> pyarrow.StructType: - """ Get the equivalent PyArrow StructType + """Get the equivalent PyArrow StructType Returns: a PyArrow [StructType][pyarrow.StructType] type """ - @staticmethod def from_pyarrow(type: pyarrow.StructType) -> StructType: - """ Create a new StructType from a PyArrow struct type. - + """Create a new StructType from a PyArrow struct type. + Will raise `TypeError` if a different data type is provided. - + Args: type: a PyArrow struct type. - + Returns: a [StructType][deltalake.schema.StructType] type """ @@ -527,14 +507,14 @@ class Schema: """ The list of invariants on the table. Each invarint is a tuple of strings. The first string is the field path and the second is the SQL of the invariant. """ - + def json(self): - """ + """ !!! warning "DEPRECATED" Convert to JSON dictionary representation """ def to_json(self) -> str: - """ Get the JSON string representation of the Schema. + """Get the JSON string representation of the Schema. A schema has the same JSON format as a StructType. ``` Schema([Field("x", "integer")]).to_json() @@ -542,11 +522,10 @@ class Schema: ``` Returns: a JSON string """ - @staticmethod def from_json(json: str) -> Schema: - """ Create a new Schema from a JSON string. - + """Create a new Schema from a JSON string. + A schema has the same JSON format as a StructType. ``` Schema.from_json('''{ @@ -560,22 +539,21 @@ class Schema: Args: json: a JSON string """ - - def to_pyarrow(self, as_large_types: bool = False) -> pyarrow.Schema: - """ Return equivalent PyArrow schema + def to_pyarrow(self, as_large_types: bool = False) -> pyarrow.Schema: + """Return equivalent PyArrow schema Args: as_large_types: get schema with all variable size types (list, binary, string) as large variants (with int64 indices). This is for compatibility with systems like Polars that only support the large versions of Arrow types. - - Returns: + + Returns: a PyArrow [Schema][pyarrow.Schema] type """ @staticmethod - def from_pyarrow(type: pyarrow.Schema) -> Schema: - """ Create a [Schema][deltalake.schema.Schema] from a PyArrow Schema type - + def from_pyarrow(type: pyarrow.Schema) -> Schema: + """Create a [Schema][deltalake.schema.Schema] from a PyArrow Schema type + Will raise `TypeError` if the PyArrow type is not a primitive type. - + Args: type: A PyArrow [Schema][pyarrow.Schema] type diff --git a/python/deltalake/schema.py b/python/deltalake/schema.py index d529e01e14..e6854c3779 100644 --- a/python/deltalake/schema.py +++ b/python/deltalake/schema.py @@ -26,7 +26,7 @@ def delta_arrow_schema_from_pandas( Args: data: Data to write. - + Returns: A PyArrow Table and the inferred schema for the Delta Table """ diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 64daeea93a..08305b2969 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -317,7 +317,7 @@ def files( Returns: list of the .parquet files referenced for the current version of the DeltaTable - + Predicates are expressed in disjunctive normal form (DNF), like [("x", "=", "a"), ...]. DNF allows arbitrary boolean logical combinations of single partition predicates. The innermost tuples each describe a single partition predicate. The list of inner @@ -337,7 +337,6 @@ def files( """ return self._table.files(self.__stringify_partition_values(partition_filters)) - def files_by_partitions( self, partition_filters: List[Tuple[str, str, Any]] ) -> List[str]: @@ -418,7 +417,7 @@ def file_uris( ) file_uris.__doc__ = "" - + def load_version(self, version: int) -> None: """ Load a DeltaTable with a specified version. @@ -549,8 +548,8 @@ def update( writer_properties: Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html, only the following fields are supported: `data_page_size_limit`, `dictionary_page_size_limit`, `data_page_row_count_limit`, `write_batch_size`, `max_row_group_size`. error_on_type_mismatch: specify if merge will return error if data types are mismatching :default = True - - Returns: + + Returns: the metrics from delete Examples: @@ -789,7 +788,7 @@ def get_add_actions(self, flatten: bool = False) -> pyarrow.RecordBatch: given the prefix `null_count.`, `min.`, and `max.`, and tags the prefix `tags.`. Nested field names are concatenated with `.`. - Returns: + Returns: a PyArrow RecordBatch containing the add action data. Example: @@ -801,7 +800,7 @@ def get_add_actions(self, flatten: bool = False) -> pyarrow.RecordBatch: dt = DeltaTable("tmp") dt.get_add_actions().to_pandas() ``` - + ``` path size_bytes modification_time data_change partition_values num_records null_count min max 0 x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 2} 1 {'y': 0} {'y': 5} {'y': 5} @@ -830,7 +829,7 @@ def delete(self, predicate: Optional[str] = None) -> Dict[str, Any]: Args: predicate: a SQL where clause. If not passed, will delete all rows. - + Returns: the metrics from delete. """ @@ -929,7 +928,7 @@ def z_order( Args: columns: the columns to use for Z-ordering. There must be at least one column. - partition_filters: the partition filters that will be used for getting the matched files + partition_filters: the partition filters that will be used for getting the matched files target_size: desired file size after bin-packing files, in bytes. If not provided, will attempt to read the table configuration value ``delta.targetFileSize``. If that value isn't set, will use default value of 256MB. diff --git a/python/src/schema.rs b/python/src/schema.rs index 334c705d0e..77e5f0d4da 100644 --- a/python/src/schema.rs +++ b/python/src/schema.rs @@ -178,7 +178,6 @@ impl PrimitiveType { )?)) } - #[pyo3(text_signature = "(data_type)")] #[staticmethod] fn from_pyarrow(data_type: PyArrowType) -> PyResult { @@ -538,7 +537,6 @@ impl Field { serde_json::to_string(&self.inner).map_err(|err| PyException::new_err(err.to_string())) } - #[staticmethod] #[pyo3(text_signature = "(field_json)")] fn from_json(field_json: String) -> PyResult { From c7131958c6b7b9c162023fbf727ad92d37774264 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Wed, 18 Oct 2023 21:41:11 +0100 Subject: [PATCH 3/8] format docstrings --- python/deltalake/table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 6c4dddd5ad..a669f44f01 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -607,7 +607,6 @@ def validate_batch(batch: pyarrow.RecordBatch) -> pyarrow.RecordBatch: safe_cast=not error_on_type_mismatch, ) - def restore( self, target: Union[int, datetime, str], From 5e4a6b8f00bc585d665694e1a60fa49714475fd3 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Wed, 18 Oct 2023 22:42:40 +0100 Subject: [PATCH 4/8] fix import alias --- python/deltalake/_internal.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index e0f9523310..13bbc006cc 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -97,7 +97,7 @@ class RawDeltaTable: ) -> str: ... def merge_execute( self, - source: pa.RecordBatchReader, + source: pyarrow.RecordBatchReader, predicate: str, source_alias: Optional[str], target_alias: Optional[str], From f25253e9372bfe0a0e9ec456cd3e508dd6e5af9e Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Thu, 19 Oct 2023 08:48:13 +0100 Subject: [PATCH 5/8] remove deprecated --- python/deltalake/_internal.pyi | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/deltalake/_internal.pyi b/python/deltalake/_internal.pyi index 13bbc006cc..91c30fda13 100644 --- a/python/deltalake/_internal.pyi +++ b/python/deltalake/_internal.pyi @@ -526,12 +526,6 @@ class Schema: """ The list of invariants on the table. Each invarint is a tuple of strings. The first string is the field path and the second is the SQL of the invariant. """ - - def json(self): - """ - !!! warning "DEPRECATED" - Convert to JSON dictionary representation - """ def to_json(self) -> str: """Get the JSON string representation of the Schema. A schema has the same JSON format as a StructType. From d33485f1db6da5986d4431c05265954089df016c Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Thu, 19 Oct 2023 12:01:10 +0100 Subject: [PATCH 6/8] minor docstring fixes --- docs/usage/examining-table.md | 4 ++-- python/deltalake/table.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/usage/examining-table.md b/docs/usage/examining-table.md index 0d91153953..9c99450078 100644 --- a/docs/usage/examining-table.md +++ b/docs/usage/examining-table.md @@ -43,10 +43,10 @@ Schema([Field(id, PrimitiveType("long"), nullable=True)]) These schemas have a JSON representation that can be retrieved. To reconstruct from json, use -[DeltaTable.schema.json()][deltalake.schema.Schema.json]. +[DeltaTable.schema.to_json()][deltalake.schema.Schema.to_json]. ``` python ->>> dt.schema().json() +>>> dt.schema().to_json() '{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}}]}' ``` diff --git a/python/deltalake/table.py b/python/deltalake/table.py index a669f44f01..7dc34675a9 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -274,7 +274,7 @@ def from_data_catalog( Args: data_catalog: the Catalog to use for getting the storage location of the Delta Table - database_name: the database name inside the Data Catalog + database_name: the database name inside the Data Catalog table_name: the table name inside the Data Catalog data_catalog_id: the identifier of the Data Catalog version: version of the DeltaTable @@ -1097,7 +1097,6 @@ def when_not_matched_by_source_delete( ``predicate`` (if specified) is true for the target row. Args: - updates (dict): a mapping of column name to update SQL expression. predicate (str | None, optional): SQL like predicate on when to delete when not matched by source. Defaults to None. Returns: From 535b7c7fce4521a2cc7933befb5327fcf89adbb9 Mon Sep 17 00:00:00 2001 From: Nikolay Ulmasov Date: Thu, 19 Oct 2023 18:08:20 +0100 Subject: [PATCH 7/8] fix sphinx build --- python/deltalake/table.py | 59 +++++++++++++++--------------------- python/deltalake/writer.py | 7 +---- python/docs/source/conf.py | 1 + python/docs/source/usage.rst | 4 +-- 4 files changed, 29 insertions(+), 42 deletions(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 7dc34675a9..e8f622847b 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -506,41 +506,32 @@ def update( Args: updates: a mapping of column name to update SQL expression. predicate: a logical expression, defaults to None - writer_properties: Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html, - only the following fields are supported: `data_page_size_limit`, `dictionary_page_size_limit`, `data_page_row_count_limit`, `write_batch_size`, `max_row_group_size`. - error_on_type_mismatch: specify if merge will return error if data types are mismatching :default = True + writer_properties: Pass writer properties to the Rust parquet writer, see options + https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html, + only the following fields are supported: `data_page_size_limit`, `dictionary_page_size_limit`, + `data_page_row_count_limit`, `write_batch_size`, `max_row_group_size`. + error_on_type_mismatch: specify if merge will return error if data types are mismatching, default = True Returns: the metrics from delete Examples: - Update some row values with SQL predicate. This is equivalent to - `UPDATE table SET deleted = true WHERE id = '5'` + Update some row values with SQL predicate. This is equivalent to `UPDATE table SET deleted = true WHERE id = '5'` ``` from deltalake import DeltaTable dt = DeltaTable("tmp") - dt.update( - predicate="id = '5'", - updates = {"deleted": True,} - ) + dt.update(predicate="id = '5'", updates = {"deleted": True}) ``` - Update all row values. This is equivalent to - `UPDATE table SET id = concat(id, '_old')`. + Update all row values. This is equivalent to `UPDATE table SET id = concat(id, '_old')`. ``` from deltalake import DeltaTable dt = DeltaTable("tmp") - dt.update( - updates = { - "deleted": True, - "id": "concat(id, '_old')" - } - ) + dt.update(updates={"deleted": True, "id": "concat(id, '_old')"}) ``` """ - metrics = self._table.update( updates, predicate, writer_properties, safe_cast=not error_on_type_mismatch ) @@ -799,18 +790,18 @@ def get_add_actions(self, flatten: bool = False) -> pyarrow.RecordBatch: dt = DeltaTable("tmp") dt.get_add_actions().to_pandas() ``` - ``` - path size_bytes modification_time data_change partition_values num_records null_count min max + path size_bytes modification_time data_change partition_values num_records null_count min max 0 x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 2} 1 {'y': 0} {'y': 5} {'y': 5} 1 x=3/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 3} 1 {'y': 0} {'y': 6} {'y': 6} 2 x=1/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True {'x': 1} 1 {'y': 0} {'y': 4} {'y': 4} ``` + ``` dt.get_add_actions(flatten=True).to_pandas() ``` ``` - path size_bytes modification_time data_change partition.x num_records null_count.y min.y max.y + path size_bytes modification_time data_change partition.x num_records null_count.y min.y max.y 0 x=2/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 2 1 0 5 5 1 x=3/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 3 1 0 6 6 2 x=1/0-91820cbf-f698-45fb-886d-5d5f5669530b-0.p... 565 1970-01-20 08:40:08.071 True 1 1 0 4 4 @@ -877,11 +868,11 @@ def with_writer_properties( """Pass writer properties to the Rust parquet writer, see options https://arrow.apache.org/rust/parquet/file/properties/struct.WriterProperties.html: Args: - data_page_size_limit (int|None, optional): Limit DataPage size to this in bytes. Defaults to None. - dictionary_page_size_limit (int|None, optional): Limit the size of each DataPage to store dicts to this amount in bytes. Defaults to None. - data_page_row_count_limit (int|None, optional): Limit the number of rows in each DataPage. Defaults to None. - write_batch_size (int|None, optional): Splits internally to smaller batch size. Defaults to None. - max_row_group_size (int|None, optional): Max number of rows in row group. Defaults to None. + data_page_size_limit (int|None, Optional): Limit DataPage size to this in bytes. Defaults to None. + dictionary_page_size_limit (int|None, Optional): Limit the size of each DataPage to store dicts to this amount in bytes. Defaults to None. + data_page_row_count_limit (int|None, Optional): Limit the number of rows in each DataPage. Defaults to None. + write_batch_size (int|None, Optional): Splits internally to smaller batch size. Defaults to None. + max_row_group_size (int|None, Optional): Max number of rows in row group. Defaults to None. Returns: TableMerger: TableMerger Object @@ -904,7 +895,7 @@ def when_matched_update( Args: updates (dict): a mapping of column name to update SQL expression. - predicate (str | None, optional): SQL like predicate on when to update. Defaults to None. + predicate (str | None, Optional): SQL like predicate on when to update. Defaults to None. Returns: TableMerger: TableMerger Object @@ -932,7 +923,7 @@ def when_matched_update_all(self, predicate: Optional[str] = None) -> "TableMerg If a ``predicate`` is specified, then it must evaluate to true for the row to be updated. Args: - predicate (str | None, optional): SQL like predicate on when to update all columns. Defaults to None. + predicate (str | None, Optional): SQL like predicate on when to update all columns. Defaults to None. Returns: TableMerger: TableMerger Object @@ -962,7 +953,7 @@ def when_matched_delete(self, predicate: Optional[str] = None) -> "TableMerger": true for the matched row. If not specified it deletes all matches. Args: - predicate (str | None, optional): SQL like predicate on when to delete. Defaults to None. + predicate (str | None, Optional): SQL like predicate on when to delete. Defaults to None. Returns: TableMerger: TableMerger Object @@ -1004,7 +995,7 @@ def when_not_matched_insert( Args: updates (dict): a mapping of column name to insert SQL expression. - predicate (str | None, optional): SQL like predicate on when to insert. Defaults to None. + predicate (str | None, Optional): SQL like predicate on when to insert. Defaults to None. Returns: TableMerger: TableMerger Object @@ -1037,7 +1028,7 @@ def when_not_matched_insert_all( the new row to be inserted. Args: - predicate (str | None, optional): SQL like predicate on when to insert. Defaults to None. + predicate (str | None, Optional): SQL like predicate on when to insert. Defaults to None. Returns: TableMerger: TableMerger Object @@ -1069,7 +1060,7 @@ def when_not_matched_by_source_update( Args: updates (dict): a mapping of column name to update SQL expression. - predicate (str | None, optional): SQL like predicate on when to update. Defaults to None. + predicate (str | None, Optional): SQL like predicate on when to update. Defaults to None. Returns: TableMerger: TableMerger Object @@ -1097,7 +1088,7 @@ def when_not_matched_by_source_delete( ``predicate`` (if specified) is true for the target row. Args: - predicate (str | None, optional): SQL like predicate on when to delete when not matched by source. Defaults to None. + predicate (str | None, Optional): SQL like predicate on when to delete when not matched by source. Defaults to None. Returns: TableMerger: TableMerger Object @@ -1113,7 +1104,7 @@ def execute(self) -> Dict[str, Any]: """Executes MERGE with the previously provided settings in Rust with Apache Datafusion query engine. Returns: - Dict[str, any]: metrics + Dict[str, Any]: metrics """ metrics = self.table._table.merge_execute( source=self.source, diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index 944c499dbc..1e7bc2ed87 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -1,11 +1,11 @@ import json +import sys import uuid from dataclasses import dataclass from datetime import date, datetime from decimal import Decimal from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, FrozenSet, @@ -23,11 +23,6 @@ from ._util import encode_partition_value -if TYPE_CHECKING: - import pandas as pd - -import sys - if sys.version_info >= (3, 8): from typing import Literal else: diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py index cea081579d..e0c8e424b6 100644 --- a/python/docs/source/conf.py +++ b/python/docs/source/conf.py @@ -42,6 +42,7 @@ def get_release_version() -> str: # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + "sphinx.ext.napoleon", "sphinx_rtd_theme", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", diff --git a/python/docs/source/usage.rst b/python/docs/source/usage.rst index 569546dce8..5718c99533 100644 --- a/python/docs/source/usage.rst +++ b/python/docs/source/usage.rst @@ -193,14 +193,14 @@ Use :meth:`DeltaTable.schema` to retrieve the delta lake schema: Schema([Field(id, PrimitiveType("long"), nullable=True)]) These schemas have a JSON representation that can be retrieved. To reconstruct -from json, use :meth:`deltalake.schema.Schema.from_json()`. +from json, use `schema.Schema.from_json()`. .. code-block:: python >>> dt.schema().json() '{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}}]}' -Use :meth:`deltalake.schema.Schema.to_pyarrow()` to retrieve the PyArrow schema: +Use `deltalake.schema.Schema.to_pyarrow()` to retrieve the PyArrow schema: .. code-block:: python From 0adebda6e1b5c2a794c9f4b106e735eb75e7843b Mon Sep 17 00:00:00 2001 From: Niko Date: Sat, 21 Oct 2023 19:49:53 +0100 Subject: [PATCH 8/8] fix a docstring_section_style typo Co-authored-by: Robert Pack <42610831+roeap@users.noreply.github.com> --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 2ae13250b7..432a8762e0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -24,7 +24,7 @@ plugins: options: # docstring_style: sphinx docstring_section_style: table - ocstring_section_style: litabst + docstring_section_style: litabst filters: ["!^_", "^__init__$"] heading_level: 3 show_source: false