From e8252ebcc65547c38b37f1c63522d6445b7f4cea Mon Sep 17 00:00:00 2001 From: Brayan Jules Date: Sun, 22 Oct 2023 02:16:01 -0300 Subject: [PATCH 1/3] doc: documented dynamodb lock configuration in python deltalake write function --- python/deltalake/writer.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index db399e857e..a43718322c 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -104,6 +104,24 @@ def write_deltalake( Note that this function does NOT register this table in a data catalog. + A locking mechanism is needed to prevent unsafe concurrent writes to a + delta lake directory when writing to S3. DynamoDB is the only available + locking provider at the moment in delta-rs. To enable DynamoDB as the + locking provider, you need to set the `AWS_S3_LOCKING_PROVIDER` to 'dynamodb' + as a storage_option or as an environment variable. + + Additionally, you must create a DynamoDB table with the name 'delta_rs_lock_table' + so that it can be automatically discovered by delta-rs. Alternatively, you can + use a table name of your choice, but you must set the `DYNAMO_LOCK_TABLE_NAME` + variable to match your chosen table name. The required schema for the DynamoDB + table is as follows: + + - Key Schema: AttributeName=key, KeyType=HASH + - Attribute Definitions: AttributeName=key, AttributeType=S + + Please note that this locking mechanism is not compatible with any other + locking mechanisms, including the one used by Spark. + :param table_or_uri: URI of a table or a DeltaTable object. :param data: Data to write. If passing iterable, the schema must also be given. :param schema: Optional schema to write. From 29611aeacd0366f84abab2277d8a7c860d21f72e Mon Sep 17 00:00:00 2001 From: Brayan Jules Date: Sun, 22 Oct 2023 02:16:01 -0300 Subject: [PATCH 2/3] doc: documented dynamodb lock configuration in python deltalake write function --- python/deltalake/writer.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py index db399e857e..a43718322c 100644 --- a/python/deltalake/writer.py +++ b/python/deltalake/writer.py @@ -104,6 +104,24 @@ def write_deltalake( Note that this function does NOT register this table in a data catalog. + A locking mechanism is needed to prevent unsafe concurrent writes to a + delta lake directory when writing to S3. DynamoDB is the only available + locking provider at the moment in delta-rs. To enable DynamoDB as the + locking provider, you need to set the `AWS_S3_LOCKING_PROVIDER` to 'dynamodb' + as a storage_option or as an environment variable. + + Additionally, you must create a DynamoDB table with the name 'delta_rs_lock_table' + so that it can be automatically discovered by delta-rs. Alternatively, you can + use a table name of your choice, but you must set the `DYNAMO_LOCK_TABLE_NAME` + variable to match your chosen table name. The required schema for the DynamoDB + table is as follows: + + - Key Schema: AttributeName=key, KeyType=HASH + - Attribute Definitions: AttributeName=key, AttributeType=S + + Please note that this locking mechanism is not compatible with any other + locking mechanisms, including the one used by Spark. + :param table_or_uri: URI of a table or a DeltaTable object. :param data: Data to write. If passing iterable, the schema must also be given. :param schema: Optional schema to write. From 95f6a626be10c862a2183a2dffcd1d7c58491bad Mon Sep 17 00:00:00 2001 From: Brayan Jules Date: Sun, 22 Oct 2023 03:15:00 -0300 Subject: [PATCH 3/3] doc: documented dynamodb lock configuration in python usage documentation --- python/docs/source/usage.rst | 50 ++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/python/docs/source/usage.rst b/python/docs/source/usage.rst index 569546dce8..e1a5e96b3b 100644 --- a/python/docs/source/usage.rst +++ b/python/docs/source/usage.rst @@ -483,6 +483,56 @@ to append pass in ``mode='append'``: the data passed to it differs from the existing table's schema. If you wish to alter the schema as part of an overwrite pass in ``overwrite_schema=True``. +Writing to s3 +~~~~~~~~~~~~~ + +A locking mechanism is needed to prevent unsafe concurrent writes to a +delta lake directory when writing to S3. DynamoDB is the only available +locking provider at the moment in delta-rs. To enable DynamoDB as the +locking provider, you need to set the **AWS_S3_LOCKING_PROVIDER** to 'dynamodb' +as a ``storage_options`` or as an environment variable. + +Additionally, you must create a DynamoDB table with the name ``delta_rs_lock_table`` +so that it can be automatically recognized by delta-rs. Alternatively, you can +use a table name of your choice, but you must set the **DYNAMO_LOCK_TABLE_NAME** +variable to match your chosen table name. The required schema for the DynamoDB +table is as follows: + +.. code-block:: json + + + { + "AttributeDefinitions": [ + { + "AttributeName": "key", + "AttributeType": "S" + } + ], + "TableName": "delta_rs_lock_table", + "KeySchema": [ + { + "AttributeName": "key", + "KeyType": "HASH" + } + ] + } + +Here is an example writing to s3 using this mechanism: + +.. code-block:: python + + >>> from deltalake import write_deltalake + >>> df = pd.DataFrame({'x': [1, 2, 3]}) + >>> storage_options = {'AWS_S3_LOCKING_PROVIDER': 'dynamodb', 'DYNAMO_LOCK_TABLE_NAME': 'custom_table_name'} + >>> write_deltalake('s3://path/to/table', df, 'storage_options'= storage_options) + +.. note:: + if for some reason you don't want to use dynamodb as your locking mechanism you can + choose to set the `AWS_S3_ALLOW_UNSAFE_RENAME` variable to ``true`` in order to enable + S3 unsafe writes. + +Please note that this locking mechanism is not compatible with any other +locking mechanisms, including the one used by Spark. Updating Delta Tables ---------------------