feat(python): expose large_dtype param in merge (#2003)

# Description This helps to avoid this [error](#1998 )since you can now set to large_dtypes=False. Also once upstream in arrow-rs there is better type coercion, this param should be able to be removed completely in the writer and merge operation.
delta-io · Jan 2, 2024 · f54bb28 · f54bb28
1 parent 7add491
commit f54bb28
Showing 1 changed file with 7 additions and 5 deletions.
diff --git a/python/deltalake/table.py b/python/deltalake/table.py
@@ -808,6 +808,7 @@ def merge(
         target_alias: Optional[str] = None,
         error_on_type_mismatch: bool = True,
         writer_properties: Optional[WriterProperties] = None,
+        large_dtypes: bool = True,
     ) -> "TableMerger":
         """Pass the source data which you want to merge on the target delta table, providing a
         predicate in SQL query like format. You can also specify on what to do when the underlying data types do not
@@ -820,6 +821,7 @@ def merge(
             target_alias: Alias for the target table
             error_on_type_mismatch: specify if merge will return error if data types are mismatching :default = True
             writer_properties: Pass writer properties to the Rust parquet writer
+            large_dtypes: If True, the data schema is kept in large_dtypes.
 
         Returns:
             TableMerger: TableMerger Object
@@ -835,16 +837,16 @@ def merge(
         )
 
         if isinstance(source, pyarrow.RecordBatchReader):
-            source = convert_pyarrow_recordbatchreader(source, large_dtypes=True)
+            source = convert_pyarrow_recordbatchreader(source, large_dtypes)
         elif isinstance(source, pyarrow.RecordBatch):
-            source = convert_pyarrow_recordbatch(source, large_dtypes=True)
+            source = convert_pyarrow_recordbatch(source, large_dtypes)
         elif isinstance(source, pyarrow.Table):
-            source = convert_pyarrow_table(source, large_dtypes=True)
+            source = convert_pyarrow_table(source, large_dtypes)
         elif isinstance(source, ds.Dataset):
-            source = convert_pyarrow_dataset(source, large_dtypes=True)
+            source = convert_pyarrow_dataset(source, large_dtypes)
         elif isinstance(source, pandas.DataFrame):
             source = convert_pyarrow_table(
-                pyarrow.Table.from_pandas(source), large_dtypes=True
+                pyarrow.Table.from_pandas(source), large_dtypes
             )
         else:
             raise TypeError(