diff --git a/frontend/lib/src/components/widgets/DataFrame/columns/utils.ts b/frontend/lib/src/components/widgets/DataFrame/columns/utils.ts index f64adba5d649e..022af0d9a88c5 100644 --- a/frontend/lib/src/components/widgets/DataFrame/columns/utils.ts +++ b/frontend/lib/src/components/widgets/DataFrame/columns/utils.ts @@ -282,6 +282,14 @@ export function toSafeArray(data: any): any[] { return [data] } + if (data instanceof Uint8Array) { + // Stlite: Uint8Array is used for any list data in fastparquet. + // It stores a json string representation in the Uint8Array. + // We need to convert this to a string first + // to later have it load as json. + data = new TextDecoder("utf-8").decode(data) + } + if (typeof data === "string") { if (data === "") { // Empty string diff --git a/frontend/lib/src/dataframes/Quiver.ts b/frontend/lib/src/dataframes/Quiver.ts index 5c417e873e385..01093db2fb4ac 100644 --- a/frontend/lib/src/dataframes/Quiver.ts +++ b/frontend/lib/src/dataframes/Quiver.ts @@ -41,6 +41,27 @@ import { IArrow, Styler as StylerProto } from "@streamlit/lib/src/proto" import { notNullOrUndefined } from "@streamlit/lib/src/util/utils" import { logWarning } from "@streamlit/lib/src/util/log" +import type { readParquet as readParquetType } from "parquet-wasm" + +// Stlite: Use parquet to bypass the Arrow implementation which is unavailable in the Wasm Python environment. +// See https://github.com/whitphx/stlite/issues/509#issuecomment-1657957887 +// NOTE: Async import is necessary for the `parquet-wasm` package to work. +// If it's imported statically, the following error will be thrown when `readParquet` is called: +// `TypeError: Cannot read properties of undefined (reading '__wbindgen_add_to_stack_pointer')` +// Ref: https://github.com/kylebarron/parquet-wasm/issues/27 +// HACK: Strictly speaking, there is no guarantee that the `readParquet` function +// async-imported in the following code will be ready when it's called in the `Quiver` class's constructor, +// but it seems to work fine in practice. +let readParquet: typeof readParquetType | undefined = undefined +setTimeout(() => + // `setTimeout()` is required for this lazy loading to work in the mountable package + // where `__webpack_public_path__` is set at runtime, as this `setTimeout()` ensures that + // this `import()` is run after `__webpack_public_path__` is patched. + import("parquet-wasm").then(parquet => { + readParquet = parquet.readParquet + }) +) + /** Data types used by ArrowJS. */ export type DataType = | null @@ -435,7 +456,9 @@ export class Quiver { private readonly _styler?: Styler constructor(element: IArrow) { - const table = tableFromIPC(element.data) + const table = tableFromIPC( + element.data ? readParquet!(element.data) : element.data + ) const schema = Quiver.parseSchema(table) const rawColumns = Quiver.getRawColumns(schema) const fields = Quiver.parseFields(table.schema) @@ -1049,7 +1072,9 @@ but was expecting \`${JSON.stringify(expectedIndexTypes)}\`. public static getTypeName(type: Type): IndexTypeName | string { // For `PeriodType` and `IntervalType` types are kept in `numpy_type`, // for the rest of the indexes in `pandas_type`. - return type.pandas_type === "object" ? type.numpy_type : type.pandas_type + const typeName = + type.pandas_type === "object" ? type.numpy_type : type.pandas_type + return typeName.toLowerCase().trim() } /** Takes the data and it's type and nicely formats it. */ diff --git a/lib/setup.py b/lib/setup.py index ebd2badb1f5ab..90dc85397ac47 100644 --- a/lib/setup.py +++ b/lib/setup.py @@ -69,6 +69,9 @@ if not os.getenv("SNOWPARK_CONDA_BUILD"): INSTALL_REQUIRES.extend(SNOWPARK_CONDA_EXCLUDED_DEPENDENCIES) +# stlite: See https://github.com/whitphx/stlite/issues/509#issuecomment-1657957887 +INSTALL_REQUIRES.extend(["fastparquet"]) + EXTRA_REQUIRES = { "snowflake": [ "snowflake-snowpark-python>=0.9.0; python_version<'3.12'", diff --git a/lib/streamlit/dataframe_util.py b/lib/streamlit/dataframe_util.py index ffcf6a286352e..275232eae0f26 100644 --- a/lib/streamlit/dataframe_util.py +++ b/lib/streamlit/dataframe_util.py @@ -17,6 +17,7 @@ from __future__ import annotations import contextlib +import io import math from enum import Enum, EnumMeta, auto from typing import ( @@ -367,8 +368,22 @@ def convert_arrow_table_to_arrow_bytes(table: pa.Table) -> bytes: return cast(bytes, sink.getvalue().to_pybytes()) +# `pd.DataFrame.to_parquet()` always closes the file handle, +# but we need to keep it open to get the written data. +# So we use this custom class to prevent the closing. +# https://github.com/dask/fastparquet/issues/868 +class UnclosableBytesIO(io.BytesIO): + def close(self): + pass + + def really_close(self): + super().close() + + def convert_pandas_df_to_arrow_bytes(df: DataFrame) -> bytes: """Serialize pandas.DataFrame to Arrow IPC bytes. + This function is customized from the original one to use Parquet instead of Arrow + for stlite. See https://github.com/whitphx/stlite/issues/509 Parameters ---------- @@ -380,20 +395,24 @@ def convert_pandas_df_to_arrow_bytes(df: DataFrame) -> bytes: bytes The serialized Arrow IPC bytes. """ - import pyarrow as pa + buf = UnclosableBytesIO() try: - table = pa.Table.from_pandas(df) - except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError) as ex: + df.to_parquet(buf, engine="fastparquet") + except (ValueError, TypeError) as ex: _LOGGER.info( - "Serialization of dataframe to Arrow table was unsuccessful due to: %s. " + "Serialization of dataframe to Parquet table was unsuccessful due to: %s. " "Applying automatic fixes for column types to make the dataframe " "Arrow-compatible.", ex, ) df = fix_arrow_incompatible_column_types(df) - table = pa.Table.from_pandas(df) - return convert_arrow_table_to_arrow_bytes(table) + df.to_parquet(buf, engine="fastparquet") + + data = buf.getvalue() + buf.really_close() + + return data def convert_arrow_bytes_to_pandas_df(source: bytes) -> DataFrame: @@ -580,6 +599,7 @@ def _maybe_truncate_table( def is_colum_type_arrow_incompatible(column: Series[Any] | Index) -> bool: """Return True if the column type is known to cause issues during Arrow conversion.""" + import pandas as pd from pandas.api.types import infer_dtype, is_dict_like, is_list_like if column.dtype.kind in [ @@ -598,6 +618,14 @@ def is_colum_type_arrow_incompatible(column: Series[Any] | Index) -> bool: }: return True + # Stlite: not supported by fastparquet: + if isinstance(column.dtype, pd.IntervalDtype): + return True + + # Stlite: not supported by fastparquet: + if isinstance(column.dtype, pd.PeriodDtype): + return True + if column.dtype == "object": # The dtype of mixed type columns is always object, the actual type of the column # values can be determined via the infer_dtype function: @@ -607,6 +635,10 @@ def is_colum_type_arrow_incompatible(column: Series[Any] | Index) -> bool: if inferred_type in [ "mixed-integer", "complex", + # Stlite: not supported by fastparquet (as object types): + "date", + "time", + "datetime", ]: return True elif inferred_type == "mixed": @@ -626,6 +658,10 @@ def is_colum_type_arrow_incompatible(column: Series[Any] | Index) -> bool: or is_dict_like(first_value) # Frozensets are list-like, but are not compatible with pyarrow. or isinstance(first_value, frozenset) + # Stlite: not supported by fastparquet: + or isinstance(first_value, set) + or isinstance(first_value, tuple) + or infer_dtype(first_value, skipna=True) in ["datetime"] ): # This seems to be an incompatible list-like type return True @@ -658,6 +694,7 @@ def fix_arrow_incompatible_column_types( The fixed dataframe. """ import pandas as pd + from pandas.api.types import infer_dtype # Make a copy, but only initialize if necessary to preserve memory. df_copy: DataFrame | None = None @@ -681,6 +718,13 @@ def fix_arrow_incompatible_column_types( if df_copy is None: df_copy = df.copy() df_copy.index = df.index.astype("string") + + # Stlite: fastparquet does not support non-string column names: + if infer_dtype(df.columns) != "string": + if df_copy is None: + df_copy = df.copy() + df_copy.columns = df.columns.astype("string") + return df_copy if df_copy is not None else df diff --git a/lib/streamlit/elements/lib/column_config_utils.py b/lib/streamlit/elements/lib/column_config_utils.py index 8e39d3f0f4a86..2d8e72b0ffd2f 100644 --- a/lib/streamlit/elements/lib/column_config_utils.py +++ b/lib/streamlit/elements/lib/column_config_utils.py @@ -361,7 +361,7 @@ def _determine_data_kind( def determine_dataframe_schema( - data_df: DataFrame, arrow_schema: pa.Schema + data_df: DataFrame, arrow_schema: pa.Schema | None ) -> DataframeSchema: """Determine the schema of a dataframe. @@ -390,7 +390,7 @@ def determine_dataframe_schema( for i, column in enumerate(data_df.items()): column_name, column_data = column dataframe_schema[column_name] = _determine_data_kind( - column_data, arrow_schema.field(i) + column_data, arrow_schema.field(i) if arrow_schema else None ) return dataframe_schema @@ -490,10 +490,65 @@ def apply_data_specific_configs( Whether to check if the data is compatible with arrow. """ import pandas as pd + from pandas.api.types import infer_dtype, is_categorical_dtype # Deactivate editing for columns that are not compatible with arrow if check_arrow_compatibility: + # Stlite: Fix non-string column names (not supported by fastparquet): + if infer_dtype(data_df.columns) != "string": + data_df.columns = data_df.columns.astype("string") + for column_name, column_data in data_df.items(): + # Stlite: Configure column types for some aspects + # that are not working out of the box with the parquet serialization. + if column_name not in columns_config: + if is_categorical_dtype(column_data.dtype): + update_column_config( + columns_config, + column_name, + { + "type_config": { + "type": "selectbox", + "options": column_data.cat.categories.tolist(), + }, + }, + ) + if column_data.dtype == "object": + inferred_type = infer_dtype(column_data, skipna=True) + if inferred_type in ["string", "empty"]: + update_column_config( + columns_config, + column_name, + {"type_config": {"type": "text"}}, + ) + elif inferred_type == "boolean": + update_column_config( + columns_config, + column_name, + {"type_config": {"type": "checkbox"}}, + ) + elif inferred_type == "date": + data_df[column_name] = pd.to_datetime( + column_data.astype("string"), errors="coerce" + ) + column_data = data_df[column_name] + update_column_config( + columns_config, + column_name, + {"type_config": {"type": "date"}}, + ) + continue + elif inferred_type == "time": + data_df[column_name] = pd.to_datetime( + column_data.astype("string"), errors="coerce" + ) + column_data = data_df[column_name] + update_column_config( + columns_config, + column_name, + {"type_config": {"type": "time"}}, + ) + if is_colum_type_arrow_incompatible(column_data): update_column_config(columns_config, column_name, {"disabled": True}) # Convert incompatible type to string diff --git a/lib/streamlit/elements/widgets/data_editor.py b/lib/streamlit/elements/widgets/data_editor.py index d9241790b80c2..2a89bca5c8d92 100644 --- a/lib/streamlit/elements/widgets/data_editor.py +++ b/lib/streamlit/elements/widgets/data_editor.py @@ -777,7 +777,6 @@ def data_editor( """ # Lazy-loaded import import pandas as pd - import pyarrow as pa key = to_key(key) @@ -841,20 +840,25 @@ def data_editor( for column in disabled: update_column_config(column_config_mapping, column, {"disabled": True}) - # Convert the dataframe to an arrow table which is used as the main - # serialization format for sending the data to the frontend. - # We also utilize the arrow schema to determine the data kinds of every column. - arrow_table = pa.Table.from_pandas(data_df) + # stlite: Don't use Arrow + # # Convert the dataframe to an arrow table which is used as the main + # # serialization format for sending the data to the frontend. + # # We also utilize the arrow schema to determine the data kinds of every column. + # arrow_table = pa.Table.from_pandas(data_df) - # Determine the dataframe schema which is required for parsing edited values - # and for checking type compatibilities. - dataframe_schema = determine_dataframe_schema(data_df, arrow_table.schema) + # stlite: arrow_table.schema can't be used as Arrow is not available. + # # Determine the dataframe schema which is required for parsing edited values + # # and for checking type compatibilities. + # dataframe_schema = determine_dataframe_schema(data_df, arrow_table.schema) + dataframe_schema = determine_dataframe_schema(data_df, None) # Check if all configured column types are compatible with the underlying data. # Throws an exception if any of the configured types are incompatible. _check_type_compatibilities(data_df, column_config_mapping, dataframe_schema) - arrow_bytes = dataframe_util.convert_arrow_table_to_arrow_bytes(arrow_table) + # stlite: Don't use Arrow + # arrow_bytes = dataframe_util.convert_arrow_table_to_arrow_bytes(arrow_table) + df_bytes = dataframe_util.convert_arrow_table_to_arrow_bytes(data_df) # We want to do this as early as possible to avoid introducing nondeterminism, # but it isn't clear how much processing is needed to have the data in a @@ -864,7 +868,8 @@ def data_editor( id = compute_widget_id( "data_editor", user_key=key, - data=arrow_bytes, + # data=arrow_bytes, + data=df_bytes, width=width, height=height, use_container_width=use_container_width, @@ -915,7 +920,9 @@ def data_editor( data.set_uuid(styler_uuid) marshall_styler(proto, data, styler_uuid) - proto.data = arrow_bytes + # stlite: Don't use Arrow. `dataframe_util.convert_arrow_table_to_arrow_bytes` is polyfilled to use Parquet instead for stlite. + # proto.data = arrow_bytes + proto.data = df_bytes marshall_column_config(proto, column_config_mapping)