diff --git a/frontend/package.json b/frontend/package.json index e52f59b222d2..2c16614cba18 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -79,6 +79,7 @@ "moment-timezone": "^0.5.40", "node-emoji": "^1.11.0", "numbro": "^2.3.6", + "parquet-wasm": "^0.4.0", "plotly.js": "^2.18.1", "prismjs": "^1.29.0", "protobufjs": "^7.2.0", diff --git a/frontend/src/lib/components/widgets/DataFrame/columns/utils.ts b/frontend/src/lib/components/widgets/DataFrame/columns/utils.ts index d803ea1837dd..71246237a26f 100644 --- a/frontend/src/lib/components/widgets/DataFrame/columns/utils.ts +++ b/frontend/src/lib/components/widgets/DataFrame/columns/utils.ts @@ -274,6 +274,14 @@ export function toSafeArray(data: any): any[] { return [data] } + if (data instanceof Uint8Array) { + // Stlite: Uint8Array is used for any list data in fastparquet. + // It stores a json string representation in the Uint8Array. + // We need to convert this to a string first + // to later have it load as json. + data = new TextDecoder("utf-8").decode(data) + } + if (typeof data === "string") { if (data === "") { // Empty string diff --git a/frontend/src/lib/dataframes/Quiver.ts b/frontend/src/lib/dataframes/Quiver.ts index 30c78df3b0d3..cce72a44748b 100644 --- a/frontend/src/lib/dataframes/Quiver.ts +++ b/frontend/src/lib/dataframes/Quiver.ts @@ -36,6 +36,27 @@ import numbro from "numbro" import { IArrow, Styler as StylerProto } from "src/lib/proto" import { notNullOrUndefined } from "src/lib/util/utils" +import type { readParquet as readParquetType } from "parquet-wasm" + +// Stlite: Use parquet to bypass the Arrow implementation which is unavailable in the Wasm Python environment. +// See https://github.com/whitphx/stlite/issues/509#issuecomment-1657957887 +// NOTE: Async import is necessary for the `parquet-wasm` package to work. +// If it's imported statically, the following error will be thrown when `readParquet` is called: +// `TypeError: Cannot read properties of undefined (reading '__wbindgen_add_to_stack_pointer')` +// Ref: https://github.com/kylebarron/parquet-wasm/issues/27 +// HACK: Strictly speaking, there is no guarantee that the `readParquet` function +// async-imported in the following code will be ready when it's called in the `Quiver` class's constructor, +// but it seems to work fine in practice. +let readParquet: typeof readParquetType | undefined = undefined +setTimeout(() => + // `setTimeout()` is required for this lazy loading to work in the mountable package + // where `__webpack_public_path__` is set at runtime, as this `setTimeout()` ensures that + // this `import()` is run after `__webpack_public_path__` is patched. + import("parquet-wasm").then(parquet => { + readParquet = parquet.readParquet + }) +) + /** Data types used by ArrowJS. */ export type DataType = | null @@ -384,7 +405,9 @@ export class Quiver { private readonly _styler?: Styler constructor(element: IArrow) { - const table = tableFromIPC(element.data) + const table = tableFromIPC( + element.data ? readParquet!(element.data) : element.data + ) const schema = Quiver.parseSchema(table) const rawColumns = Quiver.getRawColumns(schema) const fields = Quiver.parseFields(table.schema) @@ -937,7 +960,9 @@ but was expecting \`${JSON.stringify(expectedIndexTypes)}\`. public static getTypeName(type: Type): IndexTypeName | string { // For `PeriodType` and `IntervalType` types are kept in `numpy_type`, // for the rest of the indexes in `pandas_type`. - return type.pandas_type === "object" ? type.numpy_type : type.pandas_type + const typeName = + type.pandas_type === "object" ? type.numpy_type : type.pandas_type + return typeName.toLowerCase().trim() } /** Takes the data and it's type and nicely formats it. */ diff --git a/lib/setup.py b/lib/setup.py index 489056d4489c..bd9664120a34 100644 --- a/lib/setup.py +++ b/lib/setup.py @@ -78,6 +78,9 @@ if not os.getenv("SNOWPARK_CONDA_BUILD"): INSTALL_REQUIRES.extend(SNOWPARK_CONDA_EXCLUDED_DEPENDENCIES) +# stlite: See https://github.com/whitphx/stlite/issues/509#issuecomment-1657957887 +INSTALL_REQUIRES.extend(["fastparquet"]) + EXTRA_REQUIRES = {"snowflake": ["snowflake-snowpark-python; python_version=='3.8'"]} diff --git a/lib/streamlit/elements/data_editor.py b/lib/streamlit/elements/data_editor.py index 36bad8594936..872caa45c486 100644 --- a/lib/streamlit/elements/data_editor.py +++ b/lib/streamlit/elements/data_editor.py @@ -772,14 +772,17 @@ def data_editor( for column in disabled: update_column_config(column_config_mapping, column, {"disabled": True}) - # Convert the dataframe to an arrow table which is used as the main - # serialization format for sending the data to the frontend. - # We also utilize the arrow schema to determine the data kinds of every column. - arrow_table = pa.Table.from_pandas(data_df) - - # Determine the dataframe schema which is required for parsing edited values - # and for checking type compatibilities. - dataframe_schema = determine_dataframe_schema(data_df, arrow_table.schema) + # stlite: Don't use Arrow + # # Convert the dataframe to an arrow table which is used as the main + # # serialization format for sending the data to the frontend. + # # We also utilize the arrow schema to determine the data kinds of every column. + # arrow_table = pa.Table.from_pandas(data_df) + + # stlite: arrow_table.schema can't be used as Arrow is not available. + # # Determine the dataframe schema which is required for parsing edited values + # # and for checking type compatibilities. + # dataframe_schema = determine_dataframe_schema(data_df, arrow_table.schema) + dataframe_schema = determine_dataframe_schema(data_df, None) # Check if all configured column types are compatible with the underlying data. # Throws an exception if any of the configured types are incompatible. @@ -815,7 +818,9 @@ def data_editor( default_uuid = str(hash(delta_path)) marshall_styler(proto, data, default_uuid) - proto.data = type_util.pyarrow_table_to_bytes(arrow_table) + # stlite: Don't use Arrow. `type_util.data_frame_to_bytes` is polyfilled to use Parquet instead for stlite. + # proto.data = type_util.pyarrow_table_to_bytes(arrow_table) + proto.data = type_util.data_frame_to_bytes(data_df) marshall_column_config(proto, column_config_mapping) diff --git a/lib/streamlit/elements/lib/column_config_utils.py b/lib/streamlit/elements/lib/column_config_utils.py index aa0b452c5d56..a25f7162f54e 100644 --- a/lib/streamlit/elements/lib/column_config_utils.py +++ b/lib/streamlit/elements/lib/column_config_utils.py @@ -20,6 +20,7 @@ import pandas as pd import pyarrow as pa +from pandas.api.types import infer_dtype, is_categorical_dtype from typing_extensions import Final, Literal, TypeAlias from streamlit.elements.lib.column_types import ColumnConfig, ColumnType @@ -347,7 +348,7 @@ def _determine_data_kind( def determine_dataframe_schema( - data_df: pd.DataFrame, arrow_schema: pa.Schema + data_df: pd.DataFrame, arrow_schema: Optional[pa.Schema] ) -> DataframeSchema: """Determine the schema of a dataframe. @@ -376,7 +377,7 @@ def determine_dataframe_schema( for i, column in enumerate(data_df.items()): column_name, column_data = column dataframe_schema[column_name] = _determine_data_kind( - column_data, arrow_schema.field(i) + column_data, arrow_schema.field(i) if arrow_schema else None ) return dataframe_schema @@ -475,11 +476,65 @@ def apply_data_specific_configs( """ # Deactivate editing for columns that are not compatible with arrow if check_arrow_compatibility: + # Stlite: Fix non-string column names (not supported by fastparquet): + if infer_dtype(data_df.columns) != "string": + data_df.columns = data_df.columns.astype("string") + for column_name, column_data in data_df.items(): + # Stlite: Configure column types for some aspects + # that are not working out of the box with the parquet serialization. + if column_name not in columns_config: + if is_categorical_dtype(column_data.dtype): + update_column_config( + columns_config, + column_name, + { + "type_config": { + "type": "selectbox", + "options": column_data.cat.categories.tolist(), + }, + }, + ) + if column_data.dtype == "object": + inferred_type = infer_dtype(column_data, skipna=True) + if inferred_type in ["string", "empty"]: + update_column_config( + columns_config, + column_name, + {"type_config": {"type": "text"}}, + ) + elif inferred_type == "boolean": + update_column_config( + columns_config, + column_name, + {"type_config": {"type": "checkbox"}}, + ) + elif inferred_type == "date": + data_df[column_name] = pd.to_datetime( + column_data.astype("string"), errors="coerce" + ) + column_data = data_df[column_name] + update_column_config( + columns_config, + column_name, + {"type_config": {"type": "date"}}, + ) + continue + elif inferred_type == "time": + data_df[column_name] = pd.to_datetime( + column_data.astype("string"), errors="coerce" + ) + column_data = data_df[column_name] + update_column_config( + columns_config, + column_name, + {"type_config": {"type": "time"}}, + ) + if is_colum_type_arrow_incompatible(column_data): update_column_config(columns_config, column_name, {"disabled": True}) # Convert incompatible type to string - data_df[column_name] = column_data.astype(str) + data_df[column_name] = column_data.astype("string") # Pandas adds a range index as default to all datastructures # but for most of the non-pandas data objects it is unnecessary diff --git a/lib/streamlit/type_util.py b/lib/streamlit/type_util.py index c95543aa0136..6846e5e28904 100644 --- a/lib/streamlit/type_util.py +++ b/lib/streamlit/type_util.py @@ -17,6 +17,7 @@ from __future__ import annotations import contextlib +import io import re import types from enum import Enum, auto @@ -40,7 +41,13 @@ import numpy as np import pyarrow as pa from pandas import DataFrame, Index, MultiIndex, Series -from pandas.api.types import infer_dtype, is_dict_like, is_list_like +from pandas.api.types import ( + infer_dtype, + is_dict_like, + is_interval_dtype, + is_list_like, + is_period_dtype, +) from typing_extensions import Final, Literal, Protocol, TypeAlias, TypeGuard, get_args import streamlit as st @@ -658,6 +665,14 @@ def is_colum_type_arrow_incompatible(column: Union[Series, Index]) -> bool: ]: return True + # Stlite: not supported by fastparquet: + if is_interval_dtype(column.dtype): + return True + + # Stlite: not supported by fastparquet: + if is_period_dtype(column.dtype): + return True + if column.dtype == "object": # The dtype of mixed type columns is always object, the actual type of the column # values can be determined via the infer_dtype function: @@ -669,6 +684,10 @@ def is_colum_type_arrow_incompatible(column: Union[Series, Index]) -> bool: "complex", "timedelta", "timedelta64", + # Stlite: not supported by fastparquet (as object types): + "date", + "time", + "datetime", ]: return True elif inferred_type == "mixed": @@ -688,6 +707,10 @@ def is_colum_type_arrow_incompatible(column: Union[Series, Index]) -> bool: or is_dict_like(first_value) # Frozensets are list-like, but are not compatible with pyarrow. or isinstance(first_value, frozenset) + # Stlite: not supported by fastparquet: + or isinstance(first_value, set) + or isinstance(first_value, tuple) + or infer_dtype(first_value, skipna=True) in ["datetime"] ): # This seems to be an incompatible list-like type return True @@ -725,7 +748,7 @@ def fix_arrow_incompatible_column_types( if is_colum_type_arrow_incompatible(df[col]): if df_copy is None: df_copy = df.copy() - df_copy[col] = df[col].astype(str) + df_copy[col] = df[col].astype("string") # The index can also contain mixed types # causing Arrow issues during conversion. @@ -740,12 +763,33 @@ def fix_arrow_incompatible_column_types( ): if df_copy is None: df_copy = df.copy() - df_copy.index = df.index.astype(str) + df_copy.index = df.index.astype("string") + + # Stlite: fastparquet does not support non-string column names: + if infer_dtype(df.columns) != "string": + if df_copy is None: + df_copy = df.copy() + df_copy.columns = df.columns.astype("string") + return df_copy if df_copy is not None else df +# `pd.DataFrame.to_parquet()` always closes the file handle, +# but we need to keep it open to get the written data. +# So we use this custom class to prevent the closing. +# https://github.com/dask/fastparquet/issues/868 +class UnclosableBytesIO(io.BytesIO): + def close(self): + pass + + def really_close(self): + super().close() + + def data_frame_to_bytes(df: DataFrame) -> bytes: - """Serialize pandas.DataFrame to bytes using Apache Arrow. + """Serialize pandas.DataFrame to bytes using Apache ~~Arrow~~ Parquet. + This function is customized from the original one to use Parquet instead of Arrow + for stlite. See https://github.com/whitphx/stlite/issues/509 Parameters ---------- @@ -753,17 +797,23 @@ def data_frame_to_bytes(df: DataFrame) -> bytes: A dataframe to convert. """ + buf = UnclosableBytesIO() + try: - table = pa.Table.from_pandas(df) - except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError) as ex: + df.to_parquet(buf, engine="fastparquet") + except ValueError as ex: _LOGGER.info( - "Serialization of dataframe to Arrow table was unsuccessful due to: %s. " + "Serialization of dataframe to Parquet table was unsuccessful due to: %s. " "Applying automatic fixes for column types to make the dataframe Arrow-compatible.", ex, ) df = fix_arrow_incompatible_column_types(df) - table = pa.Table.from_pandas(df) - return pyarrow_table_to_bytes(table) + df.to_parquet(buf, engine="fastparquet") + + data = buf.getvalue() + buf.really_close() + + return data def bytes_to_data_frame(source: bytes) -> DataFrame: