From 413d41e48d5ba5237456e8e7c74dcd3bf7781550 Mon Sep 17 00:00:00 2001 From: Stalin Sabu Thomas <44336699+legendof-selda@users.noreply.github.com> Date: Tue, 13 Jun 2023 01:44:32 +0530 Subject: [PATCH 1/6] perf: fix pandas PerformanceWarning caused due to `frame.insert` --- .../python/plotly/plotly/express/_core.py | 61 +++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/packages/python/plotly/plotly/express/_core.py b/packages/python/plotly/plotly/express/_core.py index cd51b86e15..38debea926 100644 --- a/packages/python/plotly/plotly/express/_core.py +++ b/packages/python/plotly/plotly/express/_core.py @@ -321,7 +321,6 @@ def make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref): and args["y"] and len(trace_data[[args["x"], args["y"]]].dropna()) > 1 ): - # sorting is bad but trace_specs with "trendline" have no other attrs sorted_trace_data = trace_data.sort_values(by=args["x"]) y = sorted_trace_data[args["y"]].values @@ -562,7 +561,6 @@ def set_cartesian_axis_opts(args, axis, letter, orders): def configure_cartesian_marginal_axes(args, fig, orders): - if "histogram" in [args["marginal_x"], args["marginal_y"]]: fig.layout["barmode"] = "overlay" @@ -885,8 +883,8 @@ def make_trace_spec(args, constructor, attrs, trace_patch): def make_trendline_spec(args, constructor): trace_spec = TraceSpec( constructor=go.Scattergl - if constructor == go.Scattergl # could be contour - else go.Scatter, + if constructor == go.Scattergl + else go.Scatter, # could be contour attrs=["trendline"], trace_patch=dict(mode="lines"), marginal=None, @@ -1064,14 +1062,25 @@ def _escape_col_name(df_input, col_name, extra): return col_name -def to_unindexed_series(x): +def to_unindexed_series(x, name=None): """ - assuming x is list-like or even an existing pd.Series, return a new pd.Series with - no index, without extracting the data from an existing Series via numpy, which + assuming x is list-like or even an existing pd.Series, return a new pd.DataFrame + with no index, without extracting the data from an existing Series via numpy, which seems to mangle datetime columns. Stripping the index from existing pd.Series is - required to get things to match up right in the new DataFrame we're building + required to get things to match up right in the new DataFrame we're building. + It's converted to a frame so that it can be concated easily and it contains + `columns` attribute, so `_get_cols` can be used. """ - return pd.Series(x).reset_index(drop=True) + return pd.Series(x, name=name).reset_index(drop=True).to_frame() + + +def _get_cols(df_list): + """ + get all the columns in the current df_list. + Since this func is called when we raise error, the func is called once. + So inefficiency here can be tolerated. + """ + return [column for df in df_list for column in df.columns] def process_args_into_dataframe(args, wide_mode, var_name, value_name): @@ -1086,9 +1095,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): df_input = args["data_frame"] df_provided = df_input is not None - df_output = pd.DataFrame() - constants = dict() - ranges = list() + # we use append it as list to avoid performance issues in pandas + # when dealing with large dataframes. + df_outputs = [] + constants = {} + ranges = [] wide_id_vars = set() reserved_names = _get_reserved_col_names(args) if df_provided else set() @@ -1099,7 +1110,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): "No data were provided. Please provide data either with the `data_frame` or with the `dimensions` argument." ) else: - df_output[df_input.columns] = df_input[df_input.columns] + df_outputs.append(df_input[df_input.columns]) # hover_data is a dict hover_data_is_dict = ( @@ -1140,7 +1151,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): # argument_list and field_list ready, iterate over them # Core of the loop starts here for i, (argument, field) in enumerate(zip(argument_list, field_list)): - length = len(df_output) + length = len(df_outputs[0]) if len(df_outputs) else 0 if argument is None: continue col_name = None @@ -1181,11 +1192,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): % ( argument, len(real_argument), - str(list(df_output.columns)), + str(_get_cols(df_outputs)), length, ) ) - df_output[col_name] = to_unindexed_series(real_argument) + df_outputs.append(to_unindexed_series(real_argument, col_name)) elif not df_provided: raise ValueError( "String or int arguments are only possible when a " @@ -1214,13 +1225,13 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): % ( field, len(df_input[argument]), - str(list(df_output.columns)), + str(_get_cols(df_outputs)), length, ) ) else: col_name = str(argument) - df_output[col_name] = to_unindexed_series(df_input[argument]) + df_outputs.append(to_unindexed_series(df_input[argument], col_name)) # ----------------- argument is likely a column / array / list.... ------- else: if df_provided and hasattr(argument, "name"): @@ -1247,9 +1258,9 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): "All arguments should have the same length. " "The length of argument `%s` is %d, whereas the " "length of previously-processed arguments %s is %d" - % (field, len(argument), str(list(df_output.columns)), length) + % (field, len(argument), str(_get_cols(df_outputs)), length) ) - df_output[str(col_name)] = to_unindexed_series(argument) + df_outputs.append(to_unindexed_series(argument, str(col_name))) # Finally, update argument with column name now that column exists assert col_name is not None, ( @@ -1267,12 +1278,14 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): if field_name != "wide_variable": wide_id_vars.add(str(col_name)) - for col_name in ranges: - df_output[col_name] = range(len(df_output)) + length = len(df_outputs[0]) + df_outputs.extend([pd.Series(range(length), name=col_name) for col_name in ranges]) - for col_name in constants: - df_output[col_name] = constants[col_name] + df_outputs.extend( + [pd.Series(constants[col_name], name=col_name) for col_name in constants] + ) + df_output = pd.concat(df_outputs, axis=1) return df_output, wide_id_vars From fbfd4a8e22aa2e3fca95c6f75c682a7228f39a5f Mon Sep 17 00:00:00 2001 From: Stalin Sabu Thomas <44336699+legendof-selda@users.noreply.github.com> Date: Tue, 13 Jun 2023 01:47:06 +0530 Subject: [PATCH 2/6] chore: fix flake8 and black maxlen not match --- .flake8 | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..1d36346c0d --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 88 \ No newline at end of file From 3a4b466b041cabddc07b98de0d20ba7cdeb2cf48 Mon Sep 17 00:00:00 2001 From: Stalin Sabu Thomas <44336699+legendof-selda@users.noreply.github.com> Date: Tue, 13 Jun 2023 02:33:36 +0530 Subject: [PATCH 3/6] Revert "perf: fix pandas PerformanceWarning caused due to `frame.insert`" This reverts commit 413d41e48d5ba5237456e8e7c74dcd3bf7781550. --- .../python/plotly/plotly/express/_core.py | 61 ++++++++----------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/packages/python/plotly/plotly/express/_core.py b/packages/python/plotly/plotly/express/_core.py index 38debea926..cd51b86e15 100644 --- a/packages/python/plotly/plotly/express/_core.py +++ b/packages/python/plotly/plotly/express/_core.py @@ -321,6 +321,7 @@ def make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref): and args["y"] and len(trace_data[[args["x"], args["y"]]].dropna()) > 1 ): + # sorting is bad but trace_specs with "trendline" have no other attrs sorted_trace_data = trace_data.sort_values(by=args["x"]) y = sorted_trace_data[args["y"]].values @@ -561,6 +562,7 @@ def set_cartesian_axis_opts(args, axis, letter, orders): def configure_cartesian_marginal_axes(args, fig, orders): + if "histogram" in [args["marginal_x"], args["marginal_y"]]: fig.layout["barmode"] = "overlay" @@ -883,8 +885,8 @@ def make_trace_spec(args, constructor, attrs, trace_patch): def make_trendline_spec(args, constructor): trace_spec = TraceSpec( constructor=go.Scattergl - if constructor == go.Scattergl - else go.Scatter, # could be contour + if constructor == go.Scattergl # could be contour + else go.Scatter, attrs=["trendline"], trace_patch=dict(mode="lines"), marginal=None, @@ -1062,25 +1064,14 @@ def _escape_col_name(df_input, col_name, extra): return col_name -def to_unindexed_series(x, name=None): +def to_unindexed_series(x): """ - assuming x is list-like or even an existing pd.Series, return a new pd.DataFrame - with no index, without extracting the data from an existing Series via numpy, which + assuming x is list-like or even an existing pd.Series, return a new pd.Series with + no index, without extracting the data from an existing Series via numpy, which seems to mangle datetime columns. Stripping the index from existing pd.Series is - required to get things to match up right in the new DataFrame we're building. - It's converted to a frame so that it can be concated easily and it contains - `columns` attribute, so `_get_cols` can be used. + required to get things to match up right in the new DataFrame we're building """ - return pd.Series(x, name=name).reset_index(drop=True).to_frame() - - -def _get_cols(df_list): - """ - get all the columns in the current df_list. - Since this func is called when we raise error, the func is called once. - So inefficiency here can be tolerated. - """ - return [column for df in df_list for column in df.columns] + return pd.Series(x).reset_index(drop=True) def process_args_into_dataframe(args, wide_mode, var_name, value_name): @@ -1095,11 +1086,9 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): df_input = args["data_frame"] df_provided = df_input is not None - # we use append it as list to avoid performance issues in pandas - # when dealing with large dataframes. - df_outputs = [] - constants = {} - ranges = [] + df_output = pd.DataFrame() + constants = dict() + ranges = list() wide_id_vars = set() reserved_names = _get_reserved_col_names(args) if df_provided else set() @@ -1110,7 +1099,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): "No data were provided. Please provide data either with the `data_frame` or with the `dimensions` argument." ) else: - df_outputs.append(df_input[df_input.columns]) + df_output[df_input.columns] = df_input[df_input.columns] # hover_data is a dict hover_data_is_dict = ( @@ -1151,7 +1140,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): # argument_list and field_list ready, iterate over them # Core of the loop starts here for i, (argument, field) in enumerate(zip(argument_list, field_list)): - length = len(df_outputs[0]) if len(df_outputs) else 0 + length = len(df_output) if argument is None: continue col_name = None @@ -1192,11 +1181,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): % ( argument, len(real_argument), - str(_get_cols(df_outputs)), + str(list(df_output.columns)), length, ) ) - df_outputs.append(to_unindexed_series(real_argument, col_name)) + df_output[col_name] = to_unindexed_series(real_argument) elif not df_provided: raise ValueError( "String or int arguments are only possible when a " @@ -1225,13 +1214,13 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): % ( field, len(df_input[argument]), - str(_get_cols(df_outputs)), + str(list(df_output.columns)), length, ) ) else: col_name = str(argument) - df_outputs.append(to_unindexed_series(df_input[argument], col_name)) + df_output[col_name] = to_unindexed_series(df_input[argument]) # ----------------- argument is likely a column / array / list.... ------- else: if df_provided and hasattr(argument, "name"): @@ -1258,9 +1247,9 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): "All arguments should have the same length. " "The length of argument `%s` is %d, whereas the " "length of previously-processed arguments %s is %d" - % (field, len(argument), str(_get_cols(df_outputs)), length) + % (field, len(argument), str(list(df_output.columns)), length) ) - df_outputs.append(to_unindexed_series(argument, str(col_name))) + df_output[str(col_name)] = to_unindexed_series(argument) # Finally, update argument with column name now that column exists assert col_name is not None, ( @@ -1278,14 +1267,12 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): if field_name != "wide_variable": wide_id_vars.add(str(col_name)) - length = len(df_outputs[0]) - df_outputs.extend([pd.Series(range(length), name=col_name) for col_name in ranges]) + for col_name in ranges: + df_output[col_name] = range(len(df_output)) - df_outputs.extend( - [pd.Series(constants[col_name], name=col_name) for col_name in constants] - ) + for col_name in constants: + df_output[col_name] = constants[col_name] - df_output = pd.concat(df_outputs, axis=1) return df_output, wide_id_vars From f028571cbf1430e92fbfa48338763aff8a4b4bc3 Mon Sep 17 00:00:00 2001 From: Stalin Sabu Thomas <44336699+legendof-selda@users.noreply.github.com> Date: Tue, 13 Jun 2023 03:27:47 +0530 Subject: [PATCH 4/6] perf: fix pandas PerformanceWarning caused due to `frame.insert` --- .../python/plotly/plotly/express/_core.py | 50 +++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/packages/python/plotly/plotly/express/_core.py b/packages/python/plotly/plotly/express/_core.py index cd51b86e15..93ca17edcc 100644 --- a/packages/python/plotly/plotly/express/_core.py +++ b/packages/python/plotly/plotly/express/_core.py @@ -321,7 +321,6 @@ def make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref): and args["y"] and len(trace_data[[args["x"], args["y"]]].dropna()) > 1 ): - # sorting is bad but trace_specs with "trendline" have no other attrs sorted_trace_data = trace_data.sort_values(by=args["x"]) y = sorted_trace_data[args["y"]].values @@ -562,7 +561,6 @@ def set_cartesian_axis_opts(args, axis, letter, orders): def configure_cartesian_marginal_axes(args, fig, orders): - if "histogram" in [args["marginal_x"], args["marginal_y"]]: fig.layout["barmode"] = "overlay" @@ -1064,14 +1062,14 @@ def _escape_col_name(df_input, col_name, extra): return col_name -def to_unindexed_series(x): +def to_unindexed_series(x, name=None): """ assuming x is list-like or even an existing pd.Series, return a new pd.Series with no index, without extracting the data from an existing Series via numpy, which seems to mangle datetime columns. Stripping the index from existing pd.Series is required to get things to match up right in the new DataFrame we're building """ - return pd.Series(x).reset_index(drop=True) + return pd.Series(x, name=name).reset_index(drop=True) def process_args_into_dataframe(args, wide_mode, var_name, value_name): @@ -1086,9 +1084,12 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): df_input = args["data_frame"] df_provided = df_input is not None - df_output = pd.DataFrame() - constants = dict() - ranges = list() + # we use a dict instead of a dataframe directly so that it doesn't cause + # PerformanceWarning by pandas by repeatedly setting the columns. + # a dict is used instead of a list as the columns needs to be overwritten. + df_output = {} + constants = {} + ranges = [] wide_id_vars = set() reserved_names = _get_reserved_col_names(args) if df_provided else set() @@ -1099,7 +1100,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): "No data were provided. Please provide data either with the `data_frame` or with the `dimensions` argument." ) else: - df_output[df_input.columns] = df_input[df_input.columns] + df_output = {col: series for col, series in df_input.items()} # hover_data is a dict hover_data_is_dict = ( @@ -1140,7 +1141,7 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): # argument_list and field_list ready, iterate over them # Core of the loop starts here for i, (argument, field) in enumerate(zip(argument_list, field_list)): - length = len(df_output) + length = len(df_output[next(iter(df_output))]) if len(df_output) else 0 if argument is None: continue col_name = None @@ -1181,11 +1182,11 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): % ( argument, len(real_argument), - str(list(df_output.columns)), + str(list(df_output.keys())), length, ) ) - df_output[col_name] = to_unindexed_series(real_argument) + df_output[col_name] = to_unindexed_series(real_argument, col_name) elif not df_provided: raise ValueError( "String or int arguments are only possible when a " @@ -1214,13 +1215,15 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): % ( field, len(df_input[argument]), - str(list(df_output.columns)), + str(list(df_output.keys())), length, ) ) else: col_name = str(argument) - df_output[col_name] = to_unindexed_series(df_input[argument]) + df_output[col_name] = to_unindexed_series( + df_input[argument], col_name + ) # ----------------- argument is likely a column / array / list.... ------- else: if df_provided and hasattr(argument, "name"): @@ -1247,9 +1250,9 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): "All arguments should have the same length. " "The length of argument `%s` is %d, whereas the " "length of previously-processed arguments %s is %d" - % (field, len(argument), str(list(df_output.columns)), length) + % (field, len(argument), str(list(df_output.keys())), length) ) - df_output[str(col_name)] = to_unindexed_series(argument) + df_output[str(col_name)] = to_unindexed_series(argument, str(col_name)) # Finally, update argument with column name now that column exists assert col_name is not None, ( @@ -1267,12 +1270,19 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): if field_name != "wide_variable": wide_id_vars.add(str(col_name)) - for col_name in ranges: - df_output[col_name] = range(len(df_output)) - - for col_name in constants: - df_output[col_name] = constants[col_name] + length = len(df_output[next(iter(df_output))]) if len(df_output) else 0 + df_output.update( + {col_name: pd.Series(range(length), name=col_name) for col_name in ranges} + ) + df_output.update( + { + # constant is single value. repeat by len to avoid creating NaN on concating + col_name: pd.Series([constants[col_name]] * length, name=col_name) + for col_name in constants + } + ) + df_output = pd.DataFrame(df_output) return df_output, wide_id_vars From d4955b10ffa6bf124ad1f7f3ca11f1cb717f8718 Mon Sep 17 00:00:00 2001 From: Stalin Sabu Thomas <44336699+legendof-selda@users.noreply.github.com> Date: Tue, 13 Jun 2023 22:59:47 +0530 Subject: [PATCH 5/6] refactor: reuse to_unindexed_series --- packages/python/plotly/plotly/express/_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/python/plotly/plotly/express/_core.py b/packages/python/plotly/plotly/express/_core.py index 93ca17edcc..d528937c10 100644 --- a/packages/python/plotly/plotly/express/_core.py +++ b/packages/python/plotly/plotly/express/_core.py @@ -1272,12 +1272,12 @@ def process_args_into_dataframe(args, wide_mode, var_name, value_name): length = len(df_output[next(iter(df_output))]) if len(df_output) else 0 df_output.update( - {col_name: pd.Series(range(length), name=col_name) for col_name in ranges} + {col_name: to_unindexed_series(range(length), col_name) for col_name in ranges} ) df_output.update( { # constant is single value. repeat by len to avoid creating NaN on concating - col_name: pd.Series([constants[col_name]] * length, name=col_name) + col_name: to_unindexed_series([constants[col_name]] * length, col_name) for col_name in constants } ) From 97b214eba83e4234d2719cddfc1a75c0fec8e4d1 Mon Sep 17 00:00:00 2001 From: Stalin Sabu Thomas <44336699+legendof-selda@users.noreply.github.com> Date: Tue, 25 Jul 2023 13:02:28 +0530 Subject: [PATCH 6/6] ci: add test for perf warning --- .../test_optional/test_px/test_px_wide.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/packages/python/plotly/plotly/tests/test_optional/test_px/test_px_wide.py b/packages/python/plotly/plotly/tests/test_optional/test_px/test_px_wide.py index 9aef665760..1aac7b70ea 100644 --- a/packages/python/plotly/plotly/tests/test_optional/test_px/test_px_wide.py +++ b/packages/python/plotly/plotly/tests/test_optional/test_px/test_px_wide.py @@ -1,9 +1,11 @@ import plotly.express as px import plotly.graph_objects as go import pandas as pd +import numpy as np from plotly.express._core import build_dataframe, _is_col_list from pandas.testing import assert_frame_equal import pytest +import warnings def test_is_col_list(): @@ -847,3 +849,29 @@ def test_line_group(): assert len(fig.data) == 4 fig = px.scatter(df, x="x", y=["miss", "score"], color="who") assert len(fig.data) == 2 + + +def test_no_pd_perf_warning(): + n_cols = 1000 + n_rows = 1000 + + columns = list(f"col_{c}" for c in range(n_cols)) + index = list(f"i_{r}" for r in range(n_rows)) + + df = pd.DataFrame( + np.random.uniform(size=(n_rows, n_cols)), index=index, columns=columns + ) + + with warnings.catch_warnings(record=True) as warn_list: + _ = px.bar( + df, + x=df.index, + y=df.columns[:-2], + labels=df.columns[:-2], + ) + performance_warnings = [ + warn + for warn in warn_list + if issubclass(warn.category, pd.errors.PerformanceWarning) + ] + assert len(performance_warnings) == 0, "PerformanceWarning(s) raised!"