Skip to content

Commit

Permalink
fix pandas pivot null values
Browse files Browse the repository at this point in the history
  • Loading branch information
eschutho committed Aug 9, 2024
1 parent e77f755 commit 0e44172
Show file tree
Hide file tree
Showing 2 changed files with 344 additions and 1 deletion.
5 changes: 4 additions & 1 deletion superset/charts/post_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from io import StringIO
from typing import Any, Optional, TYPE_CHECKING, Union

import numpy as np
import pandas as pd
from flask_babel import gettext as __

Expand Down Expand Up @@ -150,6 +151,8 @@ def pivot_df( # pylint: disable=too-many-locals, too-many-arguments, too-many-s
if show_rows_total:
# add subtotal for each group and overall total; we start from the
# overall group, and iterate deeper into subgroups
# Ensure "NULL" strings are replaced with NaN
df.replace("NULL", np.nan, inplace=True)
groups = df.columns
for level in range(df.columns.nlevels):
subgroups = {group[:level] for group in groups}
Expand All @@ -171,7 +174,7 @@ def pivot_df( # pylint: disable=too-many-locals, too-many-arguments, too-many-s
for subgroup in subgroups:
slice_ = df.index.get_loc(subgroup)
subtotal = pivot_v2_aggfunc_map[aggfunc](
df.iloc[slice_, :].apply(pd.to_numeric), axis=0
df.iloc[slice_, :].apply(pd.to_numeric, errors="coerce"), axis=0
)
depth = df.index.nlevels - len(subgroup) - 1
total = metric_name if level == 0 else __("Subtotal")
Expand Down
340 changes: 340 additions & 0 deletions tests/unit_tests/charts/test_post_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,346 @@ def test_pivot_df_single_row_two_metrics():
)


def test_pivot_df_single_row_null_values():
"""
Pivot table when a single column and 2 metrics are selected.
"""
df = pd.DataFrame.from_dict(
{
"gender": {0: "girl", 1: "boy"},
"SUM(num)": {0: 118065, 1: "NULL"},
"MAX(num)": {0: 2588, 1: None},
}
)
assert (
df.to_markdown()
== """
| | gender | SUM(num) | MAX(num) |
|---:|:---------|-----------:|-----------:|
| 0 | girl | 118065 | 2588 |
| 1 | boy | nan | nan |
""".strip()
)

pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=False,
show_rows_total=False,
show_columns_total=False,
apply_metrics_on_rows=False,
)
assert (
pivoted.to_markdown()
== """
| | ('SUM(num)',) | ('MAX(num)',) |
|:----------|:----------------|:----------------|
| ('boy',) | NULL | NULL |
| ('girl',) | 118065.0 | 2588.0 |
""".strip()
)

# transpose_pivot
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=True,
combine_metrics=False,
show_rows_total=False,
show_columns_total=False,
apply_metrics_on_rows=False,
)
assert (
pivoted.to_markdown()
== """
| | ('SUM(num)', 'boy') | ('SUM(num)', 'girl') | ('MAX(num)', 'boy') | ('MAX(num)', 'girl') |
|:-----------------|:----------------------|-----------------------:|:----------------------|-----------------------:|
| ('Total (Sum)',) | NULL | 118065 | NULL | 2588 |
""".strip()
)

# combine_metrics does nothing in this case
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=True,
show_rows_total=False,
show_columns_total=False,
apply_metrics_on_rows=False,
)
assert (
pivoted.to_markdown()
== """
| | ('SUM(num)',) | ('MAX(num)',) |
|:----------|:----------------|:----------------|
| ('boy',) | NULL | NULL |
| ('girl',) | 118065.0 | 2588.0 |
""".strip()
)

# show totals
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=False,
show_rows_total=True,
show_columns_total=True,
apply_metrics_on_rows=False,
)
assert (
pivoted.to_markdown()
== """
| | ('SUM(num)',) | ('MAX(num)',) | ('Total (Sum)',) |
|:-----------------|----------------:|----------------:|-------------------:|
| ('boy',) | nan | nan | 0 |
| ('girl',) | 118065 | 2588 | 120653 |
| ('Total (Sum)',) | 118065 | 2588 | 120653 |
""".strip()
)

# apply_metrics_on_rows
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=False,
show_rows_total=True,
show_columns_total=True,
apply_metrics_on_rows=True,
)
assert (
pivoted.to_markdown()
== f"""
| | ('{_("Total")} (Sum)',) |
|:-------------------------|-------------------:|
| ('SUM(num)', 'boy') | nan |
| ('SUM(num)', 'girl') | 118065 |
| ('SUM(num)', 'Subtotal') | 118065 |
| ('MAX(num)', 'boy') | nan |
| ('MAX(num)', 'girl') | 2588 |
| ('MAX(num)', 'Subtotal') | 2588 |
| ('{_("Total")} (Sum)', '') | 120653 |
""".strip()
)

# apply_metrics_on_rows with combine_metrics
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=True,
show_rows_total=True,
show_columns_total=True,
apply_metrics_on_rows=True,
)
assert (
pivoted.to_markdown()
== f"""
| | ('{_("Total")} (Sum)',) |
|:---------------------|-------------------:|
| ('boy', 'SUM(num)') | nan |
| ('boy', 'MAX(num)') | nan |
| ('boy', 'Subtotal') | 0 |
| ('girl', 'SUM(num)') | 118065 |
| ('girl', 'MAX(num)') | 2588 |
| ('girl', 'Subtotal') | 120653 |
| ('{_("Total")} (Sum)', '') | 120653 |
""".strip()
)


def test_pivot_df_single_row_null_mix_values():
"""
Pivot table when a single column and 2 metrics are selected.
"""
df = pd.DataFrame.from_dict(
{
"gender": {0: "girl", 1: "boy"},
"SUM(num)": {0: 118065, 1: "NULL"},
"MAX(num)": {0: 2588, 1: None},
}
)
assert (
df.to_markdown()
== """
| | gender | SUM(num) | MAX(num) |
|---:|:---------|:-----------|-----------:|
| 0 | girl | 118065 | 2588 |
| 1 | boy | NULL | nan |
""".strip()
)

pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=False,
show_rows_total=False,
show_columns_total=False,
apply_metrics_on_rows=False,
)
assert (
pivoted.to_markdown()
== """
| | ('SUM(num)',) | ('MAX(num)',) |
|:----------|:----------------|:----------------|
| ('boy',) | NULL | NULL |
| ('girl',) | 118065 | 2588.0 |
""".strip()
)

# transpose_pivot
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=True,
combine_metrics=False,
show_rows_total=False,
show_columns_total=False,
apply_metrics_on_rows=False,
)
assert (
pivoted.to_markdown()
== """
| | ('SUM(num)', 'boy') | ('SUM(num)', 'girl') | ('MAX(num)', 'boy') | ('MAX(num)', 'girl') |
|:-----------------|:----------------------|-----------------------:|:----------------------|-----------------------:|
| ('Total (Sum)',) | NULL | 118065 | NULL | 2588 |
""".strip()
)

# combine_metrics does nothing in this case
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=True,
show_rows_total=False,
show_columns_total=False,
apply_metrics_on_rows=False,
)
assert (
pivoted.to_markdown()
== """
| | ('SUM(num)',) | ('MAX(num)',) |
|:----------|:----------------|:----------------|
| ('boy',) | NULL | NULL |
| ('girl',) | 118065 | 2588.0 |
""".strip()
)

# show totals
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=False,
show_rows_total=True,
show_columns_total=True,
apply_metrics_on_rows=False,
)
assert (
pivoted.to_markdown()
== """
| | ('SUM(num)',) | ('MAX(num)',) | ('Total (Sum)',) |
|:-----------------|----------------:|----------------:|-------------------:|
| ('boy',) | nan | nan | 0 |
| ('girl',) | 118065 | 2588 | 120653 |
| ('Total (Sum)',) | 118065 | 2588 | 120653 |
""".strip()
)

# apply_metrics_on_rows
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=False,
show_rows_total=True,
show_columns_total=True,
apply_metrics_on_rows=True,
)
assert (
pivoted.to_markdown()
== f"""
| | ('{_("Total")} (Sum)',) |
|:-------------------------|-------------------:|
| ('SUM(num)', 'boy') | nan |
| ('SUM(num)', 'girl') | 118065 |
| ('SUM(num)', 'Subtotal') | 118065 |
| ('MAX(num)', 'boy') | nan |
| ('MAX(num)', 'girl') | 2588 |
| ('MAX(num)', 'Subtotal') | 2588 |
| ('{_("Total")} (Sum)', '') | 120653 |
""".strip()
)

# apply_metrics_on_rows with combine_metrics
pivoted = pivot_df(
df,
rows=["gender"],
columns=[],
metrics=["SUM(num)", "MAX(num)"],
aggfunc="Sum",
transpose_pivot=False,
combine_metrics=True,
show_rows_total=True,
show_columns_total=True,
apply_metrics_on_rows=True,
)
assert (
pivoted.to_markdown()
== f"""
| | ('{_("Total")} (Sum)',) |
|:---------------------|-------------------:|
| ('boy', 'SUM(num)') | nan |
| ('boy', 'MAX(num)') | nan |
| ('boy', 'Subtotal') | 0 |
| ('girl', 'SUM(num)') | 118065 |
| ('girl', 'MAX(num)') | 2588 |
| ('girl', 'Subtotal') | 120653 |
| ('{_("Total")} (Sum)', '') | 120653 |
""".strip()
)


def test_pivot_df_complex():
"""
Pivot table when a column, rows and 2 metrics are selected.
Expand Down

0 comments on commit 0e44172

Please sign in to comment.