Skip to content

Commit

Permalink
fix: cardinality to return num unique values per dimension (#137)
Browse files Browse the repository at this point in the history
* fix cardinality to return num unique values per dimension

* incorporate pr feedback
  • Loading branch information
axiomofjoy authored Dec 30, 2022
1 parent 2e0df4b commit ead0a44
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 32 deletions.
7 changes: 3 additions & 4 deletions src/phoenix/metrics/cardinality.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@
"""

import concurrent.futures as cf
from typing import Any, Dict, List, Optional
from typing import Dict, List, Optional

import pandas as pd
from pandas.core.algorithms import value_counts


def cardinality(
df: pd.DataFrame, column_names: List[str], max_workers: Optional[int] = None
) -> Dict[str, "pd.Series[Any]"]:
) -> Dict[str, int]:
data = {}
with cf.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_column_name = {
executor.submit(value_counts, df[col], dropna=False): col for col in column_names
executor.submit(lambda x: x.nunique(dropna=False), df[col]): col for col in column_names
}
for future in cf.as_completed(future_to_column_name):
column_name = future_to_column_name[future]
Expand Down
45 changes: 17 additions & 28 deletions tests/metrics/test_cardinality.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import random
import string
from typing import Dict, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -71,45 +72,33 @@ def test_cardinality_produces_correct_counts_for_columns_of_various_data_types(
):
max_count = 30
value_to_count = {value: random.randint(1, max_count) for value in unique_values}
column, expected_counts_column = _get_data_column_and_expected_counts_column(value_to_count)
column, expected_cardinality = _get_data_column_and_expected_cardinality(value_to_count)
expected_cardinality_data = {"feature0": expected_cardinality}
input_df = pd.DataFrame.from_dict({"feature0": column})
output_data = cardinality(input_df, input_df.columns)
assert set(output_data.keys()) == set(input_df.columns)
output_counts_column = output_data["feature0"].sort_index(key=lambda x: x.astype("str"))
assert output_counts_column.equals(expected_counts_column)
cardinality_data = cardinality(input_df, input_df.columns)
assert cardinality_data == expected_cardinality_data


@pytest.mark.parametrize("unique_ints, unique_strings", [(4, 4), (6, 6), (9, 9)], indirect=True)
def test_cardinality_produces_correct_counts_for_dataframe_with_multiple_columns(
unique_ints, unique_strings
):
first_column, first_expected_counts_column = _get_data_column_and_expected_counts_column(
{value: (index + 1) ** 2 for index, value in enumerate(unique_ints)}
)
second_column, second_expected_counts_column = _get_data_column_and_expected_counts_column(
{value: (index + 1) ** 2 for index, value in enumerate(unique_strings)}
)
third_column = pd.Series(np.zeros(first_column.shape[0], dtype=np.int8)) # omitted column
def test_cardinality_produces_correct_counts_for_dataframe_with_multiple_columns():
first_column = pd.Series(["a", "b", "a", "a", "c"])
second_column = pd.Series([1, 2, 3, 4, 5], dtype=np.int8)
third_column = pd.Series([0, 0, 0, 0, 0], dtype=np.int8) # omitted column
input_df = pd.DataFrame.from_dict(
{"feature0": first_column, "feature1": second_column, "feature2": third_column}
)
column_names = ["feature0", "feature1"]
output_data = cardinality(input_df, column_names)
assert set(output_data.keys()) == set(column_names)
first_counts_column = output_data["feature0"].sort_index(key=lambda x: x.astype("str"))
second_counts_column = output_data["feature1"].sort_index(key=lambda x: x.astype("str"))
assert first_counts_column.equals(first_expected_counts_column)
assert second_counts_column.equals(second_expected_counts_column)
cardinality_data = cardinality(input_df, column_names)
assert cardinality_data == {
"feature0": 3,
"feature1": 5,
}


def _get_data_column_and_expected_counts_column(value_to_count):
def _get_data_column_and_expected_cardinality(value_to_count: Dict[Union[str, float], int]):
column = []
for value, count in value_to_count.items():
column.extend([value] * count)
random.shuffle(column)
column = pd.Series(column)
expected_column_values, expected_counts = zip(*value_to_count.items())
expected_column = pd.Series(expected_counts, index=expected_column_values).sort_index(
key=lambda x: x.astype("str")
)
return column, expected_column
expected_cardinality = len(value_to_count)
return column, expected_cardinality

0 comments on commit ead0a44

Please sign in to comment.