Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: percent empty #118

Merged
merged 9 commits into from
Feb 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion app/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ type Cluster {

enum DataQualityMetric {
cardinality
percentEmpty
}

type Dataset {
Expand All @@ -21,7 +22,7 @@ type Dimension implements Node {
name: String!
type: DimensionType!
dataType: DimensionDataType!
dataQualityMetric(metric: DataQualityMetric!): Int
dataQualityMetric(metric: DataQualityMetric!): Float
}

type DimensionConnection {
Expand Down
5 changes: 5 additions & 0 deletions app/src/components/model/ModelSchemaTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export function ModelSchemaTable(props: ModelSchemaTableProps) {
type
dataType
cardinality: dataQualityMetric(metric: cardinality)
percentEmpty: dataQualityMetric(metric: percentEmpty)
}
}
}
Expand Down Expand Up @@ -64,6 +65,10 @@ export function ModelSchemaTable(props: ModelSchemaTableProps) {
Header: "Cardinality",
accessor: "cardinality",
},
{
Header: "Percent Empty",
accessor: "percentEmpty",
},
];
return cols;
}, []);
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 16 additions & 3 deletions app/src/pages/__generated__/HomeQuery.graphql.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 0 additions & 4 deletions src/phoenix/metrics/cardinality.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
"""
Cardinality metrics
"""

import concurrent.futures as cf
from typing import Dict, List, Optional

Expand Down
14 changes: 14 additions & 0 deletions src/phoenix/metrics/percent_empty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import Dict, List, Optional

from pandas import DataFrame


def percent_empty(dataframe: DataFrame, column_names: List[str]) -> Dict[str, Optional[float]]:
"""
Returns a map of the dataframe column names to the percent of empty entries
for each row.
"""
num_records = dataframe.shape[0]
if num_records == 0:
return {col: None for col in column_names}
return dict(dataframe[column_names].isnull().sum() / num_records)
13 changes: 13 additions & 0 deletions src/phoenix/server/api/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,19 @@
from phoenix.core import DimensionDataType
from phoenix.core.model import Model
from phoenix.metrics.cardinality import cardinality
from phoenix.metrics.percent_empty import percent_empty


@dataclass
class Loaders:
cardinality: DataLoader[str, Optional[int]]
percent_empty: DataLoader[str, Optional[float]]


def create_loaders(model: Model) -> Loaders:
return Loaders(
cardinality=_get_cardinality_dataloader(model=model),
percent_empty=_get_percent_empty_dataloader(model=model),
)


Expand All @@ -39,3 +42,13 @@ async def _cardinality_load_function(column_names: List[str]) -> List[Optional[i
return [column_name_to_cardinality[col] for col in column_names]

return DataLoader(load_fn=_cardinality_load_function)


def _get_percent_empty_dataloader(model: Model) -> DataLoader[str, Optional[float]]:
async def _percent_empty_load_function(column_names: List[str]) -> List[Optional[float]]:
column_name_to_percent_empty = percent_empty(
dataframe=model.primary_dataset.dataframe, column_names=column_names
)
return [column_name_to_percent_empty[col] for col in column_names]

return DataLoader(load_fn=_percent_empty_load_function)
Comment on lines +47 to +54
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might not be worth a dataloader tbh

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will address in a separate pr

1 change: 1 addition & 0 deletions src/phoenix/server/api/types/DataQualityMetric.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
@strawberry.enum
class DataQualityMetric(Enum):
cardinality = "cardinality"
percentEmpty = "percentEmpty"
7 changes: 5 additions & 2 deletions src/phoenix/server/api/types/Dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@ class Dimension(Node):
@strawberry.field
async def dataQualityMetric(
self, metric: DataQualityMetric, info: Info[Context, None]
) -> Optional[int]:
) -> Optional[float]:
dimension_name = self.name
if metric is DataQualityMetric.cardinality:
return await info.context.loaders.cardinality.load(self.name)
return await info.context.loaders.cardinality.load(dimension_name)
elif metric is DataQualityMetric.percentEmpty:
return await info.context.loaders.percent_empty.load(dimension_name)
raise NotImplementedError(f"Metric {metric} is not implemented.")


Expand Down
41 changes: 41 additions & 0 deletions tests/metrics/drift/test_percent_empty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from numpy.testing import assert_array_almost_equal
from pandas import DataFrame

from phoenix.metrics.percent_empty import percent_empty


def test_percent_empty_returns_correct_percents_including_for_empty_and_full_columns():
dataframe = DataFrame(
{
"col0": [None, None, None],
"col1": [1.0, None, None],
"col2": ["string-entry", None, "another-string-entry"],
"col3": [0.1, 0.2, 0.3],
}
)
expected_column_name_to_percent_empty = {
"col0": 1.0,
"col1": 2 / 3,
"col2": 1 / 3,
"col3": 0.0,
}
expected_column_names = ["col0", "col1", "col2", "col3"]
column_name_to_percent_empty = percent_empty(
dataframe=dataframe, column_names=["col0", "col1", "col2", "col3"]
)
assert expected_column_names == sorted(column_name_to_percent_empty.keys())
assert_array_almost_equal(
[column_name_to_percent_empty[col] for col in expected_column_names],
[expected_column_name_to_percent_empty[col] for col in expected_column_names],
)


def test_percent_empty_returns_only_input_columns():
dataframe = DataFrame(
{
"col0": [1, 2, None],
"col1": [1.0, None, None],
}
)
column_name_to_percent_empty = percent_empty(dataframe=dataframe, column_names=["col0"])
assert ["col0"] == list(column_name_to_percent_empty.keys())