Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve dtypes benchmark #2393

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
287da18
skip unsupported data types
rwedge Feb 18, 2025
ae05291
use excluded_tests file; add rdt test
rwedge Feb 19, 2025
a562911
switch to debug folder / slack channel
rwedge Feb 19, 2025
0f2e84a
filter transformers by sdtype
rwedge Feb 19, 2025
3538b68
run workflow on commit
rwedge Feb 19, 2025
0bd9ad9
include colum name in fit; update report names
rwedge Feb 20, 2025
efe942a
separate rdt and constraint percentages in summary
rwedge Feb 20, 2025
247cdc1
prune push actions; fix summary logic
rwedge Feb 20, 2025
d58dde4
starts with RDT
rwedge Feb 20, 2025
879735d
don't overwrite results df with summary df
rwedge Feb 20, 2025
0606dfe
aggregate fit and sample as well
rwedge Feb 20, 2025
8f99379
summary shows averages only
rwedge Feb 20, 2025
698b720
fix KeyError
rwedge Feb 21, 2025
67d6b46
update summary col order; record None values if rdt input type mismatch
rwedge Feb 21, 2025
da6a7b1
use floats instead of bools
rwedge Feb 21, 2025
4f3b0fa
load_temp_results astype float
rwedge Feb 21, 2025
9513ed1
try boolean dtype
rwedge Feb 21, 2025
4a1eded
handle pd.NA equality bug
rwedge Feb 21, 2025
bedc3a8
revert to skipping
rwedge Feb 21, 2025
1aa10df
transformer kwargs
rwedge Feb 24, 2025
2fe06fb
LogitScaler kwargs
rwedge Feb 24, 2025
6b2a3aa
use int8 min/max
rwedge Feb 24, 2025
a08c9a7
cast previous results to Boolean
rwedge Feb 24, 2025
419cfa6
warn when get_previous_dtype fails to get previous value
rwedge Feb 24, 2025
13a9b0b
use float instead of nullable boolean
rwedge Feb 24, 2025
01a27c9
update compare_previous to work with floats
rwedge Feb 25, 2025
10cc837
add back on push workflow triggers
rwedge Feb 25, 2025
eed8b6a
lint
rwedge Feb 25, 2025
be9f309
formatting
rwedge Feb 25, 2025
d5d902a
exclude ordinal transformers
rwedge Feb 25, 2025
6182f97
add id transformers
rwedge Feb 26, 2025
de2e73e
don't force to bool when coloring a cell
rwedge Feb 26, 2025
e8e5092
remove constraints that are planned to be deprecated
rwedge Feb 27, 2025
8991f2e
filter constraint tests by list of valid combinations
rwedge Feb 27, 2025
73e6634
try to fix xlsxwriter error
rwedge Feb 28, 2025
676a59c
remove other on push triggers
rwedge Feb 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/dtypes_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- main
- issue-2365-improve-dtypes-benchmark

jobs:
run_dtypes_benchmark:
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/integration.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
name: Integration Tests

on:
push:
pull_request:
types: [opened, reopened]

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
name: Style Checks

on:
push:
pull_request:
types: [opened, reopened]

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/minimum.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
name: Unit Tests Minimum Versions

on:
push:
pull_request:
types: [opened, reopened]

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/unit.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
name: Unit Tests

on:
push:
pull_request:
types: [opened, reopened]

Expand Down
4 changes: 2 additions & 2 deletions tests/_external/gdrive_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def _set_color_fields(worksheet, data, marked_data, writer, color_code):
if data.loc[data_row, 'dtype'] == dtype and data.loc[data_row, 'sdtype'] == sdtype:
method_col = data.columns.get_loc(method)
worksheet.write(
data_row + 1, method_col, bool(data.loc[data_row, method]), format_code
data_row + 1, method_col, data.loc[data_row, method], format_code
)


Expand Down Expand Up @@ -172,7 +172,7 @@ def save_to_gdrive(output_folder, results, output_filename=None, mark_results=No
output_filename = _generate_filename()

output = io.BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: # pylint: disable=E0110
with pd.ExcelWriter(output, engine='xlsxwriter', engine_kwargs={'options': {'nan_inf_to_errors': True}}) as writer: # pylint: disable=E0110
for sheet_name, data in results.items():
data.to_excel(writer, sheet_name=sheet_name, index=False)
_set_column_width(writer, data, sheet_name)
Expand Down
145 changes: 20 additions & 125 deletions tests/benchmark/excluded_tests.py
Original file line number Diff line number Diff line change
@@ -1,126 +1,21 @@
"""Excluded tests from constraints due to hard crashing from NumPy or Pandas."""
"""Excluded test combinations."""

EXCLUDED_CONSTRAINT_TESTS = [
('numerical', 'pd.boolean', 'FixedIncrements'),
('numerical', 'pd.object', 'Positive'),
('numerical', 'pd.object', 'Negative'),
('numerical', 'pd.object', 'ScalarInequality'),
('numerical', 'pd.object', 'ScalarRange'),
('numerical', 'pd.string', 'Positive'),
('numerical', 'pd.string', 'Negative'),
('numerical', 'pd.string', 'ScalarInequality'),
('numerical', 'pd.category', 'Positive'),
('numerical', 'pd.category', 'Negative'),
('numerical', 'pd.category', 'ScalarInequality'),
('numerical', 'pd.category', 'ScalarRange'),
('numerical', 'pd.datetime64', 'Positive'),
('numerical', 'pd.datetime64', 'Negative'),
('numerical', 'pd.datetime64', 'ScalarInequality'),
('numerical', 'pd.timedelta64', 'Positive'),
('numerical', 'pd.timedelta64', 'Negative'),
('numerical', 'pd.timedelta64', 'ScalarInequality'),
('numerical', 'pd.Period', 'Positive'),
('numerical', 'pd.Period', 'Negative'),
('numerical', 'pd.Period', 'ScalarInequality'),
('numerical', 'pd.Period', 'FixedIncrements'),
('numerical', 'np.object', 'Positive'),
('numerical', 'np.object', 'Negative'),
('numerical', 'np.object', 'ScalarInequality'),
('numerical', 'np.string', 'Positive'),
('numerical', 'np.string', 'Negative'),
('numerical', 'np.string', 'ScalarInequality'),
('numerical', 'np.bytes', 'Positive'),
('numerical', 'np.bytes', 'Negative'),
('numerical', 'np.bytes', 'ScalarInequality'),
('numerical', 'np.unicode', 'Positive'),
('numerical', 'np.unicode', 'Negative'),
('numerical', 'np.unicode', 'ScalarInequality'),
('numerical', 'np.datetime64', 'Positive'),
('numerical', 'np.datetime64', 'Negative'),
('numerical', 'np.datetime64', 'ScalarInequality'),
('numerical', 'np.timedelta64', 'Positive'),
('numerical', 'np.timedelta64', 'Negative'),
('numerical', 'np.timedelta64', 'ScalarInequality'),
('numerical', 'pa.string', 'Positive'),
('numerical', 'pa.string', 'Negative'),
('numerical', 'pa.string', 'ScalarInequality'),
('numerical', 'pa.utf8', 'Positive'),
('numerical', 'pa.utf8', 'Negative'),
('numerical', 'pa.utf8', 'ScalarInequality'),
('numerical', 'pa.binary', 'Positive'),
('numerical', 'pa.binary', 'Negative'),
('numerical', 'pa.binary', 'ScalarInequality'),
('numerical', 'pa.binary', 'FixedIncrements'),
('numerical', 'pa.large_binary', 'Positive'),
('numerical', 'pa.large_binary', 'Negative'),
('numerical', 'pa.large_binary', 'ScalarInequality'),
('numerical', 'pa.large_binary', 'FixedIncrements'),
('numerical', 'pa.large_string', 'Positive'),
('numerical', 'pa.large_string', 'Negative'),
('numerical', 'pa.large_string', 'ScalarInequality'),
('numerical', 'pa.date32', 'Positive'),
('numerical', 'pa.date32', 'Negative'),
('numerical', 'pa.date32', 'ScalarInequality'),
('numerical', 'pa.date64', 'Positive'),
('numerical', 'pa.date64', 'Negative'),
('numerical', 'pa.date64', 'ScalarInequality'),
('numerical', 'pa.timestamp', 'Positive'),
('numerical', 'pa.timestamp', 'Negative'),
('numerical', 'pa.timestamp', 'ScalarInequality'),
('numerical', 'pa.duration', 'Positive'),
('numerical', 'pa.duration', 'Negative'),
('numerical', 'pa.duration', 'ScalarInequality'),
('numerical', 'pa.time32', 'Positive'),
('numerical', 'pa.time32', 'Negative'),
('numerical', 'pa.time32', 'ScalarInequality'),
('numerical', 'pa.time64', 'Positive'),
('numerical', 'pa.time64', 'Negative'),
('numerical', 'pa.time64', 'ScalarInequality'),
('numerical', 'pa.binary_view', 'Positive'),
('numerical', 'pa.binary_view', 'Negative'),
('numerical', 'pa.binary_view', 'ScalarInequality'),
('numerical', 'pa.binary_view', 'FixedIncrements'),
('numerical', 'pa.string_view', 'Positive'),
('numerical', 'pa.string_view', 'Negative'),
('numerical', 'pa.string_view', 'ScalarInequality'),
('datetime', 'pd.object', 'ScalarRange'),
('datetime', 'pd.category', 'ScalarRange'),
('numerical', 'pd.category', 'Inequality'),
('numerical', 'pd.category', 'Range'),
('numerical', 'pd.datetime64', 'Inequality'),
('numerical', 'pd.datetime64', 'Range'),
('numerical', 'pd.Period', 'Inequality'),
('numerical', 'pd.Period', 'Range'),
('numerical', 'np.datetime64', 'Inequality'),
('numerical', 'np.datetime64', 'Range'),
('numerical', 'pa.bool', 'Inequality'),
('numerical', 'pa.bool', 'Range'),
('numerical', 'pa.large_binary', 'Inequality'),
('numerical', 'pa.large_binary', 'Range'),
('numerical', 'pa.date32', 'Inequality'),
('numerical', 'pa.date32', 'Range'),
('numerical', 'pa.date64', 'Inequality'),
('numerical', 'pa.date64', 'Range'),
('numerical', 'pa.timestamp', 'Inequality'),
('numerical', 'pa.timestamp', 'Range'),
('numerical', 'pa.time32', 'Inequality'),
('numerical', 'pa.time32', 'Range'),
('numerical', 'pa.time64', 'Inequality'),
('numerical', 'pa.time64', 'Range'),
('numerical', 'pa.string', 'FixedIncrements'),
('numerical', 'pa.utf8', 'FixedIncrements'),
('numerical', 'pa.large_string', 'FixedIncrements'),
('numerical', 'pa.string_view', 'FixedIncrements'),
('numerical', 'pa.string', 'Inequality'),
('numerical', 'pa.string', 'Range'),
('numerical', 'pa.utf8', 'Inequality'),
('numerical', 'pa.utf8', 'Range'),
('numerical', 'pa.binary', 'Inequality'),
('numerical', 'pa.binary', 'Range'),
('numerical', 'pa.large_string', 'Inequality'),
('numerical', 'pa.large_string', 'Range'),
('numerical', 'pa.binary_view', 'Inequality'),
('numerical', 'pa.binary_view', 'Range'),
('numerical', 'pa.string_view', 'Inequality'),
('numerical', 'pa.string_view', 'Range'),
]
EXCLUDED_DATA_TYPES = {
('pd.boolean', 'numerical'),
('pd.string', 'numerical'),
('pd.category', 'numerical'),
('pd.Period', 'numerical'),
('np.bool', 'numerical'),
('np.object', 'numerical'),
('np.string', 'numerical'),
('np.unicode', 'numerical'),
('pd.boolean', 'datetime'),
('pd.timedelta64', 'datetime'),
('pd.Period', 'datetime'),
('pd.Complex', 'datetime'),
('np.complex64', 'datetime'),
('np.complex128', 'datetime'),
('np.bool', 'datetime'),
('np.unicode', 'datetime'),
('np.timedelta64', 'datetime'),
}
9 changes: 9 additions & 0 deletions tests/benchmark/included_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
INCLUDED_CONSTRAINT_TESTS =[
('numerical', 'FixedIncrements'),
('categorical', 'FixedCombinations'),
('boolean', 'FixedCombinations'),
('numerical', 'Inequality'),
('datetime', 'Inequality'),
('numerical', 'Range'),
('datetime', 'Range'),
]
Loading
Loading