Skip to content

Commit

Permalink
Add support for numpy 2.0.0 (#2269)
Browse files Browse the repository at this point in the history
  • Loading branch information
R-Palazzo authored Oct 30, 2024
1 parent 8651241 commit 5d76780
Show file tree
Hide file tree
Showing 11 changed files with 142 additions and 52 deletions.
15 changes: 9 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ dependencies = [
'botocore>=1.31,<2.0.0',
'cloudpickle>=2.1.0',
'graphviz>=0.13.2',
"numpy>=1.21.0,<2.0.0;python_version<'3.10'",
"numpy>=1.23.3,<2.0.0;python_version>='3.10' and python_version<'3.12'",
"numpy>=1.26.0,<2.0.0;python_version>='3.12'",
"numpy>=1.21.0;python_version<'3.10'",
"numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'",
"numpy>=1.26.0;python_version>='3.12'",
"pandas>=1.4.0;python_version<'3.11'",
"pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
"pandas>=2.1.1;python_version>='3.12'",
'tqdm>=4.29',
'copulas>=0.11.0',
'ctgan>=0.10.0',
'deepecho>=0.6.0',
'ctgan>=0.10.2',
'deepecho>=0.6.1',
'rdt>=1.12.3',
'sdmetrics>=0.16.0',
'platformdirs>=4.0',
Expand Down Expand Up @@ -207,7 +207,10 @@ select = [
# print statements
"T201",
# pandas-vet
"PD"
"PD",
# numpy 2.0
"NPY201"

]
ignore = [
# pydocstyle
Expand Down
1 change: 1 addition & 0 deletions sdv/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def _is_datetime_type(value):
bool(_get_datetime_format([value]))
or isinstance(value, pd.Timestamp)
or isinstance(value, datetime)
or isinstance(value, pd.Period)
or (isinstance(value, str) and pd.notna(pd.to_datetime(value, errors='coerce')))
):
return False
Expand Down
8 changes: 8 additions & 0 deletions sdv/constraints/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,14 @@ def _transform(self, table_data):
pandas.DataFrame:
Transformed data.
"""
# To make the NaN to None mapping work for pd.Categorical data, we need to convert
# the columns to object before replacing NaNs with None.
table_data[self._columns] = table_data[self._columns].astype({
col: object
for col in self._columns
if pd.api.types.is_categorical_dtype(table_data[col])
})

table_data[self._columns] = table_data[self._columns].replace({np.nan: None})
combinations = table_data[self._columns].itertuples(index=False, name=None)
uuids = map(self._combinations_to_uuids.get, combinations)
Expand Down
23 changes: 17 additions & 6 deletions sdv/data_processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@
)
from sdv.data_processing.datetime_formatter import DatetimeFormatter
from sdv.data_processing.errors import InvalidConstraintsError, NotFittedError
from sdv.data_processing.numerical_formatter import NumericalFormatter
from sdv.data_processing.numerical_formatter import INTEGER_BOUNDS, NumericalFormatter
from sdv.data_processing.utils import load_module_from_path
from sdv.errors import SynthesizerInputError, log_exc_stacktrace
from sdv.metadata.single_table import SingleTableMetadata

LOGGER = logging.getLogger(__name__)
INTEGER_BOUNDS = {str(key).lower(): value for key, value in INTEGER_BOUNDS.items()}


class DataProcessor:
Expand Down Expand Up @@ -561,26 +562,36 @@ def _create_config(self, data, columns_created_by_constraints):
)

if sdtype == 'id':
is_numeric = pd.api.types.is_numeric_dtype(data[column].dtype)
function_name = 'bothify'
column_dtype = data[column].dtype
is_numeric = pd.api.types.is_numeric_dtype(column_dtype)
if column_metadata.get('regex_format', False):
transformers[column] = self.create_regex_generator(
column, sdtype, column_metadata, is_numeric
)
sdtypes[column] = 'text'

else:
bothify_format = 'sdv-id-??????'
if is_numeric:
bothify_format = '#########'
function_name = 'random_int'
column_dtype = str(column_dtype).lower()
function_kwargs = {'min': 0, 'max': 9999999}
for key in INTEGER_BOUNDS:
if key in column_dtype:
_, max_value = INTEGER_BOUNDS[key]
function_kwargs = {'min': 0, 'max': max_value}

else:
function_kwargs = {'text': 'sdv-id-??????'}

cardinality_rule = None
if column in self._keys:
cardinality_rule = 'unique'

transformers[column] = AnonymizedFaker(
provider_name=None,
function_name='bothify',
function_kwargs={'text': bothify_format},
function_name=function_name,
function_kwargs=function_kwargs,
cardinality_rule=cardinality_rule,
)

Expand Down
3 changes: 3 additions & 0 deletions tests/benchmark/excluded_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
('numerical', 'np.string', 'Positive'),
('numerical', 'np.string', 'Negative'),
('numerical', 'np.string', 'ScalarInequality'),
('numerical', 'np.bytes', 'Positive'),
('numerical', 'np.bytes', 'Negative'),
('numerical', 'np.bytes', 'ScalarInequality'),
('numerical', 'np.unicode', 'Positive'),
('numerical', 'np.unicode', 'Negative'),
('numerical', 'np.unicode', 'ScalarInequality'),
Expand Down
11 changes: 7 additions & 4 deletions tests/benchmark/numpy_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,17 @@
}),
'np.string': pd.DataFrame({
'np.string': pd.Series([
np.string_('string1'),
np.string_('string2'),
np.string_('string3'),
np.str_('string1'),
np.str_('string2'),
np.str_('string3'),
])
}),
'np.bytes': pd.DataFrame({
'np.bytes': pd.Series([np.bytes_('bytes1'), np.bytes_('bytes2'), np.bytes_('bytes3')])
}),
'np.unicode': pd.DataFrame({
'np.unicode': pd.Series(
[np.unicode_('unicode1'), np.unicode_('unicode2'), np.unicode_('unicode3')],
[np.str_('unicode1'), np.str_('unicode2'), np.str_('unicode3')],
dtype='string',
)
}),
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmark/supported_dtypes_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
'np.object': 'categorical',
'np.bool': 'categorical',
'np.string': 'categorical',
'np.bytes': 'categorical',
'np.unicode': 'categorical',
# PyArrow
'pa.int8': 'numerical',
Expand Down Expand Up @@ -378,7 +379,6 @@ def _create_single_column_constraint_and_data(constraint, data, dtype, sdtype):


def _create_multi_column_constraint_data_and_metadata(constraint, data, dtype, sdtype, metadata):
_dtype = data.dtypes[dtype]
constraint_class = constraint.get('constraint_class')
constraints = []
if constraint_class == 'FixedCombinations':
Expand Down
50 changes: 25 additions & 25 deletions tests/integration/single_table/test_copulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,31 +347,31 @@ def test_numerical_columns_gets_pii():

# Assert
expected_sampled = pd.DataFrame({
'id': {
0: 807994768,
1: 746439230,
2: 201363792,
3: 364823003,
4: 726973888,
5: 693331380,
6: 795819284,
7: 607278621,
8: 783746695,
9: 162118876,
},
'city': {
0: 'Danielfort',
1: 'Glendaside',
2: 'Port Jenniferchester',
3: 'Port Susan',
4: 'West Michellemouth',
5: 'West Jason',
6: 'Ryanfort',
7: 'West Stephenland',
8: 'Davidland',
9: 'Port Christopher',
},
'numerical': {0: 22, 1: 24, 2: 22, 3: 23, 4: 22, 5: 24, 6: 23, 7: 24, 8: 24, 9: 24},
'id': [
1089619006166876142,
8373046707753416652,
9070705361670139280,
7227045982112645011,
3461931576753619633,
1005734164466301683,
3312031189447929384,
82456842876428117,
1819741328868365520,
8019169766233150107,
],
'city': [
'Danielfort',
'Glendaside',
'Port Jenniferchester',
'Port Susan',
'West Michellemouth',
'West Jason',
'Ryanfort',
'West Stephenland',
'Davidland',
'Port Christopher',
],
'numerical': [22, 24, 22, 23, 22, 24, 23, 24, 24, 24],
})
pd.testing.assert_frame_equal(expected_sampled, sampled)

Expand Down
22 changes: 22 additions & 0 deletions tests/unit/constraints/test_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,28 @@ def test_transform_non_string(self):
expected_out_a = pd.Series(['a', 'b', 'c'], name='a')
pd.testing.assert_series_equal(expected_out_a, out['a'])

def test_transform_categorical_dtype(self):
"""Test ``transform`` with categorical columns."""
# Setup
table_data = pd.DataFrame({
'a': ['a', 'b', 'c'],
'b': pd.Categorical(['d', None, 'f']),
'c': pd.Categorical(['g', 'h', np.nan]),
})
columns = ['b', 'c']
instance = FixedCombinations(column_names=columns)
instance.fit(table_data)

# Run
out = instance.transform(table_data)

# Assert
assert out['b#c'].isna().sum() == 0
assert instance._combinations_to_uuids is not None
assert instance._uuids_to_combinations is not None
expected_out_a = pd.Series(['a', 'b', 'c'], name='a')
pd.testing.assert_series_equal(expected_out_a, out['a'])

def test_transform_not_all_columns_provided(self):
"""Test the ``FixedCombinations.transform`` method.
Expand Down
39 changes: 29 additions & 10 deletions tests/unit/data_processing/test_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,7 +1137,9 @@ def test__create_config(self):
'first_name': ['John', 'Doe', 'Johanna'],
'id': ['ID_001', 'ID_002', 'ID_003'],
'id_no_regex': ['ID_001', 'ID_002', 'ID_003'],
'id_numeric': [0, 1, 2],
'id_numeric_int8': pd.Series([1, 2, 3], dtype='Int8'),
'id_numeric_int16': pd.Series([1, 2, 3], dtype='Int16'),
'id_numeric_int32': pd.Series([1, 2, 3], dtype='Int32'),
'id_column': ['ID_999', 'ID_999', 'ID_007'],
'date': ['2021-02-01', '2022-03-05', '2023-01-31'],
'unknown': ['a', 'b', 'c'],
Expand All @@ -1151,9 +1153,9 @@ def test__create_config(self):
dp.create_anonymized_transformer.return_value = 'AnonymizedFaker'
dp.create_regex_generator.return_value = 'RegexGenerator'
dp.metadata.primary_key = 'id'
dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric']
dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric_int8']
dp._primary_key = 'id'
dp._keys = ['id', 'id_no_regex', 'id_numeric']
dp._keys = ['id', 'id_no_regex', 'id_numeric_int8']
dp.metadata.columns = {
'int': {'sdtype': 'numerical'},
'float': {'sdtype': 'numerical'},
Expand All @@ -1163,7 +1165,9 @@ def test__create_config(self):
'first_name': {'sdtype': 'first_name'},
'id': {'sdtype': 'id', 'regex_format': 'ID_\\d{3}[0-9]'},
'id_no_regex': {'sdtype': 'id'},
'id_numeric': {'sdtype': 'id'},
'id_numeric_int8': {'sdtype': 'id'},
'id_numeric_int16': {'sdtype': 'id'},
'id_numeric_int32': {'sdtype': 'id'},
'id_column': {'sdtype': 'id'},
'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
'unknown': {'sdtype': 'unknown'},
Expand All @@ -1188,7 +1192,9 @@ def test__create_config(self):
'first_name': 'pii',
'id': 'text',
'id_no_regex': 'text',
'id_numeric': 'text',
'id_numeric_int8': 'text',
'id_numeric_int16': 'text',
'id_numeric_int32': 'text',
'id_column': 'text',
'date': 'datetime',
'unknown': 'pii',
Expand Down Expand Up @@ -1236,11 +1242,24 @@ def test__create_config(self):
assert id_no_regex_transformer.function_kwargs == {'text': 'sdv-id-??????'}
assert id_no_regex_transformer.cardinality_rule == 'unique'

id_numeric_transformer = config['transformers']['id_numeric']
assert isinstance(id_numeric_transformer, AnonymizedFaker)
assert id_numeric_transformer.function_name == 'bothify'
assert id_numeric_transformer.function_kwargs == {'text': '#########'}
assert id_numeric_transformer.cardinality_rule == 'unique'
id_numeric_int_8_transformer = config['transformers']['id_numeric_int8']
assert isinstance(id_numeric_int_8_transformer, AnonymizedFaker)
assert id_numeric_int_8_transformer.function_name == 'random_int'
assert id_numeric_int_8_transformer.function_kwargs == {'min': 0, 'max': 127}
assert id_numeric_int_8_transformer.cardinality_rule == 'unique'

id_numeric_int_16_transformer = config['transformers']['id_numeric_int16']
assert isinstance(id_numeric_int_16_transformer, AnonymizedFaker)
assert id_numeric_int_16_transformer.function_name == 'random_int'
assert id_numeric_int_16_transformer.function_kwargs == {'min': 0, 'max': 32767}

id_numeric_int_32_transformer = config['transformers']['id_numeric_int32']
assert isinstance(id_numeric_int_32_transformer, AnonymizedFaker)
assert id_numeric_int_32_transformer.function_name == 'random_int'
assert id_numeric_int_32_transformer.function_kwargs == {
'min': 0,
'max': 2147483647,
}

id_column_transformer = config['transformers']['id_column']
assert isinstance(id_column_transformer, AnonymizedFaker)
Expand Down
20 changes: 20 additions & 0 deletions tests/unit/test__utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,26 @@ def test__is_datetime_type_with_datetime_series():
assert is_datetime


def test__is_datetime_type_with_period():
"""Test the ``_is_datetime_type`` function when a period series is passed.
Expect to return True when a period series is passed.
Input:
- A pandas.Series of type `period`
Output:
- True
"""
# Setup
data = pd.Series(pd.period_range('2023-01', periods=3, freq='M'))

# Run
is_datetime = _is_datetime_type(data)

# Assert
assert is_datetime


def test__is_datetime_type_with_mixed_array():
"""Test the ``_is_datetime_type`` function with a list of mixed datetime types."""
# Setup
Expand Down

0 comments on commit 5d76780

Please sign in to comment.