Add support for numpy 2.0.0 (#2269)

sdv-dev · Oct 30, 2024 · 5d76780 · 5d76780
1 parent 8651241
commit 5d76780
Show file tree

Hide file tree

Showing 11 changed files with 142 additions and 52 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,16 +25,16 @@ dependencies = [
     'botocore>=1.31,<2.0.0',
     'cloudpickle>=2.1.0',
     'graphviz>=0.13.2',
-    "numpy>=1.21.0,<2.0.0;python_version<'3.10'",
-    "numpy>=1.23.3,<2.0.0;python_version>='3.10' and python_version<'3.12'",
-    "numpy>=1.26.0,<2.0.0;python_version>='3.12'",
+    "numpy>=1.21.0;python_version<'3.10'",
+    "numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'",
+    "numpy>=1.26.0;python_version>='3.12'",
     "pandas>=1.4.0;python_version<'3.11'",
     "pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
     "pandas>=2.1.1;python_version>='3.12'",
     'tqdm>=4.29',
     'copulas>=0.11.0',
-    'ctgan>=0.10.0',
-    'deepecho>=0.6.0',
+    'ctgan>=0.10.2',
+    'deepecho>=0.6.1',
     'rdt>=1.12.3',
     'sdmetrics>=0.16.0',
     'platformdirs>=4.0',
@@ -207,7 +207,10 @@ select = [
     # print statements
     "T201",
     # pandas-vet
-    "PD"
+    "PD",
+    # numpy 2.0
+    "NPY201"
+
 ]
 ignore = [
     # pydocstyle

diff --git a/sdv/_utils.py b/sdv/_utils.py
@@ -82,6 +82,7 @@ def _is_datetime_type(value):
             bool(_get_datetime_format([value]))
             or isinstance(value, pd.Timestamp)
             or isinstance(value, datetime)
+            or isinstance(value, pd.Period)
             or (isinstance(value, str) and pd.notna(pd.to_datetime(value, errors='coerce')))
         ):
             return False

diff --git a/sdv/constraints/tabular.py b/sdv/constraints/tabular.py
@@ -334,6 +334,14 @@ def _transform(self, table_data):
             pandas.DataFrame:
                 Transformed data.
         """
+        # To make the NaN to None mapping work for pd.Categorical data, we need to convert
+        # the columns to object before replacing NaNs with None.
+        table_data[self._columns] = table_data[self._columns].astype({
+            col: object
+            for col in self._columns
+            if pd.api.types.is_categorical_dtype(table_data[col])
+        })
+
         table_data[self._columns] = table_data[self._columns].replace({np.nan: None})
         combinations = table_data[self._columns].itertuples(index=False, name=None)
         uuids = map(self._combinations_to_uuids.get, combinations)

diff --git a/sdv/data_processing/data_processor.py b/sdv/data_processing/data_processor.py
@@ -22,12 +22,13 @@
 )
 from sdv.data_processing.datetime_formatter import DatetimeFormatter
 from sdv.data_processing.errors import InvalidConstraintsError, NotFittedError
-from sdv.data_processing.numerical_formatter import NumericalFormatter
+from sdv.data_processing.numerical_formatter import INTEGER_BOUNDS, NumericalFormatter
 from sdv.data_processing.utils import load_module_from_path
 from sdv.errors import SynthesizerInputError, log_exc_stacktrace
 from sdv.metadata.single_table import SingleTableMetadata
 
 LOGGER = logging.getLogger(__name__)
+INTEGER_BOUNDS = {str(key).lower(): value for key, value in INTEGER_BOUNDS.items()}
 
 
 class DataProcessor:
@@ -561,26 +562,36 @@ def _create_config(self, data, columns_created_by_constraints):
             )
 
             if sdtype == 'id':
-                is_numeric = pd.api.types.is_numeric_dtype(data[column].dtype)
+                function_name = 'bothify'
+                column_dtype = data[column].dtype
+                is_numeric = pd.api.types.is_numeric_dtype(column_dtype)
                 if column_metadata.get('regex_format', False):
                     transformers[column] = self.create_regex_generator(
                         column, sdtype, column_metadata, is_numeric
                     )
                     sdtypes[column] = 'text'
 
                 else:
-                    bothify_format = 'sdv-id-??????'
                     if is_numeric:
-                        bothify_format = '#########'
+                        function_name = 'random_int'
+                        column_dtype = str(column_dtype).lower()
+                        function_kwargs = {'min': 0, 'max': 9999999}
+                        for key in INTEGER_BOUNDS:
+                            if key in column_dtype:
+                                _, max_value = INTEGER_BOUNDS[key]
+                                function_kwargs = {'min': 0, 'max': max_value}
+
+                    else:
+                        function_kwargs = {'text': 'sdv-id-??????'}
 
                     cardinality_rule = None
                     if column in self._keys:
                         cardinality_rule = 'unique'
 
                     transformers[column] = AnonymizedFaker(
                         provider_name=None,
-                        function_name='bothify',
-                        function_kwargs={'text': bothify_format},
+                        function_name=function_name,
+                        function_kwargs=function_kwargs,
                         cardinality_rule=cardinality_rule,
                     )
 

diff --git a/tests/benchmark/excluded_tests.py b/tests/benchmark/excluded_tests.py
@@ -29,6 +29,9 @@
     ('numerical', 'np.string', 'Positive'),
     ('numerical', 'np.string', 'Negative'),
     ('numerical', 'np.string', 'ScalarInequality'),
+    ('numerical', 'np.bytes', 'Positive'),
+    ('numerical', 'np.bytes', 'Negative'),
+    ('numerical', 'np.bytes', 'ScalarInequality'),
     ('numerical', 'np.unicode', 'Positive'),
     ('numerical', 'np.unicode', 'Negative'),
     ('numerical', 'np.unicode', 'ScalarInequality'),

diff --git a/tests/benchmark/numpy_dtypes.py b/tests/benchmark/numpy_dtypes.py
@@ -61,14 +61,17 @@
     }),
     'np.string': pd.DataFrame({
         'np.string': pd.Series([
-            np.string_('string1'),
-            np.string_('string2'),
-            np.string_('string3'),
+            np.str_('string1'),
+            np.str_('string2'),
+            np.str_('string3'),
         ])
     }),
+    'np.bytes': pd.DataFrame({
+        'np.bytes': pd.Series([np.bytes_('bytes1'), np.bytes_('bytes2'), np.bytes_('bytes3')])
+    }),
     'np.unicode': pd.DataFrame({
         'np.unicode': pd.Series(
-            [np.unicode_('unicode1'), np.unicode_('unicode2'), np.unicode_('unicode3')],
+            [np.str_('unicode1'), np.str_('unicode2'), np.str_('unicode3')],
             dtype='string',
         )
     }),

diff --git a/tests/benchmark/supported_dtypes_benchmark.py b/tests/benchmark/supported_dtypes_benchmark.py
@@ -120,6 +120,7 @@
     'np.object': 'categorical',
     'np.bool': 'categorical',
     'np.string': 'categorical',
+    'np.bytes': 'categorical',
     'np.unicode': 'categorical',
     # PyArrow
     'pa.int8': 'numerical',
@@ -378,7 +379,6 @@ def _create_single_column_constraint_and_data(constraint, data, dtype, sdtype):
 
 
 def _create_multi_column_constraint_data_and_metadata(constraint, data, dtype, sdtype, metadata):
-    _dtype = data.dtypes[dtype]
     constraint_class = constraint.get('constraint_class')
     constraints = []
     if constraint_class == 'FixedCombinations':

diff --git a/tests/integration/single_table/test_copulas.py b/tests/integration/single_table/test_copulas.py
@@ -347,31 +347,31 @@ def test_numerical_columns_gets_pii():
 
     # Assert
     expected_sampled = pd.DataFrame({
-        'id': {
-            0: 807994768,
-            1: 746439230,
-            2: 201363792,
-            3: 364823003,
-            4: 726973888,
-            5: 693331380,
-            6: 795819284,
-            7: 607278621,
-            8: 783746695,
-            9: 162118876,
-        },
-        'city': {
-            0: 'Danielfort',
-            1: 'Glendaside',
-            2: 'Port Jenniferchester',
-            3: 'Port Susan',
-            4: 'West Michellemouth',
-            5: 'West Jason',
-            6: 'Ryanfort',
-            7: 'West Stephenland',
-            8: 'Davidland',
-            9: 'Port Christopher',
-        },
-        'numerical': {0: 22, 1: 24, 2: 22, 3: 23, 4: 22, 5: 24, 6: 23, 7: 24, 8: 24, 9: 24},
+        'id': [
+            1089619006166876142,
+            8373046707753416652,
+            9070705361670139280,
+            7227045982112645011,
+            3461931576753619633,
+            1005734164466301683,
+            3312031189447929384,
+            82456842876428117,
+            1819741328868365520,
+            8019169766233150107,
+        ],
+        'city': [
+            'Danielfort',
+            'Glendaside',
+            'Port Jenniferchester',
+            'Port Susan',
+            'West Michellemouth',
+            'West Jason',
+            'Ryanfort',
+            'West Stephenland',
+            'Davidland',
+            'Port Christopher',
+        ],
+        'numerical': [22, 24, 22, 23, 22, 24, 23, 24, 24, 24],
     })
     pd.testing.assert_frame_equal(expected_sampled, sampled)
 

diff --git a/tests/unit/constraints/test_tabular.py b/tests/unit/constraints/test_tabular.py
@@ -889,6 +889,28 @@ def test_transform_non_string(self):
         expected_out_a = pd.Series(['a', 'b', 'c'], name='a')
         pd.testing.assert_series_equal(expected_out_a, out['a'])
 
+    def test_transform_categorical_dtype(self):
+        """Test ``transform`` with categorical columns."""
+        # Setup
+        table_data = pd.DataFrame({
+            'a': ['a', 'b', 'c'],
+            'b': pd.Categorical(['d', None, 'f']),
+            'c': pd.Categorical(['g', 'h', np.nan]),
+        })
+        columns = ['b', 'c']
+        instance = FixedCombinations(column_names=columns)
+        instance.fit(table_data)
+
+        # Run
+        out = instance.transform(table_data)
+
+        # Assert
+        assert out['b#c'].isna().sum() == 0
+        assert instance._combinations_to_uuids is not None
+        assert instance._uuids_to_combinations is not None
+        expected_out_a = pd.Series(['a', 'b', 'c'], name='a')
+        pd.testing.assert_series_equal(expected_out_a, out['a'])
+
     def test_transform_not_all_columns_provided(self):
         """Test the ``FixedCombinations.transform`` method.
 

diff --git a/tests/unit/data_processing/test_data_processor.py b/tests/unit/data_processing/test_data_processor.py
@@ -1137,7 +1137,9 @@ def test__create_config(self):
             'first_name': ['John', 'Doe', 'Johanna'],
             'id': ['ID_001', 'ID_002', 'ID_003'],
             'id_no_regex': ['ID_001', 'ID_002', 'ID_003'],
-            'id_numeric': [0, 1, 2],
+            'id_numeric_int8': pd.Series([1, 2, 3], dtype='Int8'),
+            'id_numeric_int16': pd.Series([1, 2, 3], dtype='Int16'),
+            'id_numeric_int32': pd.Series([1, 2, 3], dtype='Int32'),
             'id_column': ['ID_999', 'ID_999', 'ID_007'],
             'date': ['2021-02-01', '2022-03-05', '2023-01-31'],
             'unknown': ['a', 'b', 'c'],
@@ -1151,9 +1153,9 @@ def test__create_config(self):
         dp.create_anonymized_transformer.return_value = 'AnonymizedFaker'
         dp.create_regex_generator.return_value = 'RegexGenerator'
         dp.metadata.primary_key = 'id'
-        dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric']
+        dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric_int8']
         dp._primary_key = 'id'
-        dp._keys = ['id', 'id_no_regex', 'id_numeric']
+        dp._keys = ['id', 'id_no_regex', 'id_numeric_int8']
         dp.metadata.columns = {
             'int': {'sdtype': 'numerical'},
             'float': {'sdtype': 'numerical'},
@@ -1163,7 +1165,9 @@ def test__create_config(self):
             'first_name': {'sdtype': 'first_name'},
             'id': {'sdtype': 'id', 'regex_format': 'ID_\\d{3}[0-9]'},
             'id_no_regex': {'sdtype': 'id'},
-            'id_numeric': {'sdtype': 'id'},
+            'id_numeric_int8': {'sdtype': 'id'},
+            'id_numeric_int16': {'sdtype': 'id'},
+            'id_numeric_int32': {'sdtype': 'id'},
             'id_column': {'sdtype': 'id'},
             'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
             'unknown': {'sdtype': 'unknown'},
@@ -1188,7 +1192,9 @@ def test__create_config(self):
             'first_name': 'pii',
             'id': 'text',
             'id_no_regex': 'text',
-            'id_numeric': 'text',
+            'id_numeric_int8': 'text',
+            'id_numeric_int16': 'text',
+            'id_numeric_int32': 'text',
             'id_column': 'text',
             'date': 'datetime',
             'unknown': 'pii',
@@ -1236,11 +1242,24 @@ def test__create_config(self):
         assert id_no_regex_transformer.function_kwargs == {'text': 'sdv-id-??????'}
         assert id_no_regex_transformer.cardinality_rule == 'unique'
 
-        id_numeric_transformer = config['transformers']['id_numeric']
-        assert isinstance(id_numeric_transformer, AnonymizedFaker)
-        assert id_numeric_transformer.function_name == 'bothify'
-        assert id_numeric_transformer.function_kwargs == {'text': '#########'}
-        assert id_numeric_transformer.cardinality_rule == 'unique'
+        id_numeric_int_8_transformer = config['transformers']['id_numeric_int8']
+        assert isinstance(id_numeric_int_8_transformer, AnonymizedFaker)
+        assert id_numeric_int_8_transformer.function_name == 'random_int'
+        assert id_numeric_int_8_transformer.function_kwargs == {'min': 0, 'max': 127}
+        assert id_numeric_int_8_transformer.cardinality_rule == 'unique'
+
+        id_numeric_int_16_transformer = config['transformers']['id_numeric_int16']
+        assert isinstance(id_numeric_int_16_transformer, AnonymizedFaker)
+        assert id_numeric_int_16_transformer.function_name == 'random_int'
+        assert id_numeric_int_16_transformer.function_kwargs == {'min': 0, 'max': 32767}
+
+        id_numeric_int_32_transformer = config['transformers']['id_numeric_int32']
+        assert isinstance(id_numeric_int_32_transformer, AnonymizedFaker)
+        assert id_numeric_int_32_transformer.function_name == 'random_int'
+        assert id_numeric_int_32_transformer.function_kwargs == {
+            'min': 0,
+            'max': 2147483647,
+        }
 
         id_column_transformer = config['transformers']['id_column']
         assert isinstance(id_column_transformer, AnonymizedFaker)

diff --git a/tests/unit/test__utils.py b/tests/unit/test__utils.py
@@ -106,6 +106,26 @@ def test__is_datetime_type_with_datetime_series():
     assert is_datetime
 
 
+def test__is_datetime_type_with_period():
+    """Test the ``_is_datetime_type`` function when a period series is passed.
+
+    Expect to return True when a period series is passed.
+
+    Input:
+    - A pandas.Series of type `period`
+    Output:
+    - True
+    """
+    # Setup
+    data = pd.Series(pd.period_range('2023-01', periods=3, freq='M'))
+
+    # Run
+    is_datetime = _is_datetime_type(data)
+
+    # Assert
+    assert is_datetime
+
+
 def test__is_datetime_type_with_mixed_array():
     """Test the ``_is_datetime_type`` function with a list of mixed datetime types."""
     # Setup