Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align test/id sdtypes to match SDV #881

Merged
merged 9 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DEVELOPMENT.rst
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ method. The times are specified in seconds and the memory in bytes.

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {
'time': 1,
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ exclude = [
'.tox',
'.git',
'__pycache__',
'*.ipynb',
'.ipynb_checkpoints',
'tasks.py',
'tests/contributing.py'
Expand Down Expand Up @@ -235,4 +236,4 @@ convention = "google"

[tool.ruff.lint.pycodestyle]
max-doc-length = 100
max-line-length = 100
max-line-length = 100
4 changes: 3 additions & 1 deletion rdt/performance/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
boolean,
categorical,
datetime,
id,
numerical,
pii,
text,
Expand All @@ -16,9 +17,10 @@
'boolean',
'categorical',
'datetime',
'id',
'numerical',
'text',
'pii',
'text',
'BaseDatasetGenerator',
]

Expand Down
2 changes: 1 addition & 1 deletion rdt/performance/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ def get_subclasses(cls):
@staticmethod
@abstractmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
raise NotImplementedError()
12 changes: 6 additions & 6 deletions rdt/performance/datasets/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-5, 'memory': 400.0},
'transform': {'time': 1e-5, 'memory': 400.0},
Expand Down Expand Up @@ -63,7 +63,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-5, 'memory': 400.0},
'transform': {'time': 1e-5, 'memory': 1000.0},
Expand All @@ -90,7 +90,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 1e-5, 'memory': 400.0},
'transform': {'time': 1e-5, 'memory': 400.0},
Expand Down Expand Up @@ -119,7 +119,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 1e-5, 'memory': 400.0},
'transform': {'time': 1e-5, 'memory': 1000.0},
Expand All @@ -141,7 +141,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 1e-5, 'memory': 400.0},
'transform': {'time': 1e-5, 'memory': 400.0},
Expand Down Expand Up @@ -169,7 +169,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 1e-5, 'memory': 400.0},
'transform': {'time': 1e-5, 'memory': 1000.0},
Expand Down
28 changes: 14 additions & 14 deletions rdt/performance/datasets/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 5e-05, 'memory': 400.0},
Expand All @@ -47,7 +47,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 5e-05, 'memory': 1000.0},
Expand All @@ -69,7 +69,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 500.0},
'transform': {'time': 1e-05, 'memory': 500.0},
Expand All @@ -90,7 +90,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 1e-05, 'memory': 1000.0},
Expand Down Expand Up @@ -126,7 +126,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 1e-05, 'memory': 1000.0},
Expand Down Expand Up @@ -158,7 +158,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 1e-05, 'memory': 2000.0},
Expand All @@ -180,7 +180,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 3e-05, 'memory': 400.0},
Expand All @@ -201,7 +201,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 3e-05, 'memory': 400.0},
Expand All @@ -223,7 +223,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 4e-05, 'memory': 400.0},
Expand All @@ -244,7 +244,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 2e-05, 'memory': 400.0},
'transform': {'time': 3e-05, 'memory': 400.0},
Expand All @@ -265,7 +265,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 0.0004, 'memory': 2000.0},
'transform': {'time': 0.0004, 'memory': 500000.0},
Expand All @@ -286,7 +286,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 0.0004, 'memory': 1000.0},
'transform': {'time': 0.0004, 'memory': 1000000.0},
Expand All @@ -307,7 +307,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 0.002, 'memory': 2000.0},
'transform': {'time': 0.0004, 'memory': 500000.0},
Expand All @@ -328,7 +328,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 0.001, 'memory': 1000.0},
'transform': {'time': 0.0005, 'memory': 1000000.0},
Expand Down
12 changes: 6 additions & 6 deletions rdt/performance/datasets/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 5e-05, 'memory': 500.0},
'transform': {'time': 5e-05, 'memory': 350.0},
Expand All @@ -53,7 +53,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 5e-05, 'memory': 500.0},
'transform': {'time': 5e-05, 'memory': 350.0},
Expand All @@ -75,7 +75,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 5e-05, 'memory': 500.0},
'transform': {'time': 5e-05, 'memory': 1000.0},
Expand All @@ -99,7 +99,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 5e-05, 'memory': 500.0},
'transform': {'time': 5e-05, 'memory': 350.0},
Expand All @@ -126,7 +126,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 5e-05, 'memory': 500.0},
'transform': {'time': 5e-05, 'memory': 350.0},
Expand All @@ -153,7 +153,7 @@ def generate(num_rows):

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
"""Return the expected thresholds."""
return {
'fit': {'time': 5e-05, 'memory': 500.0},
'transform': {'time': 5e-05, 'memory': 350.0},
Expand Down
57 changes: 57 additions & 0 deletions rdt/performance/datasets/id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Dataset Generators for ID transformers."""

from abc import ABC

import numpy as np

from rdt.performance.datasets.base import BaseDatasetGenerator
from rdt.performance.datasets.utils import add_nans


class RegexGeneratorGenerator(BaseDatasetGenerator, ABC):
"""Base class for generators that generate ID data."""

SDTYPE = 'id'


class RandomStringGenerator(RegexGeneratorGenerator):
"""Generator that creates an array of random strings."""

@staticmethod
def generate(num_rows):
"""Generate a ``num_rows`` number of rows."""
categories = ['Alice', 'Bob', 'Charlie', 'Dave', 'Eve']
return np.random.choice(a=categories, size=num_rows)

@staticmethod
def get_performance_thresholds():
"""Return the expected thresholds."""
return {
'fit': {'time': 1e-05, 'memory': 500.0},
'transform': {'time': 1e-05, 'memory': 500.0},
'reverse_transform': {
'time': 2e-05,
'memory': 1000.0,
},
}


class RandomStringNaNsGenerator(RegexGeneratorGenerator):
"""Generator that creates an array of random strings with nans."""

@staticmethod
def generate(num_rows):
"""Generate a ``num_rows`` number of rows."""
return add_nans(RandomStringGenerator.generate(num_rows).astype('O'))

@staticmethod
def get_performance_thresholds():
"""Return the expected thresholds."""
return {
'fit': {'time': 1e-05, 'memory': 400.0},
'transform': {'time': 1e-05, 'memory': 1000.0},
'reverse_transform': {
'time': 2e-05,
'memory': 1000.0,
},
}
Loading
Loading