Skip to content

Commit

Permalink
Add text generator for sdtype like
Browse files Browse the repository at this point in the history
  • Loading branch information
pvk-developer committed Sep 11, 2024
1 parent a00cbf0 commit 3002025
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 1 deletion.
2 changes: 1 addition & 1 deletion rdt/performance/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
id,
numerical,
pii,
text,
)
from rdt.performance.datasets import id as text
from rdt.performance.datasets.base import BaseDatasetGenerator

__all__ = [
Expand Down
57 changes: 57 additions & 0 deletions rdt/performance/datasets/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Dataset Generators for 'text' transformers."""

from abc import ABC

import numpy as np

from rdt.performance.datasets.base import BaseDatasetGenerator
from rdt.performance.datasets.utils import add_nans


class RegexGeneratorGenerator(BaseDatasetGenerator, ABC):
"""Base class for generators that generate ID data."""

SDTYPE = 'text'


class RandomStringGenerator(RegexGeneratorGenerator):
"""Generator that creates an array of random strings."""

@staticmethod
def generate(num_rows):
"""Generate a ``num_rows`` number of rows."""
categories = ['Alice', 'Bob', 'Charlie', 'Dave', 'Eve']
return np.random.choice(a=categories, size=num_rows)

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
return {
'fit': {'time': 1e-05, 'memory': 500.0},
'transform': {'time': 1e-05, 'memory': 500.0},
'reverse_transform': {
'time': 2e-05,
'memory': 1000.0,
},
}


class RandomStringNaNsGenerator(RegexGeneratorGenerator):
"""Generator that creates an array of random strings with nans."""

@staticmethod
def generate(num_rows):
"""Generate a ``num_rows`` number of rows."""
return add_nans(RandomStringGenerator.generate(num_rows).astype('O'))

@staticmethod
def get_performance_thresholds():
"""Return the expected threseholds."""
return {
'fit': {'time': 1e-05, 'memory': 400.0},
'transform': {'time': 1e-05, 'memory': 1000.0},
'reverse_transform': {
'time': 2e-05,
'memory': 1000.0,
},
}

0 comments on commit 3002025

Please sign in to comment.