Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

text generator and test #295

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions synthetic_data/distinct_generators/text_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import numpy as np
import string
from numpy.random import Generator
from typing import List, Optional

drahc1R marked this conversation as resolved.
Show resolved Hide resolved

def random_string(
rng: Generator,
chars: Optional[List[str]] = None,
num_rows: int = 1,
str_len_min: int = 1,
str_len_max: int = 256,
) -> np.array:
"""
Randomly generates an array of strings with length between a min and max value

:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param chars: a list of values that are allowed in a string or None
:type chars: List[str], None
:param num_rows: the number of rows in np array generated
:type num_rows: int, optional
:param str_len_min: the minimum length a string can be
:type str_len_min: int, optional
:param str_len_max: the maximum length a string can be
:type str_len_max: int, optional

:return: numpy array of strings
"""
if chars is None:
chars = list(
string.ascii_uppercase
+ string.ascii_lowercase
+ string.digits
+ " "
+ string.punctuation
)
string_list = []

for _ in range(num_rows):
length = rng.integers(str_len_min, str_len_max)
string_entry = "".join(rng.choice(chars, (length,)))
string_list.append(string_entry)

return np.array(string_list)

drahc1R marked this conversation as resolved.
Show resolved Hide resolved

def random_text(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we even want two diff or to just reduce to 1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we just keep random_text?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can remove later if desired

rng: Generator,
chars: Optional[str] = None,
num_rows: int = 1,
str_len_min: int = 256,
str_len_max: int = 1000,
) -> np.array:
"""
Randomly generates an array of text with length between a min and max value

:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param chars: a list of values that are allowed in a string or None
:type chars: List[str], None
:param num_rows: the number of rows in np array generated
:type num_rows: int, optional
:param str_len_min: the minimum length a string can be (must be larger than 255)
:type str_len_min: int, optional
:param str_len_max: the maximum length a string can be
:type str_len_max: int, optional

:return: numpy array of text
"""
if str_len_min < 256:
raise ValueError(
f"str_len_min must be > 255. " f"Value provided: {str_len_min}."
)

return random_string(rng, chars, num_rows, str_len_min, str_len_max)
48 changes: 48 additions & 0 deletions tests/distinct_generators/test_text_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import unittest
from unittest import mock
import pandas as pd
import numpy as np
from synthetic_data.distinct_generators.text_generator import random_string, random_text


class TestTextGeneratorFunctions(unittest.TestCase):
def setUp(self):
self.rng = np.random.default_rng(12345)

def test_return_type(self):
str_arr = random_string(self.rng)
txt_arr = random_text(self.rng)
for x in str_arr:
self.assertIsInstance(x, np.str_)
for x in txt_arr:
self.assertIsInstance(x, np.str_)

def test_str_length(self):
str_arr = random_string(self.rng, str_len_min=1, str_len_max=256)
txt_arr = random_text(self.rng, str_len_min=256, str_len_max=1000)
with self.assertRaises(ValueError):
random_text(self.rng, str_len_min=255)

self.assertLessEqual(len(str_arr[0]), 256)
self.assertGreaterEqual(len(str_arr[0]), 1)
self.assertLessEqual(len(txt_arr[0]), 1000)
self.assertGreaterEqual(len(txt_arr[0]), 256)

def test_num_rows(self):
num_rows = [1,5,10]
for nr in num_rows:
str_arr = random_string(self.rng, num_rows=nr)
txt_arr = random_text(self.rng, num_rows=nr)
self.assertEqual(str_arr.size, nr)
self.assertEqual(txt_arr.size, nr)

def test_chars(self):
chars_set = {"0","1"}
str_arr = random_string(self.rng, chars=["0","1"])
txt_arr = random_text(self.rng, chars=["0","1"])
for s in str_arr:
for char in s:
self.assertIn(char, chars_set)
for s in txt_arr:
for char in s:
self.assertIn(char, chars_set)