Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Focus on None value in parquet also #187

Merged
merged 6 commits into from
Jun 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 01-model/model.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"description": "The machine learning meta-model with synthetic data (useful for MLOps/feature store), part of the quality gate concept.",
"kind": "model",
"spec": {
"version": "0.3.4",
"version": "0.3.5",
"CSV_SEPARATOR": ",",
"CSV_DECIMAL": ".",
"NONE_VALUES": true,
Expand Down
Binary file removed 02-data/01-size-100/01-basic-party.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/01-basic-party.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/01-basic_party.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/01-basic_party.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/02-basic-contact.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/02-basic-contact.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/02-basic_contact.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/02-basic_contact.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/03-basic-relation.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/03-basic-relation.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/03-basic_relation.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/03-basic_relation.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/04-basic-account.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/04-basic-account.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/04-basic_account.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/04-basic_account.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/05-basic-transaction.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/05-basic-transaction.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/05-basic_transaction.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/05-basic_transaction.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/06-basic-event.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/06-basic-event.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/06-basic_event.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/06-basic_event.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/07-basic-communication.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/07-basic-communication.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/07-basic_communication.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/07-basic_communication.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/01-basic-party.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/01-basic-party.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/01-basic_party.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/01-basic_party.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/02-basic-contact.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/02-basic-contact.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/02-basic_contact.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/02-basic_contact.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/03-basic-relation.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/03-basic-relation.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/03-basic_relation.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/03-basic_relation.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/04-basic-account.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/04-basic-account.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/04-basic_account.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/04-basic_account.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/05-basic-transaction.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/05-basic-transaction.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/05-basic_transaction.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/05-basic_transaction.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/06-basic-event.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/06-basic-event.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/06-basic_event.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/06-basic_event.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/07-basic-communication.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/07-basic-communication.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/07-basic_communication.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/07-basic_communication.parquet
Binary file not shown.
586 changes: 301 additions & 285 deletions 03-test/01-size-100.json

Large diffs are not rendered by default.

604 changes: 310 additions & 294 deletions 03-test/02-size-1k.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion generator/base_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def save(self, path, dir: str, compress: bool):
# free memory
del df

def apply_none_value(self, current_collection, property_name, default_value, lower_probability=1, none_value=""):
def apply_none_value(self, current_collection, property_name, default_value, lower_probability=1, none_value=None):
"""Apply None value, in case that current value is default.
It is based on project setting (see setting in file 'model.json',
with config values 'NONE_VALUES' and 'NONE_VALUES_PROBABILITY')
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ click~=8.1

numpy~=1.26
pandas==2.2.2
pyarrow==15.0.2
pyarrow==16.1.0

# generation of synthetic data
faker~=24.2
14 changes: 13 additions & 1 deletion tests/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_generate(self):
def test_generate_smallbulk_repeat(self):
"""Repeat generation of small files"""

for i in range(10):
for i in range(20):
lbl = f"0-size-iter{i}-8,6"

generator = SyntheticData(os.path.join("..","01-model"),TestGenerator.OUTPUT_ADR, TestGenerator.OUTPUT_ADR)
Expand Down Expand Up @@ -107,6 +107,18 @@ def test_generate_bigbulk(self):
self.assertTrue(os.path.exists(dir))
self.assertTrue(os.path.exists(path.join(dir, f"{basic_party.BasicParty.NAME}.csv")))

def test_generate_bigbulk_repeat(self):
for i in range(5):
lbl = f"0-size-iter{i}-1000,1000"

generator = SyntheticData(os.path.join("..","01-model"),TestGenerator.OUTPUT_ADR, TestGenerator.OUTPUT_ADR)
generator.generate(label=lbl, count=1000, bulk_max=1000, compress=False)

dir = path.join(TestGenerator.OUTPUT_ADR, lbl)
self.assertTrue(os.path.exists(dir))
self.assertTrue(os.path.exists(path.join(dir, f"{basic_party.BasicParty.NAME}.csv")))


def _check_csv_header(self, filename, key_text):
if os.path.exists(filename):
df = pd.read_csv(filename)
Expand Down