Skip to content

Commit

Permalink
Merge pull request #187 from george0st/change
Browse files Browse the repository at this point in the history
Focus on None value in parquet also
  • Loading branch information
george0st authored Jun 22, 2024
2 parents 295c751 + 67fb250 commit 32e494b
Show file tree
Hide file tree
Showing 62 changed files with 627 additions and 583 deletions.
2 changes: 1 addition & 1 deletion 01-model/model.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"description": "The machine learning meta-model with synthetic data (useful for MLOps/feature store), part of the quality gate concept.",
"kind": "model",
"spec": {
"version": "0.3.4",
"version": "0.3.5",
"CSV_SEPARATOR": ",",
"CSV_DECIMAL": ".",
"NONE_VALUES": true,
Expand Down
Binary file removed 02-data/01-size-100/01-basic-party.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/01-basic-party.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/01-basic_party.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/01-basic_party.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/02-basic-contact.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/02-basic-contact.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/02-basic_contact.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/02-basic_contact.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/03-basic-relation.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/03-basic-relation.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/03-basic_relation.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/03-basic_relation.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/04-basic-account.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/04-basic-account.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/04-basic_account.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/04-basic_account.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/05-basic-transaction.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/05-basic-transaction.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/05-basic_transaction.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/05-basic_transaction.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/06-basic-event.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/06-basic-event.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/06-basic_event.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/06-basic_event.parquet
Binary file not shown.
Binary file removed 02-data/01-size-100/07-basic-communication.csv.gz
Binary file not shown.
Binary file removed 02-data/01-size-100/07-basic-communication.parquet
Binary file not shown.
Binary file modified 02-data/01-size-100/07-basic_communication.csv.gz
Binary file not shown.
Binary file modified 02-data/01-size-100/07-basic_communication.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/01-basic-party.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/01-basic-party.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/01-basic_party.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/01-basic_party.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/02-basic-contact.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/02-basic-contact.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/02-basic_contact.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/02-basic_contact.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/03-basic-relation.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/03-basic-relation.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/03-basic_relation.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/03-basic_relation.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/04-basic-account.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/04-basic-account.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/04-basic_account.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/04-basic_account.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/05-basic-transaction.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/05-basic-transaction.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/05-basic_transaction.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/05-basic_transaction.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/06-basic-event.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/06-basic-event.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/06-basic_event.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/06-basic_event.parquet
Binary file not shown.
Binary file removed 02-data/02-size-1K/07-basic-communication.csv.gz
Binary file not shown.
Binary file removed 02-data/02-size-1K/07-basic-communication.parquet
Binary file not shown.
Binary file modified 02-data/02-size-1K/07-basic_communication.csv.gz
Binary file not shown.
Binary file modified 02-data/02-size-1K/07-basic_communication.parquet
Binary file not shown.
586 changes: 301 additions & 285 deletions 03-test/01-size-100.json

Large diffs are not rendered by default.

604 changes: 310 additions & 294 deletions 03-test/02-size-1k.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion generator/base_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def save(self, path, dir: str, compress: bool):
# free memory
del df

def apply_none_value(self, current_collection, property_name, default_value, lower_probability=1, none_value=""):
def apply_none_value(self, current_collection, property_name, default_value, lower_probability=1, none_value=None):
"""Apply None value, in case that current value is default.
It is based on project setting (see setting in file 'model.json',
with config values 'NONE_VALUES' and 'NONE_VALUES_PROBABILITY')
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ click~=8.1

numpy~=1.26
pandas==2.2.2
pyarrow==15.0.2
pyarrow==16.1.0

# generation of synthetic data
faker~=24.2
14 changes: 13 additions & 1 deletion tests/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_generate(self):
def test_generate_smallbulk_repeat(self):
"""Repeat generation of small files"""

for i in range(10):
for i in range(20):
lbl = f"0-size-iter{i}-8,6"

generator = SyntheticData(os.path.join("..","01-model"),TestGenerator.OUTPUT_ADR, TestGenerator.OUTPUT_ADR)
Expand Down Expand Up @@ -107,6 +107,18 @@ def test_generate_bigbulk(self):
self.assertTrue(os.path.exists(dir))
self.assertTrue(os.path.exists(path.join(dir, f"{basic_party.BasicParty.NAME}.csv")))

def test_generate_bigbulk_repeat(self):
for i in range(5):
lbl = f"0-size-iter{i}-1000,1000"

generator = SyntheticData(os.path.join("..","01-model"),TestGenerator.OUTPUT_ADR, TestGenerator.OUTPUT_ADR)
generator.generate(label=lbl, count=1000, bulk_max=1000, compress=False)

dir = path.join(TestGenerator.OUTPUT_ADR, lbl)
self.assertTrue(os.path.exists(dir))
self.assertTrue(os.path.exists(path.join(dir, f"{basic_party.BasicParty.NAME}.csv")))


def _check_csv_header(self, filename, key_text):
if os.path.exists(filename):
df = pd.read_csv(filename)
Expand Down

0 comments on commit 32e494b

Please sign in to comment.