From ccb189c8640e0628b3fe316d835a5ebc51f59120 Mon Sep 17 00:00:00 2001 From: jist <95856749+george0st@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:13:41 +0200 Subject: [PATCH 1/4] Extend check_csv_header --- tests/test_generator.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index 9925d7f..5dcab19 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -128,10 +128,11 @@ def test_generate_bigbulk_repeat(self): self.assertTrue(os.path.exists(path.join(dir, f"{basic_contact.BasicContact.NAME}.csv"))) - def _check_csv_header(self, filename, key_text): + def _check_csv_header(self, filename, key_texts: list): if os.path.exists(filename): - df = pd.read_csv(filename) - self.assertTrue(df.to_string().find(key_text) >= 0) + content = pd.read_csv(filename).to_string() + for key_text in key_texts: + self.assertTrue(content.find(key_text) >= 0) def test_csv_structure(self): """All csv have header""" @@ -142,12 +143,12 @@ def test_csv_structure(self): dir = path.join(TestGenerator.OUTPUT_ADR, lbl) self.assertTrue(os.path.exists(dir)) - self._check_csv_header(path.join(dir, f"{basic_party.BasicParty.NAME}.csv"), "party_id") - self._check_csv_header(path.join(dir, f"{basic_contact.BasicContact.NAME}.csv"), "party_id") - self._check_csv_header(path.join(dir, f"{basic_relation.BasicRelation.NAME}.csv"), "party_id") - self._check_csv_header(path.join(dir, f"{basic_account.BasicAccount.NAME}.csv"), "party_id") - self._check_csv_header(path.join(dir, f"{basic_transaction.BasicTransaction.NAME}.csv"), "account_id") - self._check_csv_header(path.join(dir, f"{basic_event.BasicEvent.NAME}.csv"), "party_id") - self._check_csv_header(path.join(dir, f"{basic_communication.BasicCommunication.NAME}.csv"), "party_id") + self._check_csv_header(path.join(dir, f"{basic_party.BasicParty.NAME}.csv"), ["party_id"]) + self._check_csv_header(path.join(dir, f"{basic_contact.BasicContact.NAME}.csv"), ["party_id"]) + self._check_csv_header(path.join(dir, f"{basic_relation.BasicRelation.NAME}.csv"), ["party_id"]) + self._check_csv_header(path.join(dir, f"{basic_account.BasicAccount.NAME}.csv"), ["party_id"]) + self._check_csv_header(path.join(dir, f"{basic_transaction.BasicTransaction.NAME}.csv"), ["account_id"]) + self._check_csv_header(path.join(dir, f"{basic_event.BasicEvent.NAME}.csv"), ["party_id"]) + self._check_csv_header(path.join(dir, f"{basic_communication.BasicCommunication.NAME}.csv"), ["party_id"]) # TODO: Add batch size under limit, it will generate wrong dataset \ No newline at end of file From 79e3141c6089eb952568e49e024c3f11b9cf6565 Mon Sep 17 00:00:00 2001 From: jist <95856749+george0st@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:14:03 +0200 Subject: [PATCH 2/4] Update todo_list.md --- docs/todo_list.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/todo_list.md b/docs/todo_list.md index 10f309e..8bfaf95 100644 --- a/docs/todo_list.md +++ b/docs/todo_list.md @@ -2,8 +2,6 @@ The list of expected/future improvements: -0. Extend columns in function '_check_csv_header' - 1. Add ability to select type of output format (CSV or parquet) in commandline - current solution generate both formats without ability to choose preferences From 6fc122cbb5c7d2193480ca7efb6305862b74aace Mon Sep 17 00:00:00 2001 From: jist <95856749+george0st@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:20:58 +0200 Subject: [PATCH 3/4] Update test_generator.py --- tests/test_generator.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index 5dcab19..de06251 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -143,12 +143,19 @@ def test_csv_structure(self): dir = path.join(TestGenerator.OUTPUT_ADR, lbl) self.assertTrue(os.path.exists(dir)) - self._check_csv_header(path.join(dir, f"{basic_party.BasicParty.NAME}.csv"), ["party_id"]) - self._check_csv_header(path.join(dir, f"{basic_contact.BasicContact.NAME}.csv"), ["party_id"]) - self._check_csv_header(path.join(dir, f"{basic_relation.BasicRelation.NAME}.csv"), ["party_id"]) - self._check_csv_header(path.join(dir, f"{basic_account.BasicAccount.NAME}.csv"), ["party_id"]) - self._check_csv_header(path.join(dir, f"{basic_transaction.BasicTransaction.NAME}.csv"), ["account_id"]) - self._check_csv_header(path.join(dir, f"{basic_event.BasicEvent.NAME}.csv"), ["party_id"]) - self._check_csv_header(path.join(dir, f"{basic_communication.BasicCommunication.NAME}.csv"), ["party_id"]) + self._check_csv_header(path.join(dir, f"{basic_party.BasicParty.NAME}.csv"), + ["party_id", "party_gender"]) + self._check_csv_header(path.join(dir, f"{basic_contact.BasicContact.NAME}.csv"), + ["party_id", "contact_id", "contact_state"]) + self._check_csv_header(path.join(dir, f"{basic_relation.BasicRelation.NAME}.csv"), + ["party_id", "relation_id", "relation_type"]) + self._check_csv_header(path.join(dir, f"{basic_account.BasicAccount.NAME}.csv"), + ["party_id", "account_id", "account_state"]) + self._check_csv_header(path.join(dir, f"{basic_transaction.BasicTransaction.NAME}.csv"), + ["account_id", "transaction_id", "transaction_direction"]) + self._check_csv_header(path.join(dir, f"{basic_event.BasicEvent.NAME}.csv"), + ["party_id", "event_id", "session_id"]) + self._check_csv_header(path.join(dir, f"{basic_communication.BasicCommunication.NAME}.csv"), + ["party_id", "communication_id", "content"]) # TODO: Add batch size under limit, it will generate wrong dataset \ No newline at end of file From a8294c81b6f96dcad4f687afdd89bcad76a3e6c6 Mon Sep 17 00:00:00 2001 From: jist <95856749+george0st@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:21:49 +0200 Subject: [PATCH 4/4] Update test_generator.py --- tests/test_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index de06251..bef8915 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -156,6 +156,6 @@ def test_csv_structure(self): self._check_csv_header(path.join(dir, f"{basic_event.BasicEvent.NAME}.csv"), ["party_id", "event_id", "session_id"]) self._check_csv_header(path.join(dir, f"{basic_communication.BasicCommunication.NAME}.csv"), - ["party_id", "communication_id", "content"]) + ["party_id", "communication_id", "content", "content_sentiment"]) # TODO: Add batch size under limit, it will generate wrong dataset \ No newline at end of file