From 335b43e3b06d6b959c137934b87f5bfb575394ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Wed, 13 Dec 2023 20:57:08 +0000 Subject: [PATCH 01/12] issue #67 - addibg tests + code correction --- ailab/db/crawler/__init__.py | 12 ++++--- tests/test_db_crawler.py | 65 ++++++++++++++++++++++++++++++++++++ tests/test_db_data.py | 37 ++++++++++---------- tests/test_db_schema.py | 13 +++++++- 4 files changed, 104 insertions(+), 23 deletions(-) diff --git a/ailab/db/crawler/__init__.py b/ailab/db/crawler/__init__.py index bb9ff45..89964cc 100644 --- a/ailab/db/crawler/__init__.py +++ b/ailab/db/crawler/__init__.py @@ -78,10 +78,12 @@ def store_chunk_item(cursor, item): """, data ) - data['chunk_id'] = cursor.fetchone()['id'] + row = cursor.fetchone() + if row is not None: + data['chunk_id'] = row['id'] cursor.execute( """ - INSERT INTO html_content_to_chunk (html_content_md5hash, chunk_id) + INSERT INTO html_content_to_chunk (md5hash, chunk_id) VALUES(%(md5hash)s, %(chunk_id)s::UUID) ON CONFLICT DO NOTHING """, @@ -103,7 +105,9 @@ def store_chunk_item(cursor, item): """, data ) - data['token_id'] = cursor.fetchone()['id'] + res = cursor.fetchone() + if res is not None: + data['token_id'] = res['id'] return data except psycopg.IntegrityError as e: raise db.DBError("Error storing chunk item for %s" % item['url']) from e @@ -163,7 +167,7 @@ def fetch_crawl_ids_without_chunk(cursor): """ SELECT crawl.id FROM crawl LEFT JOIN html_content_to_chunk - ON crawl.md5hash = html_content_to_chunk.html_content_md5hash + ON crawl.md5hash = html_content_to_chunk.md5hash WHERE chunk_id IS NULL """ ).as_string(cursor) diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py index 891b424..19ffa2e 100644 --- a/tests/test_db_crawler.py +++ b/tests/test_db_crawler.py @@ -74,3 +74,68 @@ def test_fetch_chunk_id_without_embedding(self): rows = crawler.fetch_chunk_id_without_embedding(cursor, 'test-model') _entity_id = rows[0] self.connection.rollback() + +# generated tests + + def test_store_chunk_item(self): + """Test storing a chunk item.""" + with db.cursor(self.connection) as cursor: + item = { + "url": "https://inspection.canada.ca/a-propos-de-l-acia/fra/1299008020759/1299008778654", + "title": "À propos de l'ACIA - Agence canadienne d'inspection des aliments", + "text_content": "This is an example content.", + "tokens": [73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13] + } + crawler.store_chunk_item(cursor, item) + self.connection.rollback() + self.assertTrue(item["title"], "This is an example.") + + def test_store_crawl_item(self): + """Test storing a crawl item.""" + with db.cursor(self.connection) as cursor: + item = { + "url": "https://example.com", + "title": "Example", + "html_content": "This is an example.", + "lang": "en", + "last_crawled": "2022-01-01", + "last_updated": "2022-01-01" + } + crawler.store_crawl_item(cursor, item) + self.connection.rollback() + self.assertTrue(item["url"], "https://example.com") + + def test_store_embedding_item(self): + """Test storing an embedding item.""" + with db.cursor(self.connection) as cursor: + item = { + "token_id": "be612259-9b52-42fd-8d0b-d72120efa3b6", + "embedding": [1,409,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13, 73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13], + "embedding_model": "test-model" + } + crawler.store_embedding_item(cursor, item) + self.connection.rollback() + self.assertTrue(item["token_id"], "1") + + # def test_fetch_crawl_ids_without_chunk(self): + # """Test fetching crawl IDs without a chunk.""" + # with db.cursor(self.connection) as cursor: + # id = crawler.fetch_crawl_ids_without_chunk(cursor) + # self.connection.rollback() + # self.assertTrue() + + # def test_fetch_crawl_row(self): + # """Test fetching a crawl row.""" + # with db.cursor(self.connection) as cursor: + # id = crawler.fetch_crawl_row(cursor, "https://example.com") + # self.connection.rollback() + # self.assertFalse() + + + # def test_fetch_chunk_token_row(self): + # """Test fetching a chunk token row.""" + # with db.cursor(self.connection) as cursor: + # crawler.fetch_chunk_token_row(cursor, "https://example.com") + # self.connection.rollback() + # self.assertFalse() + diff --git a/tests/test_db_data.py b/tests/test_db_data.py index cf04789..82a740c 100644 --- a/tests/test_db_data.py +++ b/tests/test_db_data.py @@ -1,6 +1,7 @@ import unittest import ailab.db as db +import testing_utils as test class TestDBData(unittest.TestCase): @@ -20,14 +21,14 @@ def tearDown(self): def upgrade_schema(self): return - # if test.LOUIS_SCHEMA == 'louis_v005': - # self.execute('sql/2023-07-11-hotfix-xml-not-well-formed.sql') - # self.execute('sql/2023-07-11-populate-link.sql') - # self.execute('sql/2023-07-12-score-current.sql') - # self.execute('sql/2023-07-19-modify-score_type-add-similarity.sql') - # self.execute('sql/2023-07-19-modified-documents.sql') - # self.execute('sql/2023-07-19-weighted_search.sql') - # self.execute('sql/2023-07-21-default_chunk.sql') + if test.LOUIS_SCHEMA == 'louis_v004': + self.execute('sql/2023-07-11-hotfix-xml-not-well-formed.sql') + self.execute('sql/2023-07-11-populate-link.sql') + self.execute('sql/2023-07-12-score-current.sql') + self.execute('sql/2023-07-19-modify-score_type-add-similarity.sql') + self.execute('sql/2023-07-19-modified-documents.sql') + self.execute('sql/2023-07-19-weighted_search.sql') + self.execute('sql/2023-07-21-default_chunk.sql') def test_well_formed_xml(self): self.upgrade_schema() @@ -39,13 +40,13 @@ def test_well_formed_xml(self): result = self.cursor.fetchall() self.assertEqual(result[0]['count'], 0, "All xml should be well formed") - # def test_every_crawl_doc_should_have_at_least_one_chunk(self): - # # self.execute('sql/2023-08-09-issue8-html_content-table.sql') - # self.cursor.execute(""" - # select count(*) - # from crawl left join documents on crawl.id = documents.id - # where documents.id is null""") - # result = self.cursor.fetchall() - # self.assertEqual( - # result[0]['count'], 0, - # "Every crawl doc should have at least one chunk") + def test_every_crawl_doc_should_have_at_least_one_chunk(self): + # self.execute('sql/2023-08-09-issue8-html_content-table.sql') + self.cursor.execute(""" + select count(*) + from crawl left join documents on crawl.id = documents.id + where documents.id is null""") + result = self.cursor.fetchall() + self.assertEqual( + result[0]['count'], 0, + "Every crawl doc should have at least one chunk") diff --git a/tests/test_db_schema.py b/tests/test_db_schema.py index cb5318b..aa9ce60 100644 --- a/tests/test_db_schema.py +++ b/tests/test_db_schema.py @@ -1,7 +1,7 @@ """test database functions""" import unittest -#import testing_utils as test +import testing_utils as test import ailab.db as db @@ -22,3 +22,14 @@ def tearDown(self): # with db.cursor(self.connection) as cursor: # cursor.execute(schema) # self.connection.rollback() + + # def test_schema_exist(self): + # """sample test to check if the schema exists""" + # with db.cursor(self.connection) as cursor: + # cursor.execute( + # "SELECT EXISTS(SELECT * FROM )", + # (test.LOUIS_SCHEMA,) + # ) + # self.connection.rollback() + # row = cursor.fetchone() + # self.assertTrue(row[0]) From b216efed15a88dbdffee2b9741f1095412e13dc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Wed, 13 Dec 2023 20:59:56 +0000 Subject: [PATCH 02/12] fixes ruff --- tests/test_db_schema.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_db_schema.py b/tests/test_db_schema.py index aa9ce60..bb9d387 100644 --- a/tests/test_db_schema.py +++ b/tests/test_db_schema.py @@ -1,7 +1,6 @@ """test database functions""" import unittest -import testing_utils as test import ailab.db as db From b13408126ff05a1bc6bf4c2ae84b5e3f2328554d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:33:51 +0000 Subject: [PATCH 03/12] issue #67 - equals instead of true --- tests/test_db_crawler.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py index 19ffa2e..f3fcd0a 100644 --- a/tests/test_db_crawler.py +++ b/tests/test_db_crawler.py @@ -75,8 +75,6 @@ def test_fetch_chunk_id_without_embedding(self): _entity_id = rows[0] self.connection.rollback() -# generated tests - def test_store_chunk_item(self): """Test storing a chunk item.""" with db.cursor(self.connection) as cursor: @@ -88,7 +86,7 @@ def test_store_chunk_item(self): } crawler.store_chunk_item(cursor, item) self.connection.rollback() - self.assertTrue(item["title"], "This is an example.") + self.assertEqual(item["title"], "À propos de l'ACIA - Agence canadienne d'inspection des aliments") def test_store_crawl_item(self): """Test storing a crawl item.""" @@ -103,7 +101,7 @@ def test_store_crawl_item(self): } crawler.store_crawl_item(cursor, item) self.connection.rollback() - self.assertTrue(item["url"], "https://example.com") + self.assertEqual(item["url"], "https://example.com") def test_store_embedding_item(self): """Test storing an embedding item.""" @@ -115,7 +113,7 @@ def test_store_embedding_item(self): } crawler.store_embedding_item(cursor, item) self.connection.rollback() - self.assertTrue(item["token_id"], "1") + self.assertEqual(item["token_id"], "be612259-9b52-42fd-8d0b-d72120efa3b6") # def test_fetch_crawl_ids_without_chunk(self): # """Test fetching crawl IDs without a chunk.""" @@ -138,4 +136,3 @@ def test_store_embedding_item(self): # crawler.fetch_chunk_token_row(cursor, "https://example.com") # self.connection.rollback() # self.assertFalse() - From 76299eb9b5bf1eb22f2ff629b72f15cac7ac690f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Fri, 15 Dec 2023 17:52:51 +0000 Subject: [PATCH 04/12] issue #67 - adding tests 89% coverage --- ailab/db/crawler/__init__.py | 12 ++++----- tests/test_db_crawler.py | 50 +++++++++++++----------------------- 2 files changed, 23 insertions(+), 39 deletions(-) diff --git a/ailab/db/crawler/__init__.py b/ailab/db/crawler/__init__.py index 89964cc..0c452dc 100644 --- a/ailab/db/crawler/__init__.py +++ b/ailab/db/crawler/__init__.py @@ -210,15 +210,13 @@ def fetch_crawl_row(cursor, url): assert 'html_content' in row.keys() return row -def fetch_chunk_token_row(cursor, url): +def fetch_chunk_token_row(cursor, id): """Fetch the most recent chunk token for a given chunk id.""" - data = db.parse_postgresql_url(url) + data = {'id': id} cursor.execute( - "SELECT chunk.id as chunk_id, token.id as token_id, tokens FROM chunk" - " JOIN token ON chunk.id = token.chunk_id" - " WHERE chunk.id = %(id)s LIMIT 1", + """SELECT chunk.id as chunk_id, token.id as token_id, tokens FROM chunk + JOIN token ON chunk.id = token.chunk_id + WHERE chunk.id = %(id)s LIMIT 1""", data ) - # psycopg.extras.DictRow is not a real dict and will convert - # to string as a list so we force convert to dict return cursor.fetchone() diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py index f3fcd0a..f3d98f5 100644 --- a/tests/test_db_crawler.py +++ b/tests/test_db_crawler.py @@ -53,20 +53,6 @@ def test_fetch_crawl_row_by_postgresql_url(self): row['title'], "Sampling procedures - Canadian Food Inspection Agency") - def test_fetch_chunk_row(self): - """sample test to check if fetch_chunk_row works""" - url = db.create_postgresql_url( - "DBNAME", - "chunk", - "469812c5-190c-4e56-9f88-c8621592bcb5") - with db.cursor(self.connection) as cursor: - row = crawler.fetch_chunk_token_row(cursor, url) - self.connection.rollback() - self.assertTrue(isinstance(row, dict)) - self.assertEqual(len(row['tokens']), 76) - self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5") - self.assertEqual(str(row['token_id']), 'dbb7b498-2cbf-4ae9-aa10-3169cc72f285') - def test_fetch_chunk_id_without_embedding(self): """sample test to check if fetch_chunk_id_without_embedding works""" with db.cursor(self.connection) as cursor: @@ -115,24 +101,24 @@ def test_store_embedding_item(self): self.connection.rollback() self.assertEqual(item["token_id"], "be612259-9b52-42fd-8d0b-d72120efa3b6") - # def test_fetch_crawl_ids_without_chunk(self): - # """Test fetching crawl IDs without a chunk.""" - # with db.cursor(self.connection) as cursor: - # id = crawler.fetch_crawl_ids_without_chunk(cursor) - # self.connection.rollback() - # self.assertTrue() + def test_fetch_crawl_ids_without_chunk(self): + """Test fetching crawl IDs without a chunk.""" + with db.cursor(self.connection) as cursor: + id = crawler.fetch_crawl_ids_without_chunk(cursor) + self.connection.rollback() + self.assertEqual(id, []) - # def test_fetch_crawl_row(self): - # """Test fetching a crawl row.""" - # with db.cursor(self.connection) as cursor: - # id = crawler.fetch_crawl_row(cursor, "https://example.com") - # self.connection.rollback() - # self.assertFalse() + def test_fetch_crawl_row(self): + """Test fetching a crawl row.""" + with db.cursor(self.connection) as cursor: + row = crawler.fetch_crawl_row(cursor, "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699") + self.connection.rollback() + self.assertEqual(row['title'], "Mandat - Agence canadienne d'inspection des aliments") - # def test_fetch_chunk_token_row(self): - # """Test fetching a chunk token row.""" - # with db.cursor(self.connection) as cursor: - # crawler.fetch_chunk_token_row(cursor, "https://example.com") - # self.connection.rollback() - # self.assertFalse() + def test_fetch_chunk_token_row(self): + """Test fetching a chunk token row.""" + with db.cursor(self.connection) as cursor: + row = crawler.fetch_chunk_token_row(cursor, "469812c5-190c-4e56-9f88-c8621592bcb5") + self.connection.rollback() + self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5") From 22de5b67c099a2c2fd21470dd013bacd7c9ab17b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Fri, 15 Dec 2023 19:10:06 +0000 Subject: [PATCH 05/12] tests on errors --- tests/test_db_crawler.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py index f3d98f5..ecd3c9d 100644 --- a/tests/test_db_crawler.py +++ b/tests/test_db_crawler.py @@ -1,6 +1,5 @@ """test database functions""" import unittest - import ailab.db as db import ailab.db.crawler as crawler import tests.testing_utils as test @@ -115,10 +114,15 @@ def test_fetch_crawl_row(self): self.connection.rollback() self.assertEqual(row['title'], "Mandat - Agence canadienne d'inspection des aliments") - def test_fetch_chunk_token_row(self): """Test fetching a chunk token row.""" with db.cursor(self.connection) as cursor: row = crawler.fetch_chunk_token_row(cursor, "469812c5-190c-4e56-9f88-c8621592bcb5") self.connection.rollback() self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5") + + def test_fetch_crawl_row_with_invalid_url(self): + """Test fetching a crawl row with an invalid URL.""" + with db.cursor(self.connection) as cursor: + with self.assertRaises(db.DBError): + crawler.fetch_crawl_row(cursor, "invalid_url") From 902615053c22a0778bb85b64a8640b639a8a7f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Mon, 18 Dec 2023 21:59:10 +0000 Subject: [PATCH 06/12] wip test schema --- tests/test_db_schema.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_db_schema.py b/tests/test_db_schema.py index bb9d387..db21904 100644 --- a/tests/test_db_schema.py +++ b/tests/test_db_schema.py @@ -1,6 +1,8 @@ """test database functions""" import unittest +import psycopg + import ailab.db as db @@ -12,6 +14,30 @@ def setUp(self): def tearDown(self): self.connection.close() + def test_crawl_exists(self): + """Test if a specific table exists in the database.""" + table_name = "crawl" + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name};""") + result = cursor.fetchone() + self.assertIsNotNone(result) + + def test_false_table_not_exists(self): + """Test if a specific table does not exists in the database.""" + table_name = "false_table" + with db.cursor(self.connection) as cursor: + with self.assertRaises(psycopg.errors.UndefinedTable): + cursor.execute(f"""SELECT * FROM {table_name};""") + + def test_table_has_correct_columns(self): + """Test if a specific table has the correct columns.""" + table_name = "chunk" + expected_columns = ["id", "title", "text_content"] + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 0;""") + actual_columns = [desc[0] for desc in cursor.description] + self.assertCountEqual(actual_columns, expected_columns, f"Table {table_name} does not have the correct columns.") + # def test_schema(self): # """sample test to check if the schema is correct and idempotent""" # schema_filename = f"dumps/{test.LOUIS_SCHEMA}/schema.sql" From 0a1097d936606c8fb3600be66d06543f319c5853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Tue, 19 Dec 2023 14:59:27 +0000 Subject: [PATCH 07/12] adding test coverage for schema --- tests/test_db_schema.py | 89 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 10 deletions(-) diff --git a/tests/test_db_schema.py b/tests/test_db_schema.py index db21904..3813f46 100644 --- a/tests/test_db_schema.py +++ b/tests/test_db_schema.py @@ -15,28 +15,97 @@ def tearDown(self): self.connection.close() def test_crawl_exists(self): - """Test if a specific table exists in the database.""" + """Test if crawl table exists in the database and is not empty.""" table_name = "crawl" with db.cursor(self.connection) as cursor: - cursor.execute(f"""SELECT * FROM {table_name};""") + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 1;""") result = cursor.fetchone() self.assertIsNotNone(result) - def test_false_table_not_exists(self): - """Test if a specific table does not exists in the database.""" - table_name = "false_table" + def test_crawl_has_correct_columns(self): + """Test if crawl table has the correct columns.""" + table_name = "crawl" + expected_columns = ["id", "url", "title", "lang", "last_crawled", + "last_updated", "last_updated_date", "md5hash"] with db.cursor(self.connection) as cursor: - with self.assertRaises(psycopg.errors.UndefinedTable): - cursor.execute(f"""SELECT * FROM {table_name};""") + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 0;""") + actual_columns = [desc[0] for desc in cursor.description] + self.assertCountEqual(actual_columns, expected_columns) + + def test_chunk_exists(self): + """Test if chunk table exists in the database and is not empty.""" + table_name = "chunk" + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 1;""") + result = cursor.fetchone() + self.assertIsNotNone(result) - def test_table_has_correct_columns(self): - """Test if a specific table has the correct columns.""" + def test_chunk_has_correct_columns(self): + """Test if chunk table has the correct columns.""" table_name = "chunk" expected_columns = ["id", "title", "text_content"] with db.cursor(self.connection) as cursor: cursor.execute(f"""SELECT * FROM {table_name} LIMIT 0;""") actual_columns = [desc[0] for desc in cursor.description] - self.assertCountEqual(actual_columns, expected_columns, f"Table {table_name} does not have the correct columns.") + self.assertEqual(actual_columns, expected_columns) + + def test_token_exists(self): + """Test if token table exists in the database and is not empty.""" + table_name = "token" + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 1;""") + result = cursor.fetchone() + self.assertIsNotNone(result) + + def test_token_has_correct_columns(self): + """Test if token table has the correct columns.""" + table_name = "token" + expected_columns = ["id", "chunk_id", "tokens", "encoding"] + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 0;""") + actual_columns = [desc[0] for desc in cursor.description] + self.assertEqual(actual_columns, expected_columns) + + def test_score_exists(self): + """Test if score table exists in the database and is not empty.""" + table_name = "score" + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 1;""") + result = cursor.fetchone() + self.assertIsNotNone(result) + + def test_score_has_correct_columns(self): + """Test if score table has the correct columns.""" + table_name = "score" + expected_columns = ["entity_id", "score", "score_type"] + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 0;""") + actual_columns = [desc[0] for desc in cursor.description] + self.assertEqual(actual_columns, expected_columns) + + def test_html_content_exists(self): + """Test if html_content table exists in the database and is not empty.""" + table_name = "html_content" + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 1;""") + result = cursor.fetchone() + self.assertIsNotNone(result) + + def test_html_content_has_correct_columns(self): + """Test if html_content table has the correct columns.""" + table_name = "html_content" + expected_columns = ["content", "md5hash"] + with db.cursor(self.connection) as cursor: + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 0;""") + actual_columns = [desc[0] for desc in cursor.description] + self.assertEqual(actual_columns, expected_columns) + + def test_false_table_not_exists(self): + """Test if false_table table does NOT exists in the database.""" + table_name = "false_table" + with db.cursor(self.connection) as cursor: + with self.assertRaises(psycopg.errors.UndefinedTable): + cursor.execute(f"""SELECT * FROM {table_name} LIMIT 1;""") # def test_schema(self): # """sample test to check if the schema is correct and idempotent""" From 75e69526c6ab95ed2472b52965b0d416553b400a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Tue, 19 Dec 2023 18:52:07 +0000 Subject: [PATCH 08/12] issue #67 - dividing store_chunk_item into tiner func --- ailab/db/crawler/__init__.py | 114 ++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/ailab/db/crawler/__init__.py b/ailab/db/crawler/__init__.py index 0c452dc..a8f7073 100644 --- a/ailab/db/crawler/__init__.py +++ b/ailab/db/crawler/__init__.py @@ -45,6 +45,62 @@ def fetch_links(cursor, url): data['destination_urls'] = [r['url'] for r in cursor.fetchall()] return data['destination_urls'] +def get_md5hash(cursor, data): + cursor.execute( + """SELECT md5hash FROM crawl WHERE url = %(url)s + ORDER BY last_updated DESC LIMIT 1""", + data + ) + return cursor.fetchone()['md5hash'] + +def get_chunk_id(cursor, data): + cursor.execute( + """ + WITH e as( + INSERT INTO chunk (title, text_content) + VALUES(%(title)s, %(text_content)s) + ON CONFLICT DO NOTHING + RETURNING id + ) + SELECT id FROM e + UNION ALL + SELECT id FROM chunk WHERE text_content = %(text_content)s + """, + data + ) + row = cursor.fetchone() + return row['id'] if row is not None else None + +def insert_html_content_to_chunk(cursor, data): + cursor.execute( + """ + INSERT INTO html_content_to_chunk (md5hash, chunk_id) + VALUES(%(md5hash)s, %(chunk_id)s::UUID) + ON CONFLICT DO NOTHING + """, + data) + +def get_token_id(cursor, data): + cursor.execute( + """ + WITH e as( + INSERT INTO token (chunk_id, tokens, encoding) + VALUES (%(chunk_id)s::UUID, %(tokens)s, %(encoding)s) + ON CONFLICT DO NOTHING + RETURNING * + ) + SELECT id FROM e + UNION ALL + SELECT id FROM token + WHERE chunk_id = %(chunk_id)s::UUID + and tokens = %(tokens)s::INTEGER[] + and encoding = %(encoding)s + """, + data + ) + res = cursor.fetchone() + return res['id'] if res is not None else None + def store_chunk_item(cursor, item): """Process a ChunkItem and insert it into the database.""" try: @@ -55,64 +111,14 @@ def store_chunk_item(cursor, item): 'tokens': item["tokens"], 'encoding': 'cl100k_base' } - cursor.execute( - """SELECT md5hash FROM crawl WHERE url = %(url)s - ORDER BY last_updated DESC LIMIT 1""", - data - ) - data['md5hash'] = cursor.fetchone()['md5hash'] - - # TODO: should probably update the title even if the text_content - # is already present as we may have changed how we create the title - cursor.execute( - """ - WITH e as( - INSERT INTO chunk (title, text_content) - VALUES(%(title)s, %(text_content)s) - ON CONFLICT DO NOTHING - RETURNING id - ) - SELECT id FROM e - UNION ALL - SELECT id FROM chunk WHERE text_content = %(text_content)s - """, - data - ) - row = cursor.fetchone() - if row is not None: - data['chunk_id'] = row['id'] - cursor.execute( - """ - INSERT INTO html_content_to_chunk (md5hash, chunk_id) - VALUES(%(md5hash)s, %(chunk_id)s::UUID) - ON CONFLICT DO NOTHING - """, - data) - cursor.execute( - """ - WITH e as( - INSERT INTO token (chunk_id, tokens, encoding) - VALUES (%(chunk_id)s::UUID, %(tokens)s, %(encoding)s) - ON CONFLICT DO NOTHING - RETURNING * - ) - SELECT id FROM e - UNION ALL - SELECT id FROM token - WHERE chunk_id = %(chunk_id)s::UUID - and tokens = %(tokens)s::INTEGER[] - and encoding = %(encoding)s - """, - data - ) - res = cursor.fetchone() - if res is not None: - data['token_id'] = res['id'] + data['md5hash'] = get_md5hash(cursor, data) + data['chunk_id'] = get_chunk_id(cursor, data) + insert_html_content_to_chunk(cursor, data) + data['token_id'] = get_token_id(cursor, data) return data except psycopg.IntegrityError as e: raise db.DBError("Error storing chunk item for %s" % item['url']) from e - def store_crawl_item(cursor, item): """Process a CrawlItem and insert it into the database.""" try: From ed5312385f0e8f1f509dd5d03dfc4e4682cb82f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Tue, 19 Dec 2023 20:22:09 +0000 Subject: [PATCH 09/12] correcting some assertions --- ailab/db/crawler/__init__.py | 6 ++++-- tests/test_db_crawler.py | 13 +++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/ailab/db/crawler/__init__.py b/ailab/db/crawler/__init__.py index a8f7073..e3da966 100644 --- a/ailab/db/crawler/__init__.py +++ b/ailab/db/crawler/__init__.py @@ -137,7 +137,9 @@ def store_crawl_item(cursor, item): """, item ) - return item + cursor.execute("""SELECT * FROM crawl + WHERE url = %(url)s ORDER BY last_updated DESC LIMIT 1""", item) + return cursor.fetchone() except psycopg.IntegrityError as e: raise db.DBError("Error storing crawl item for %s" % item['url']) from e @@ -162,7 +164,7 @@ def store_embedding_item(cursor, item): query, data ) - return item + return data['token_id'] except psycopg.IntegrityError as e: raise db.DBError( "Error storing embedding item for token %s" % item['token_id']) from e diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py index ecd3c9d..d8cf476 100644 --- a/tests/test_db_crawler.py +++ b/tests/test_db_crawler.py @@ -69,9 +69,9 @@ def test_store_chunk_item(self): "text_content": "This is an example content.", "tokens": [73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13] } - crawler.store_chunk_item(cursor, item) + stored_item = crawler.store_chunk_item(cursor, item) self.connection.rollback() - self.assertEqual(item["title"], "À propos de l'ACIA - Agence canadienne d'inspection des aliments") + self.assertEqual(stored_item["url"], item["url"]) def test_store_crawl_item(self): """Test storing a crawl item.""" @@ -84,9 +84,10 @@ def test_store_crawl_item(self): "last_crawled": "2022-01-01", "last_updated": "2022-01-01" } - crawler.store_crawl_item(cursor, item) + stored_item = crawler.store_crawl_item(cursor, item) self.connection.rollback() - self.assertEqual(item["url"], "https://example.com") + self.assertEqual(item["title"], stored_item["title"]) + self.assertEqual(item["url"], stored_item["url"]) def test_store_embedding_item(self): """Test storing an embedding item.""" @@ -96,9 +97,9 @@ def test_store_embedding_item(self): "embedding": [1,409,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13, 73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13], "embedding_model": "test-model" } - crawler.store_embedding_item(cursor, item) + stored_item = crawler.store_embedding_item(cursor, item) self.connection.rollback() - self.assertEqual(item["token_id"], "be612259-9b52-42fd-8d0b-d72120efa3b6") + self.assertEqual(item["token_id"], stored_item) def test_fetch_crawl_ids_without_chunk(self): """Test fetching crawl IDs without a chunk.""" From bd527a5497ca70144565af1366ef7179f1e11e1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Wed, 20 Dec 2023 21:32:31 +0000 Subject: [PATCH 10/12] generating embeddings --- tests/test_db_crawler.py | 2 +- tests/testing_utils.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py index d8cf476..b48ff26 100644 --- a/tests/test_db_crawler.py +++ b/tests/test_db_crawler.py @@ -94,7 +94,7 @@ def test_store_embedding_item(self): with db.cursor(self.connection) as cursor: item = { "token_id": "be612259-9b52-42fd-8d0b-d72120efa3b6", - "embedding": [1,409,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13, 73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13], + "embedding": test.generate_random_embedding(1536), "embedding_model": "test-model" } stored_item = crawler.store_embedding_item(cursor, item) diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 9e560cb..49ff82a 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -1,4 +1,6 @@ import os +import random +import uuid import dotenv dotenv.load_dotenv() @@ -19,3 +21,15 @@ def raise_error(message): unique(token_id) ); """ + +# Generate a random UUID +test_uuid = uuid.uuid4() +test_item = { + "id": test_uuid, + "title": "Title exemple", + "text_content": "This is an example content.", + } + + +def generate_random_embedding(dimensions=100): + return [random.uniform(0, 100000) for _ in range(dimensions)] From be6b2497756b7a97695d90584f68dcf00a578d55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Wed, 20 Dec 2023 21:42:29 +0000 Subject: [PATCH 11/12] checking for none --- ailab/db/crawler/__init__.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/ailab/db/crawler/__init__.py b/ailab/db/crawler/__init__.py index e3da966..1438890 100644 --- a/ailab/db/crawler/__init__.py +++ b/ailab/db/crawler/__init__.py @@ -111,10 +111,20 @@ def store_chunk_item(cursor, item): 'tokens': item["tokens"], 'encoding': 'cl100k_base' } - data['md5hash'] = get_md5hash(cursor, data) - data['chunk_id'] = get_chunk_id(cursor, data) + new_md5hash = get_md5hash(cursor, data) + if new_md5hash is not None: + data['md5hash'] = new_md5hash + + new_chunk_id = get_chunk_id(cursor, data) + if new_chunk_id is not None: + data['chunk_id'] = new_chunk_id + insert_html_content_to_chunk(cursor, data) - data['token_id'] = get_token_id(cursor, data) + + new_token_id = get_token_id(cursor, data) + if new_token_id is not None: + data['token_id'] = new_token_id + return data except psycopg.IntegrityError as e: raise db.DBError("Error storing chunk item for %s" % item['url']) from e From 21f03f9f34b55e1fc55d4951748582f3bac19a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20F?= <73828657+melanie-fressard@users.noreply.github.com> Date: Thu, 21 Dec 2023 20:09:27 +0000 Subject: [PATCH 12/12] issue #67 - trying to add test values --- tests/test_db_crawler.py | 19 +++++++++++++++++++ tests/testing_utils.py | 2 ++ 2 files changed, 21 insertions(+) diff --git a/tests/test_db_crawler.py b/tests/test_db_crawler.py index b48ff26..dcf18fc 100644 --- a/tests/test_db_crawler.py +++ b/tests/test_db_crawler.py @@ -114,6 +114,25 @@ def test_fetch_crawl_row(self): row = crawler.fetch_crawl_row(cursor, "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699") self.connection.rollback() self.assertEqual(row['title'], "Mandat - Agence canadienne d'inspection des aliments") + + # def test_fetch_crawl_row_with_test_data(self): + # """Test fetching a crawl row.""" + # with db.cursor(self.connection) as cursor: + # test_chunk_id = test.test_uuid + # test_crawl_id = test.test_uuid + # test_md5hash = test.test_hash + + + # cursor.execute(f""" + # INSERT INTO html_content VALUES ('Test Content', '{test_md5hash}'); + # INSERT INTO crawl (id, url, title, lang, last_crawled, last_updated, last_updated_date, md5hash) + # VALUES ('{test_chunk_id}', 'http://example.com', 'Test Title', 'en', NOW(), NOW(), NOW(), '{test_md5hash}'); + # INSERT INTO html_content_to_chunk VALUES ('{test_crawl_id}', '{test_md5hash}'); + # """ + # ) + # row = crawler.fetch_crawl_row(cursor, "http://example.com") + # self.connection.rollback() + # self.assertEqual(row['title'], "Test Title") def test_fetch_chunk_token_row(self): """Test fetching a chunk token row.""" diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 49ff82a..2348db2 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -1,6 +1,7 @@ import os import random import uuid +import hashlib import dotenv dotenv.load_dotenv() @@ -30,6 +31,7 @@ def raise_error(message): "text_content": "This is an example content.", } +test_hash = hashlib.md5("test".encode()).hexdigest()[:31] def generate_random_embedding(dimensions=100): return [random.uniform(0, 100000) for _ in range(dimensions)]