Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes 67: Adding test coverage for louis-crawler #69

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
138 changes: 79 additions & 59 deletions ailab/db/crawler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,62 @@ def fetch_links(cursor, url):
data['destination_urls'] = [r['url'] for r in cursor.fetchall()]
return data['destination_urls']

def get_md5hash(cursor, data):
cursor.execute(
"""SELECT md5hash FROM crawl WHERE url = %(url)s
ORDER BY last_updated DESC LIMIT 1""",
data
)
return cursor.fetchone()['md5hash']

def get_chunk_id(cursor, data):
cursor.execute(
"""
WITH e as(
INSERT INTO chunk (title, text_content)
VALUES(%(title)s, %(text_content)s)
ON CONFLICT DO NOTHING
RETURNING id
)
SELECT id FROM e
UNION ALL
SELECT id FROM chunk WHERE text_content = %(text_content)s
""",
data
)
row = cursor.fetchone()
return row['id'] if row is not None else None

def insert_html_content_to_chunk(cursor, data):
cursor.execute(
"""
INSERT INTO html_content_to_chunk (md5hash, chunk_id)
VALUES(%(md5hash)s, %(chunk_id)s::UUID)
ON CONFLICT DO NOTHING
""",
data)

def get_token_id(cursor, data):
cursor.execute(
"""
WITH e as(
INSERT INTO token (chunk_id, tokens, encoding)
VALUES (%(chunk_id)s::UUID, %(tokens)s, %(encoding)s)
ON CONFLICT DO NOTHING
RETURNING *
)
SELECT id FROM e
UNION ALL
SELECT id FROM token
WHERE chunk_id = %(chunk_id)s::UUID
and tokens = %(tokens)s::INTEGER[]
and encoding = %(encoding)s
""",
data
)
res = cursor.fetchone()
return res['id'] if res is not None else None

def store_chunk_item(cursor, item):
"""Process a ChunkItem and insert it into the database."""
try:
Expand All @@ -55,60 +111,24 @@ def store_chunk_item(cursor, item):
'tokens': item["tokens"],
'encoding': 'cl100k_base'
}
cursor.execute(
"""SELECT md5hash FROM crawl WHERE url = %(url)s
ORDER BY last_updated DESC LIMIT 1""",
data
)
data['md5hash'] = cursor.fetchone()['md5hash']
new_md5hash = get_md5hash(cursor, data)
if new_md5hash is not None:
data['md5hash'] = new_md5hash

new_chunk_id = get_chunk_id(cursor, data)
if new_chunk_id is not None:
data['chunk_id'] = new_chunk_id

insert_html_content_to_chunk(cursor, data)

# TODO: should probably update the title even if the text_content
# is already present as we may have changed how we create the title
cursor.execute(
"""
WITH e as(
INSERT INTO chunk (title, text_content)
VALUES(%(title)s, %(text_content)s)
ON CONFLICT DO NOTHING
RETURNING id
)
SELECT id FROM e
UNION ALL
SELECT id FROM chunk WHERE text_content = %(text_content)s
""",
data
)
data['chunk_id'] = cursor.fetchone()['id']
cursor.execute(
"""
INSERT INTO html_content_to_chunk (html_content_md5hash, chunk_id)
VALUES(%(md5hash)s, %(chunk_id)s::UUID)
ON CONFLICT DO NOTHING
""",
data)
cursor.execute(
"""
WITH e as(
INSERT INTO token (chunk_id, tokens, encoding)
VALUES (%(chunk_id)s::UUID, %(tokens)s, %(encoding)s)
ON CONFLICT DO NOTHING
RETURNING *
)
SELECT id FROM e
UNION ALL
SELECT id FROM token
WHERE chunk_id = %(chunk_id)s::UUID
and tokens = %(tokens)s::INTEGER[]
and encoding = %(encoding)s
""",
data
)
data['token_id'] = cursor.fetchone()['id']
new_token_id = get_token_id(cursor, data)
if new_token_id is not None:
data['token_id'] = new_token_id

return data
except psycopg.IntegrityError as e:
raise db.DBError("Error storing chunk item for %s" % item['url']) from e


def store_crawl_item(cursor, item):
"""Process a CrawlItem and insert it into the database."""
try:
Expand All @@ -127,7 +147,9 @@ def store_crawl_item(cursor, item):
""",
item
)
return item
cursor.execute("""SELECT * FROM crawl
WHERE url = %(url)s ORDER BY last_updated DESC LIMIT 1""", item)
return cursor.fetchone()
except psycopg.IntegrityError as e:
raise db.DBError("Error storing crawl item for %s" % item['url']) from e

Expand All @@ -152,7 +174,7 @@ def store_embedding_item(cursor, item):
query,
data
)
return item
return data['token_id']
except psycopg.IntegrityError as e:
raise db.DBError(
"Error storing embedding item for token %s" % item['token_id']) from e
Expand All @@ -163,7 +185,7 @@ def fetch_crawl_ids_without_chunk(cursor):
"""
SELECT crawl.id FROM crawl
LEFT JOIN html_content_to_chunk
ON crawl.md5hash = html_content_to_chunk.html_content_md5hash
ON crawl.md5hash = html_content_to_chunk.md5hash
WHERE chunk_id IS NULL
"""
).as_string(cursor)
Expand Down Expand Up @@ -206,15 +228,13 @@ def fetch_crawl_row(cursor, url):
assert 'html_content' in row.keys()
return row

def fetch_chunk_token_row(cursor, url):
def fetch_chunk_token_row(cursor, id):
"""Fetch the most recent chunk token for a given chunk id."""
data = db.parse_postgresql_url(url)
data = {'id': id}
cursor.execute(
"SELECT chunk.id as chunk_id, token.id as token_id, tokens FROM chunk"
" JOIN token ON chunk.id = token.chunk_id"
" WHERE chunk.id = %(id)s LIMIT 1",
"""SELECT chunk.id as chunk_id, token.id as token_id, tokens FROM chunk
JOIN token ON chunk.id = token.chunk_id
WHERE chunk.id = %(id)s LIMIT 1""",
data
)
# psycopg.extras.DictRow is not a real dict and will convert
# to string as a list so we force convert to dict
return cursor.fetchone()
102 changes: 87 additions & 15 deletions tests/test_db_crawler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""test database functions"""
import unittest

import ailab.db as db
import ailab.db.crawler as crawler
import tests.testing_utils as test
Expand Down Expand Up @@ -53,24 +52,97 @@ def test_fetch_crawl_row_by_postgresql_url(self):
row['title'],
"Sampling procedures - Canadian Food Inspection Agency")

def test_fetch_chunk_row(self):
"""sample test to check if fetch_chunk_row works"""
url = db.create_postgresql_url(
"DBNAME",
"chunk",
"469812c5-190c-4e56-9f88-c8621592bcb5")
with db.cursor(self.connection) as cursor:
row = crawler.fetch_chunk_token_row(cursor, url)
self.connection.rollback()
self.assertTrue(isinstance(row, dict))
self.assertEqual(len(row['tokens']), 76)
self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5")
self.assertEqual(str(row['token_id']), 'dbb7b498-2cbf-4ae9-aa10-3169cc72f285')

def test_fetch_chunk_id_without_embedding(self):
"""sample test to check if fetch_chunk_id_without_embedding works"""
with db.cursor(self.connection) as cursor:
cursor.execute(test.embedding_table.format(embedding_model='test-model'))
rows = crawler.fetch_chunk_id_without_embedding(cursor, 'test-model')
_entity_id = rows[0]
self.connection.rollback()

def test_store_chunk_item(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test case seems to assert the happy path for store_chunk_item. I suggest also testing the edge case where row is None here:

        row = cursor.fetchone()
        if row is not None:
            data['chunk_id'] = row['id']

in store_chunk_item. This is important because if there is a possibility that row is None, then we should know exactly what the end result would be.

To go about this, I suggest breaking store_chunk_item into more atomic functions (which would be tested separately), then mocking the one that would normally produce row to return None instead in the test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

broke down store_chunk_item into tinier functions
still need to test if there is a case with none

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

"""Test storing a chunk item."""
with db.cursor(self.connection) as cursor:
item = {
"url": "https://inspection.canada.ca/a-propos-de-l-acia/fra/1299008020759/1299008778654",
"title": "À propos de l'ACIA - Agence canadienne d'inspection des aliments",
"text_content": "This is an example content.",
"tokens": [73053,10045,409,326,6,1741,5987,2998,37622,934,6,8629,44618,409,38682,1001,367,3869,348,2328,7330,52760,11,326,6,1741,5987,264,653,348,5642,11837,266,7930,2995,64097,1208,4371,392,1018,978,38450,12267,11,1208,77323,951,4039,12249,11,1208,9313,951,348,19395,978,2629,2249,1880,326,69537,12416,8065,84751,6625,13,17360,535,89,551,5690,6405,13674,33867,14318,3765,91080,1370,2126,22811,11876,459,5979,729,3539,5512,409,326,6,1741,5987,56311,39929,64079,3869,951,90108,35933,46680,969,645,551,2009,85182,40280,3930,7008,90108,1082,3625,459,5979,729,3539,5512,5019,3625,4824,76,1154,25540,5512,1370,514,72601,409,2343,68,2405,10610,953,13,2998,62163,42145,40948,5512,294,6,97675,4149,3462,16848,85046,1880,83229,70,91555,11683,12416,3869,326,6,26125,1880,1208,9313,951,5790,325,625,3808,1732,36527,3459,360,17724,409,326,6,1741,5987,22555,951,24261,288,1880,951,917,2053,3700,34965,11,93084,1880,3057,65,811,57967,1220,294,26248,1088,1759,409,1208,6377,30052,9359,10333,5392,788,95188,4949,11,2126,22811,11,1008,44357,11,95995,409,3729,8471,1880,5790,325,625,3808,65381,10045,409,17317,24789,266,11,9131,11,11376,11,4046,6414,51084,951,97035,13,8245,22139,64829,29696,409,11692,1880,409,85182,77,685,328,5164,409,80080,423,944,59307,80080,11,17889,1354,5860,24985,3946,11,62163,409,5790,325,625,3808,951,35030,3557,1880,3930,586,13,51097,4972,35933,44564,20392,3869,326,58591,73511,7769,3136,13,3744,268,2850,1900,5856,288,9952,3625,447,2053,17317,58673,484,2439,5019,65827,268,404,1880,5201,261,3625,7617,288,409,77463,38450,12267,1880,1097,73511,15171,1208,9313,1880,1208,6225,31617,8082,951,1615,316,18543,1759,13,5856,288,8666,266,22589,1219,13109,8666,50848,294,6,4683,15916,11,1219,13109,8666,1413,3930,49904,265,11,43252,89781,288,1765,3625,13826,25108,4978,409,51304,13,29124,6414,51084,951,97222,1880,951,3600,45629,288,5019,38682,404,15907,22639,9952,3625,2027,38647,11,3625,1615,316,18543,1759,11,326,6,485,592,7379,1880,3625,46106,31957,1821,13,20915,21066,409,1208,26965,13109,44564,3057,1557,409,1208,26965,13109,38682,1001,12267,13,2998,2842,40280,1880,82620,27220,18042,283,35573,514,82620,27220,1880,326,91655,11323,266,2428,13,2998,29033,6672,51097,737,2727,392,645,432,1137,625,3808,1765,3625,737,2727,392,645,1880,46106,21744,10515,5512,409,326,6,1741,5987,13]
}
stored_item = crawler.store_chunk_item(cursor, item)
self.connection.rollback()
self.assertEqual(stored_item["url"], item["url"])

def test_store_crawl_item(self):
"""Test storing a crawl item."""
with db.cursor(self.connection) as cursor:
item = {
"url": "https://example.com",
"title": "Example",
"html_content": "<html><body>This is an example.</body></html>",
"lang": "en",
"last_crawled": "2022-01-01",
"last_updated": "2022-01-01"
}
stored_item = crawler.store_crawl_item(cursor, item)
self.connection.rollback()
self.assertEqual(item["title"], stored_item["title"])
self.assertEqual(item["url"], stored_item["url"])

def test_store_embedding_item(self):
"""Test storing an embedding item."""
with db.cursor(self.connection) as cursor:
item = {
"token_id": "be612259-9b52-42fd-8d0b-d72120efa3b6",
"embedding": test.generate_random_embedding(1536),
"embedding_model": "test-model"
}
stored_item = crawler.store_embedding_item(cursor, item)
self.connection.rollback()
self.assertEqual(item["token_id"], stored_item)

def test_fetch_crawl_ids_without_chunk(self):
"""Test fetching crawl IDs without a chunk."""
with db.cursor(self.connection) as cursor:
id = crawler.fetch_crawl_ids_without_chunk(cursor)
self.connection.rollback()
self.assertEqual(id, [])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assertion isn't adequately testing fetch_crawl_ids_without_chunk. A better test would be to first populate the table with a mix of crawls, some with chunk IDs and some without. Then, we should verify that the function's results match exactly the crawls that don't have chunk IDs, before performing a rollback.


def test_fetch_crawl_row(self):
"""Test fetching a crawl row."""
with db.cursor(self.connection) as cursor:
row = crawler.fetch_crawl_row(cursor, "https://inspection.canada.ca/a-propos-de-l-acia/structure-organisationnelle/mandat/fra/1299780188624/1319164463699")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This data might not exist in the future, in which case this test will fail. A more suitable test would be to first populate the relevant tables with test items, perform the fetch, then assert the result matches the test items.

fetch_crawl_row also has multiple paths and edge cases worth testing.

self.connection.rollback()
self.assertEqual(row['title'], "Mandat - Agence canadienne d'inspection des aliments")

# def test_fetch_crawl_row_with_test_data(self):
# """Test fetching a crawl row."""
# with db.cursor(self.connection) as cursor:
# test_chunk_id = test.test_uuid
# test_crawl_id = test.test_uuid
# test_md5hash = test.test_hash


# cursor.execute(f"""
# INSERT INTO html_content VALUES ('<html><body>Test Content</body></html>', '{test_md5hash}');
# INSERT INTO crawl (id, url, title, lang, last_crawled, last_updated, last_updated_date, md5hash)
# VALUES ('{test_chunk_id}', 'http://example.com', 'Test Title', 'en', NOW(), NOW(), NOW(), '{test_md5hash}');
# INSERT INTO html_content_to_chunk VALUES ('{test_crawl_id}', '{test_md5hash}');
# """
# )
# row = crawler.fetch_crawl_row(cursor, "http://example.com")
# self.connection.rollback()
# self.assertEqual(row['title'], "Test Title")

def test_fetch_chunk_token_row(self):
"""Test fetching a chunk token row."""
with db.cursor(self.connection) as cursor:
row = crawler.fetch_chunk_token_row(cursor, "469812c5-190c-4e56-9f88-c8621592bcb5")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This data might not exist in the future, in which case this test will fail. A more suitable test would be to first populate the relevant tables with test items, perform the fetch, then assert the result matches the test items.

self.connection.rollback()
self.assertEqual(str(row['chunk_id']), "469812c5-190c-4e56-9f88-c8621592bcb5")

def test_fetch_crawl_row_with_invalid_url(self):
"""Test fetching a crawl row with an invalid URL."""
with db.cursor(self.connection) as cursor:
with self.assertRaises(db.DBError):
crawler.fetch_crawl_row(cursor, "invalid_url")
37 changes: 19 additions & 18 deletions tests/test_db_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest

import ailab.db as db
import testing_utils as test

class TestDBData(unittest.TestCase):

Expand All @@ -20,14 +21,14 @@ def tearDown(self):

def upgrade_schema(self):
return
# if test.LOUIS_SCHEMA == 'louis_v005':
# self.execute('sql/2023-07-11-hotfix-xml-not-well-formed.sql')
# self.execute('sql/2023-07-11-populate-link.sql')
# self.execute('sql/2023-07-12-score-current.sql')
# self.execute('sql/2023-07-19-modify-score_type-add-similarity.sql')
# self.execute('sql/2023-07-19-modified-documents.sql')
# self.execute('sql/2023-07-19-weighted_search.sql')
# self.execute('sql/2023-07-21-default_chunk.sql')
if test.LOUIS_SCHEMA == 'louis_v004':
self.execute('sql/2023-07-11-hotfix-xml-not-well-formed.sql')
self.execute('sql/2023-07-11-populate-link.sql')
self.execute('sql/2023-07-12-score-current.sql')
self.execute('sql/2023-07-19-modify-score_type-add-similarity.sql')
self.execute('sql/2023-07-19-modified-documents.sql')
self.execute('sql/2023-07-19-weighted_search.sql')
self.execute('sql/2023-07-21-default_chunk.sql')

def test_well_formed_xml(self):
self.upgrade_schema()
Expand All @@ -39,13 +40,13 @@ def test_well_formed_xml(self):
result = self.cursor.fetchall()
self.assertEqual(result[0]['count'], 0, "All xml should be well formed")

# def test_every_crawl_doc_should_have_at_least_one_chunk(self):
# # self.execute('sql/2023-08-09-issue8-html_content-table.sql')
# self.cursor.execute("""
# select count(*)
# from crawl left join documents on crawl.id = documents.id
# where documents.id is null""")
# result = self.cursor.fetchall()
# self.assertEqual(
# result[0]['count'], 0,
# "Every crawl doc should have at least one chunk")
def test_every_crawl_doc_should_have_at_least_one_chunk(self):
# self.execute('sql/2023-08-09-issue8-html_content-table.sql')
self.cursor.execute("""
select count(*)
from crawl left join documents on crawl.id = documents.id
where documents.id is null""")
result = self.cursor.fetchall()
self.assertEqual(
result[0]['count'], 0,
"Every crawl doc should have at least one chunk")
Loading