Skip to content

Commit

Permalink
Add tests for pdf_features and trainer
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Jul 8, 2024
1 parent df0e6e0 commit a4f07bc
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 0 deletions.
24 changes: 24 additions & 0 deletions src/pdf_features/tests/test_pdf_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from os.path import join
from pathlib import Path
from unittest import TestCase

from configuration import ROOT_PATH
from pdf_features.PdfFeatures import PdfFeatures


class TestPdfFeatures(TestCase):
def test_wrong_pdf(self):
not_a_pdf_path = join(ROOT_PATH, "test_pdfs", "not_a_pdf.pdf")
print(not_a_pdf_path)
pdf_features = PdfFeatures.from_pdf_path(not_a_pdf_path)
self.assertIsNone(pdf_features)

def test_blank_xml(self):
pdf_features_empty = PdfFeatures.from_poppler_etree_content("", "")
pdf_features_empty_list = PdfFeatures.from_poppler_etree_content("", "[]")
self.assertNotEqual(pdf_features_empty, None)
self.assertNotEqual(pdf_features_empty_list, None)

def test_ocr_pdf(self):
pdf_features = PdfFeatures.from_pdf_path(join(ROOT_PATH, "test_pdfs", "ocr_pdf.pdf"))
self.assertGreater(len(pdf_features.pages[0].tokens), 0)
34 changes: 34 additions & 0 deletions src/pdf_tokens_type_trainer/tests/test_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from os.path import join, exists
from unittest import TestCase

from pdf_token_type_labels.TokenType import TokenType
from pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer

from pdf_features.PdfFeatures import PdfFeatures

from configuration import ROOT_PATH


class TestTrainer(TestCase):
def test_train_blank_pdf(self):
pdf_features = PdfFeatures.from_pdf_path(join(ROOT_PATH, "test_pdfs", "blank.pdf"))
model_path = join(ROOT_PATH, "model", "blank.model")
trainer = TokenTypeTrainer([pdf_features])
trainer.train(model_path, [])
self.assertFalse(exists(model_path))

def test_predict_blank_pdf(self):
pdf_features = PdfFeatures.from_pdf_path(join(ROOT_PATH, "test_pdfs", "blank.pdf"))
trainer = TokenTypeTrainer([pdf_features])
trainer.set_token_types()
self.assertEqual([], pdf_features.pages[0].tokens)

def test_predict(self):
pdf_features = PdfFeatures.from_pdf_path(join(ROOT_PATH, "test_pdfs", "test.pdf"))
trainer = TokenTypeTrainer([pdf_features])
trainer.set_token_types()
tokens = pdf_features.pages[0].tokens
self.assertEqual(TokenType.TITLE, tokens[0].token_type)
self.assertEqual("Document Big Centered Title", tokens[0].content)
self.assertEqual(TokenType.TEXT, tokens[1].token_type)
self.assertEqual("List Title", tokens[10].content)
Empty file added test_pdfs/not_a_pdf.pdf
Empty file.
Binary file added test_pdfs/ocr_pdf.pdf
Binary file not shown.
Binary file added test_pdfs/test.pdf
Binary file not shown.

0 comments on commit a4f07bc

Please sign in to comment.