-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add tests for pdf_features and trainer
- Loading branch information
1 parent
df0e6e0
commit a4f07bc
Showing
5 changed files
with
58 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from os.path import join | ||
from pathlib import Path | ||
from unittest import TestCase | ||
|
||
from configuration import ROOT_PATH | ||
from pdf_features.PdfFeatures import PdfFeatures | ||
|
||
|
||
class TestPdfFeatures(TestCase): | ||
def test_wrong_pdf(self): | ||
not_a_pdf_path = join(ROOT_PATH, "test_pdfs", "not_a_pdf.pdf") | ||
print(not_a_pdf_path) | ||
pdf_features = PdfFeatures.from_pdf_path(not_a_pdf_path) | ||
self.assertIsNone(pdf_features) | ||
|
||
def test_blank_xml(self): | ||
pdf_features_empty = PdfFeatures.from_poppler_etree_content("", "") | ||
pdf_features_empty_list = PdfFeatures.from_poppler_etree_content("", "[]") | ||
self.assertNotEqual(pdf_features_empty, None) | ||
self.assertNotEqual(pdf_features_empty_list, None) | ||
|
||
def test_ocr_pdf(self): | ||
pdf_features = PdfFeatures.from_pdf_path(join(ROOT_PATH, "test_pdfs", "ocr_pdf.pdf")) | ||
self.assertGreater(len(pdf_features.pages[0].tokens), 0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from os.path import join, exists | ||
from unittest import TestCase | ||
|
||
from pdf_token_type_labels.TokenType import TokenType | ||
from pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer | ||
|
||
from pdf_features.PdfFeatures import PdfFeatures | ||
|
||
from configuration import ROOT_PATH | ||
|
||
|
||
class TestTrainer(TestCase): | ||
def test_train_blank_pdf(self): | ||
pdf_features = PdfFeatures.from_pdf_path(join(ROOT_PATH, "test_pdfs", "blank.pdf")) | ||
model_path = join(ROOT_PATH, "model", "blank.model") | ||
trainer = TokenTypeTrainer([pdf_features]) | ||
trainer.train(model_path, []) | ||
self.assertFalse(exists(model_path)) | ||
|
||
def test_predict_blank_pdf(self): | ||
pdf_features = PdfFeatures.from_pdf_path(join(ROOT_PATH, "test_pdfs", "blank.pdf")) | ||
trainer = TokenTypeTrainer([pdf_features]) | ||
trainer.set_token_types() | ||
self.assertEqual([], pdf_features.pages[0].tokens) | ||
|
||
def test_predict(self): | ||
pdf_features = PdfFeatures.from_pdf_path(join(ROOT_PATH, "test_pdfs", "test.pdf")) | ||
trainer = TokenTypeTrainer([pdf_features]) | ||
trainer.set_token_types() | ||
tokens = pdf_features.pages[0].tokens | ||
self.assertEqual(TokenType.TITLE, tokens[0].token_type) | ||
self.assertEqual("Document Big Centered Title", tokens[0].content) | ||
self.assertEqual(TokenType.TEXT, tokens[1].token_type) | ||
self.assertEqual("List Title", tokens[10].content) |
Empty file.
Binary file not shown.
Binary file not shown.