diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 42aa24a..54a8d6b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,6 +24,9 @@ jobs: - name: Free up space run: make free_up_space + - name: Install pdftohtml + run: sudo apt-get install -y pdftohtml + - name: Install venv run: make install_venv diff --git a/Dockerfile b/Dockerfile index 5082a1c..e7012cd 100755 --- a/Dockerfile +++ b/Dockerfile @@ -19,8 +19,8 @@ RUN pip install --upgrade pip RUN pip --default-timeout=1000 install -r requirements.txt WORKDIR /app -RUN cd src; git clone https://github.com/facebookresearch/detectron2 -RUN cd src/detectron2; python setup.py build develop +RUN cd src; git clone https://github.com/facebookresearch/detectron2; +RUN cd src/detectron2; git checkout 70f454304e1a38378200459dd2dbca0f0f4a5ab4; python setup.py build develop COPY ./src/. ./src COPY ./models/. ./models/ diff --git a/Makefile b/Makefile index 74758a6..919ed3d 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,5 @@ +HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0) + install: . .venv/bin/activate; pip install -Ur requirements.txt @@ -23,7 +25,14 @@ remove_docker_images: start: mkdir -p ./models +ifeq ($(HAS_GPU), 1) + @echo "NVIDIA GPU detected, using docker-compose-gpu.yml" docker compose -f docker-compose-gpu.yml up --build +else + @echo "No NVIDIA GPU detected, using docker-compose.yml" + docker compose -f docker-compose.yml up --build +endif + start_no_gpu: mkdir -p ./models diff --git a/README.md b/README.md index 77a1472..17d6c3c 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,7 @@ pictures, tables and so on. Additionally, it determines the correct order of the ## Quick Start Start the service: - # With GPU support make start - - # Without GPU support [if you do not have a GPU on your system] - make start_no_gpu Get the segments from a PDF: @@ -49,7 +45,9 @@ To stop the server: - [Models](#models) - [Data](#data) - [Usage](#usage) -- [Benchmark](#benchmark) +- [Benchmarks](#benchmarks) + - [Performance](#performance) + - [Speed](#speed) - [Related Services](#related-services) ## Dependencies @@ -57,8 +55,8 @@ To stop the server: * For GPU support [install link](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) ## Requirements -* 4 GB RAM memory -* 6 GB GPU memory (if not, it will run with CPU) +* 2 GB RAM memory +* 5 GB GPU memory (if not, it will run on CPU) ## Models @@ -102,10 +100,14 @@ As we mentioned at the [Quick Start](#quick-start), you can use the service simp curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' localhost:5060 -This command will run the code on visual model. So you should be prepared that it will use lots of resources. But if you -want to use the not visual models, which are the LightGBM models, you can use this command: +This command will run the visual model. So you should be prepared that it will use lots of resources. Also, please note +that if you do not have GPU in your system, or if you do not have enough free GPU memory, the visual model will run on CPU. +You should be expecting a long response time in that case (See [speed benchmark](#speed) for more details). - curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' localhost:5060/fast + +If you want to use the non-visual models, which are the LightGBM models, you can use this command: + + curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' -F "fast=true" localhost:5060 The shape of the response will be the same in both of these commands. @@ -146,7 +148,9 @@ we process them after sorting all segments with content. To determine their read using distance as a criterion. -## Benchmark +## Benchmarks + +### Performance These are the benchmark results for VGT model on PubLayNet dataset: @@ -171,6 +175,37 @@ These are the benchmark results for VGT model on PubLayNet dataset: You can check this link to see the comparison with the other models: https://paperswithcode.com/sota/document-layout-analysis-on-publaynet-val +### Speed + +For 15 pages academic paper document: + + + + + + + + + + + + + + + + + + + + + + +
ModelGPUSpeed (seconds per page)
Fast Model✗ [i7-8700 3.2GHz]0.42
VGT✓ [GTX 1070]1.75
VGT✗ [i7-8700 3.2GHz]13.5
+ + + + + ## Related Services Here are some of our other services that is built upon this service: diff --git a/requirements.txt b/requirements.txt index ce5f192..27b2b19 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,6 @@ pdf2image==1.17.0 lxml==5.2.2 lightgbm==4.4.0 huggingface_hub==0.23.4 -setuptools==70.2.0 \ No newline at end of file +setuptools==70.2.0 +roman~=4.2 +hydra-core==1.3.2 \ No newline at end of file diff --git a/src/app.py b/src/app.py index cd1d877..02f3a70 100755 --- a/src/app.py +++ b/src/app.py @@ -3,13 +3,14 @@ from os.path import join from pathlib import Path import torch -from fastapi import FastAPI, UploadFile, File +from fastapi import FastAPI, UploadFile, File, Form from fastapi.responses import PlainTextResponse from catch_exceptions import catch_exceptions from configuration import service_logger, XMLS_PATH from pdf_layout_analysis.run_pdf_layout_analysis import analyze_pdf from pdf_layout_analysis.run_pdf_layout_analysis_fast import analyze_pdf_fast +from toc.extract_table_of_contents import extract_table_of_contents service_logger.info(f"Is PyTorch using GPU: {torch.cuda.is_available()}") @@ -23,9 +24,9 @@ async def info(): @app.post("/") @catch_exceptions -async def run(file: UploadFile = File(...)): +async def run(file: UploadFile = File(...), fast: bool = Form(False)): service_logger.info(f"Processing file: {file.filename}") - return analyze_pdf(file.file.read()) + return analyze_pdf_fast(file.file.read()) if fast else analyze_pdf(file.file.read()) @app.post("/save_xml/{xml_file_name}") @@ -45,7 +46,10 @@ async def get_xml(xml_file_name: str): return content -@app.post("/fast") +@app.post("/toc") @catch_exceptions -async def run_fast(file: UploadFile = File(...)): - return analyze_pdf_fast(file.file.read()) +async def get_toc(file: UploadFile = File(...), fast: bool = Form(False)): + file_content = file.file.read() + if fast: + return extract_table_of_contents(file_content, analyze_pdf_fast(file_content)) + return extract_table_of_contents(file_content, analyze_pdf(file_content)) diff --git a/src/data_model/SegmentBox.py b/src/data_model/SegmentBox.py index 961b73f..5929768 100644 --- a/src/data_model/SegmentBox.py +++ b/src/data_model/SegmentBox.py @@ -1,11 +1,8 @@ from fast_trainer.PdfSegment import PdfSegment from pdf_features.PdfPage import PdfPage -from pdf_features.Rectangle import Rectangle from pdf_token_type_labels.TokenType import TokenType from pydantic import BaseModel -from configuration import DOCLAYNET_TYPE_BY_ID - class SegmentBox(BaseModel): left: float diff --git a/src/pdf_token_type_labels/TokenType.py b/src/pdf_token_type_labels/TokenType.py index 2ef70f4..e8e324f 100644 --- a/src/pdf_token_type_labels/TokenType.py +++ b/src/pdf_token_type_labels/TokenType.py @@ -28,5 +28,12 @@ def from_index(index: int): except IndexError: return TokenType.TEXT.name.lower() + @staticmethod + def from_value(value: str): + for token_type in TokenType: + if token_type.value == value: + return token_type + return TokenType.TEXT + def get_index(self) -> int: return list(TokenType).index(self) diff --git a/src/test_end_to_end.py b/src/test_end_to_end.py index 30a0be8..df22a0d 100644 --- a/src/test_end_to_end.py +++ b/src/test_end_to_end.py @@ -62,16 +62,18 @@ def test_regular_pdf(self): def test_error_file_fast(self): with open(f"{ROOT_PATH}/test_pdfs/error.pdf", "rb") as stream: files = {"file": stream} + data = {"fast": "True"} - results = requests.post(f"{self.service_url}/fast", files=files) + results = requests.post(f"{self.service_url}", files=files, data=data) self.assertEqual(422, results.status_code) def test_blank_pdf_fast(self): with open(f"{ROOT_PATH}/test_pdfs/blank.pdf", "rb") as stream: files = {"file": stream} + data = {"fast": "True"} - results = requests.post(f"{self.service_url}/fast", files=files) + results = requests.post(f"{self.service_url}", files=files, data=data) self.assertEqual(200, results.status_code) self.assertEqual(0, len(results.json())) @@ -79,8 +81,9 @@ def test_blank_pdf_fast(self): def test_segmentation_some_empty_pages_fast(self): with open(f"{ROOT_PATH}/test_pdfs/some_empty_pages.pdf", "rb") as stream: files = {"file": stream} + data = {"fast": "True"} - results = requests.post(f"{self.service_url}/fast", files=files) + results = requests.post(f"{self.service_url}", files=files, data=data) self.assertEqual(200, results.status_code) self.assertEqual(2, len(results.json())) @@ -88,8 +91,9 @@ def test_segmentation_some_empty_pages_fast(self): def test_image_pdfs_fast(self): with open(f"{ROOT_PATH}/test_pdfs/image.pdf", "rb") as stream: files = {"file": stream} + data = {"fast": "True"} - results = requests.post(f"{self.service_url}/fast", files=files) + results = requests.post(f"{self.service_url}", files=files, data=data) self.assertEqual(200, results.status_code) self.assertEqual(0, len(results.json())) @@ -97,7 +101,8 @@ def test_image_pdfs_fast(self): def test_regular_pdf_fast(self): with open(f"{ROOT_PATH}/test_pdfs/regular.pdf", "rb") as stream: files = {"file": stream} - results = requests.post(f"{self.service_url}/fast", files=files) + data = {"fast": "True"} + results = requests.post(f"{self.service_url}", files=files, data=data) results_dict = results.json() expected_content = "RESOLUCIÓN DE LA CORTE INTERAMERICANA DE DERECHOS HUMANOS" self.assertEqual(200, results.status_code) @@ -130,15 +135,17 @@ def test_chinese(self): def test_korean_fast(self): with open(f"{ROOT_PATH}/test_pdfs/korean.pdf", "rb") as stream: files = {"file": stream} + data = {"fast": "True"} - results = requests.post(f"{self.service_url}/fast", files=files) + results = requests.post(f"{self.service_url}", files=files, data=data) self.assertEqual(200, results.status_code) def test_chinese_fast(self): with open(f"{ROOT_PATH}/test_pdfs/chinese.pdf", "rb") as stream: files = {"file": stream} + data = {"fast": "True"} - results = requests.post(f"{self.service_url}/fast", files=files) + results = requests.post(f"{self.service_url}", files=files, data=data) self.assertEqual(200, results.status_code) diff --git a/src/toc/MergeTwoSegmentsTitles.py b/src/toc/MergeTwoSegmentsTitles.py new file mode 100644 index 0000000..750f46e --- /dev/null +++ b/src/toc/MergeTwoSegmentsTitles.py @@ -0,0 +1,48 @@ +from toc.TitleFeatures import TitleFeatures +from toc.PdfSegmentation import PdfSegmentation + + +class MergeTwoSegmentsTitles: + def __init__(self, pdf_segmentation: PdfSegmentation): + self.title_features_list: list[TitleFeatures] = TitleFeatures.from_pdf_segmentation(pdf_segmentation) + self.titles_merged: list[TitleFeatures] = list() + self.merge() + + def merge(self): + index = 0 + while index < len(self.title_features_list): + if index == len(self.title_features_list) - 1: + self.titles_merged.append(self.title_features_list[index]) + break + + if not self.should_merge(self.title_features_list[index], self.title_features_list[index + 1]): + self.titles_merged.append(self.title_features_list[index]) + index += 1 + continue + + self.title_features_list[index + 1] = self.title_features_list[index + 1].append(self.title_features_list[index]) + index += 1 + + @staticmethod + def should_merge(title: TitleFeatures, other_title: TitleFeatures): + same_page = other_title.pdf_segment.page_number == title.pdf_segment.page_number + + if not same_page: + return False + + if abs(other_title.top - title.bottom) > 15: + return False + + if abs(other_title.left - title.right) > 15 or abs(other_title.right - title.left) > 15: + return False + + if title.first_characters_type in [1, 2, 3] and other_title.first_characters_type in [1, 2, 3]: + return False + + if title.bullet_points_type and other_title.bullet_points_type: + return False + + if title.get_features_to_merge() != other_title.get_features_to_merge(): + return False + + return True diff --git a/src/toc/PdfSegmentation.py b/src/toc/PdfSegmentation.py new file mode 100644 index 0000000..0af1268 --- /dev/null +++ b/src/toc/PdfSegmentation.py @@ -0,0 +1,32 @@ +from fast_trainer.PdfSegment import PdfSegment +from pdf_features.PdfFeatures import PdfFeatures +from pdf_features.PdfToken import PdfToken + + +class PdfSegmentation: + def __init__(self, pdf_features: PdfFeatures, pdf_segments: list[PdfSegment]): + self.pdf_features: PdfFeatures = pdf_features + self.pdf_segments: list[PdfSegment] = pdf_segments + self.tokens_by_segments: dict[PdfSegment, list[PdfToken]] = self.find_tokens_by_segments() + + @staticmethod + def find_segment_for_token(token: PdfToken, segments: list[PdfSegment], tokens_by_segments): + best_score: float = 0 + most_probable_segment: PdfSegment | None = None + for segment in segments: + intersection_percentage = token.bounding_box.get_intersection_percentage(segment.bounding_box) + if intersection_percentage > best_score: + best_score = intersection_percentage + most_probable_segment = segment + if best_score >= 99: + break + if most_probable_segment: + tokens_by_segments.setdefault(most_probable_segment, list()).append(token) + + def find_tokens_by_segments(self): + tokens_by_segments: dict[PdfSegment, list[PdfToken]] = {} + for page in self.pdf_features.pages: + page_segments = [segment for segment in self.pdf_segments if segment.page_number == page.page_number] + for token in page.tokens: + self.find_segment_for_token(token, page_segments, tokens_by_segments) + return tokens_by_segments diff --git a/src/toc/TOCExtractor.py b/src/toc/TOCExtractor.py new file mode 100644 index 0000000..5e5f9f6 --- /dev/null +++ b/src/toc/TOCExtractor.py @@ -0,0 +1,67 @@ +from toc.MergeTwoSegmentsTitles import MergeTwoSegmentsTitles +from toc.TitleFeatures import TitleFeatures +from toc.data.TOCItem import TOCItem +from toc.PdfSegmentation import PdfSegmentation + + +class TOCExtractor: + def __init__(self, pdf_segmentation: PdfSegmentation): + self.pdf_segmentation = pdf_segmentation + self.titles_features_sorted = MergeTwoSegmentsTitles(self.pdf_segmentation).titles_merged + self.toc: list[TOCItem] = list() + self.set_toc() + + def set_toc(self): + for index, title_features in enumerate(self.titles_features_sorted): + indentation = self.get_indentation(index, title_features) + self.toc.append(title_features.to_toc_item(indentation)) + + def __str__(self): + return "\n".join([f'{" " * x.indentation} * {x.label}' for x in self.toc]) + + def get_indentation(self, title_index: int, title_features: TitleFeatures): + if title_index == 0: + return 0 + + for index in reversed(range(title_index)): + if self.toc[index].point_closed: + continue + + if self.same_indentation(self.titles_features_sorted[index], title_features): + self.close_toc_items(self.toc[index].indentation) + return self.toc[index].indentation + + return self.toc[title_index - 1].indentation + 1 + + def close_toc_items(self, indentation): + for toc in self.toc: + if toc.indentation > indentation: + toc.point_closed = True + + @staticmethod + def same_indentation(previous_title_features: TitleFeatures, title_features: TitleFeatures): + if previous_title_features.first_characters in title_features.get_possible_previous_point(): + return True + + if previous_title_features.get_features_toc() == title_features.get_features_toc(): + return True + + return False + + def to_dict(self): + toc: list[dict[str, any]] = list() + + for toc_item in self.toc: + toc_element_dict = dict() + toc_element_dict["indentation"] = toc_item.indentation + toc_element_dict["label"] = toc_item.label + rectangle = dict() + rectangle["left"] = int(toc_item.selection_rectangle.left) + rectangle["top"] = int(toc_item.selection_rectangle.top) + rectangle["width"] = int(toc_item.selection_rectangle.width) + rectangle["height"] = int(toc_item.selection_rectangle.height) + rectangle["page"] = str(toc_item.selection_rectangle.page_number) + toc_element_dict["bounding_box"] = rectangle + toc.append(toc_element_dict) + + return toc diff --git a/src/toc/TitleFeatures.py b/src/toc/TitleFeatures.py new file mode 100755 index 0000000..9769183 --- /dev/null +++ b/src/toc/TitleFeatures.py @@ -0,0 +1,171 @@ +import string +import roman +import numpy as np +from fast_trainer.PdfSegment import PdfSegment +from pdf_features.PdfToken import PdfToken +from pdf_features.Rectangle import Rectangle +from data_model.SegmentBox import SegmentBox +from toc.data.TOCItem import TOCItem +from toc.methods.two_models_v3_segments_context_2.Modes import Modes +from toc.PdfSegmentation import PdfSegmentation + + +class TitleFeatures: + SPECIAL_MARKERS = [".", "(", ")", "\\", "/", ":", ";", "-", "_", "[", "]", "•", "◦", "*", ","] + ALPHABET = list(string.ascii_lowercase) + ALPHABET_UPPERCASE = list(string.ascii_uppercase) + ROMAN_NUMBERS = [roman.toRoman(i) for i in range(1, 151)] + ROMAN_NUMBERS_LOWERCASE = [x.lower() for x in ROMAN_NUMBERS] + BULLET_POINTS = [ALPHABET, ALPHABET_UPPERCASE, ROMAN_NUMBERS, ROMAN_NUMBERS_LOWERCASE] + + def __init__(self, pdf_segment: PdfSegment, segment_tokens: list[PdfToken], pdf_features, modes: Modes): + self.modes = modes + self.pdf_segment = pdf_segment + self.pdf_features = pdf_features + + self.segment_tokens: list[PdfToken] = segment_tokens + self.first_characters: str = "" + self.first_characters_special_markers_count: int = 0 + self.font_size: float = 0.0 + self.text_content: str = "" + self.width: float = 0 + self.font_family: str = "" + self.font_color: str = "" + self.line_height: float = 0.0 + self.uppercase: bool = False + self.bold: float = False + self.italics: float = False + self.first_characters_type = 0 + self.bullet_points_type = 0 + self.text_centered: int = 0 + self.is_left: bool = False + self.indentation: int = -1 + self.left: int = self.pdf_segment.bounding_box.left + self.top: int = self.pdf_segment.bounding_box.top + self.right: int = self.pdf_segment.bounding_box.right + self.bottom: int = self.pdf_segment.bounding_box.bottom + + self.initialize_text_properties() + self.process_first_characters() + self.process_font_properties() + self.process_positional_properties() + + def initialize_text_properties(self): + words = [token.content for token in self.segment_tokens] + self.text_content = " ".join(words) + + def process_first_characters(self): + self.first_characters = self.text_content.split(" ")[0].split("\n")[0].split("\t")[0] + clean_first_characters = [x for x in self.first_characters if x not in self.SPECIAL_MARKERS] + characters_checker = { + 1: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL"]), + 2: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL".lower()]), + 3: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "1234567890"]), + 4: lambda x_list: len(x_list) == len([letter for letter in x_list if letter == letter.upper()]), + } + + self.first_characters_type = next( + (index for index, type_checker in characters_checker.items() if type_checker(clean_first_characters)), 0 + ) + + self.bullet_points_type = ( + self.SPECIAL_MARKERS.index(self.first_characters[-1]) + 1 + if self.first_characters[-1] in self.SPECIAL_MARKERS + else 0 + ) + self.first_characters_special_markers_count = len( + [x for x in self.first_characters[:-1] if x in self.SPECIAL_MARKERS] + ) + + def process_font_properties(self): + self.font_family = self.segment_tokens[0].font.font_id + self.font_color = self.segment_tokens[0].font.color + self.bold = sum(token.font.bold for token in self.segment_tokens) / len(self.segment_tokens) + self.italics = sum(token.font.italics for token in self.segment_tokens) / len(self.segment_tokens) + self.uppercase = self.text_content.upper() == self.text_content + font_sizes = [token.font.font_size for token in self.segment_tokens] + self.font_size = np.mean(font_sizes) + + def process_positional_properties(self): + self.line_height = self.segment_tokens[0].font.font_size + page_width = self.pdf_features.pages[self.pdf_segment.page_number - 1].page_width + self.text_centered = 1 if abs(self.left - (page_width - self.right)) < 10 else 0 + self.is_left = self.left < page_width - self.right if not self.text_centered else False + self.indentation = int((self.left - self.modes.left_space_mode) / 15) if self.is_left else -1 + + def get_features_to_merge(self) -> np.array: + return ( + 1 if self.bold else 0, + 1 if self.italics else 0, + ) + + def get_features_toc(self) -> np.array: + return ( + 1 if self.bold else 0, + 1 if self.italics else 0, + self.first_characters_type, + self.first_characters_special_markers_count, + self.bullet_points_type, + ) + + def get_possible_previous_point(self) -> list[str]: + previous_characters = self.first_characters + final_special_markers = "" + last_part = "" + for letter in list(reversed(previous_characters)): + if not last_part and letter in self.SPECIAL_MARKERS: + final_special_markers = previous_characters[-1] + final_special_markers + previous_characters = previous_characters[:-1] + continue + + if last_part and letter in self.SPECIAL_MARKERS: + break + + last_part = letter + last_part + previous_characters = previous_characters[:-1] + + previous_items = self.get_previous_items(last_part) + + if not previous_items and len(self.first_characters) >= 4: + return [self.first_characters] + + return [previous_characters + x + final_special_markers for x in previous_items] + + def get_previous_items(self, item: str): + previous_items = [] + + for bullet_points in self.BULLET_POINTS: + if item in bullet_points and bullet_points.index(item): + previous_items.append(bullet_points[bullet_points.index(item) - 1]) + + if item.isnumeric(): + previous_items.append(str(int(item) - 1)) + + return previous_items + + @staticmethod + def from_pdf_segmentation(pdf_segmentation: PdfSegmentation) -> list["TitleFeatures"]: + titles_features = list() + modes = Modes(pdf_features=pdf_segmentation.pdf_features) + for pdf_segment in pdf_segmentation.pdf_segments: + segment_tokens = pdf_segmentation.tokens_by_segments[pdf_segment] + titles_features.append(TitleFeatures(pdf_segment, segment_tokens, pdf_segmentation.pdf_features, modes)) + + return titles_features + + def to_toc_item(self, indentation): + return TOCItem( + indentation=indentation, + label=self.text_content, + selection_rectangle=SegmentBox.from_pdf_segment(self.pdf_segment, self.pdf_features.pages), + ) + + def append(self, other_title_features: "TitleFeatures"): + other_segment = other_title_features.pdf_segment + merged_bounding_box = Rectangle.merge_rectangles([self.pdf_segment.bounding_box, other_segment.bounding_box]) + merged_content = self.pdf_segment.text_content + other_segment.text_content + merged_segment = PdfSegment( + self.pdf_segment.page_number, merged_bounding_box, merged_content, self.pdf_segment.segment_type + ) + segment_tokens = self.segment_tokens + other_title_features.segment_tokens + return TitleFeatures(merged_segment, segment_tokens, pdf_features=self.pdf_features, modes=self.modes) diff --git a/src/toc/data/TOCItem.py b/src/toc/data/TOCItem.py new file mode 100644 index 0000000..9d141d3 --- /dev/null +++ b/src/toc/data/TOCItem.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from data_model.SegmentBox import SegmentBox + + +class TOCItem(BaseModel): + indentation: int + label: str = "" + selection_rectangle: SegmentBox + point_closed: bool = False diff --git a/src/toc/extract_table_of_contents.py b/src/toc/extract_table_of_contents.py new file mode 100644 index 0000000..f097d31 --- /dev/null +++ b/src/toc/extract_table_of_contents.py @@ -0,0 +1,73 @@ +import tempfile +import uuid +from os.path import join +from pathlib import Path +from typing import AnyStr +from fast_trainer.PdfSegment import PdfSegment +from pdf_features.PdfFeatures import PdfFeatures +from pdf_features.Rectangle import Rectangle +from pdf_token_type_labels.TokenType import TokenType +from toc.TOCExtractor import TOCExtractor +from configuration import service_logger +from toc.PdfSegmentation import PdfSegmentation + +TITLE_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER} +SKIP_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER, TokenType.PAGE_HEADER, TokenType.PICTURE} + + +def get_file_path(file_name, extension): + return join(tempfile.gettempdir(), file_name + "." + extension) + + +def pdf_content_to_pdf_path(file_content): + file_id = str(uuid.uuid1()) + + pdf_path = Path(get_file_path(file_id, "pdf")) + pdf_path.write_bytes(file_content) + + return pdf_path + + +def skip_name_of_the_document(pdf_segments: list[PdfSegment], title_segments: list[PdfSegment]): + segments_to_remove = [] + last_segment = None + for segment in pdf_segments: + if segment.segment_type not in SKIP_TYPES: + break + if segment.segment_type == TokenType.PAGE_HEADER or segment.segment_type == TokenType.PICTURE: + continue + if not last_segment: + last_segment = segment + else: + if segment.bounding_box.right < last_segment.bounding_box.left + last_segment.bounding_box.width * 0.66: + break + last_segment = segment + if segment.segment_type in TITLE_TYPES: + segments_to_remove.append(segment) + for segment in segments_to_remove: + title_segments.remove(segment) + + +def get_pdf_segments_from_segment_boxes(pdf_features: PdfFeatures, segment_boxes: list[dict]) -> list[PdfSegment]: + pdf_segments: list[PdfSegment] = [] + for segment_box in segment_boxes: + left, top, width, height = segment_box["left"], segment_box["top"], segment_box["width"], segment_box["height"] + bounding_box = Rectangle.from_width_height(left, top, width, height) + segment_type = TokenType.from_value(segment_box["type"]) + pdf_name = pdf_features.file_name + segment = PdfSegment(segment_box["page_number"], bounding_box, segment_box["text"], segment_type, pdf_name) + pdf_segments.append(segment) + return pdf_segments + + +def extract_table_of_contents(file: AnyStr, segment_boxes: list[dict], skip_document_name=False): + service_logger.info("Getting TOC") + pdf_path = pdf_content_to_pdf_path(file) + pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(pdf_path) + pdf_segments: list[PdfSegment] = get_pdf_segments_from_segment_boxes(pdf_features, segment_boxes) + title_segments = [segment for segment in pdf_segments if segment.segment_type in TITLE_TYPES] + if skip_document_name: + skip_name_of_the_document(pdf_segments, title_segments) + pdf_segmentation: PdfSegmentation = PdfSegmentation(pdf_features, title_segments) + toc_instance: TOCExtractor = TOCExtractor(pdf_segmentation) + return toc_instance.to_dict() diff --git a/src/toc/methods/two_models_v3_segments_context_2/Modes.py b/src/toc/methods/two_models_v3_segments_context_2/Modes.py new file mode 100644 index 0000000..dc83c4f --- /dev/null +++ b/src/toc/methods/two_models_v3_segments_context_2/Modes.py @@ -0,0 +1,44 @@ +import dataclasses +import hashlib +from statistics import mode + +from pdf_features.PdfFeatures import PdfFeatures + + +@dataclasses.dataclass +class Modes: + lines_space_mode: float + left_space_mode: float + right_space_mode: float + font_size_mode: float + font_family_name_mode: str + font_family_mode: int + font_family_mode_normalized: float + pdf_features: PdfFeatures + + def __init__(self, pdf_features: PdfFeatures): + self.pdf_features = pdf_features + self.set_modes() + + def set_modes(self): + line_spaces, right_spaces, left_spaces = [0], [0], [0] + for page, token in self.pdf_features.loop_tokens(): + right_spaces.append(self.pdf_features.pages[0].page_width - token.bounding_box.right) + left_spaces.append(token.bounding_box.left) + line_spaces.append(token.bounding_box.bottom) + + self.lines_space_mode = mode(line_spaces) + self.left_space_mode = mode(left_spaces) + self.right_space_mode = mode(right_spaces) + + font_sizes = [token.font.font_size for page, token in self.pdf_features.loop_tokens() if token.font] + self.font_size_mode = mode(font_sizes) if font_sizes else 0 + font_ids = [token.font.font_id for page, token in self.pdf_features.loop_tokens() if token.font] + self.font_family_name_mode = mode(font_ids) if font_ids else "" + self.font_family_mode = abs( + int( + str(hashlib.sha256(self.font_family_name_mode.encode("utf-8")).hexdigest())[:8], + 16, + ) + ) + self.font_family_mode_normalized = float(f"{str(self.font_family_mode)[0]}.{str(self.font_family_mode)[1:]}")