diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 42aa24a..54a8d6b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,6 +24,9 @@ jobs:
- name: Free up space
run: make free_up_space
+ - name: Install pdftohtml
+ run: sudo apt-get install -y pdftohtml
+
- name: Install venv
run: make install_venv
diff --git a/Dockerfile b/Dockerfile
index 5082a1c..e7012cd 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,8 +19,8 @@ RUN pip install --upgrade pip
RUN pip --default-timeout=1000 install -r requirements.txt
WORKDIR /app
-RUN cd src; git clone https://github.com/facebookresearch/detectron2
-RUN cd src/detectron2; python setup.py build develop
+RUN cd src; git clone https://github.com/facebookresearch/detectron2;
+RUN cd src/detectron2; git checkout 70f454304e1a38378200459dd2dbca0f0f4a5ab4; python setup.py build develop
COPY ./src/. ./src
COPY ./models/. ./models/
diff --git a/Makefile b/Makefile
index 74758a6..919ed3d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,5 @@
+HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0)
+
install:
. .venv/bin/activate; pip install -Ur requirements.txt
@@ -23,7 +25,14 @@ remove_docker_images:
start:
mkdir -p ./models
+ifeq ($(HAS_GPU), 1)
+ @echo "NVIDIA GPU detected, using docker-compose-gpu.yml"
docker compose -f docker-compose-gpu.yml up --build
+else
+ @echo "No NVIDIA GPU detected, using docker-compose.yml"
+ docker compose -f docker-compose.yml up --build
+endif
+
start_no_gpu:
mkdir -p ./models
diff --git a/README.md b/README.md
index 77a1472..17d6c3c 100644
--- a/README.md
+++ b/README.md
@@ -28,11 +28,7 @@ pictures, tables and so on. Additionally, it determines the correct order of the
## Quick Start
Start the service:
- # With GPU support
make start
-
- # Without GPU support [if you do not have a GPU on your system]
- make start_no_gpu
Get the segments from a PDF:
@@ -49,7 +45,9 @@ To stop the server:
- [Models](#models)
- [Data](#data)
- [Usage](#usage)
-- [Benchmark](#benchmark)
+- [Benchmarks](#benchmarks)
+ - [Performance](#performance)
+ - [Speed](#speed)
- [Related Services](#related-services)
## Dependencies
@@ -57,8 +55,8 @@ To stop the server:
* For GPU support [install link](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
## Requirements
-* 4 GB RAM memory
-* 6 GB GPU memory (if not, it will run with CPU)
+* 2 GB RAM memory
+* 5 GB GPU memory (if not, it will run on CPU)
## Models
@@ -102,10 +100,14 @@ As we mentioned at the [Quick Start](#quick-start), you can use the service simp
curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' localhost:5060
-This command will run the code on visual model. So you should be prepared that it will use lots of resources. But if you
-want to use the not visual models, which are the LightGBM models, you can use this command:
+This command will run the visual model. So you should be prepared that it will use lots of resources. Also, please note
+that if you do not have GPU in your system, or if you do not have enough free GPU memory, the visual model will run on CPU.
+You should be expecting a long response time in that case (See [speed benchmark](#speed) for more details).
- curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' localhost:5060/fast
+
+If you want to use the non-visual models, which are the LightGBM models, you can use this command:
+
+ curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' -F "fast=true" localhost:5060
The shape of the response will be the same in both of these commands.
@@ -146,7 +148,9 @@ we process them after sorting all segments with content. To determine their read
using distance as a criterion.
-## Benchmark
+## Benchmarks
+
+### Performance
These are the benchmark results for VGT model on PubLayNet dataset:
@@ -171,6 +175,37 @@ These are the benchmark results for VGT model on PubLayNet dataset:
You can check this link to see the comparison with the other models: https://paperswithcode.com/sota/document-layout-analysis-on-publaynet-val
+### Speed
+
+For 15 pages academic paper document:
+
+
+
+ Model |
+ GPU |
+ Speed (seconds per page) |
+
+
+ Fast Model |
+ ✗ [i7-8700 3.2GHz] |
+ 0.42 |
+
+
+ VGT |
+ ✓ [GTX 1070] |
+ 1.75 |
+
+
+ VGT |
+ ✗ [i7-8700 3.2GHz] |
+ 13.5 |
+
+
+
+
+
+
+
## Related Services
Here are some of our other services that is built upon this service:
diff --git a/requirements.txt b/requirements.txt
index ce5f192..27b2b19 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,4 +17,6 @@ pdf2image==1.17.0
lxml==5.2.2
lightgbm==4.4.0
huggingface_hub==0.23.4
-setuptools==70.2.0
\ No newline at end of file
+setuptools==70.2.0
+roman~=4.2
+hydra-core==1.3.2
\ No newline at end of file
diff --git a/src/app.py b/src/app.py
index cd1d877..02f3a70 100755
--- a/src/app.py
+++ b/src/app.py
@@ -3,13 +3,14 @@
from os.path import join
from pathlib import Path
import torch
-from fastapi import FastAPI, UploadFile, File
+from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import PlainTextResponse
from catch_exceptions import catch_exceptions
from configuration import service_logger, XMLS_PATH
from pdf_layout_analysis.run_pdf_layout_analysis import analyze_pdf
from pdf_layout_analysis.run_pdf_layout_analysis_fast import analyze_pdf_fast
+from toc.extract_table_of_contents import extract_table_of_contents
service_logger.info(f"Is PyTorch using GPU: {torch.cuda.is_available()}")
@@ -23,9 +24,9 @@ async def info():
@app.post("/")
@catch_exceptions
-async def run(file: UploadFile = File(...)):
+async def run(file: UploadFile = File(...), fast: bool = Form(False)):
service_logger.info(f"Processing file: {file.filename}")
- return analyze_pdf(file.file.read())
+ return analyze_pdf_fast(file.file.read()) if fast else analyze_pdf(file.file.read())
@app.post("/save_xml/{xml_file_name}")
@@ -45,7 +46,10 @@ async def get_xml(xml_file_name: str):
return content
-@app.post("/fast")
+@app.post("/toc")
@catch_exceptions
-async def run_fast(file: UploadFile = File(...)):
- return analyze_pdf_fast(file.file.read())
+async def get_toc(file: UploadFile = File(...), fast: bool = Form(False)):
+ file_content = file.file.read()
+ if fast:
+ return extract_table_of_contents(file_content, analyze_pdf_fast(file_content))
+ return extract_table_of_contents(file_content, analyze_pdf(file_content))
diff --git a/src/data_model/SegmentBox.py b/src/data_model/SegmentBox.py
index 961b73f..5929768 100644
--- a/src/data_model/SegmentBox.py
+++ b/src/data_model/SegmentBox.py
@@ -1,11 +1,8 @@
from fast_trainer.PdfSegment import PdfSegment
from pdf_features.PdfPage import PdfPage
-from pdf_features.Rectangle import Rectangle
from pdf_token_type_labels.TokenType import TokenType
from pydantic import BaseModel
-from configuration import DOCLAYNET_TYPE_BY_ID
-
class SegmentBox(BaseModel):
left: float
diff --git a/src/pdf_token_type_labels/TokenType.py b/src/pdf_token_type_labels/TokenType.py
index 2ef70f4..e8e324f 100644
--- a/src/pdf_token_type_labels/TokenType.py
+++ b/src/pdf_token_type_labels/TokenType.py
@@ -28,5 +28,12 @@ def from_index(index: int):
except IndexError:
return TokenType.TEXT.name.lower()
+ @staticmethod
+ def from_value(value: str):
+ for token_type in TokenType:
+ if token_type.value == value:
+ return token_type
+ return TokenType.TEXT
+
def get_index(self) -> int:
return list(TokenType).index(self)
diff --git a/src/test_end_to_end.py b/src/test_end_to_end.py
index 30a0be8..df22a0d 100644
--- a/src/test_end_to_end.py
+++ b/src/test_end_to_end.py
@@ -62,16 +62,18 @@ def test_regular_pdf(self):
def test_error_file_fast(self):
with open(f"{ROOT_PATH}/test_pdfs/error.pdf", "rb") as stream:
files = {"file": stream}
+ data = {"fast": "True"}
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
self.assertEqual(422, results.status_code)
def test_blank_pdf_fast(self):
with open(f"{ROOT_PATH}/test_pdfs/blank.pdf", "rb") as stream:
files = {"file": stream}
+ data = {"fast": "True"}
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
self.assertEqual(200, results.status_code)
self.assertEqual(0, len(results.json()))
@@ -79,8 +81,9 @@ def test_blank_pdf_fast(self):
def test_segmentation_some_empty_pages_fast(self):
with open(f"{ROOT_PATH}/test_pdfs/some_empty_pages.pdf", "rb") as stream:
files = {"file": stream}
+ data = {"fast": "True"}
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
self.assertEqual(200, results.status_code)
self.assertEqual(2, len(results.json()))
@@ -88,8 +91,9 @@ def test_segmentation_some_empty_pages_fast(self):
def test_image_pdfs_fast(self):
with open(f"{ROOT_PATH}/test_pdfs/image.pdf", "rb") as stream:
files = {"file": stream}
+ data = {"fast": "True"}
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
self.assertEqual(200, results.status_code)
self.assertEqual(0, len(results.json()))
@@ -97,7 +101,8 @@ def test_image_pdfs_fast(self):
def test_regular_pdf_fast(self):
with open(f"{ROOT_PATH}/test_pdfs/regular.pdf", "rb") as stream:
files = {"file": stream}
- results = requests.post(f"{self.service_url}/fast", files=files)
+ data = {"fast": "True"}
+ results = requests.post(f"{self.service_url}", files=files, data=data)
results_dict = results.json()
expected_content = "RESOLUCIÓN DE LA CORTE INTERAMERICANA DE DERECHOS HUMANOS"
self.assertEqual(200, results.status_code)
@@ -130,15 +135,17 @@ def test_chinese(self):
def test_korean_fast(self):
with open(f"{ROOT_PATH}/test_pdfs/korean.pdf", "rb") as stream:
files = {"file": stream}
+ data = {"fast": "True"}
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
self.assertEqual(200, results.status_code)
def test_chinese_fast(self):
with open(f"{ROOT_PATH}/test_pdfs/chinese.pdf", "rb") as stream:
files = {"file": stream}
+ data = {"fast": "True"}
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
self.assertEqual(200, results.status_code)
diff --git a/src/toc/MergeTwoSegmentsTitles.py b/src/toc/MergeTwoSegmentsTitles.py
new file mode 100644
index 0000000..750f46e
--- /dev/null
+++ b/src/toc/MergeTwoSegmentsTitles.py
@@ -0,0 +1,48 @@
+from toc.TitleFeatures import TitleFeatures
+from toc.PdfSegmentation import PdfSegmentation
+
+
+class MergeTwoSegmentsTitles:
+ def __init__(self, pdf_segmentation: PdfSegmentation):
+ self.title_features_list: list[TitleFeatures] = TitleFeatures.from_pdf_segmentation(pdf_segmentation)
+ self.titles_merged: list[TitleFeatures] = list()
+ self.merge()
+
+ def merge(self):
+ index = 0
+ while index < len(self.title_features_list):
+ if index == len(self.title_features_list) - 1:
+ self.titles_merged.append(self.title_features_list[index])
+ break
+
+ if not self.should_merge(self.title_features_list[index], self.title_features_list[index + 1]):
+ self.titles_merged.append(self.title_features_list[index])
+ index += 1
+ continue
+
+ self.title_features_list[index + 1] = self.title_features_list[index + 1].append(self.title_features_list[index])
+ index += 1
+
+ @staticmethod
+ def should_merge(title: TitleFeatures, other_title: TitleFeatures):
+ same_page = other_title.pdf_segment.page_number == title.pdf_segment.page_number
+
+ if not same_page:
+ return False
+
+ if abs(other_title.top - title.bottom) > 15:
+ return False
+
+ if abs(other_title.left - title.right) > 15 or abs(other_title.right - title.left) > 15:
+ return False
+
+ if title.first_characters_type in [1, 2, 3] and other_title.first_characters_type in [1, 2, 3]:
+ return False
+
+ if title.bullet_points_type and other_title.bullet_points_type:
+ return False
+
+ if title.get_features_to_merge() != other_title.get_features_to_merge():
+ return False
+
+ return True
diff --git a/src/toc/PdfSegmentation.py b/src/toc/PdfSegmentation.py
new file mode 100644
index 0000000..0af1268
--- /dev/null
+++ b/src/toc/PdfSegmentation.py
@@ -0,0 +1,32 @@
+from fast_trainer.PdfSegment import PdfSegment
+from pdf_features.PdfFeatures import PdfFeatures
+from pdf_features.PdfToken import PdfToken
+
+
+class PdfSegmentation:
+ def __init__(self, pdf_features: PdfFeatures, pdf_segments: list[PdfSegment]):
+ self.pdf_features: PdfFeatures = pdf_features
+ self.pdf_segments: list[PdfSegment] = pdf_segments
+ self.tokens_by_segments: dict[PdfSegment, list[PdfToken]] = self.find_tokens_by_segments()
+
+ @staticmethod
+ def find_segment_for_token(token: PdfToken, segments: list[PdfSegment], tokens_by_segments):
+ best_score: float = 0
+ most_probable_segment: PdfSegment | None = None
+ for segment in segments:
+ intersection_percentage = token.bounding_box.get_intersection_percentage(segment.bounding_box)
+ if intersection_percentage > best_score:
+ best_score = intersection_percentage
+ most_probable_segment = segment
+ if best_score >= 99:
+ break
+ if most_probable_segment:
+ tokens_by_segments.setdefault(most_probable_segment, list()).append(token)
+
+ def find_tokens_by_segments(self):
+ tokens_by_segments: dict[PdfSegment, list[PdfToken]] = {}
+ for page in self.pdf_features.pages:
+ page_segments = [segment for segment in self.pdf_segments if segment.page_number == page.page_number]
+ for token in page.tokens:
+ self.find_segment_for_token(token, page_segments, tokens_by_segments)
+ return tokens_by_segments
diff --git a/src/toc/TOCExtractor.py b/src/toc/TOCExtractor.py
new file mode 100644
index 0000000..5e5f9f6
--- /dev/null
+++ b/src/toc/TOCExtractor.py
@@ -0,0 +1,67 @@
+from toc.MergeTwoSegmentsTitles import MergeTwoSegmentsTitles
+from toc.TitleFeatures import TitleFeatures
+from toc.data.TOCItem import TOCItem
+from toc.PdfSegmentation import PdfSegmentation
+
+
+class TOCExtractor:
+ def __init__(self, pdf_segmentation: PdfSegmentation):
+ self.pdf_segmentation = pdf_segmentation
+ self.titles_features_sorted = MergeTwoSegmentsTitles(self.pdf_segmentation).titles_merged
+ self.toc: list[TOCItem] = list()
+ self.set_toc()
+
+ def set_toc(self):
+ for index, title_features in enumerate(self.titles_features_sorted):
+ indentation = self.get_indentation(index, title_features)
+ self.toc.append(title_features.to_toc_item(indentation))
+
+ def __str__(self):
+ return "\n".join([f'{" " * x.indentation} * {x.label}' for x in self.toc])
+
+ def get_indentation(self, title_index: int, title_features: TitleFeatures):
+ if title_index == 0:
+ return 0
+
+ for index in reversed(range(title_index)):
+ if self.toc[index].point_closed:
+ continue
+
+ if self.same_indentation(self.titles_features_sorted[index], title_features):
+ self.close_toc_items(self.toc[index].indentation)
+ return self.toc[index].indentation
+
+ return self.toc[title_index - 1].indentation + 1
+
+ def close_toc_items(self, indentation):
+ for toc in self.toc:
+ if toc.indentation > indentation:
+ toc.point_closed = True
+
+ @staticmethod
+ def same_indentation(previous_title_features: TitleFeatures, title_features: TitleFeatures):
+ if previous_title_features.first_characters in title_features.get_possible_previous_point():
+ return True
+
+ if previous_title_features.get_features_toc() == title_features.get_features_toc():
+ return True
+
+ return False
+
+ def to_dict(self):
+ toc: list[dict[str, any]] = list()
+
+ for toc_item in self.toc:
+ toc_element_dict = dict()
+ toc_element_dict["indentation"] = toc_item.indentation
+ toc_element_dict["label"] = toc_item.label
+ rectangle = dict()
+ rectangle["left"] = int(toc_item.selection_rectangle.left)
+ rectangle["top"] = int(toc_item.selection_rectangle.top)
+ rectangle["width"] = int(toc_item.selection_rectangle.width)
+ rectangle["height"] = int(toc_item.selection_rectangle.height)
+ rectangle["page"] = str(toc_item.selection_rectangle.page_number)
+ toc_element_dict["bounding_box"] = rectangle
+ toc.append(toc_element_dict)
+
+ return toc
diff --git a/src/toc/TitleFeatures.py b/src/toc/TitleFeatures.py
new file mode 100755
index 0000000..9769183
--- /dev/null
+++ b/src/toc/TitleFeatures.py
@@ -0,0 +1,171 @@
+import string
+import roman
+import numpy as np
+from fast_trainer.PdfSegment import PdfSegment
+from pdf_features.PdfToken import PdfToken
+from pdf_features.Rectangle import Rectangle
+from data_model.SegmentBox import SegmentBox
+from toc.data.TOCItem import TOCItem
+from toc.methods.two_models_v3_segments_context_2.Modes import Modes
+from toc.PdfSegmentation import PdfSegmentation
+
+
+class TitleFeatures:
+ SPECIAL_MARKERS = [".", "(", ")", "\\", "/", ":", ";", "-", "_", "[", "]", "•", "◦", "*", ","]
+ ALPHABET = list(string.ascii_lowercase)
+ ALPHABET_UPPERCASE = list(string.ascii_uppercase)
+ ROMAN_NUMBERS = [roman.toRoman(i) for i in range(1, 151)]
+ ROMAN_NUMBERS_LOWERCASE = [x.lower() for x in ROMAN_NUMBERS]
+ BULLET_POINTS = [ALPHABET, ALPHABET_UPPERCASE, ROMAN_NUMBERS, ROMAN_NUMBERS_LOWERCASE]
+
+ def __init__(self, pdf_segment: PdfSegment, segment_tokens: list[PdfToken], pdf_features, modes: Modes):
+ self.modes = modes
+ self.pdf_segment = pdf_segment
+ self.pdf_features = pdf_features
+
+ self.segment_tokens: list[PdfToken] = segment_tokens
+ self.first_characters: str = ""
+ self.first_characters_special_markers_count: int = 0
+ self.font_size: float = 0.0
+ self.text_content: str = ""
+ self.width: float = 0
+ self.font_family: str = ""
+ self.font_color: str = ""
+ self.line_height: float = 0.0
+ self.uppercase: bool = False
+ self.bold: float = False
+ self.italics: float = False
+ self.first_characters_type = 0
+ self.bullet_points_type = 0
+ self.text_centered: int = 0
+ self.is_left: bool = False
+ self.indentation: int = -1
+ self.left: int = self.pdf_segment.bounding_box.left
+ self.top: int = self.pdf_segment.bounding_box.top
+ self.right: int = self.pdf_segment.bounding_box.right
+ self.bottom: int = self.pdf_segment.bounding_box.bottom
+
+ self.initialize_text_properties()
+ self.process_first_characters()
+ self.process_font_properties()
+ self.process_positional_properties()
+
+ def initialize_text_properties(self):
+ words = [token.content for token in self.segment_tokens]
+ self.text_content = " ".join(words)
+
+ def process_first_characters(self):
+ self.first_characters = self.text_content.split(" ")[0].split("\n")[0].split("\t")[0]
+ clean_first_characters = [x for x in self.first_characters if x not in self.SPECIAL_MARKERS]
+ characters_checker = {
+ 1: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL"]),
+ 2: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL".lower()]),
+ 3: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "1234567890"]),
+ 4: lambda x_list: len(x_list) == len([letter for letter in x_list if letter == letter.upper()]),
+ }
+
+ self.first_characters_type = next(
+ (index for index, type_checker in characters_checker.items() if type_checker(clean_first_characters)), 0
+ )
+
+ self.bullet_points_type = (
+ self.SPECIAL_MARKERS.index(self.first_characters[-1]) + 1
+ if self.first_characters[-1] in self.SPECIAL_MARKERS
+ else 0
+ )
+ self.first_characters_special_markers_count = len(
+ [x for x in self.first_characters[:-1] if x in self.SPECIAL_MARKERS]
+ )
+
+ def process_font_properties(self):
+ self.font_family = self.segment_tokens[0].font.font_id
+ self.font_color = self.segment_tokens[0].font.color
+ self.bold = sum(token.font.bold for token in self.segment_tokens) / len(self.segment_tokens)
+ self.italics = sum(token.font.italics for token in self.segment_tokens) / len(self.segment_tokens)
+ self.uppercase = self.text_content.upper() == self.text_content
+ font_sizes = [token.font.font_size for token in self.segment_tokens]
+ self.font_size = np.mean(font_sizes)
+
+ def process_positional_properties(self):
+ self.line_height = self.segment_tokens[0].font.font_size
+ page_width = self.pdf_features.pages[self.pdf_segment.page_number - 1].page_width
+ self.text_centered = 1 if abs(self.left - (page_width - self.right)) < 10 else 0
+ self.is_left = self.left < page_width - self.right if not self.text_centered else False
+ self.indentation = int((self.left - self.modes.left_space_mode) / 15) if self.is_left else -1
+
+ def get_features_to_merge(self) -> np.array:
+ return (
+ 1 if self.bold else 0,
+ 1 if self.italics else 0,
+ )
+
+ def get_features_toc(self) -> np.array:
+ return (
+ 1 if self.bold else 0,
+ 1 if self.italics else 0,
+ self.first_characters_type,
+ self.first_characters_special_markers_count,
+ self.bullet_points_type,
+ )
+
+ def get_possible_previous_point(self) -> list[str]:
+ previous_characters = self.first_characters
+ final_special_markers = ""
+ last_part = ""
+ for letter in list(reversed(previous_characters)):
+ if not last_part and letter in self.SPECIAL_MARKERS:
+ final_special_markers = previous_characters[-1] + final_special_markers
+ previous_characters = previous_characters[:-1]
+ continue
+
+ if last_part and letter in self.SPECIAL_MARKERS:
+ break
+
+ last_part = letter + last_part
+ previous_characters = previous_characters[:-1]
+
+ previous_items = self.get_previous_items(last_part)
+
+ if not previous_items and len(self.first_characters) >= 4:
+ return [self.first_characters]
+
+ return [previous_characters + x + final_special_markers for x in previous_items]
+
+ def get_previous_items(self, item: str):
+ previous_items = []
+
+ for bullet_points in self.BULLET_POINTS:
+ if item in bullet_points and bullet_points.index(item):
+ previous_items.append(bullet_points[bullet_points.index(item) - 1])
+
+ if item.isnumeric():
+ previous_items.append(str(int(item) - 1))
+
+ return previous_items
+
+ @staticmethod
+ def from_pdf_segmentation(pdf_segmentation: PdfSegmentation) -> list["TitleFeatures"]:
+ titles_features = list()
+ modes = Modes(pdf_features=pdf_segmentation.pdf_features)
+ for pdf_segment in pdf_segmentation.pdf_segments:
+ segment_tokens = pdf_segmentation.tokens_by_segments[pdf_segment]
+ titles_features.append(TitleFeatures(pdf_segment, segment_tokens, pdf_segmentation.pdf_features, modes))
+
+ return titles_features
+
+ def to_toc_item(self, indentation):
+ return TOCItem(
+ indentation=indentation,
+ label=self.text_content,
+ selection_rectangle=SegmentBox.from_pdf_segment(self.pdf_segment, self.pdf_features.pages),
+ )
+
+ def append(self, other_title_features: "TitleFeatures"):
+ other_segment = other_title_features.pdf_segment
+ merged_bounding_box = Rectangle.merge_rectangles([self.pdf_segment.bounding_box, other_segment.bounding_box])
+ merged_content = self.pdf_segment.text_content + other_segment.text_content
+ merged_segment = PdfSegment(
+ self.pdf_segment.page_number, merged_bounding_box, merged_content, self.pdf_segment.segment_type
+ )
+ segment_tokens = self.segment_tokens + other_title_features.segment_tokens
+ return TitleFeatures(merged_segment, segment_tokens, pdf_features=self.pdf_features, modes=self.modes)
diff --git a/src/toc/data/TOCItem.py b/src/toc/data/TOCItem.py
new file mode 100644
index 0000000..9d141d3
--- /dev/null
+++ b/src/toc/data/TOCItem.py
@@ -0,0 +1,10 @@
+from pydantic import BaseModel
+
+from data_model.SegmentBox import SegmentBox
+
+
+class TOCItem(BaseModel):
+ indentation: int
+ label: str = ""
+ selection_rectangle: SegmentBox
+ point_closed: bool = False
diff --git a/src/toc/extract_table_of_contents.py b/src/toc/extract_table_of_contents.py
new file mode 100644
index 0000000..f097d31
--- /dev/null
+++ b/src/toc/extract_table_of_contents.py
@@ -0,0 +1,73 @@
+import tempfile
+import uuid
+from os.path import join
+from pathlib import Path
+from typing import AnyStr
+from fast_trainer.PdfSegment import PdfSegment
+from pdf_features.PdfFeatures import PdfFeatures
+from pdf_features.Rectangle import Rectangle
+from pdf_token_type_labels.TokenType import TokenType
+from toc.TOCExtractor import TOCExtractor
+from configuration import service_logger
+from toc.PdfSegmentation import PdfSegmentation
+
+TITLE_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER}
+SKIP_TYPES = {TokenType.TITLE, TokenType.SECTION_HEADER, TokenType.PAGE_HEADER, TokenType.PICTURE}
+
+
+def get_file_path(file_name, extension):
+ return join(tempfile.gettempdir(), file_name + "." + extension)
+
+
+def pdf_content_to_pdf_path(file_content):
+ file_id = str(uuid.uuid1())
+
+ pdf_path = Path(get_file_path(file_id, "pdf"))
+ pdf_path.write_bytes(file_content)
+
+ return pdf_path
+
+
+def skip_name_of_the_document(pdf_segments: list[PdfSegment], title_segments: list[PdfSegment]):
+ segments_to_remove = []
+ last_segment = None
+ for segment in pdf_segments:
+ if segment.segment_type not in SKIP_TYPES:
+ break
+ if segment.segment_type == TokenType.PAGE_HEADER or segment.segment_type == TokenType.PICTURE:
+ continue
+ if not last_segment:
+ last_segment = segment
+ else:
+ if segment.bounding_box.right < last_segment.bounding_box.left + last_segment.bounding_box.width * 0.66:
+ break
+ last_segment = segment
+ if segment.segment_type in TITLE_TYPES:
+ segments_to_remove.append(segment)
+ for segment in segments_to_remove:
+ title_segments.remove(segment)
+
+
+def get_pdf_segments_from_segment_boxes(pdf_features: PdfFeatures, segment_boxes: list[dict]) -> list[PdfSegment]:
+ pdf_segments: list[PdfSegment] = []
+ for segment_box in segment_boxes:
+ left, top, width, height = segment_box["left"], segment_box["top"], segment_box["width"], segment_box["height"]
+ bounding_box = Rectangle.from_width_height(left, top, width, height)
+ segment_type = TokenType.from_value(segment_box["type"])
+ pdf_name = pdf_features.file_name
+ segment = PdfSegment(segment_box["page_number"], bounding_box, segment_box["text"], segment_type, pdf_name)
+ pdf_segments.append(segment)
+ return pdf_segments
+
+
+def extract_table_of_contents(file: AnyStr, segment_boxes: list[dict], skip_document_name=False):
+ service_logger.info("Getting TOC")
+ pdf_path = pdf_content_to_pdf_path(file)
+ pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(pdf_path)
+ pdf_segments: list[PdfSegment] = get_pdf_segments_from_segment_boxes(pdf_features, segment_boxes)
+ title_segments = [segment for segment in pdf_segments if segment.segment_type in TITLE_TYPES]
+ if skip_document_name:
+ skip_name_of_the_document(pdf_segments, title_segments)
+ pdf_segmentation: PdfSegmentation = PdfSegmentation(pdf_features, title_segments)
+ toc_instance: TOCExtractor = TOCExtractor(pdf_segmentation)
+ return toc_instance.to_dict()
diff --git a/src/toc/methods/two_models_v3_segments_context_2/Modes.py b/src/toc/methods/two_models_v3_segments_context_2/Modes.py
new file mode 100644
index 0000000..dc83c4f
--- /dev/null
+++ b/src/toc/methods/two_models_v3_segments_context_2/Modes.py
@@ -0,0 +1,44 @@
+import dataclasses
+import hashlib
+from statistics import mode
+
+from pdf_features.PdfFeatures import PdfFeatures
+
+
+@dataclasses.dataclass
+class Modes:
+ lines_space_mode: float
+ left_space_mode: float
+ right_space_mode: float
+ font_size_mode: float
+ font_family_name_mode: str
+ font_family_mode: int
+ font_family_mode_normalized: float
+ pdf_features: PdfFeatures
+
+ def __init__(self, pdf_features: PdfFeatures):
+ self.pdf_features = pdf_features
+ self.set_modes()
+
+ def set_modes(self):
+ line_spaces, right_spaces, left_spaces = [0], [0], [0]
+ for page, token in self.pdf_features.loop_tokens():
+ right_spaces.append(self.pdf_features.pages[0].page_width - token.bounding_box.right)
+ left_spaces.append(token.bounding_box.left)
+ line_spaces.append(token.bounding_box.bottom)
+
+ self.lines_space_mode = mode(line_spaces)
+ self.left_space_mode = mode(left_spaces)
+ self.right_space_mode = mode(right_spaces)
+
+ font_sizes = [token.font.font_size for page, token in self.pdf_features.loop_tokens() if token.font]
+ self.font_size_mode = mode(font_sizes) if font_sizes else 0
+ font_ids = [token.font.font_id for page, token in self.pdf_features.loop_tokens() if token.font]
+ self.font_family_name_mode = mode(font_ids) if font_ids else ""
+ self.font_family_mode = abs(
+ int(
+ str(hashlib.sha256(self.font_family_name_mode.encode("utf-8")).hexdigest())[:8],
+ 16,
+ )
+ )
+ self.font_family_mode_normalized = float(f"{str(self.font_family_mode)[0]}.{str(self.font_family_mode)[1:]}")