Merge pull request #37 from huridocs/toc

Toc
huridocs · Jul 12, 2024 · 4624511 · 4624511
2 parents 971ae2b + 4ee44b2
commit 4624511
Show file tree

Hide file tree

Showing 16 changed files with 539 additions and 30 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -24,6 +24,9 @@ jobs:
  - name: Free up space
  run: make free_up_space
 
+ - name: Install pdftohtml
+ run: sudo apt-get install -y pdftohtml
+
  - name: Install venv
  run: make install_venv
 

diff --git a/Dockerfile b/Dockerfile
@@ -19,8 +19,8 @@ RUN pip install --upgrade pip
 RUN pip --default-timeout=1000 install -r requirements.txt
 
 WORKDIR /app
-RUN cd src; git clone https://github.com/facebookresearch/detectron2
-RUN cd src/detectron2; python setup.py build develop
+RUN cd src; git clone https://github.com/facebookresearch/detectron2;
+RUN cd src/detectron2; git checkout 70f454304e1a38378200459dd2dbca0f0f4a5ab4; python setup.py build develop
 
 COPY ./src/. ./src
 COPY ./models/. ./models/

diff --git a/Makefile b/Makefile
@@ -1,3 +1,5 @@
+HAS_GPU := $(shell command -v nvidia-smi > /dev/null && echo 1 || echo 0)
+
 install:
  . .venv/bin/activate; pip install -Ur requirements.txt
 
@@ -23,7 +25,14 @@ remove_docker_images:
 
 start:
  mkdir -p ./models
+ifeq ($(HAS_GPU), 1)
+ @echo "NVIDIA GPU detected, using docker-compose-gpu.yml"
  docker compose -f docker-compose-gpu.yml up --build
+else
+ @echo "No NVIDIA GPU detected, using docker-compose.yml"
+ docker compose -f docker-compose.yml up --build
+endif
+
 
 start_no_gpu:
  mkdir -p ./models

diff --git a/README.md b/README.md
@@ -28,11 +28,7 @@ pictures, tables and so on. Additionally, it determines the correct order of the
 ## Quick Start
 Start the service:
 
- # With GPU support
  make start
-
- # Without GPU support [if you do not have a GPU on your system]
- make start_no_gpu
 
 Get the segments from a PDF:
 
@@ -49,16 +45,18 @@ To stop the server:
 - [Models](#models)
 - [Data](#data)
 - [Usage](#usage)
-- [Benchmark](#benchmark)
+- [Benchmarks](#benchmarks)
+ - [Performance](#performance)
+ - [Speed](#speed)
 - [Related Services](#related-services)
 
 ## Dependencies
 * Docker Desktop 4.25.0 [install link](https://www.docker.com/products/docker-desktop/)
 * For GPU support [install link](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
 
 ## Requirements
-* 4 GB RAM memory
-* 6 GB GPU memory (if not, it will run with CPU)
+* 2 GB RAM memory
+* 5 GB GPU memory (if not, it will run on CPU)
 
 ## Models
 
@@ -102,10 +100,14 @@ As we mentioned at the [Quick Start](#quick-start), you can use the service simp
 
  curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' localhost:5060
 
-This command will run the code on visual model. So you should be prepared that it will use lots of resources. But if you
-want to use the not visual models, which are the LightGBM models, you can use this command:
+This command will run the visual model. So you should be prepared that it will use lots of resources. Also, please note 
+that if you do not have GPU in your system, or if you do not have enough free GPU memory, the visual model will run on CPU. 
+You should be expecting a long response time in that case (See [speed benchmark](#speed) for more details).
 
- curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' localhost:5060/fast
+
+If you want to use the non-visual models, which are the LightGBM models, you can use this command:
+
+ curl -X POST -F 'file=@/PATH/TO/PDF/pdf_name.pdf' -F "fast=true" localhost:5060
 
 The shape of the response will be the same in both of these commands. 
 
@@ -146,7 +148,9 @@ we process them after sorting all segments with content. To determine their read
 using distance as a criterion.
 
 
-## Benchmark
+## Benchmarks
+
+### Performance
 
 These are the benchmark results for VGT model on PubLayNet dataset:
 
@@ -171,6 +175,37 @@ These are the benchmark results for VGT model on PubLayNet dataset:
 
 You can check this link to see the comparison with the other models: https://paperswithcode.com/sota/document-layout-analysis-on-publaynet-val
 
+### Speed
+
+For 15 pages academic paper document:
+
+<table>
+ <tr>
+ <th>Model</th>
+ <th>GPU</th>
+ <th>Speed (seconds per page)</th>
+ </tr>
+ <tr>
+ <td>Fast Model</td>
+ <td>✗ [i7-8700 3.2GHz]</td>
+ <td>0.42</td>
+ </tr>
+ <tr>
+ <td>VGT</td>
+ <td>✓ [GTX 1070]</td>
+ <td>1.75</td>
+ </tr>
+ <tr>
+ <td>VGT</td>
+ <td>✗ [i7-8700 3.2GHz]</td>
+ <td>13.5</td>
+ </tr>
+</table>
+
+
+
+
+
 ## Related Services
 Here are some of our other services that is built upon this service:
 

diff --git a/requirements.txt b/requirements.txt
@@ -17,4 +17,6 @@ pdf2image==1.17.0
 lxml==5.2.2
 lightgbm==4.4.0
 huggingface_hub==0.23.4
-setuptools==70.2.0
+setuptools==70.2.0
+roman~=4.2
+hydra-core==1.3.2
diff --git a/src/app.py b/src/app.py
@@ -3,13 +3,14 @@
 from os.path import join
 from pathlib import Path
 import torch
-from fastapi import FastAPI, UploadFile, File
+from fastapi import FastAPI, UploadFile, File, Form
 from fastapi.responses import PlainTextResponse
 
 from catch_exceptions import catch_exceptions
 from configuration import service_logger, XMLS_PATH
 from pdf_layout_analysis.run_pdf_layout_analysis import analyze_pdf
 from pdf_layout_analysis.run_pdf_layout_analysis_fast import analyze_pdf_fast
+from toc.extract_table_of_contents import extract_table_of_contents
 
 service_logger.info(f"Is PyTorch using GPU: {torch.cuda.is_available()}")
 
@@ -23,9 +24,9 @@ async def info():
 
 @app.post("/")
 @catch_exceptions
-async def run(file: UploadFile = File(...)):
+async def run(file: UploadFile = File(...), fast: bool = Form(False)):
  service_logger.info(f"Processing file: {file.filename}")
- return analyze_pdf(file.file.read())
+ return analyze_pdf_fast(file.file.read()) if fast else analyze_pdf(file.file.read())
 
 
 @app.post("/save_xml/{xml_file_name}")
@@ -45,7 +46,10 @@ async def get_xml(xml_file_name: str):
  return content
 
 
-@app.post("/fast")
+@app.post("/toc")
 @catch_exceptions
-async def run_fast(file: UploadFile = File(...)):
- return analyze_pdf_fast(file.file.read())
+async def get_toc(file: UploadFile = File(...), fast: bool = Form(False)):
+ file_content = file.file.read()
+ if fast:
+ return extract_table_of_contents(file_content, analyze_pdf_fast(file_content))
+ return extract_table_of_contents(file_content, analyze_pdf(file_content))
diff --git a/src/data_model/SegmentBox.py b/src/data_model/SegmentBox.py
@@ -1,11 +1,8 @@
 from fast_trainer.PdfSegment import PdfSegment
 from pdf_features.PdfPage import PdfPage
-from pdf_features.Rectangle import Rectangle
 from pdf_token_type_labels.TokenType import TokenType
 from pydantic import BaseModel
 
-from configuration import DOCLAYNET_TYPE_BY_ID
-
 
 class SegmentBox(BaseModel):
  left: float

diff --git a/src/pdf_token_type_labels/TokenType.py b/src/pdf_token_type_labels/TokenType.py
@@ -28,5 +28,12 @@ def from_index(index: int):
  except IndexError:
  return TokenType.TEXT.name.lower()
 
+ @staticmethod
+ def from_value(value: str):
+ for token_type in TokenType:
+ if token_type.value == value:
+ return token_type
+ return TokenType.TEXT
+
  def get_index(self) -> int:
  return list(TokenType).index(self)
diff --git a/src/test_end_to_end.py b/src/test_end_to_end.py
@@ -62,42 +62,47 @@ def test_regular_pdf(self):
  def test_error_file_fast(self):
  with open(f"{ROOT_PATH}/test_pdfs/error.pdf", "rb") as stream:
  files = {"file": stream}
+ data = {"fast": "True"}
 
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
 
  self.assertEqual(422, results.status_code)
 
  def test_blank_pdf_fast(self):
  with open(f"{ROOT_PATH}/test_pdfs/blank.pdf", "rb") as stream:
  files = {"file": stream}
+ data = {"fast": "True"}
 
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
 
  self.assertEqual(200, results.status_code)
  self.assertEqual(0, len(results.json()))
 
  def test_segmentation_some_empty_pages_fast(self):
  with open(f"{ROOT_PATH}/test_pdfs/some_empty_pages.pdf", "rb") as stream:
  files = {"file": stream}
+ data = {"fast": "True"}
 
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
 
  self.assertEqual(200, results.status_code)
  self.assertEqual(2, len(results.json()))
 
  def test_image_pdfs_fast(self):
  with open(f"{ROOT_PATH}/test_pdfs/image.pdf", "rb") as stream:
  files = {"file": stream}
+ data = {"fast": "True"}
 
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
 
  self.assertEqual(200, results.status_code)
  self.assertEqual(0, len(results.json()))
 
  def test_regular_pdf_fast(self):
  with open(f"{ROOT_PATH}/test_pdfs/regular.pdf", "rb") as stream:
  files = {"file": stream}
- results = requests.post(f"{self.service_url}/fast", files=files)
+ data = {"fast": "True"}
+ results = requests.post(f"{self.service_url}", files=files, data=data)
  results_dict = results.json()
  expected_content = "RESOLUCIÓN DE LA CORTE INTERAMERICANA DE DERECHOS HUMANOS"
  self.assertEqual(200, results.status_code)
@@ -130,15 +135,17 @@ def test_chinese(self):
  def test_korean_fast(self):
  with open(f"{ROOT_PATH}/test_pdfs/korean.pdf", "rb") as stream:
  files = {"file": stream}
+ data = {"fast": "True"}
 
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
 
  self.assertEqual(200, results.status_code)
 
  def test_chinese_fast(self):
  with open(f"{ROOT_PATH}/test_pdfs/chinese.pdf", "rb") as stream:
  files = {"file": stream}
+ data = {"fast": "True"}
 
- results = requests.post(f"{self.service_url}/fast", files=files)
+ results = requests.post(f"{self.service_url}", files=files, data=data)
 
  self.assertEqual(200, results.status_code)
diff --git a/src/toc/MergeTwoSegmentsTitles.py b/src/toc/MergeTwoSegmentsTitles.py
@@ -0,0 +1,48 @@
+from toc.TitleFeatures import TitleFeatures
+from toc.PdfSegmentation import PdfSegmentation
+
+
+class MergeTwoSegmentsTitles:
+ def __init__(self, pdf_segmentation: PdfSegmentation):
+ self.title_features_list: list[TitleFeatures] = TitleFeatures.from_pdf_segmentation(pdf_segmentation)
+ self.titles_merged: list[TitleFeatures] = list()
+ self.merge()
+
+ def merge(self):
+ index = 0
+ while index < len(self.title_features_list):
+ if index == len(self.title_features_list) - 1:
+ self.titles_merged.append(self.title_features_list[index])
+ break
+
+ if not self.should_merge(self.title_features_list[index], self.title_features_list[index + 1]):
+ self.titles_merged.append(self.title_features_list[index])
+ index += 1
+ continue
+
+ self.title_features_list[index + 1] = self.title_features_list[index + 1].append(self.title_features_list[index])
+ index += 1
+
+ @staticmethod
+ def should_merge(title: TitleFeatures, other_title: TitleFeatures):
+ same_page = other_title.pdf_segment.page_number == title.pdf_segment.page_number
+
+ if not same_page:
+ return False
+
+ if abs(other_title.top - title.bottom) > 15:
+ return False
+
+ if abs(other_title.left - title.right) > 15 or abs(other_title.right - title.left) > 15:
+ return False
+
+ if title.first_characters_type in [1, 2, 3] and other_title.first_characters_type in [1, 2, 3]:
+ return False
+
+ if title.bullet_points_type and other_title.bullet_points_type:
+ return False
+
+ if title.get_features_to_merge() != other_title.get_features_to_merge():
+ return False
+
+ return True
diff --git a/src/toc/PdfSegmentation.py b/src/toc/PdfSegmentation.py
@@ -0,0 +1,32 @@
+from fast_trainer.PdfSegment import PdfSegment
+from pdf_features.PdfFeatures import PdfFeatures
+from pdf_features.PdfToken import PdfToken
+
+
+class PdfSegmentation:
+ def __init__(self, pdf_features: PdfFeatures, pdf_segments: list[PdfSegment]):
+ self.pdf_features: PdfFeatures = pdf_features
+ self.pdf_segments: list[PdfSegment] = pdf_segments
+ self.tokens_by_segments: dict[PdfSegment, list[PdfToken]] = self.find_tokens_by_segments()
+
+ @staticmethod
+ def find_segment_for_token(token: PdfToken, segments: list[PdfSegment], tokens_by_segments):
+ best_score: float = 0
+ most_probable_segment: PdfSegment | None = None
+ for segment in segments:
+ intersection_percentage = token.bounding_box.get_intersection_percentage(segment.bounding_box)
+ if intersection_percentage > best_score:
+ best_score = intersection_percentage
+ most_probable_segment = segment
+ if best_score >= 99:
+ break
+ if most_probable_segment:
+ tokens_by_segments.setdefault(most_probable_segment, list()).append(token)
+
+ def find_tokens_by_segments(self):
+ tokens_by_segments: dict[PdfSegment, list[PdfToken]] = {}
+ for page in self.pdf_features.pages:
+ page_segments = [segment for segment in self.pdf_segments if segment.page_number == page.page_number]
+ for token in page.tokens:
+ self.find_segment_for_token(token, page_segments, tokens_by_segments)
+ return tokens_by_segments