Skip to content

Commit

Permalink
Async fast endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Jul 19, 2024
1 parent ba16548 commit 1ebb872
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
3 changes: 2 additions & 1 deletion src/pdf_layout_analysis/run_pdf_layout_analysis.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import tempfile
import uuid
from os.path import join
Expand All @@ -10,7 +11,7 @@
from vgt.get_most_probable_pdf_segments import get_most_probable_pdf_segments
from vgt.get_reading_orders import get_reading_orders
from data_model.PdfImages import PdfImages
from src.configuration import service_logger, JSON_TEST_FILE_PATH, IMAGES_ROOT_PATH
from src.configuration import service_logger, JSON_TEST_FILE_PATH, IMAGES_ROOT_PATH, XMLS_PATH
from vgt.create_word_grid import create_word_grid, remove_word_grids
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.data.datasets import register_coco_instances
Expand Down
12 changes: 10 additions & 2 deletions src/pdf_layout_analysis/run_pdf_layout_analysis_fast.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
from os.path import join
from pathlib import Path
from typing import AnyStr

from fast_trainer.ParagraphExtractorTrainer import ParagraphExtractorTrainer
Expand All @@ -8,14 +10,20 @@
from pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer
from pdf_tokens_type_trainer.ModelConfiguration import ModelConfiguration

from configuration import ROOT_PATH, service_logger
from configuration import ROOT_PATH, service_logger, XMLS_PATH
from data_model.SegmentBox import SegmentBox


def analyze_pdf_fast(file: AnyStr, xml_file_name: str = "") -> list[dict]:
pdf_path = pdf_content_to_pdf_path(file)
service_logger.info("Creating Paragraph Tokens [fast]")
pdf_features = PdfFeatures.from_pdf_path(pdf_path, xml_file_name)

xml_path = Path(join(XMLS_PATH, xml_file_name)) if xml_file_name else None

if xml_path and not xml_path.parent.exists():
os.makedirs(xml_path.parent, exist_ok=True)

pdf_features = PdfFeatures.from_pdf_path(pdf_path, str(xml_path) if xml_path else None)
token_type_trainer = TokenTypeTrainer([pdf_features], ModelConfiguration())
token_type_trainer.set_token_types(join(ROOT_PATH, "models", "token_type_lightgbm.model"))
trainer = ParagraphExtractorTrainer(pdfs_features=[pdf_features], model_configuration=PARAGRAPH_EXTRACTION_CONFIGURATION)
Expand Down

0 comments on commit 1ebb872

Please sign in to comment.