Async fast endpoint

huridocs · Jul 19, 2024 · 1ebb872 · 1ebb872
1 parent ba16548
commit 1ebb872
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 3 deletions.
diff --git a/src/pdf_layout_analysis/run_pdf_layout_analysis.py b/src/pdf_layout_analysis/run_pdf_layout_analysis.py
@@ -1,3 +1,4 @@
+import os
 import tempfile
 import uuid
 from os.path import join
@@ -10,7 +11,7 @@
 from vgt.get_most_probable_pdf_segments import get_most_probable_pdf_segments
 from vgt.get_reading_orders import get_reading_orders
 from data_model.PdfImages import PdfImages
-from src.configuration import service_logger, JSON_TEST_FILE_PATH, IMAGES_ROOT_PATH
+from src.configuration import service_logger, JSON_TEST_FILE_PATH, IMAGES_ROOT_PATH, XMLS_PATH
 from vgt.create_word_grid import create_word_grid, remove_word_grids
 from detectron2.checkpoint import DetectionCheckpointer
 from detectron2.data.datasets import register_coco_instances

diff --git a/src/pdf_layout_analysis/run_pdf_layout_analysis_fast.py b/src/pdf_layout_analysis/run_pdf_layout_analysis_fast.py
@@ -1,4 +1,6 @@
+import os
 from os.path import join
+from pathlib import Path
 from typing import AnyStr
 
 from fast_trainer.ParagraphExtractorTrainer import ParagraphExtractorTrainer
@@ -8,14 +10,20 @@
 from pdf_tokens_type_trainer.TokenTypeTrainer import TokenTypeTrainer
 from pdf_tokens_type_trainer.ModelConfiguration import ModelConfiguration
 
-from configuration import ROOT_PATH, service_logger
+from configuration import ROOT_PATH, service_logger, XMLS_PATH
 from data_model.SegmentBox import SegmentBox
 
 
 def analyze_pdf_fast(file: AnyStr, xml_file_name: str = "") -> list[dict]:
  pdf_path = pdf_content_to_pdf_path(file)
  service_logger.info("Creating Paragraph Tokens [fast]")
- pdf_features = PdfFeatures.from_pdf_path(pdf_path, xml_file_name)
+
+ xml_path = Path(join(XMLS_PATH, xml_file_name)) if xml_file_name else None
+
+ if xml_path and not xml_path.parent.exists():
+ os.makedirs(xml_path.parent, exist_ok=True)
+
+ pdf_features = PdfFeatures.from_pdf_path(pdf_path, str(xml_path) if xml_path else None)
  token_type_trainer = TokenTypeTrainer([pdf_features], ModelConfiguration())
  token_type_trainer.set_token_types(join(ROOT_PATH, "models", "token_type_lightgbm.model"))
  trainer = ParagraphExtractorTrainer(pdfs_features=[pdf_features], model_configuration=PARAGRAPH_EXTRACTION_CONFIGURATION)