Layoutlmv2 tesseractconfig (huggingface#17733)

* Added option for users to modify config parameter used by pytesseract during feature extraction - Added optional 'tess_config' kwarg when setting up LayoutLMV2 processor that is used by pytesseract during feature extraction - Eg. Can be used to modify psm values by setting tess_config to '--psm 7' - Different psm values significantly influences the output of layoutlmv2 * Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Updated variable names to be more explicit * Fixed styles * Added option for users to modify config parameter when calling pytesseract during feature extraction - Added option to set "tesseract_config" parameter during LayoutLMV3 processor initialization - Can be used to modify PSM values, eg. by setting tesseract_config="--psm 6" * Removed from function signature Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
xght · Aug 1, 2022 · 24845ae · 24845ae
1 parent 151a2aa
commit 24845ae
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 10 deletions.
diff --git a/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py b/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
@@ -46,11 +46,11 @@ def normalize_box(box, width, height):
     ]
 
 
-def apply_tesseract(image: Image.Image, lang: Optional[str]):
+def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
     """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
 
     # apply OCR
-    data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
+    data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
     words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
 
     # filter empty words and corresponding coordinates
@@ -100,9 +100,12 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
             if `do_resize` is set to `True`.
         apply_ocr (`bool`, *optional*, defaults to `True`):
             Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
-        ocr_lang (`Optional[str]`, *optional*):
+        ocr_lang (`str`, *optional*):
             The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
             used.
+        tesseract_config (`str`, *optional*):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'.
 
             <Tip>
 
@@ -112,13 +115,23 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
 
     model_input_names = ["pixel_values"]
 
-    def __init__(self, do_resize=True, size=224, resample=Image.BILINEAR, apply_ocr=True, ocr_lang=None, **kwargs):
+    def __init__(
+        self,
+        do_resize=True,
+        size=224,
+        resample=Image.BILINEAR,
+        apply_ocr=True,
+        ocr_lang=None,
+        tesseract_config="",
+        **kwargs
+    ):
         super().__init__(**kwargs)
         self.do_resize = do_resize
         self.size = size
         self.resample = resample
         self.apply_ocr = apply_ocr
         self.ocr_lang = ocr_lang
+        self.tesseract_config = tesseract_config
 
     def __call__(
         self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
@@ -201,7 +214,7 @@ def __call__(
             words_batch = []
             boxes_batch = []
             for image in images:
-                words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
+                words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
                 words_batch.append(words)
                 boxes_batch.append(boxes)
 

diff --git a/src/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py b/src/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py
@@ -46,11 +46,10 @@ def normalize_box(box, width, height):
     ]
 
 
-def apply_tesseract(image: Image.Image, lang: Optional[str]):
+def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
     """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
-
     # apply OCR
-    data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
+    data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
     words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
 
     # filter empty words and corresponding coordinates
@@ -106,9 +105,12 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
             The sequence of standard deviations for each channel, to be used when normalizing images.
         apply_ocr (`bool`, *optional*, defaults to `True`):
             Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
-        ocr_lang (`Optional[str]`, *optional*):
+        ocr_lang (`str`, *optional*):
             The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
             used.
+        tesseract_config (`str`, *optional*):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'.
 
             <Tip>
 
@@ -128,6 +130,7 @@ def __init__(
         image_std=None,
         apply_ocr=True,
         ocr_lang=None,
+        tesseract_config="",
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -139,6 +142,7 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
         self.apply_ocr = apply_ocr
         self.ocr_lang = ocr_lang
+        self.tesseract_config = tesseract_config
 
     def __call__(
         self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
@@ -221,7 +225,7 @@ def __call__(
             words_batch = []
             boxes_batch = []
             for image in images:
-                words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
+                words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
                 words_batch.append(words)
                 boxes_batch.append(boxes)