Skip to content

Commit

Permalink
Layoutlmv2 tesseractconfig (huggingface#17733)
Browse files Browse the repository at this point in the history
* Added option for users to modify config parameter used by pytesseract during feature extraction

- Added optional 'tess_config' kwarg when setting up LayoutLMV2 processor that is used by pytesseract during feature extraction
- Eg. Can be used to modify psm values by setting tess_config to '--psm 7'
- Different psm values significantly influences the output of layoutlmv2

* Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Updated variable names to be more explicit

* Fixed styles

* Added option for users to modify config parameter when calling pytesseract during feature extraction

- Added option to set "tesseract_config" parameter during LayoutLMV3 processor initialization
- Can be used to modify PSM values, eg. by setting tesseract_config="--psm 6"

* Removed  from function signature

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
  • Loading branch information
kelvinAI and NielsRogge authored Aug 1, 2022
1 parent 151a2aa commit 24845ae
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ def normalize_box(box, width, height):
]


def apply_tesseract(image: Image.Image, lang: Optional[str]):
def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""

# apply OCR
data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]

# filter empty words and corresponding coordinates
Expand Down Expand Up @@ -100,9 +100,12 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
if `do_resize` is set to `True`.
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`Optional[str]`, *optional*):
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'.
<Tip>
Expand All @@ -112,13 +115,23 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM

model_input_names = ["pixel_values"]

def __init__(self, do_resize=True, size=224, resample=Image.BILINEAR, apply_ocr=True, ocr_lang=None, **kwargs):
def __init__(
self,
do_resize=True,
size=224,
resample=Image.BILINEAR,
apply_ocr=True,
ocr_lang=None,
tesseract_config="",
**kwargs
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size
self.resample = resample
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config

def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
Expand Down Expand Up @@ -201,7 +214,7 @@ def __call__(
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
words_batch.append(words)
boxes_batch.append(boxes)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,10 @@ def normalize_box(box, width, height):
]


def apply_tesseract(image: Image.Image, lang: Optional[str]):
def apply_tesseract(image: Image.Image, lang: Optional[str], tesseract_config: Optional[str]):
"""Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""

# apply OCR
data = pytesseract.image_to_data(image, lang=lang, output_type="dict")
data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]

# filter empty words and corresponding coordinates
Expand Down Expand Up @@ -106,9 +105,12 @@ class LayoutLMv3FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
The sequence of standard deviations for each channel, to be used when normalizing images.
apply_ocr (`bool`, *optional*, defaults to `True`):
Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
ocr_lang (`Optional[str]`, *optional*):
ocr_lang (`str`, *optional*):
The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
used.
tesseract_config (`str`, *optional*):
Any additional custom configuration flags that are forwarded to the `config` parameter when calling
Tesseract. For example: '--psm 6'.
<Tip>
Expand All @@ -128,6 +130,7 @@ def __init__(
image_std=None,
apply_ocr=True,
ocr_lang=None,
tesseract_config="",
**kwargs
):
super().__init__(**kwargs)
Expand All @@ -139,6 +142,7 @@ def __init__(
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
self.apply_ocr = apply_ocr
self.ocr_lang = ocr_lang
self.tesseract_config = tesseract_config

def __call__(
self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
Expand Down Expand Up @@ -221,7 +225,7 @@ def __call__(
words_batch = []
boxes_batch = []
for image in images:
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang)
words, boxes = apply_tesseract(self.to_pil_image(image), self.ocr_lang, self.tesseract_config)
words_batch.append(words)
boxes_batch.append(boxes)

Expand Down

0 comments on commit 24845ae

Please sign in to comment.