From 044ed4fd2dd99ff3a2121c253de1affeb8d587e7 Mon Sep 17 00:00:00 2001
From: Vaishak Kumar <vaishak.thimakapura@gmail.com>
Date: Thu, 29 Dec 2022 11:44:59 -0800
Subject: [PATCH 1/6] Add document token classification pipeline (#1)

---
 docs/source/en/main_classes/pipelines.mdx     |   6 +
 docs/source/en/model_doc/auto.mdx             |   8 +
 src/transformers/__init__.py                  |  10 +
 src/transformers/models/auto/__init__.py      |   8 +
 src/transformers/models/auto/modeling_auto.py |  18 ++
 .../models/auto/modeling_tf_auto.py           |  16 ++
 src/transformers/pipelines/__init__.py        |  14 +
 .../document_token_classification.py          | 258 ++++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   9 +
 src/transformers/utils/dummy_tf_objects.py    |   9 +
 src/transformers/utils/fx.py                  |   6 +
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  50 ++--
 .../layoutlmv3/test_modeling_layoutlmv3.py    |  41 +--
 tests/pipelines/test_pipelines_common.py      |   2 +-
 ...pipelines_document_token_classification.py | 201 ++++++++++++++
 15 files changed, 614 insertions(+), 42 deletions(-)
 create mode 100644 src/transformers/pipelines/document_token_classification.py
 create mode 100644 tests/pipelines/test_pipelines_document_token_classification.py

diff --git a/docs/source/en/main_classes/pipelines.mdx b/docs/source/en/main_classes/pipelines.mdx
index e5ee3902028e34..acb4e10c80bbd6 100644
--- a/docs/source/en/main_classes/pipelines.mdx
+++ b/docs/source/en/main_classes/pipelines.mdx
@@ -446,6 +446,12 @@ Pipelines available for multimodal tasks include the following.
     - __call__
     - all
 
+### DocumentTokenClassificationPipeline
+
+[[autodoc]] DocumentTokenClassificationPipeline
+    - __call__
+    - all
+
 ### FeatureExtractionPipeline
 
 [[autodoc]] FeatureExtractionPipeline
diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.mdx
index b39920151db424..80beec71ba23aa 100644
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@@ -310,6 +310,14 @@ The following auto classes are available for the following multimodal tasks.
 
 [[autodoc]] TFAutoModelForDocumentQuestionAnswering
 
+### AutoModelForDocumentTokenClassification
+
+[[autodoc]] AutoModelForDocumentTokenClassification
+
+### TFAutoModelForDocumentTokenClassification
+
+[[autodoc]] TFAutoModelForDocumentTokenClassification
+
 ### AutoModelForVisualQuestionAnswering
 
 [[autodoc]] AutoModelForVisualQuestionAnswering
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index bf53737e968992..c0b22afb2ebde0 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -481,6 +481,7 @@
         "CsvPipelineDataFormat",
         "DepthEstimationPipeline",
         "DocumentQuestionAnsweringPipeline",
+        "DocumentTokenClassificationPipeline",
         "FeatureExtractionPipeline",
         "FillMaskPipeline",
         "ImageClassificationPipeline",
@@ -938,6 +939,7 @@
             "MODEL_FOR_CTC_MAPPING",
             "MODEL_FOR_DEPTH_ESTIMATION_MAPPING",
             "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
+            "MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
             "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
@@ -970,6 +972,7 @@
             "AutoModelForCTC",
             "AutoModelForDepthEstimation",
             "AutoModelForDocumentQuestionAnswering",
+            "AutoModelForDocumentTokenClassification",
             "AutoModelForImageClassification",
             "AutoModelForImageSegmentation",
             "AutoModelForInstanceSegmentation",
@@ -2531,6 +2534,7 @@
         [
             "TF_MODEL_FOR_CAUSAL_LM_MAPPING",
             "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
+            "TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING",
             "TF_MODEL_FOR_MASKED_LM_MAPPING",
@@ -2550,6 +2554,7 @@
             "TFAutoModel",
             "TFAutoModelForCausalLM",
             "TFAutoModelForDocumentQuestionAnswering",
+            "TFAutoModelForDocumentTokenClassification",
             "TFAutoModelForImageClassification",
             "TFAutoModelForMaskedLM",
             "TFAutoModelForMultipleChoice",
@@ -3796,6 +3801,7 @@
         CsvPipelineDataFormat,
         DepthEstimationPipeline,
         DocumentQuestionAnsweringPipeline,
+        DocumentTokenClassificationPipeline,
         FeatureExtractionPipeline,
         FillMaskPipeline,
         ImageClassificationPipeline,
@@ -4183,6 +4189,7 @@
             MODEL_FOR_CTC_MAPPING,
             MODEL_FOR_DEPTH_ESTIMATION_MAPPING,
             MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
@@ -4215,6 +4222,7 @@
             AutoModelForCTC,
             AutoModelForDepthEstimation,
             AutoModelForDocumentQuestionAnswering,
+            AutoModelForDocumentTokenClassification,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
             AutoModelForInstanceSegmentation,
@@ -5497,6 +5505,7 @@
         from .models.auto import (
             TF_MODEL_FOR_CAUSAL_LM_MAPPING,
             TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
+            TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
             TF_MODEL_FOR_MASKED_LM_MAPPING,
@@ -5516,6 +5525,7 @@
             TFAutoModel,
             TFAutoModelForCausalLM,
             TFAutoModelForDocumentQuestionAnswering,
+            TFAutoModelForDocumentTokenClassification,
             TFAutoModelForImageClassification,
             TFAutoModelForMaskedLM,
             TFAutoModelForMultipleChoice,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index da8ceb8e7e6258..6683aa2b85e6cc 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -50,6 +50,7 @@
         "MODEL_FOR_CAUSAL_LM_MAPPING",
         "MODEL_FOR_CTC_MAPPING",
         "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
+        "MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING",
         "MODEL_FOR_DEPTH_ESTIMATION_MAPPING",
         "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
@@ -103,6 +104,7 @@
         "AutoModelForVision2Seq",
         "AutoModelForVisualQuestionAnswering",
         "AutoModelForDocumentQuestionAnswering",
+        "AutoModelForDocumentTokenClassification",
         "AutoModelWithLMHead",
         "AutoModelForZeroShotObjectDetection",
     ]
@@ -123,6 +125,7 @@
         "TF_MODEL_FOR_PRETRAINING_MAPPING",
         "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
         "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
+        "TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING",
         "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
         "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
         "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
@@ -140,6 +143,7 @@
         "TFAutoModelForNextSentencePrediction",
         "TFAutoModelForPreTraining",
         "TFAutoModelForDocumentQuestionAnswering",
+        "TFAutoModelForDocumentTokenClassification",
         "TFAutoModelForQuestionAnswering",
         "TFAutoModelForSemanticSegmentation",
         "TFAutoModelForSeq2SeqLM",
@@ -208,6 +212,7 @@
             MODEL_FOR_CTC_MAPPING,
             MODEL_FOR_DEPTH_ESTIMATION_MAPPING,
             MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
+            MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
@@ -240,6 +245,7 @@
             AutoModelForCTC,
             AutoModelForDepthEstimation,
             AutoModelForDocumentQuestionAnswering,
+            AutoModelForDocumentTokenClassification,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
             AutoModelForInstanceSegmentation,
@@ -273,6 +279,7 @@
         from .modeling_tf_auto import (
             TF_MODEL_FOR_CAUSAL_LM_MAPPING,
             TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
+            TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
             TF_MODEL_FOR_MASKED_LM_MAPPING,
@@ -292,6 +299,7 @@
             TFAutoModel,
             TFAutoModelForCausalLM,
             TFAutoModelForDocumentQuestionAnswering,
+            TFAutoModelForDocumentTokenClassification,
             TFAutoModelForImageClassification,
             TFAutoModelForMaskedLM,
             TFAutoModelForMultipleChoice,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4d61c4c972ed05..c1efb96990962d 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -716,6 +716,12 @@
     ]
 )
 
+MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("layoutlmv3", "LayoutLMv3ForTokenClassification"),
+    ]
+)
+
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Token Classification mapping
@@ -926,6 +932,9 @@
 MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
 )
+MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING_NAMES
+)
 MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
 MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
@@ -1060,6 +1069,15 @@ class AutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
     checkpoint_for_example='impira/layoutlm-document-qa", revision="52e01b3',
 )
 
+class AutoModelForDocumentTokenClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING
+
+AutoModelForDocumentTokenClassification = auto_class_update(
+    AutoModelForDocumentTokenClassification,
+    head_doc="document token classification",
+    checkpoint_for_example='microsoft/layoutlmv3-base", revision="07c9b08',
+)
+
 
 class AutoModelForTokenClassification(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index c77fba4f66fac6..b95facb6dc2cf3 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -344,6 +344,11 @@
     ]
 )
 
+TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("layoutlmv3", "TFLayoutLMv3ForTokenClassification"),
+    ]
+)
 
 TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
@@ -442,6 +447,9 @@
 TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
 )
+TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING_NAMES
+)
 TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
 )
@@ -561,6 +569,14 @@ class TFAutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
     checkpoint_for_example='impira/layoutlm-document-qa", revision="52e01b3',
 )
 
+class TFAutoModelForDocumentTokenClassification(_BaseAutoModelClass):
+    _model_mapping = TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING
+
+TFAutoModelForDocumentTokenClassification = auto_class_update(
+    TFAutoModelForDocumentTokenClassification,
+    head_doc="document token classification",
+    checkpoint_for_example='microsoft/layoutlmv3-base", revision="07c9b08',
+)
 
 class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass):
     _model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 8b06009a4cd14b..61cada6386397c 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -62,6 +62,7 @@
 from .conversational import Conversation, ConversationalPipeline
 from .depth_estimation import DepthEstimationPipeline
 from .document_question_answering import DocumentQuestionAnsweringPipeline
+from .document_token_classification import DocumentTokenClassificationPipeline
 from .feature_extraction import FeatureExtractionPipeline
 from .fill_mask import FillMaskPipeline
 from .image_classification import ImageClassificationPipeline
@@ -123,6 +124,7 @@
         AutoModelForCausalLM,
         AutoModelForCTC,
         AutoModelForDocumentQuestionAnswering,
+        AutoModelForDocumentTokenClassification,
         AutoModelForImageClassification,
         AutoModelForImageSegmentation,
         AutoModelForMaskedLM,
@@ -240,6 +242,18 @@
         },
         "type": "multimodal",
     },
+    "document-token-classification": {
+        "impl": DocumentTokenClassificationPipeline,
+        "pt": (AutoModelForDocumentTokenClassification,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {
+                "pt": ("microsoft/layoutlmv3-base", "07c9b08"),
+                "tf": ("microsoft/layoutlmv3-base", "07c9b08"),
+            },
+        },
+        "type": "multimodal",
+    },
     "fill-mask": {
         "impl": FillMaskPipeline,
         "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
diff --git a/src/transformers/pipelines/document_token_classification.py b/src/transformers/pipelines/document_token_classification.py
new file mode 100644
index 00000000000000..fa550f4a181f8e
--- /dev/null
+++ b/src/transformers/pipelines/document_token_classification.py
@@ -0,0 +1,258 @@
+# Copyright 2022 The Loop Team and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import List, Optional, Tuple, Union, Dict
+
+import numpy as np
+
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_pytesseract_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+)
+from .base import PIPELINE_INIT_ARGS, Pipeline, ArgumentHandler, Dataset, types
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING
+
+TESSERACT_LOADED = False
+if is_pytesseract_available():
+    TESSERACT_LOADED = True
+    import pytesseract
+
+logger = logging.get_logger(__name__)
+
+
+class ModelType(ExplicitEnum):
+    LayoutLMv3 = "layoutlmv3"
+    LayoutLMv2 = "layoutlmv2"
+
+
+class DocumentTokenClassificationArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for token classification.
+    """
+
+    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+
+        if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
+            inputs = list(inputs)
+        elif isinstance(inputs, str) or isinstance(inputs, Image.Image) or isinstance(inputs, dict):
+            inputs = [inputs]
+        elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
+            return inputs
+        else:
+            raise ValueError("At least one input is required.")
+        return inputs
+
+
+@add_end_docstrings(PIPELINE_INIT_ARGS)
+class DocumentTokenClassificationPipeline(Pipeline):
+    # TODO: Update task_summary docs to include an example with document token classification
+    """
+    Document Token Classification pipeline using any `AutoModelForDocumentTokenClassification`. The inputs/outputs are
+    similar to the Token Classification pipeline; however, the pipeline takes an image (and optional OCR'd
+    words/boxes) as input instead of text context.
+
+    This Document Token Classification pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"document-token-classification"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a Document Token Classification task.
+    See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=document-token-classification).
+    """
+
+    def __init__(self, args_parser=DocumentTokenClassificationArgumentHandler(), *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING)
+        self.image_processor = self.feature_extractor
+        self.image_processor.apply_ocr = False
+        self._args_parser = args_parser
+        if self.model.config.model_type == "layoutlmv3":
+            self.model_type = ModelType.LayoutLMv3
+        elif self.model.config.model_type == "layoutlmv2":
+            self.model_type = ModelType.LayoutLMv2
+        else:
+            raise ValueError(f"Model type {self.model.config.model_type} is not supported by this pipeline.")
+
+    def _sanitize_parameters(
+        self,
+        padding=None,
+        doc_stride=None,
+        lang: Optional[str] = None,
+        tesseract_config: Optional[str] = None,
+        max_seq_len=None,
+        **kwargs,
+    ):
+        preprocess_params, postprocess_params = {}, {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if doc_stride is not None:
+            preprocess_params["doc_stride"] = doc_stride
+        if max_seq_len is not None:
+            preprocess_params["max_seq_len"] = max_seq_len
+        if lang is not None:
+            preprocess_params["lang"] = lang
+        if tesseract_config is not None:
+            preprocess_params["tesseract_config"] = tesseract_config
+
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(
+        self,
+        inputs: Union["Image.Image", List["Image.Image"], str, Dict, List[dict]],
+        **kwargs,
+    ):
+        """
+        Classifies the list of tokens (word_boxes) given a document. A document is defined as an image and an
+        optional list of (word, box) tuples which represent the text in the document. If the `word_boxes` are not
+        provided, it will use the Tesseract OCR engine (if available) to extract the words and boxes automatically for
+        LayoutLM-like models which require them as input.
+
+        You can invoke the pipeline several ways:
+
+        - `pipeline(inputs=image)`
+        - `pipeline(inputs=[image])`
+        - `pipeline(inputs={"image": image})`
+        - `pipeline(inputs={"image": image, "word_boxes": word_boxes})`
+        - `pipeline(inputs={"image": image, "words": words, "boxes": boxes})`
+        - `pipeline(inputs=[{"image": image}])`
+        - `pipeline(inputs=[{"image": image, "word_boxes": word_boxes}])`
+        - `pipeline(inputs=[{"image": image, "words": words, "boxes": boxes}])`
+
+        Args:
+            inputs (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image`, :obj:`List[PIL.Image]`, :obj:`Dict`, :obj:`List[Dict]`):
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **words** (:obj:`List[str]`) -- The words in the document.
+            - **boxes** (:obj:`List[List[int]]`) -- The boxes of the words in the document.
+            - **word_labels** (:obj:`List[str]`) -- The predicted labels for each word.
+        """
+        inputs = self._args_parser(inputs)
+        output = super().__call__(inputs, **kwargs)
+        if isinstance(output, list) and len(output) == 1:
+            return output[0]
+        return output
+
+    def preprocess(self, input, lang=None, tesseract_config="", **kwargs):
+        image = None
+        if isinstance(input, str) or isinstance(input, Image.Image):
+            image = load_image(input)
+            input = {"image": image}
+        elif input.get("image", None) is not None:
+            image = load_image(input["image"])
+
+        words, boxes = None, None
+        self.image_processor.apply_ocr = False
+        if "words" in input and "boxes" in input:
+            words = input["words"]
+            boxes = input["boxes"]
+        elif "word_boxes" in input:
+            words = [x[0] for x in input["word_boxes"]]
+            boxes = [x[1] for x in input["word_boxes"]]
+        elif image is not None and not TESSERACT_LOADED:
+            raise ValueError(
+                "If you provide an image without word_boxes, then the pipeline will run OCR using Tesseract,"
+                " but pytesseract is not available"
+            )
+        else:
+            self.image_processor.apply_ocr = True
+
+        # first, apply the image processor
+        features = self.image_processor(
+            images=image,
+            return_tensors=self.framework,
+            **kwargs,
+        )
+
+        encoded_inputs = self.tokenizer(
+            text=words if words is not None else features["words"],
+            boxes=boxes if boxes is not None else features["boxes"],
+            return_tensors=self.framework,
+            **kwargs,
+        )
+
+        if self.model_type == ModelType.LayoutLMv3:
+            image_field = "pixel_values"
+        elif self.model_type == ModelType.LayoutLMv2:
+            image_field = "image"
+        encoded_inputs[image_field] = features.pop("pixel_values")
+
+        # Fields that help with post-processing
+        encoded_inputs["word_ids"] = encoded_inputs.word_ids()
+        encoded_inputs["words"] = words if words is not None else features["words"]
+        encoded_inputs["boxes"] = boxes if boxes is not None else features["boxes"]
+
+        return encoded_inputs
+
+    def _forward(self, model_inputs):
+        word_ids = model_inputs.pop("word_ids", None)
+        words = model_inputs.pop("words", None)
+        boxes = model_inputs.pop("boxes", None)
+
+        model_outputs = self.model(**model_inputs)
+
+        model_outputs["word_ids"] = word_ids
+        model_outputs["words"] = words
+        model_outputs["boxes"] = boxes
+        return model_outputs
+
+    def postprocess(self, model_outputs, **kwargs):
+        model_outputs = dict(model_outputs)
+        logits = np.asarray(model_outputs.pop("logits", None))
+        words = model_outputs["words"]
+        boxes = model_outputs["boxes"]
+
+        # if first dimension is 1, remove it
+        if logits.shape[0] == 1:
+            logits = logits[0]
+
+        # if words is a list of list of strings, get the first one
+        if isinstance(words, list) and len(words) != 0 and isinstance(words[0], list):
+            words = words[0]
+            model_outputs["words"] = words
+
+        if isinstance(boxes, list) and len(boxes) != 0 and isinstance(boxes[0], list):
+            boxes = boxes[0]
+            model_outputs["boxes"] = boxes
+
+        token_predictions = logits.argmax(-1)
+
+        word_ids = model_outputs.pop("word_ids", None)
+
+        # Map Token predictions to word predictions
+        word_predictions = [None] * len(words)
+        for word_id, token_prediction in zip(word_ids, token_predictions):
+            if word_id is not None and word_predictions[word_id] is None:
+                word_predictions[word_id] = token_prediction
+            elif word_id is not None and word_predictions[word_id] != token_prediction:
+                # If conflict, we take the first prediction
+                pass
+
+        word_labels = [self.model.config.id2label[prediction] for prediction in word_predictions]
+        model_outputs["word_labels"] = word_labels
+        return model_outputs
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 178a0b5ae6e559..ceaf47706b1220 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -398,6 +398,9 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = None
 
 
+MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING = None
+
+
 MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
 
 
@@ -529,6 +532,12 @@ class AutoModelForDocumentQuestionAnswering(metaclass=DummyObject):
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
+class AutoModelForDocumentTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
 
 class AutoModelForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 624e08b88e9e31..c7cd01e764df32 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -275,6 +275,9 @@ def __init__(self, *args, **kwargs):
 TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = None
 
 
+TF_MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING = None
+
+
 TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
 
 
@@ -343,6 +346,12 @@ class TFAutoModelForDocumentQuestionAnswering(metaclass=DummyObject):
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
+class TFAutoModelForDocumentTokenClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
 
 class TFAutoModelForImageClassification(metaclass=DummyObject):
     _backends = ["tf"]
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 4a44c15b22150e..24f1aef1ebe7ae 100644
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -38,6 +38,7 @@
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_CTC_MAPPING_NAMES,
     MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING_NAMES,
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
     MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
     MODEL_FOR_MASKED_LM_MAPPING_NAMES,
@@ -75,6 +76,7 @@ def _generate_supported_model_class_names(
         "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
         "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
         "document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+        "document-token-classification": MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING_NAMES,
         "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
         "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
         "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
@@ -753,6 +755,7 @@ def _generate_dummy_input(
             elif model_class_name in [
                 *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES),
                 *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING_NAMES),
                 *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
                 *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
                 *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
@@ -774,6 +777,9 @@ def _generate_dummy_input(
                     image_size = model.config.vision_config.image_size
                 elif hasattr(model.config, "encoder"):
                     image_size = model.config.encoder.image_size
+                elif getattr(model.config, "model_type")=="layoutlmv3":
+                    image_size = getattr(model.config, "input_size")
+                    image_size = (image_size, image_size)
                 else:
                     image_size = (_generate_random_int(), _generate_random_int())
 
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 3c38373163e496..c4aa2548c70a5b 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -105,6 +105,33 @@ def __init__(
         self.scope = scope
         self.range_bbox = range_bbox
 
+    def get_config(self):
+        config = LayoutLMv2Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            image_feature_pool_shape=self.image_feature_pool_shape,
+            coordinate_size=self.coordinate_size,
+            shape_size=self.shape_size,
+        )
+
+        # use smaller resnet backbone to make tests faster
+        config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18
+        config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
+        config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1
+
+        return config
+
+
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
@@ -140,28 +167,7 @@ def prepare_config_and_inputs(self):
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
 
-        config = LayoutLMv2Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            image_feature_pool_shape=self.image_feature_pool_shape,
-            coordinate_size=self.coordinate_size,
-            shape_size=self.shape_size,
-        )
-
-        # use smaller resnet backbone to make tests faster
-        config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18
-        config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
-        config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1
+        config = self.get_config()
 
         return config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
 
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index d5c8d42d22177a..f0cbcf827d8912 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -113,6 +113,25 @@ def __init__(
         self.image_seq_length = (image_size // patch_size) ** 2 + 1
         self.seq_length = self.text_seq_length + self.image_seq_length
 
+    def get_config(self):
+            return LayoutLMv3Config(
+                            vocab_size=self.vocab_size,
+                            hidden_size=self.hidden_size,
+                            num_hidden_layers=self.num_hidden_layers,
+                            num_attention_heads=self.num_attention_heads,
+                            intermediate_size=self.intermediate_size,
+                            hidden_act=self.hidden_act,
+                            hidden_dropout_prob=self.hidden_dropout_prob,
+                            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                            max_position_embeddings=self.max_position_embeddings,
+                            type_vocab_size=self.type_vocab_size,
+                            initializer_range=self.initializer_range,
+                            coordinate_size=self.coordinate_size,
+                            shape_size=self.shape_size,
+                            input_size=self.image_size,
+                            patch_size=self.patch_size,
+                        )
+    
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size)
 
@@ -144,27 +163,11 @@ def prepare_config_and_inputs(self):
         if self.use_labels:
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels)
-
-        config = LayoutLMv3Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            coordinate_size=self.coordinate_size,
-            shape_size=self.shape_size,
-            input_size=self.image_size,
-            patch_size=self.patch_size,
-        )
+        
+        config = self.get_config()
 
         return config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
-
+    
     def create_and_check_model(
         self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
     ):
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index c06bd644c6391b..edcc0e425bf22c 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -81,7 +81,7 @@
 
 def get_checkpoint_from_architecture(architecture):
     try:
-        module = importlib.import_module(architecture.__module__)
+        module = importlib.import_module(str(architecture.__module__))
     except ImportError:
         logger.error(f"Ignoring architecture {architecture}")
         return
diff --git a/tests/pipelines/test_pipelines_document_token_classification.py b/tests/pipelines/test_pipelines_document_token_classification.py
new file mode 100644
index 00000000000000..dac0a3af1f407d
--- /dev/null
+++ b/tests/pipelines/test_pipelines_document_token_classification.py
@@ -0,0 +1,201 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING, AutoTokenizer, AutoFeatureExtractor, is_vision_available, AutoConfig, AutoModelForDocumentTokenClassification
+from transformers.pipelines import pipeline
+from transformers.models.layoutlmv3.image_processing_layoutlmv3 import apply_tesseract as apply_ocr
+from transformers.testing_utils import (
+    nested_simplify,
+    require_pytesseract,
+    require_tf,
+    require_torch,
+    require_vision,
+    require_detectron2,
+    slow,
+)
+
+from .test_pipelines_common import ANY, PipelineTestCaseMeta
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers.image_utils import load_image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+    def load_image(_):
+        return None
+
+
+# This is a pinned image from a specific revision of a document question answering space, hosted by HuggingFace,
+# so we can expect it to be available.
+INVOICE_URL = (
+    "https://huggingface.co/spaces/impira/docquery/resolve/2f6c96314dc84dfda62d40de9da55f2f5165d403/invoice.png"
+)
+
+
+@require_torch
+@require_vision
+class DocumentTokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
+    model_mapping = MODEL_FOR_DOCUMENT_TOKEN_CLASSIFICATION_MAPPING
+
+
+    @require_pytesseract
+    @require_vision
+    def get_test_pipeline(self, model, tokenizer, feature_extractor):
+        dtc_pipeline = pipeline(
+            "document-token-classification", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
+        )
+
+        image = INVOICE_URL
+        word_boxes = list(zip(*apply_ocr(load_image(image), None, "")))
+        examples = [
+            {
+                "image": load_image(image),
+            },
+            {
+                "image": image,
+            },
+            {
+                "image": image,
+                "word_boxes": word_boxes,
+            },
+        ]
+        return dtc_pipeline, examples
+
+    def run_pipeline_test(self, dtc_pipeline, examples):
+        outputs = dtc_pipeline(examples)
+        self.assertEqual(
+            outputs,
+            [
+                    {"words": ANY(list), "word_labels": ANY(list), "boxes": ANY(list)} for _ in examples
+            ]
+        )
+    
+    @require_torch
+    @require_pytesseract
+    def test_small_model_pt(self):
+        config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-LayoutLMv3ForTokenClassification")
+        config_ms= AutoConfig.from_pretrained("microsoft/layoutlmv3-base")
+        config.update(config_ms.to_dict())
+        model = AutoModelForDocumentTokenClassification.from_config(config)
+        tokenizer = AutoTokenizer.from_pretrained(
+            "microsoft/layoutlmv3-base", revision="07c9b08", add_prefix_space=True
+        )
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            "microsoft/layoutlmv3-base", revision="07c9b08"
+        )
+        dtc_pipeline = pipeline("document-token-classification", 
+            model=model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+        )
+        image = INVOICE_URL
+        outputs = dtc_pipeline(inputs=image)
+        self.assertEqual(len(outputs["words"]), 95)
+        self.assertEqual(len(outputs["word_labels"]), 95)
+        self.assertEqual(len(outputs["boxes"]), 95)
+        self.assertEqual(set(outputs["word_labels"]), set(['LABEL_0', 'LABEL_1']))
+
+        outputs = dtc_pipeline({"image": image})
+        self.assertEqual(len(outputs["words"]), 95)
+        self.assertEqual(len(outputs["word_labels"]), 95)
+        self.assertEqual(len(outputs["boxes"]), 95)
+        self.assertEqual(set(outputs["word_labels"]), set(['LABEL_0', 'LABEL_1']))
+
+        # No text detected -> empty list
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        outputs = dtc_pipeline(inputs=image)
+        self.assertEqual(outputs["words"], [])
+        self.assertEqual(outputs["boxes"], [])
+        self.assertEqual(outputs["word_labels"], [])
+
+        # We can pass the words and bounding boxes directly
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        words = []
+        boxes = []
+        outputs = dtc_pipeline({"image":image, "words":words, "boxes":boxes})
+        self.assertEqual(outputs["words"], [])
+        self.assertEqual(outputs["boxes"], [])
+        self.assertEqual(outputs["word_labels"], [])
+
+
+#    @slow
+    @require_torch
+    @require_pytesseract
+    @require_vision
+    def test_large_model_pt_layoutlm(self):
+        dtc_pipeline = pipeline(
+            "document-token-classification",
+            model="Theivaprakasham/layoutlmv3-finetuned-invoice",
+        )
+        image = INVOICE_URL
+
+        outputs = dtc_pipeline(inputs=image)
+        self.assertEqual(len(outputs["words"]), 95)
+        self.assertEqual(len(outputs["word_labels"]), 95)
+        self.assertEqual(len(outputs["boxes"]), 95)
+        self.assertEqual(set(outputs["word_labels"]), {'B-BILLER_POST_CODE', 'B-BILLER', 'B-GST', 'O', 'B-TOTAL'})
+        self.assertEqual(outputs["word_labels"].count("B-BILLER_POST_CODE"), 2)
+        self.assertEqual(outputs["word_labels"].count("B-BILLER"), 2)
+        self.assertEqual(outputs["word_labels"].count("B-GST"), 7)
+        self.assertEqual(outputs["word_labels"].count("O"), 80)
+        self.assertEqual(outputs["word_labels"].count("B-TOTAL"), 4)
+
+
+        outputs = dtc_pipeline({"image": image})
+        self.assertEqual(len(outputs["words"]), 95)
+        self.assertEqual(len(outputs["word_labels"]), 95)
+        self.assertEqual(len(outputs["boxes"]), 95)
+        self.assertEqual(set(outputs["word_labels"]), {'B-BILLER_POST_CODE', 'B-BILLER', 'B-GST', 'O', 'B-TOTAL'})
+        self.assertEqual(outputs["word_labels"].count("B-BILLER_POST_CODE"), 2)
+        self.assertEqual(outputs["word_labels"].count("B-BILLER"), 2)
+        self.assertEqual(outputs["word_labels"].count("B-GST"), 7)
+        self.assertEqual(outputs["word_labels"].count("O"), 80)
+        self.assertEqual(outputs["word_labels"].count("B-TOTAL"), 4)
+
+        outputs = dtc_pipeline(
+            [{"image": image}, {"image": image}]
+        )
+        self.assertEqual(len(outputs[0]["words"]), 95)
+        self.assertEqual(len(outputs[0]["word_labels"]), 95)
+        self.assertEqual(len(outputs[0]["boxes"]), 95)
+        self.assertEqual(set(outputs[0]["word_labels"]), {'B-BILLER_POST_CODE', 'B-BILLER', 'B-GST', 'O', 'B-TOTAL'})
+        self.assertEqual(outputs[0]["word_labels"].count("B-BILLER_POST_CODE"), 2)
+        self.assertEqual(outputs[0]["word_labels"].count("B-BILLER"), 2)
+        self.assertEqual(outputs[0]["word_labels"].count("B-GST"), 7)
+        self.assertEqual(outputs[0]["word_labels"].count("O"), 80)
+        self.assertEqual(outputs[0]["word_labels"].count("B-TOTAL"), 4)
+
+        self.assertEqual(len(outputs[1]["words"]), 95)
+        self.assertEqual(len(outputs[1]["word_labels"]), 95)
+        self.assertEqual(len(outputs[1]["boxes"]), 95)
+        self.assertEqual(set(outputs[1]["word_labels"]), {'B-BILLER_POST_CODE', 'B-BILLER', 'B-GST', 'O', 'B-TOTAL'})
+        self.assertEqual(outputs[1]["word_labels"].count("B-BILLER_POST_CODE"), 2)
+        self.assertEqual(outputs[1]["word_labels"].count("B-BILLER"), 2)
+        self.assertEqual(outputs[1]["word_labels"].count("B-GST"), 7)
+        self.assertEqual(outputs[1]["word_labels"].count("O"), 80)
+        self.assertEqual(outputs[1]["word_labels"].count("B-TOTAL"), 4)
+
+    @require_tf
+    @unittest.skip("Document Token Classification not implemented in TF")
+    def test_small_model_tf(self):
+        pass

From d751d0f70513b13015b6a87d175f7f898f3955a5 Mon Sep 17 00:00:00 2001
From: Vaishak Kumar <vaishak.thimakapura@gmail.com>
Date: Thu, 29 Dec 2022 11:54:03 -0800
Subject: [PATCH 2/6] remove model test changes

---
 .../layoutlmv2/test_modeling_layoutlmv2.py    | 50 ++++++++-----------
 .../layoutlmv3/test_modeling_layoutlmv3.py    | 41 +++++++--------
 2 files changed, 41 insertions(+), 50 deletions(-)

diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index c4aa2548c70a5b..3c38373163e496 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -105,33 +105,6 @@ def __init__(
         self.scope = scope
         self.range_bbox = range_bbox
 
-    def get_config(self):
-        config = LayoutLMv2Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            image_feature_pool_shape=self.image_feature_pool_shape,
-            coordinate_size=self.coordinate_size,
-            shape_size=self.shape_size,
-        )
-
-        # use smaller resnet backbone to make tests faster
-        config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18
-        config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
-        config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1
-
-        return config
-
-
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
@@ -167,7 +140,28 @@ def prepare_config_and_inputs(self):
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
 
-        config = self.get_config()
+        config = LayoutLMv2Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            image_feature_pool_shape=self.image_feature_pool_shape,
+            coordinate_size=self.coordinate_size,
+            shape_size=self.shape_size,
+        )
+
+        # use smaller resnet backbone to make tests faster
+        config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18
+        config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
+        config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1
 
         return config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
 
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index f0cbcf827d8912..d5c8d42d22177a 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -113,25 +113,6 @@ def __init__(
         self.image_seq_length = (image_size // patch_size) ** 2 + 1
         self.seq_length = self.text_seq_length + self.image_seq_length
 
-    def get_config(self):
-            return LayoutLMv3Config(
-                            vocab_size=self.vocab_size,
-                            hidden_size=self.hidden_size,
-                            num_hidden_layers=self.num_hidden_layers,
-                            num_attention_heads=self.num_attention_heads,
-                            intermediate_size=self.intermediate_size,
-                            hidden_act=self.hidden_act,
-                            hidden_dropout_prob=self.hidden_dropout_prob,
-                            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                            max_position_embeddings=self.max_position_embeddings,
-                            type_vocab_size=self.type_vocab_size,
-                            initializer_range=self.initializer_range,
-                            coordinate_size=self.coordinate_size,
-                            shape_size=self.shape_size,
-                            input_size=self.image_size,
-                            patch_size=self.patch_size,
-                        )
-    
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size)
 
@@ -163,11 +144,27 @@ def prepare_config_and_inputs(self):
         if self.use_labels:
             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
             token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels)
-        
-        config = self.get_config()
+
+        config = LayoutLMv3Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+            coordinate_size=self.coordinate_size,
+            shape_size=self.shape_size,
+            input_size=self.image_size,
+            patch_size=self.patch_size,
+        )
 
         return config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
-    
+
     def create_and_check_model(
         self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
     ):

From a64bc7d2c1b81f1d3ae747721a2e78418f1eb69e Mon Sep 17 00:00:00 2001
From: Vaishak Kumar <vaishak.thimakapura@gmail.com>
Date: Thu, 29 Dec 2022 11:57:06 -0800
Subject: [PATCH 3/6] no support for models other than layoutlmv3

---
 .../pipelines/document_token_classification.py             | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/transformers/pipelines/document_token_classification.py b/src/transformers/pipelines/document_token_classification.py
index fa550f4a181f8e..32375d10c6c9b1 100644
--- a/src/transformers/pipelines/document_token_classification.py
+++ b/src/transformers/pipelines/document_token_classification.py
@@ -47,7 +47,6 @@
 
 class ModelType(ExplicitEnum):
     LayoutLMv3 = "layoutlmv3"
-    LayoutLMv2 = "layoutlmv2"
 
 
 class DocumentTokenClassificationArgumentHandler(ArgumentHandler):
@@ -92,8 +91,6 @@ def __init__(self, args_parser=DocumentTokenClassificationArgumentHandler(), *ar
         self._args_parser = args_parser
         if self.model.config.model_type == "layoutlmv3":
             self.model_type = ModelType.LayoutLMv3
-        elif self.model.config.model_type == "layoutlmv2":
-            self.model_type = ModelType.LayoutLMv2
         else:
             raise ValueError(f"Model type {self.model.config.model_type} is not supported by this pipeline.")
 
@@ -198,8 +195,8 @@ def preprocess(self, input, lang=None, tesseract_config="", **kwargs):
 
         if self.model_type == ModelType.LayoutLMv3:
             image_field = "pixel_values"
-        elif self.model_type == ModelType.LayoutLMv2:
-            image_field = "image"
+        else:
+            raise ValueError(f"Model type {self.model.config.model_type} is not supported by this pipeline.")
         encoded_inputs[image_field] = features.pop("pixel_values")
 
         # Fields that help with post-processing

From 0f3ee86cdadad62cc9ff50daa7a43c2dea818608 Mon Sep 17 00:00:00 2001
From: Vaishak Kumar <vaishak.thimakapura@gmail.com>
Date: Wed, 4 Jan 2023 10:22:55 -0800
Subject: [PATCH 4/6] Update
 src/transformers/pipelines/document_token_classification.py

Co-authored-by: Evan Richards <tevanrichards@gmail.com>
---
 src/transformers/pipelines/document_token_classification.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/document_token_classification.py b/src/transformers/pipelines/document_token_classification.py
index 32375d10c6c9b1..c06e370e3918f4 100644
--- a/src/transformers/pipelines/document_token_classification.py
+++ b/src/transformers/pipelines/document_token_classification.py
@@ -173,8 +173,7 @@ def preprocess(self, input, lang=None, tesseract_config="", **kwargs):
             boxes = [x[1] for x in input["word_boxes"]]
         elif image is not None and not TESSERACT_LOADED:
             raise ValueError(
-                "If you provide an image without word_boxes, then the pipeline will run OCR using Tesseract,"
-                " but pytesseract is not available"
+                "`word_boxes` not supplied and pytesseract not available to run OCR"
             )
         else:
             self.image_processor.apply_ocr = True

From 36b9901e97f839a7c78d1c8497401dee9e82aa36 Mon Sep 17 00:00:00 2001
From: Vaishak Kumar <vaishak.thimakapura@gmail.com>
Date: Wed, 4 Jan 2023 10:24:33 -0800
Subject: [PATCH 5/6] Update test_pipelines_document_token_classification.py

---
 tests/pipelines/test_pipelines_document_token_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_document_token_classification.py b/tests/pipelines/test_pipelines_document_token_classification.py
index dac0a3af1f407d..76e4b3107a5d69 100644
--- a/tests/pipelines/test_pipelines_document_token_classification.py
+++ b/tests/pipelines/test_pipelines_document_token_classification.py
@@ -138,7 +138,7 @@ def test_small_model_pt(self):
         self.assertEqual(outputs["word_labels"], [])
 
 
-#    @slow
+    @slow
     @require_torch
     @require_pytesseract
     @require_vision

From 82ccf1357b1a47354e55eb0cd51cc6eae9f14f9b Mon Sep 17 00:00:00 2001
From: Vaishak Kumar <vaishak.thimakapura@gmail.com>
Date: Wed, 4 Jan 2023 11:26:28 -0800
Subject: [PATCH 6/6] Update document_token_classification.py

---
 src/transformers/pipelines/document_token_classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/document_token_classification.py b/src/transformers/pipelines/document_token_classification.py
index c06e370e3918f4..1b06652a7c8f57 100644
--- a/src/transformers/pipelines/document_token_classification.py
+++ b/src/transformers/pipelines/document_token_classification.py
@@ -57,9 +57,9 @@ class DocumentTokenClassificationArgumentHandler(ArgumentHandler):
     def __call__(self, inputs: Union[str, List[str]], **kwargs):
 
         if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
-            inputs = list(inputs)
+            return list(inputs)
         elif isinstance(inputs, str) or isinstance(inputs, Image.Image) or isinstance(inputs, dict):
-            inputs = [inputs]
+            return [inputs]
         elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
             return inputs
         else: