Skip to content

Commit

Permalink
Add DocumentQuestionAnswering pipeline (huggingface#18414)
Browse files Browse the repository at this point in the history
* [WIP] Skeleton of VisualQuestionAnweringPipeline extended to support LayoutLM-like models

* Fixup

* Use the full encoding

* Basic refactoring to DocumentQuestionAnsweringPipeline

* Cleanup

* Improve args, docs, and implement preprocessing

* Integrate OCR

* Refactor question_answering pipeline

* Use refactored QA code in the document qa pipeline

* Fix tests

* Some small cleanups

* Use a string type annotation for Image.Image

* Update encoding with image features

* Wire through the basic docs

* Handle invalid response

* Handle empty word_boxes properly

* Docstring fix

* Integrate Donut model

* Fixup

* Incorporate comments

* Address comments

* Initial incorporation of tests

* Address Comments

* Change assert to ValueError

* Comments

* Wrap `score` in float to make it JSON serializable

* Incorporate AutoModeLForDocumentQuestionAnswering changes

* Fixup

* Rename postprocess function

* Fix auto import

* Applying comments

* Improve docs

* Remove extra assets and add copyright

* Address comments

Co-authored-by: Ankur Goyal <ankur@impira.com>
  • Loading branch information
2 people authored and oneraghavan committed Sep 26, 2022
1 parent 6671700 commit 71a2011
Show file tree
Hide file tree
Showing 18 changed files with 962 additions and 139 deletions.
7 changes: 7 additions & 0 deletions docs/source/en/main_classes/pipelines.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ There are two categories of pipeline abstractions to be aware about:
- [`AudioClassificationPipeline`]
- [`AutomaticSpeechRecognitionPipeline`]
- [`ConversationalPipeline`]
- [`DocumentQuestionAnsweringPipeline`]
- [`FeatureExtractionPipeline`]
- [`FillMaskPipeline`]
- [`ImageClassificationPipeline`]
Expand Down Expand Up @@ -342,6 +343,12 @@ That should enable you to do all the custom code you want.
- __call__
- all

### DocumentQuestionAnsweringPipeline

[[autodoc]] DocumentQuestionAnsweringPipeline
- __call__
- all

### FeatureExtractionPipeline

[[autodoc]] FeatureExtractionPipeline
Expand Down
8 changes: 8 additions & 0 deletions docs/source/en/model_doc/auto.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its

[[autodoc]] AutoModelForTableQuestionAnswering

## AutoModelForDocumentQuestionAnswering

[[autodoc]] AutoModelForDocumentQuestionAnswering

## AutoModelForImageClassification

[[autodoc]] AutoModelForImageClassification
Expand Down Expand Up @@ -214,6 +218,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its

[[autodoc]] TFAutoModelForTableQuestionAnswering

## TFAutoModelForDocumentQuestionAnswering

[[autodoc]] TFAutoModelForDocumentQuestionAnswering

## TFAutoModelForTokenClassification

[[autodoc]] TFAutoModelForTokenClassification
Expand Down
10 changes: 10 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@
"Conversation",
"ConversationalPipeline",
"CsvPipelineDataFormat",
"DocumentQuestionAnsweringPipeline",
"FeatureExtractionPipeline",
"FillMaskPipeline",
"ImageClassificationPipeline",
Expand Down Expand Up @@ -789,6 +790,7 @@
"MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
"MODEL_FOR_CAUSAL_LM_MAPPING",
"MODEL_FOR_CTC_MAPPING",
"MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
Expand Down Expand Up @@ -816,6 +818,7 @@
"AutoModelForAudioXVector",
"AutoModelForCausalLM",
"AutoModelForCTC",
"AutoModelForDocumentQuestionAnswering",
"AutoModelForImageClassification",
"AutoModelForImageSegmentation",
"AutoModelForInstanceSegmentation",
Expand Down Expand Up @@ -2107,6 +2110,7 @@
"TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
"TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
"TF_MODEL_FOR_PRETRAINING_MAPPING",
"TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
Expand All @@ -2124,6 +2128,7 @@
"TFAutoModelForMultipleChoice",
"TFAutoModelForNextSentencePrediction",
"TFAutoModelForPreTraining",
"TFAutoModelForDocumentQuestionAnswering",
"TFAutoModelForQuestionAnswering",
"TFAutoModelForSemanticSegmentation",
"TFAutoModelForSeq2SeqLM",
Expand Down Expand Up @@ -3200,6 +3205,7 @@
Conversation,
ConversationalPipeline,
CsvPipelineDataFormat,
DocumentQuestionAnsweringPipeline,
FeatureExtractionPipeline,
FillMaskPipeline,
ImageClassificationPipeline,
Expand Down Expand Up @@ -3549,6 +3555,7 @@
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_CTC_MAPPING,
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
Expand Down Expand Up @@ -3576,6 +3583,7 @@
AutoModelForAudioXVector,
AutoModelForCausalLM,
AutoModelForCTC,
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForInstanceSegmentation,
Expand Down Expand Up @@ -4637,6 +4645,7 @@
)
from .models.auto import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
Expand All @@ -4655,6 +4664,7 @@
TF_MODEL_WITH_LM_HEAD_MAPPING,
TFAutoModel,
TFAutoModelForCausalLM,
TFAutoModelForDocumentQuestionAnswering,
TFAutoModelForImageClassification,
TFAutoModelForMaskedLM,
TFAutoModelForMultipleChoice,
Expand Down
8 changes: 8 additions & 0 deletions src/transformers/models/auto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING",
"MODEL_FOR_CAUSAL_LM_MAPPING",
"MODEL_FOR_CTC_MAPPING",
"MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
Expand Down Expand Up @@ -93,6 +94,7 @@
"AutoModelForVideoClassification",
"AutoModelForVision2Seq",
"AutoModelForVisualQuestionAnswering",
"AutoModelForDocumentQuestionAnswering",
"AutoModelWithLMHead",
]

Expand All @@ -111,6 +113,7 @@
"TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
"TF_MODEL_FOR_PRETRAINING_MAPPING",
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
Expand All @@ -127,6 +130,7 @@
"TFAutoModelForMultipleChoice",
"TFAutoModelForNextSentencePrediction",
"TFAutoModelForPreTraining",
"TFAutoModelForDocumentQuestionAnswering",
"TFAutoModelForQuestionAnswering",
"TFAutoModelForSemanticSegmentation",
"TFAutoModelForSeq2SeqLM",
Expand Down Expand Up @@ -191,6 +195,7 @@
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_CTC_MAPPING,
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
Expand Down Expand Up @@ -218,6 +223,7 @@
AutoModelForAudioXVector,
AutoModelForCausalLM,
AutoModelForCTC,
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForInstanceSegmentation,
Expand Down Expand Up @@ -248,6 +254,7 @@
else:
from .modeling_tf_auto import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
Expand All @@ -266,6 +273,7 @@
TF_MODEL_WITH_LM_HEAD_MAPPING,
TFAutoModel,
TFAutoModelForCausalLM,
TFAutoModelForDocumentQuestionAnswering,
TFAutoModelForImageClassification,
TFAutoModelForMaskedLM,
TFAutoModelForMultipleChoice,
Expand Down
22 changes: 22 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,14 @@
]
)

MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
("layoutlm", "LayoutLMForQuestionAnswering"),
("layoutlmv2", "LayoutLMv2ForQuestionAnswering"),
("layoutlmv3", "LayoutLMv3ForQuestionAnswering"),
]
)

MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Token Classification mapping
Expand Down Expand Up @@ -773,6 +781,9 @@
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
)
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
)
MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES)
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES
Expand Down Expand Up @@ -891,6 +902,17 @@ class AutoModelForVisualQuestionAnswering(_BaseAutoModelClass):
)


class AutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING


AutoModelForDocumentQuestionAnswering = auto_class_update(
AutoModelForDocumentQuestionAnswering,
head_doc="document question answering",
checkpoint_for_example='impira/layoutlm-document-qa", revision="3dc6de3',
)


class AutoModelForTokenClassification(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING

Expand Down
21 changes: 21 additions & 0 deletions src/transformers/models/auto/modeling_tf_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,13 @@
]
)

TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
("layoutlm", "TFLayoutLMForQuestionAnswering"),
]
)


TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
[
# Model for Table Question Answering mapping
Expand Down Expand Up @@ -406,6 +413,9 @@
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
)
TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
)
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
)
Expand Down Expand Up @@ -515,6 +525,17 @@ class TFAutoModelForQuestionAnswering(_BaseAutoModelClass):
TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering")


class TFAutoModelForDocumentQuestionAnswering(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING


TFAutoModelForDocumentQuestionAnswering = auto_class_update(
TFAutoModelForDocumentQuestionAnswering,
head_doc="document question answering",
checkpoint_for_example='impira/layoutlm-document-qa", revision="3dc6de3',
)


class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING

Expand Down
13 changes: 12 additions & 1 deletion src/transformers/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
infer_framework_load_model,
)
from .conversational import Conversation, ConversationalPipeline
from .document_question_answering import DocumentQuestionAnsweringPipeline
from .feature_extraction import FeatureExtractionPipeline
from .fill_mask import FillMaskPipeline
from .image_classification import ImageClassificationPipeline
Expand Down Expand Up @@ -109,6 +110,7 @@
AutoModelForAudioClassification,
AutoModelForCausalLM,
AutoModelForCTC,
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForMaskedLM,
Expand Down Expand Up @@ -215,6 +217,15 @@
},
"type": "multimodal",
},
"document-question-answering": {
"impl": DocumentQuestionAnsweringPipeline,
"pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (),
"tf": (),
"default": {
"model": {"pt": ("impira/layoutlm-document-qa", "3a93017")},
},
"type": "multimodal",
},
"fill-mask": {
"impl": FillMaskPipeline,
"tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
Expand Down Expand Up @@ -443,7 +454,7 @@ def pipeline(
trust_remote_code: Optional[bool] = None,
model_kwargs: Dict[str, Any] = None,
pipeline_class: Optional[Any] = None,
**kwargs
**kwargs,
) -> Pipeline:
"""
Utility factory method to build a [`Pipeline`].
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def infer_framework_load_model(
model_classes: Optional[Dict[str, Tuple[type]]] = None,
task: Optional[str] = None,
framework: Optional[str] = None,
**model_kwargs
**model_kwargs,
):
"""
Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
Expand Down Expand Up @@ -274,7 +274,7 @@ def infer_framework_from_model(
model_classes: Optional[Dict[str, Tuple[type]]] = None,
task: Optional[str] = None,
framework: Optional[str] = None,
**model_kwargs
**model_kwargs,
):
"""
Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
Expand Down
Loading

0 comments on commit 71a2011

Please sign in to comment.