arvindrajan92 · arvindrajan92 · Aug 2, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,10 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite using metadata below."
+authors:
+- family-names: "Rajan"
+  given-names: "Arvind"
+  orcid: "https://orcid.org/0000-0003-4829-5007"
+title: "A PyTorch implementation of DTrOCR: Decoder-only Transformer for Optical Character Recognition"
+repository-code: 'https://github.com/arvindrajan92/DTrOCR'
+date-released: 2024-07-13
+license: MIT
diff --git a/README.md b/README.md
@@ -1,11 +1,26 @@
 # DTrOCR
+![logo](logo.png)
+[![Python application](https://github.com/arvindrajan92/DTrOCR/actions/workflows/python-app.yml/badge.svg)](https://github.com/arvindrajan92/DTrOCR/actions/workflows/python-app.yml)
+[![CodeQL](https://github.com/arvindrajan92/DTrOCR/actions/workflows/github-code-scanning/codeql/badge.svg)](https://github.com/arvindrajan92/DTrOCR/actions/workflows/github-code-scanning/codeql)
+[![Python Versions](https://img.shields.io/badge/python-3.11-blue)](https://www.python.org/downloads/)
+[![License](https://img.shields.io/github/license/arvindrajan92/DTrOCR.svg)](https://github.com/arvindrajan92/DTrOCR/LICENSE)
+[![GitHub stars](https://img.shields.io/github/stars/arvindrajan92/DTrOCR?style=social)](https://github.com/arvindrajan92/DTrOCR)
+
 A PyTorch implementation of DTrOCR: Decoder-only Transformer for Optical Character Recognition.
 
 > [!NOTE]
 >
 > The author of this repository is not in any way affiliated to the author of the [DTrOCR paper](https://doi.org/10.48550/arXiv.2308.15996). This implementation is purely based on the published details of DTrOCR model architecture and its training.
 > 
-> Pre-trained weight for the model is currently not available as this is a personal project with limited resources.
+> Pre-trained weight for the model is not available at this time as this is a personal project with limited resources.
+
+Below are the key differences between the original implementation (from the paper) and this implementation.
+
+|                                                              | Original implementation      | This implementation   |
+| ------------------------------------------------------------ | ---------------------------- | --------------------- |
+| Maximum token length<br />(including 128 image patch tokens) | 512                          | 256                   |
+| Language                                                     | English & Chinese            | English               |
+| Pre-training corpus (planned)                                | Scene, printed & handwritten | Printed & handwritten |
 
 ## Installation
 
@@ -28,6 +43,7 @@ config = DTrOCRConfig()
 model = DTrOCRLMHeadModel(config)
 processor = DTrOCRProcessor(DTrOCRConfig())
 
+model.eval()        # set model to evaluation mode for deterministic behaviour
 path_to_image = ""  # path to image file
 
 inputs = processor(
@@ -39,7 +55,8 @@ inputs = processor(
 model_output = model.generate(
     inputs=inputs, 
     processor=processor, 
-    num_beams=3  # defaults to 1 if not specified
+    num_beams=3,    # defaults to 1 if not specified
+    use_cache=True  # defaults to True if not specified
 )
 
 predicted_text = processor.tokeniser.decode(model_output[0], skip_special_tokens=True)

diff --git a/dtrocr/data.py b/dtrocr/data.py
@@ -8,13 +8,15 @@
 @dataclass
 class DTrOCRModelOutput:
     hidden_states: torch.FloatTensor
+    past_key_values: torch.FloatTensor
 
 
 @dataclass
 class DTrOCRLMHeadModelOutput:
     logits: torch.FloatTensor
     loss: Optional[torch.FloatTensor] = None
     accuracy: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[torch.FloatTensor] = None
 
 
 @dataclass