DTrOCR-18 - Add support for batching (#19)

arvindrajan92 · Aug 3, 2024 · 3d9cb3e · 3d9cb3e
1 parent 7112177
commit 3d9cb3e
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -10,17 +10,17 @@ A PyTorch implementation of DTrOCR: Decoder-only Transformer for Optical Charact
 
 > [!NOTE]
 >
-> The author of this repository is not in any way affiliated to the author of the [DTrOCR paper](https://doi.org/10.48550/arXiv.2308.15996). This implementation is purely based on the published details of DTrOCR model architecture and its training.
+> The authors of this repository are not affiliated with the author of the [DTrOCR paper](https://doi.org/10.48550/arXiv.2308.15996). This implementation is independently developed, relying solely on the publicly available descriptions of the DTrOCR model architecture and its training methodologies, with a specific focus on printed and handwritten words.
 > 
-> Pre-trained weight for the model is not available at this time as this is a personal project with limited resources.
+> Due to the project's inception as a personal endeavor with limited resources, the pre-trained weights for the model are not presently accessible. However, there is an ongoing commitment to pre-train the model and subsequently release the weights to the public. For detailed insights into the project's development and future milestones, please refer to the [project roadmap](https://github.com/users/arvindrajan92/projects/1).
 
-Below are the key differences between the original implementation (from the paper) and this implementation.
+The table below outlines the principal distinctions between the implementation described in the original paper and the current implementation.
 
-|                                                              | Original implementation      | This implementation   |
-| ------------------------------------------------------------ | ---------------------------- | --------------------- |
-| Maximum token length<br />(including 128 image patch tokens) | 512                          | 256                   |
-| Language                                                     | English & Chinese            | English               |
-| Pre-training corpus (planned)                                | Scene, printed & handwritten | Printed & handwritten |
+|                                                             | Original implementation      | Current implementation |
+|-------------------------------------------------------------| ---------------------------- |------------------------|
+| Maximum token length<br />(including 128 image patch tokens)| 512                          | 256                    |
+| Supported language(s)                                       | English & Chinese            | English                |
+| Pre-training corpus (planned)                               | Scene, printed & handwritten | Printed & handwritten  |
 
 ## Installation
 
@@ -61,4 +61,7 @@ model_output = model.generate(
 
 predicted_text = processor.tokeniser.decode(model_output[0], skip_special_tokens=True)
 ```
+## Acknowledgments
+This project builds upon the original work presented in [DTrOCR: Decoder-only Transformer for Optical Character Recognition](https://doi.org/10.48550/arXiv.2308.15996), authored by Masato Fujitake. We extend our gratitude for their significant contributions to the field.
 
+Additionally, we leverage the GPT-2 and Vision Transformer (ViT) models developed by Hugging Face, which have been instrumental in advancing our project's capabilities. Our sincere thanks go to the Hugging Face team for making such powerful tools accessible to the broader research community.
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -8,75 +8,68 @@
 from PIL import Image
 from dataclasses import asdict
 
+# global variables
+BATCH_SIZE = random.choice(range(1, 10))
+BEAM_SIZE = random.choice(range(1, 3))
+CONFIG = DTrOCRConfig()
+MODEL = DTrOCRLMHeadModel(CONFIG)
+
+# set model to evaluation mode
+MODEL.eval()
 
-def test_model():
-    batch_size = random.choice(range(1, 10))
 
-    config = DTrOCRConfig()
-    model = DTrOCRLMHeadModel(config)
-    processor = DTrOCRProcessor(config=config, add_bos_token=True, add_eos_token=True)
+def test_model():
+    processor = DTrOCRProcessor(config=CONFIG, add_bos_token=True, add_eos_token=True)
 
     inputs = processor(
-        images=[Image.new("RGB", config.image_size[::-1]) for _ in range(batch_size)],
-        texts=["This is a sentence" for _ in range(batch_size)],
+        images=[Image.new("RGB", CONFIG.image_size[::-1]) for _ in range(BATCH_SIZE)],
+        texts=["This is a sentence" for _ in range(BATCH_SIZE)],
         padding=True,
         return_tensors="pt",
         return_labels=True
     )
 
-    model_output = model(**asdict(inputs))
+    model_output = MODEL(**asdict(inputs))
 
     assert model_output.loss.shape == ()
     assert model_output.accuracy.shape == ()
     assert model_output.logits.shape == (
-        batch_size,
-        int(((config.image_size[0] / config.patch_size[0]) * (config.image_size[1] / config.patch_size[1]))) +
+        BATCH_SIZE,
+        int(((CONFIG.image_size[0] / CONFIG.patch_size[0]) * (CONFIG.image_size[1] / CONFIG.patch_size[1]))) +
         inputs.attention_mask.shape[1],
-        config.vocab_size
+        CONFIG.vocab_size
     )
 
 
 def test_generation_to_be_deterministic():
-    beam_size = random.choice(range(1, 3))
-
-    config = DTrOCRConfig()
-    model = DTrOCRLMHeadModel(config)
     processor = DTrOCRProcessor(DTrOCRConfig())
 
     inputs = processor(
-        images=Image.new("RGB", config.image_size[::-1]),
-        texts=processor.tokeniser.bos_token,
+        images=[Image.new("RGB", CONFIG.image_size[::-1]) for _ in range(BATCH_SIZE)],
+        texts=[processor.tokeniser.bos_token for _ in range(BATCH_SIZE)],
         return_tensors="pt"
     )
 
-    model.eval()
-
-    output_1 = model.generate(inputs=inputs, processor=processor, num_beams=beam_size, use_cache=False)
-    output_2 = model.generate(inputs=inputs, processor=processor, num_beams=beam_size, use_cache=False)
+    output_1 = MODEL.generate(inputs=inputs, processor=processor, num_beams=BEAM_SIZE, use_cache=False)
+    output_2 = MODEL.generate(inputs=inputs, processor=processor, num_beams=BEAM_SIZE, use_cache=False)
     assert torch.equal(output_1, output_2)
 
 
 def test_generation_with_and_without_caching():
-    beam_size = random.choice(range(1, 3))
-
-    config = DTrOCRConfig()
-    model = DTrOCRLMHeadModel(config)
     processor = DTrOCRProcessor(DTrOCRConfig())
 
     inputs = processor(
-        images=Image.new("RGB", config.image_size[::-1]),
-        texts=processor.tokeniser.bos_token,
+        images=[Image.new("RGB", CONFIG.image_size[::-1]) for _ in range(BATCH_SIZE)],
+        texts=[processor.tokeniser.bos_token for _ in range(BATCH_SIZE)],
         return_tensors="pt"
     )
 
-    model.eval()
-
     start_time = time.time()
-    output_without_cache = model.generate(inputs=inputs, processor=processor, num_beams=beam_size, use_cache=False)
+    output_without_cache = MODEL.generate(inputs=inputs, processor=processor, num_beams=BEAM_SIZE, use_cache=False)
     time_without_cache = time.time() - start_time
 
     start_time = time.time()
-    output_with_cache = model.generate(inputs=inputs, processor=processor, num_beams=beam_size, use_cache=True)
+    output_with_cache = MODEL.generate(inputs=inputs, processor=processor, num_beams=BEAM_SIZE, use_cache=True)
     time_with_cache = time.time() - start_time
 
     assert torch.equal(output_without_cache, output_with_cache)