From 0153e2e8e1aef0be09cb5750ca31d47690d9f859 Mon Sep 17 00:00:00 2001
From: Manuel Gabteni <maniga3004@gmail.com>
Date: Wed, 15 May 2024 13:36:23 +0200
Subject: [PATCH] added splitting of scentences and translating of each
 scentence

---
 app/translation/translation.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/app/translation/translation.py b/app/translation/translation.py
index b2a74f4..c0dd9ad 100644
--- a/app/translation/translation.py
+++ b/app/translation/translation.py
@@ -1,5 +1,6 @@
 import os
 import json
+import re
 import torch
 import logging
 import sys
@@ -48,25 +49,35 @@ def translate_to_german(text, src_lang=None):
         if src_lang is None:
             return json.dumps({"error": "Could not detect language"}, ensure_ascii=False)
     
-    # Set the target language
-    tokenizer.src_lang = src_lang  # Default to English, model will auto-detect if wrong
+    # Set the source and target languages
+    tokenizer.src_lang = src_lang
     tokenizer.tgt_lang = "de"
 
-    # Encode the text and move tensors to the appropriate device
-    encoded = tokenizer(text, return_tensors="pt").to(device)
+    logger.info(f"Translating from {src_lang} to de")
 
-    # Generate translation
-    generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id("de"))
+    # Split text into sentences
+    sentences = re.split(r'(?<=[.!?]) +', text)
 
-    # Decode and return the translation
-    translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+    # Translate each sentence
+    translations = []
+    for sentence in sentences:
+        encoded = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
+        generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id("de"))
+        translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+        translations.append(translation)
+    
+    # Combine all translations into a single string
+    full_translation = ' '.join(translations)
+
+    # Log the translation for debugging
+    logger.info(f"Translation: {full_translation}")
 
     # Create a JSON object with the result
     result = {
         "source_language": src_lang,
         "target_language": "de",
         "original_text": text,
-        "translated_text": translation
+        "translated_text": full_translation
     }
 
     return json.dumps(result, ensure_ascii=False)