From e9970df690b668ca6324128a5bce0da141649f79 Mon Sep 17 00:00:00 2001
From: Ahmed Abdelali <ahmed.abdelali@gmail.com>
Date: Tue, 13 Jun 2023 17:37:15 +0300
Subject: [PATCH 1/5] Add diacritizaton module

---
 .../datasets/ArabicDiacritization.py          |  47 ++++++
 arabic_llm_benchmark/datasets/__init__.py     |   1 +
 .../tasks/ArabicDiacritization.py             | 136 ++++++++++++++++++
 arabic_llm_benchmark/tasks/__init__.py        |   1 +
 .../diacritization_ChatGPT_ZeroShot.py        |  43 ++++++
 5 files changed, 228 insertions(+)
 create mode 100644 arabic_llm_benchmark/datasets/ArabicDiacritization.py
 create mode 100644 arabic_llm_benchmark/tasks/ArabicDiacritization.py
 create mode 100644 assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py

diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/arabic_llm_benchmark/datasets/ArabicDiacritization.py
new file mode 100644
index 00000000..9065d13e
--- /dev/null
+++ b/arabic_llm_benchmark/datasets/ArabicDiacritization.py
@@ -0,0 +1,47 @@
+from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+
+
+class ArabicDiacritizationDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(ArabicDiacritizationDataset, self).__init__(**kwargs)
+
+    def citation(self):
+        return """@article{10.1145/3434235,
+            author = {Darwish, Kareem and Abdelali, Ahmed and Mubarak, Hamdy and Eldesouki, Mohamed},
+            title = {Arabic Diacritic Recovery Using a Feature-Rich BiLSTM Model},
+            year = {2021},
+            issue_date = {March 2021},
+            publisher = {Association for Computing Machinery},
+            address = {New York, NY, USA},
+            volume = {20},
+            number = {2},
+            issn = {2375-4699},
+            url = {https://doi.org/10.1145/3434235},
+            doi = {10.1145/3434235},
+            journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
+            month = {apr},
+            articleno = {33},
+            numpages = {18},
+            }"""
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": "Sentence with segmented words",
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        # TODO: modify to iterator
+        data = []
+
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                data.append(
+                    {
+                        "input": line.split("\t")[0],
+                        "label": line.split("\t")[1],
+                        "line_number": line_idx,
+                    }
+                )
+
+        return data
diff --git a/arabic_llm_benchmark/datasets/__init__.py b/arabic_llm_benchmark/datasets/__init__.py
index cebc7a5b..d5cf8fcd 100644
--- a/arabic_llm_benchmark/datasets/__init__.py
+++ b/arabic_llm_benchmark/datasets/__init__.py
@@ -3,6 +3,7 @@
 from .Aqmar import AqmarDataset
 from .AraBench import AraBenchDataset
 from .ArabGend import ArabGendDataset
+from .ArabicDiacritization import ArabicDiacritizationDataset
 from .ArabicSegmentation import ArabicSegmentationDataset
 from .ArapTweet import ArapTweetDataset
 from .ARCD import ARCDDataset
diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
new file mode 100644
index 00000000..1eed7190
--- /dev/null
+++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
@@ -0,0 +1,136 @@
+import re
+
+from sklearn.metrics import f1_score
+
+from arabic_llm_benchmark.tasks.task_base import TaskBase
+
+
+#
+# repo: https://pyzone.dev/word-error-rate-in-python
+#
+def wer(ref, hyp, debug=True):
+    r = ref
+    h = hyp
+    # costs will holds the costs, like in the Levenshtein distance algorithm
+    costs = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)]
+    # backtrace will hold the operations we've done.
+    # so we could later backtrace, like the WER algorithm requires us to.
+    backtrace = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)]
+
+    OP_OK = 0
+    OP_SUB = 1
+    OP_INS = 2
+    OP_DEL = 3
+    DEL_PENALTY = 1
+    INS_PENALTY = 1
+    SUB_PENALTY = 1
+
+    # First column represents the case where we achieve zero
+    # hypothesis words by deleting all reference words.
+    for i in range(1, len(r) + 1):
+        costs[i][0] = DEL_PENALTY * i
+        backtrace[i][0] = OP_DEL
+
+    # First row represents the case where we achieve the hypothesis
+    # by inserting all hypothesis words into a zero-length reference.
+    for j in range(1, len(h) + 1):
+        costs[0][j] = INS_PENALTY * j
+        backtrace[0][j] = OP_INS
+
+    # computation
+    for i in range(1, len(r) + 1):
+        for j in range(1, len(h) + 1):
+            if r[i - 1] == h[j - 1]:
+                costs[i][j] = costs[i - 1][j - 1]
+                backtrace[i][j] = OP_OK
+            else:
+                substitutionCost = (
+                    costs[i - 1][j - 1] + SUB_PENALTY
+                )  # penalty is always 1
+                insertionCost = costs[i][j - 1] + INS_PENALTY  # penalty is always 1
+                deletionCost = costs[i - 1][j] + DEL_PENALTY  # penalty is always 1
+
+                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
+                if costs[i][j] == substitutionCost:
+                    backtrace[i][j] = OP_SUB
+                elif costs[i][j] == insertionCost:
+                    backtrace[i][j] = OP_INS
+                else:
+                    backtrace[i][j] = OP_DEL
+
+    # back trace though the best route:
+    i = len(r)
+    j = len(h)
+    numSub = 0
+    numDel = 0
+    numIns = 0
+    numCor = 0
+    if debug:
+        print("OP\tREF\tHYP")
+        lines = []
+    while i > 0 or j > 0:
+        if backtrace[i][j] == OP_OK:
+            numCor += 1
+            i -= 1
+            j -= 1
+            if debug:
+                lines.append("OK\t" + r[i] + "\t" + h[j])
+        elif backtrace[i][j] == OP_SUB:
+            numSub += 1
+            i -= 1
+            j -= 1
+            if debug:
+                lines.append("SUB\t" + r[i] + "\t" + h[j])
+        elif backtrace[i][j] == OP_INS:
+            numIns += 1
+            j -= 1
+            if debug:
+                lines.append("INS\t" + "****" + "\t" + h[j])
+        elif backtrace[i][j] == OP_DEL:
+            numDel += 1
+            i -= 1
+            if debug:
+                lines.append("DEL\t" + r[i] + "\t" + "****")
+    if debug:
+        lines = reversed(lines)
+        for line in lines:
+            print(line)
+        print("#cor " + str(numCor))
+        print("#sub " + str(numSub))
+        print("#del " + str(numDel))
+        print("#ins " + str(numIns))
+    # return (numSub + numDel + numIns) / (float) (len(r))
+    wer_result = round((numSub + numDel + numIns) / (float)(len(r)), 3)
+    if debug:
+        return {
+            "WER": wer_result,
+            "numCor": numCor,
+            "numSub": numSub,
+            "numIns": numIns,
+            "numDel": numDel,
+            "numCount": len(r),
+        }
+    else:
+        return {"WER": wer_result}
+
+
+class ArabicDiacritizationTask(TaskBase):
+    def __init__(self, **kwargs):
+        super(ArabicDiacritizationTask, self).__init__(**kwargs)
+
+    def evaluate(self, true_labels, predicted_labels):
+        # split sentence into words
+        hyp = []
+        ref = []
+        for t, p in zip(true_labels, predicted_labels):
+            t = t.split()
+            if p is None:
+                p = ["UNK"] * len(t)
+            else:
+                p = p.split()
+            if len(p) < len(t):
+                for i in range(len(p) - len(t)):
+                    hyp.append("")
+            hyp += p[: len(t)]
+            ref += t
+        return wer(ref, hyp, False)
diff --git a/arabic_llm_benchmark/tasks/__init__.py b/arabic_llm_benchmark/tasks/__init__.py
index 0010153d..4cabd0d1 100644
--- a/arabic_llm_benchmark/tasks/__init__.py
+++ b/arabic_llm_benchmark/tasks/__init__.py
@@ -1,4 +1,5 @@
 from .Adult import AdultTask
+from .ArabicDiacritization import ArabicDiacritizationTask
 from .ArabicSegmentation import ArabicSegmentationTask
 from .Attentionworthy import AttentionworthyTask
 from .Checkworthiness import CheckworthinessTask
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
new file mode 100644
index 00000000..d1ba05d9
--- /dev/null
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
@@ -0,0 +1,43 @@
+import os
+
+from arabic_llm_benchmark.datasets import ArabicDiacritizationDataset
+from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
+from arabic_llm_benchmark.tasks import ArabicDiacritizationTask
+
+
+def config():
+    return {
+        "dataset": ArabicDiacritizationDataset,
+        "dataset_args": {},
+        "task": ArabicDiacritizationTask,
+        "task_args": {},
+        "model": GPTModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": "gpt",
+            # "class_labels": ["m", "f"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruth.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return {
+        "system_message": "You are an AI assistant that helps people find information.",
+        "messages": [
+            {
+                "sender": "user",
+                "text": f"Diacritize fully the following Arabic sentence: {input_sample}",
+            }
+        ],
+    }
+
+
+def post_process(response):
+    return response["choices"][0]["text"]

From 825eefc45c7ba3acf36fa2ae010d510d5fd3d424 Mon Sep 17 00:00:00 2001
From: Ahmed Abdelali <ahmed.abdelali@gmail.com>
Date: Mon, 19 Jun 2023 16:54:45 +0300
Subject: [PATCH 2/5] Update ArabicDiacritization.py

Use undiacritized tokens as fall back for None results.
---
 arabic_llm_benchmark/tasks/ArabicDiacritization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
index 1eed7190..a001ecac 100644
--- a/arabic_llm_benchmark/tasks/ArabicDiacritization.py
+++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
@@ -123,11 +123,11 @@ def evaluate(self, true_labels, predicted_labels):
         hyp = []
         ref = []
         for t, p in zip(true_labels, predicted_labels):
-            t = t.split()
             if p is None:
-                p = ["UNK"] * len(t)
+                p = re.sub(r'[ًٌٍَُِّْ]','',t).split()
             else:
                 p = p.split()
+            t = t.split()
             if len(p) < len(t):
                 for i in range(len(p) - len(t)):
                     hyp.append("")

From 05d0629c36f8899f0589fd94f5304b58144c0b1b Mon Sep 17 00:00:00 2001
From: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
Date: Wed, 21 Jun 2023 10:07:49 +0300
Subject: [PATCH 3/5] Format code

---
 arabic_llm_benchmark/tasks/ArabicDiacritization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
index a001ecac..3c69d4a7 100644
--- a/arabic_llm_benchmark/tasks/ArabicDiacritization.py
+++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
@@ -124,7 +124,7 @@ def evaluate(self, true_labels, predicted_labels):
         ref = []
         for t, p in zip(true_labels, predicted_labels):
             if p is None:
-                p = re.sub(r'[ًٌٍَُِّْ]','',t).split()
+                p = re.sub(r"[ًٌٍَُِّْ]", "", t).split()
             else:
                 p = p.split()
             t = t.split()

From d1849e76e419058b87196472f8f2c9025b52d685 Mon Sep 17 00:00:00 2001
From: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
Date: Wed, 21 Jun 2023 10:12:41 +0300
Subject: [PATCH 4/5] Add comments and minor fixes

---
 arabic_llm_benchmark/datasets/ArabicDiacritization.py    | 3 +--
 arabic_llm_benchmark/tasks/ArabicDiacritization.py       | 9 ++++++++-
 .../diacritization_ChatGPT_ZeroShot.py                   | 5 ++---
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/arabic_llm_benchmark/datasets/ArabicDiacritization.py
index 9065d13e..6f0b05fd 100644
--- a/arabic_llm_benchmark/datasets/ArabicDiacritization.py
+++ b/arabic_llm_benchmark/datasets/ArabicDiacritization.py
@@ -27,11 +27,10 @@ def citation(self):
     def get_data_sample(self):
         return {
             "input": "Original sentence",
-            "label": "Sentence with segmented words",
+            "label": "Sentence with diacritized words",
         }
 
     def load_data(self, data_path, no_labels=False):
-        # TODO: modify to iterator
         data = []
 
         with open(data_path, "r") as fp:
diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
index 3c69d4a7..5ff83594 100644
--- a/arabic_llm_benchmark/tasks/ArabicDiacritization.py
+++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
@@ -119,18 +119,25 @@ def __init__(self, **kwargs):
         super(ArabicDiacritizationTask, self).__init__(**kwargs)
 
     def evaluate(self, true_labels, predicted_labels):
-        # split sentence into words
+        # Flatten sentences into a long list of words
         hyp = []
         ref = []
         for t, p in zip(true_labels, predicted_labels):
             if p is None:
+                # Use undiacritized word in case of prediction failiure
                 p = re.sub(r"[ًٌٍَُِّْ]", "", t).split()
             else:
                 p = p.split()
+
             t = t.split()
+
+            # If prediction is missing tokens, pad with empty tokens
             if len(p) < len(t):
                 for i in range(len(p) - len(t)):
                     hyp.append("")
+
+            # If prediction has extra tokens, only consider the first
+            # N tokens, where N == number of gold tokens
             hyp += p[: len(t)]
             ref += t
         return wer(ref, hyp, False)
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
index d1ba05d9..cf8637a7 100644
--- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
@@ -1,7 +1,7 @@
 import os
 
 from arabic_llm_benchmark.datasets import ArabicDiacritizationDataset
-from arabic_llm_benchmark.models import GPTModel, RandomGPTModel
+from arabic_llm_benchmark.models import GPTModel
 from arabic_llm_benchmark.tasks import ArabicDiacritizationTask
 
 
@@ -17,8 +17,7 @@ def config():
             "api_version": "2023-03-15-preview",
             "api_base": os.environ["AZURE_API_URL"],
             "api_key": os.environ["AZURE_API_KEY"],
-            "engine_name": "gpt",
-            # "class_labels": ["m", "f"],
+            "engine_name": os.environ["ENGINE_NAME"],
             "max_tries": 3,
         },
         "general_args": {

From ffe3c615e16c4b0623ff3174b675f466e50c6696 Mon Sep 17 00:00:00 2001
From: Fahim Imaduddin Dalvi <faimaduddin@hbku.edu.qa>
Date: Wed, 21 Jun 2023 10:51:13 +0300
Subject: [PATCH 5/5] More fixes to dataloader

---
 arabic_llm_benchmark/datasets/ArabicDiacritization.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/arabic_llm_benchmark/datasets/ArabicDiacritization.py
index 6f0b05fd..2d43b14c 100644
--- a/arabic_llm_benchmark/datasets/ArabicDiacritization.py
+++ b/arabic_llm_benchmark/datasets/ArabicDiacritization.py
@@ -35,10 +35,11 @@ def load_data(self, data_path, no_labels=False):
 
         with open(data_path, "r") as fp:
             for line_idx, line in enumerate(fp):
+                text, diacritized_text = line.split("\t")
                 data.append(
                     {
-                        "input": line.split("\t")[0],
-                        "label": line.split("\t")[1],
+                        "input": text.strip(),
+                        "label": diacritized_text.strip(),
                         "line_number": line_idx,
                     }
                 )