diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/arabic_llm_benchmark/datasets/ArabicDiacritization.py
new file mode 100644
index 00000000..2d43b14c
--- /dev/null
+++ b/arabic_llm_benchmark/datasets/ArabicDiacritization.py
@@ -0,0 +1,47 @@
+from arabic_llm_benchmark.datasets.dataset_base import DatasetBase
+
+
+class ArabicDiacritizationDataset(DatasetBase):
+    def __init__(self, **kwargs):
+        super(ArabicDiacritizationDataset, self).__init__(**kwargs)
+
+    def citation(self):
+        return """@article{10.1145/3434235,
+            author = {Darwish, Kareem and Abdelali, Ahmed and Mubarak, Hamdy and Eldesouki, Mohamed},
+            title = {Arabic Diacritic Recovery Using a Feature-Rich BiLSTM Model},
+            year = {2021},
+            issue_date = {March 2021},
+            publisher = {Association for Computing Machinery},
+            address = {New York, NY, USA},
+            volume = {20},
+            number = {2},
+            issn = {2375-4699},
+            url = {https://doi.org/10.1145/3434235},
+            doi = {10.1145/3434235},
+            journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
+            month = {apr},
+            articleno = {33},
+            numpages = {18},
+            }"""
+
+    def get_data_sample(self):
+        return {
+            "input": "Original sentence",
+            "label": "Sentence with diacritized words",
+        }
+
+    def load_data(self, data_path, no_labels=False):
+        data = []
+
+        with open(data_path, "r") as fp:
+            for line_idx, line in enumerate(fp):
+                text, diacritized_text = line.split("\t")
+                data.append(
+                    {
+                        "input": text.strip(),
+                        "label": diacritized_text.strip(),
+                        "line_number": line_idx,
+                    }
+                )
+
+        return data
diff --git a/arabic_llm_benchmark/datasets/__init__.py b/arabic_llm_benchmark/datasets/__init__.py
index cebc7a5b..d5cf8fcd 100644
--- a/arabic_llm_benchmark/datasets/__init__.py
+++ b/arabic_llm_benchmark/datasets/__init__.py
@@ -3,6 +3,7 @@
 from .Aqmar import AqmarDataset
 from .AraBench import AraBenchDataset
 from .ArabGend import ArabGendDataset
+from .ArabicDiacritization import ArabicDiacritizationDataset
 from .ArabicSegmentation import ArabicSegmentationDataset
 from .ArapTweet import ArapTweetDataset
 from .ARCD import ARCDDataset
diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
new file mode 100644
index 00000000..5ff83594
--- /dev/null
+++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py
@@ -0,0 +1,143 @@
+import re
+
+from sklearn.metrics import f1_score
+
+from arabic_llm_benchmark.tasks.task_base import TaskBase
+
+
+#
+# repo: https://pyzone.dev/word-error-rate-in-python
+#
+def wer(ref, hyp, debug=True):
+    r = ref
+    h = hyp
+    # costs will holds the costs, like in the Levenshtein distance algorithm
+    costs = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)]
+    # backtrace will hold the operations we've done.
+    # so we could later backtrace, like the WER algorithm requires us to.
+    backtrace = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)]
+
+    OP_OK = 0
+    OP_SUB = 1
+    OP_INS = 2
+    OP_DEL = 3
+    DEL_PENALTY = 1
+    INS_PENALTY = 1
+    SUB_PENALTY = 1
+
+    # First column represents the case where we achieve zero
+    # hypothesis words by deleting all reference words.
+    for i in range(1, len(r) + 1):
+        costs[i][0] = DEL_PENALTY * i
+        backtrace[i][0] = OP_DEL
+
+    # First row represents the case where we achieve the hypothesis
+    # by inserting all hypothesis words into a zero-length reference.
+    for j in range(1, len(h) + 1):
+        costs[0][j] = INS_PENALTY * j
+        backtrace[0][j] = OP_INS
+
+    # computation
+    for i in range(1, len(r) + 1):
+        for j in range(1, len(h) + 1):
+            if r[i - 1] == h[j - 1]:
+                costs[i][j] = costs[i - 1][j - 1]
+                backtrace[i][j] = OP_OK
+            else:
+                substitutionCost = (
+                    costs[i - 1][j - 1] + SUB_PENALTY
+                )  # penalty is always 1
+                insertionCost = costs[i][j - 1] + INS_PENALTY  # penalty is always 1
+                deletionCost = costs[i - 1][j] + DEL_PENALTY  # penalty is always 1
+
+                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
+                if costs[i][j] == substitutionCost:
+                    backtrace[i][j] = OP_SUB
+                elif costs[i][j] == insertionCost:
+                    backtrace[i][j] = OP_INS
+                else:
+                    backtrace[i][j] = OP_DEL
+
+    # back trace though the best route:
+    i = len(r)
+    j = len(h)
+    numSub = 0
+    numDel = 0
+    numIns = 0
+    numCor = 0
+    if debug:
+        print("OP\tREF\tHYP")
+        lines = []
+    while i > 0 or j > 0:
+        if backtrace[i][j] == OP_OK:
+            numCor += 1
+            i -= 1
+            j -= 1
+            if debug:
+                lines.append("OK\t" + r[i] + "\t" + h[j])
+        elif backtrace[i][j] == OP_SUB:
+            numSub += 1
+            i -= 1
+            j -= 1
+            if debug:
+                lines.append("SUB\t" + r[i] + "\t" + h[j])
+        elif backtrace[i][j] == OP_INS:
+            numIns += 1
+            j -= 1
+            if debug:
+                lines.append("INS\t" + "****" + "\t" + h[j])
+        elif backtrace[i][j] == OP_DEL:
+            numDel += 1
+            i -= 1
+            if debug:
+                lines.append("DEL\t" + r[i] + "\t" + "****")
+    if debug:
+        lines = reversed(lines)
+        for line in lines:
+            print(line)
+        print("#cor " + str(numCor))
+        print("#sub " + str(numSub))
+        print("#del " + str(numDel))
+        print("#ins " + str(numIns))
+    # return (numSub + numDel + numIns) / (float) (len(r))
+    wer_result = round((numSub + numDel + numIns) / (float)(len(r)), 3)
+    if debug:
+        return {
+            "WER": wer_result,
+            "numCor": numCor,
+            "numSub": numSub,
+            "numIns": numIns,
+            "numDel": numDel,
+            "numCount": len(r),
+        }
+    else:
+        return {"WER": wer_result}
+
+
+class ArabicDiacritizationTask(TaskBase):
+    def __init__(self, **kwargs):
+        super(ArabicDiacritizationTask, self).__init__(**kwargs)
+
+    def evaluate(self, true_labels, predicted_labels):
+        # Flatten sentences into a long list of words
+        hyp = []
+        ref = []
+        for t, p in zip(true_labels, predicted_labels):
+            if p is None:
+                # Use undiacritized word in case of prediction failiure
+                p = re.sub(r"[ًٌٍَُِّْ]", "", t).split()
+            else:
+                p = p.split()
+
+            t = t.split()
+
+            # If prediction is missing tokens, pad with empty tokens
+            if len(p) < len(t):
+                for i in range(len(p) - len(t)):
+                    hyp.append("")
+
+            # If prediction has extra tokens, only consider the first
+            # N tokens, where N == number of gold tokens
+            hyp += p[: len(t)]
+            ref += t
+        return wer(ref, hyp, False)
diff --git a/arabic_llm_benchmark/tasks/__init__.py b/arabic_llm_benchmark/tasks/__init__.py
index 0010153d..4cabd0d1 100644
--- a/arabic_llm_benchmark/tasks/__init__.py
+++ b/arabic_llm_benchmark/tasks/__init__.py
@@ -1,4 +1,5 @@
 from .Adult import AdultTask
+from .ArabicDiacritization import ArabicDiacritizationTask
 from .ArabicSegmentation import ArabicSegmentationTask
 from .Attentionworthy import AttentionworthyTask
 from .Checkworthiness import CheckworthinessTask
diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
new file mode 100644
index 00000000..cf8637a7
--- /dev/null
+++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py
@@ -0,0 +1,42 @@
+import os
+
+from arabic_llm_benchmark.datasets import ArabicDiacritizationDataset
+from arabic_llm_benchmark.models import GPTModel
+from arabic_llm_benchmark.tasks import ArabicDiacritizationTask
+
+
+def config():
+    return {
+        "dataset": ArabicDiacritizationDataset,
+        "dataset_args": {},
+        "task": ArabicDiacritizationTask,
+        "task_args": {},
+        "model": GPTModel,
+        "model_args": {
+            "api_type": "azure",
+            "api_version": "2023-03-15-preview",
+            "api_base": os.environ["AZURE_API_URL"],
+            "api_key": os.environ["AZURE_API_KEY"],
+            "engine_name": os.environ["ENGINE_NAME"],
+            "max_tries": 3,
+        },
+        "general_args": {
+            "data_path": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruth.txt"
+        },
+    }
+
+
+def prompt(input_sample):
+    return {
+        "system_message": "You are an AI assistant that helps people find information.",
+        "messages": [
+            {
+                "sender": "user",
+                "text": f"Diacritize fully the following Arabic sentence: {input_sample}",
+            }
+        ],
+    }
+
+
+def post_process(response):
+    return response["choices"][0]["text"]