From e9970df690b668ca6324128a5bce0da141649f79 Mon Sep 17 00:00:00 2001 From: Ahmed Abdelali Date: Tue, 13 Jun 2023 17:37:15 +0300 Subject: [PATCH 1/5] Add diacritizaton module --- .../datasets/ArabicDiacritization.py | 47 ++++++ arabic_llm_benchmark/datasets/__init__.py | 1 + .../tasks/ArabicDiacritization.py | 136 ++++++++++++++++++ arabic_llm_benchmark/tasks/__init__.py | 1 + .../diacritization_ChatGPT_ZeroShot.py | 43 ++++++ 5 files changed, 228 insertions(+) create mode 100644 arabic_llm_benchmark/datasets/ArabicDiacritization.py create mode 100644 arabic_llm_benchmark/tasks/ArabicDiacritization.py create mode 100644 assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/arabic_llm_benchmark/datasets/ArabicDiacritization.py new file mode 100644 index 00000000..9065d13e --- /dev/null +++ b/arabic_llm_benchmark/datasets/ArabicDiacritization.py @@ -0,0 +1,47 @@ +from arabic_llm_benchmark.datasets.dataset_base import DatasetBase + + +class ArabicDiacritizationDataset(DatasetBase): + def __init__(self, **kwargs): + super(ArabicDiacritizationDataset, self).__init__(**kwargs) + + def citation(self): + return """@article{10.1145/3434235, + author = {Darwish, Kareem and Abdelali, Ahmed and Mubarak, Hamdy and Eldesouki, Mohamed}, + title = {Arabic Diacritic Recovery Using a Feature-Rich BiLSTM Model}, + year = {2021}, + issue_date = {March 2021}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + volume = {20}, + number = {2}, + issn = {2375-4699}, + url = {https://doi.org/10.1145/3434235}, + doi = {10.1145/3434235}, + journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.}, + month = {apr}, + articleno = {33}, + numpages = {18}, + }""" + + def get_data_sample(self): + return { + "input": "Original sentence", + "label": "Sentence with segmented words", + } + + def load_data(self, data_path, no_labels=False): + # TODO: modify to iterator + data = [] + + with open(data_path, "r") as fp: + for line_idx, line in enumerate(fp): + data.append( + { + "input": line.split("\t")[0], + "label": line.split("\t")[1], + "line_number": line_idx, + } + ) + + return data diff --git a/arabic_llm_benchmark/datasets/__init__.py b/arabic_llm_benchmark/datasets/__init__.py index cebc7a5b..d5cf8fcd 100644 --- a/arabic_llm_benchmark/datasets/__init__.py +++ b/arabic_llm_benchmark/datasets/__init__.py @@ -3,6 +3,7 @@ from .Aqmar import AqmarDataset from .AraBench import AraBenchDataset from .ArabGend import ArabGendDataset +from .ArabicDiacritization import ArabicDiacritizationDataset from .ArabicSegmentation import ArabicSegmentationDataset from .ArapTweet import ArapTweetDataset from .ARCD import ARCDDataset diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py new file mode 100644 index 00000000..1eed7190 --- /dev/null +++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py @@ -0,0 +1,136 @@ +import re + +from sklearn.metrics import f1_score + +from arabic_llm_benchmark.tasks.task_base import TaskBase + + +# +# repo: https://pyzone.dev/word-error-rate-in-python +# +def wer(ref, hyp, debug=True): + r = ref + h = hyp + # costs will holds the costs, like in the Levenshtein distance algorithm + costs = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)] + # backtrace will hold the operations we've done. + # so we could later backtrace, like the WER algorithm requires us to. + backtrace = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)] + + OP_OK = 0 + OP_SUB = 1 + OP_INS = 2 + OP_DEL = 3 + DEL_PENALTY = 1 + INS_PENALTY = 1 + SUB_PENALTY = 1 + + # First column represents the case where we achieve zero + # hypothesis words by deleting all reference words. + for i in range(1, len(r) + 1): + costs[i][0] = DEL_PENALTY * i + backtrace[i][0] = OP_DEL + + # First row represents the case where we achieve the hypothesis + # by inserting all hypothesis words into a zero-length reference. + for j in range(1, len(h) + 1): + costs[0][j] = INS_PENALTY * j + backtrace[0][j] = OP_INS + + # computation + for i in range(1, len(r) + 1): + for j in range(1, len(h) + 1): + if r[i - 1] == h[j - 1]: + costs[i][j] = costs[i - 1][j - 1] + backtrace[i][j] = OP_OK + else: + substitutionCost = ( + costs[i - 1][j - 1] + SUB_PENALTY + ) # penalty is always 1 + insertionCost = costs[i][j - 1] + INS_PENALTY # penalty is always 1 + deletionCost = costs[i - 1][j] + DEL_PENALTY # penalty is always 1 + + costs[i][j] = min(substitutionCost, insertionCost, deletionCost) + if costs[i][j] == substitutionCost: + backtrace[i][j] = OP_SUB + elif costs[i][j] == insertionCost: + backtrace[i][j] = OP_INS + else: + backtrace[i][j] = OP_DEL + + # back trace though the best route: + i = len(r) + j = len(h) + numSub = 0 + numDel = 0 + numIns = 0 + numCor = 0 + if debug: + print("OP\tREF\tHYP") + lines = [] + while i > 0 or j > 0: + if backtrace[i][j] == OP_OK: + numCor += 1 + i -= 1 + j -= 1 + if debug: + lines.append("OK\t" + r[i] + "\t" + h[j]) + elif backtrace[i][j] == OP_SUB: + numSub += 1 + i -= 1 + j -= 1 + if debug: + lines.append("SUB\t" + r[i] + "\t" + h[j]) + elif backtrace[i][j] == OP_INS: + numIns += 1 + j -= 1 + if debug: + lines.append("INS\t" + "****" + "\t" + h[j]) + elif backtrace[i][j] == OP_DEL: + numDel += 1 + i -= 1 + if debug: + lines.append("DEL\t" + r[i] + "\t" + "****") + if debug: + lines = reversed(lines) + for line in lines: + print(line) + print("#cor " + str(numCor)) + print("#sub " + str(numSub)) + print("#del " + str(numDel)) + print("#ins " + str(numIns)) + # return (numSub + numDel + numIns) / (float) (len(r)) + wer_result = round((numSub + numDel + numIns) / (float)(len(r)), 3) + if debug: + return { + "WER": wer_result, + "numCor": numCor, + "numSub": numSub, + "numIns": numIns, + "numDel": numDel, + "numCount": len(r), + } + else: + return {"WER": wer_result} + + +class ArabicDiacritizationTask(TaskBase): + def __init__(self, **kwargs): + super(ArabicDiacritizationTask, self).__init__(**kwargs) + + def evaluate(self, true_labels, predicted_labels): + # split sentence into words + hyp = [] + ref = [] + for t, p in zip(true_labels, predicted_labels): + t = t.split() + if p is None: + p = ["UNK"] * len(t) + else: + p = p.split() + if len(p) < len(t): + for i in range(len(p) - len(t)): + hyp.append("") + hyp += p[: len(t)] + ref += t + return wer(ref, hyp, False) diff --git a/arabic_llm_benchmark/tasks/__init__.py b/arabic_llm_benchmark/tasks/__init__.py index 0010153d..4cabd0d1 100644 --- a/arabic_llm_benchmark/tasks/__init__.py +++ b/arabic_llm_benchmark/tasks/__init__.py @@ -1,4 +1,5 @@ from .Adult import AdultTask +from .ArabicDiacritization import ArabicDiacritizationTask from .ArabicSegmentation import ArabicSegmentationTask from .Attentionworthy import AttentionworthyTask from .Checkworthiness import CheckworthinessTask diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py new file mode 100644 index 00000000..d1ba05d9 --- /dev/null +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py @@ -0,0 +1,43 @@ +import os + +from arabic_llm_benchmark.datasets import ArabicDiacritizationDataset +from arabic_llm_benchmark.models import GPTModel, RandomGPTModel +from arabic_llm_benchmark.tasks import ArabicDiacritizationTask + + +def config(): + return { + "dataset": ArabicDiacritizationDataset, + "dataset_args": {}, + "task": ArabicDiacritizationTask, + "task_args": {}, + "model": GPTModel, + "model_args": { + "api_type": "azure", + "api_version": "2023-03-15-preview", + "api_base": os.environ["AZURE_API_URL"], + "api_key": os.environ["AZURE_API_KEY"], + "engine_name": "gpt", + # "class_labels": ["m", "f"], + "max_tries": 3, + }, + "general_args": { + "data_path": "data/sequence_tagging_ner_pos_etc/diacritization/WikiNewsTruth.txt" + }, + } + + +def prompt(input_sample): + return { + "system_message": "You are an AI assistant that helps people find information.", + "messages": [ + { + "sender": "user", + "text": f"Diacritize fully the following Arabic sentence: {input_sample}", + } + ], + } + + +def post_process(response): + return response["choices"][0]["text"] From 825eefc45c7ba3acf36fa2ae010d510d5fd3d424 Mon Sep 17 00:00:00 2001 From: Ahmed Abdelali Date: Mon, 19 Jun 2023 16:54:45 +0300 Subject: [PATCH 2/5] Update ArabicDiacritization.py Use undiacritized tokens as fall back for None results. --- arabic_llm_benchmark/tasks/ArabicDiacritization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py index 1eed7190..a001ecac 100644 --- a/arabic_llm_benchmark/tasks/ArabicDiacritization.py +++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py @@ -123,11 +123,11 @@ def evaluate(self, true_labels, predicted_labels): hyp = [] ref = [] for t, p in zip(true_labels, predicted_labels): - t = t.split() if p is None: - p = ["UNK"] * len(t) + p = re.sub(r'[ًٌٍَُِّْ]','',t).split() else: p = p.split() + t = t.split() if len(p) < len(t): for i in range(len(p) - len(t)): hyp.append("") From 05d0629c36f8899f0589fd94f5304b58144c0b1b Mon Sep 17 00:00:00 2001 From: Fahim Imaduddin Dalvi Date: Wed, 21 Jun 2023 10:07:49 +0300 Subject: [PATCH 3/5] Format code --- arabic_llm_benchmark/tasks/ArabicDiacritization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py index a001ecac..3c69d4a7 100644 --- a/arabic_llm_benchmark/tasks/ArabicDiacritization.py +++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py @@ -124,7 +124,7 @@ def evaluate(self, true_labels, predicted_labels): ref = [] for t, p in zip(true_labels, predicted_labels): if p is None: - p = re.sub(r'[ًٌٍَُِّْ]','',t).split() + p = re.sub(r"[ًٌٍَُِّْ]", "", t).split() else: p = p.split() t = t.split() From d1849e76e419058b87196472f8f2c9025b52d685 Mon Sep 17 00:00:00 2001 From: Fahim Imaduddin Dalvi Date: Wed, 21 Jun 2023 10:12:41 +0300 Subject: [PATCH 4/5] Add comments and minor fixes --- arabic_llm_benchmark/datasets/ArabicDiacritization.py | 3 +-- arabic_llm_benchmark/tasks/ArabicDiacritization.py | 9 ++++++++- .../diacritization_ChatGPT_ZeroShot.py | 5 ++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/arabic_llm_benchmark/datasets/ArabicDiacritization.py index 9065d13e..6f0b05fd 100644 --- a/arabic_llm_benchmark/datasets/ArabicDiacritization.py +++ b/arabic_llm_benchmark/datasets/ArabicDiacritization.py @@ -27,11 +27,10 @@ def citation(self): def get_data_sample(self): return { "input": "Original sentence", - "label": "Sentence with segmented words", + "label": "Sentence with diacritized words", } def load_data(self, data_path, no_labels=False): - # TODO: modify to iterator data = [] with open(data_path, "r") as fp: diff --git a/arabic_llm_benchmark/tasks/ArabicDiacritization.py b/arabic_llm_benchmark/tasks/ArabicDiacritization.py index 3c69d4a7..5ff83594 100644 --- a/arabic_llm_benchmark/tasks/ArabicDiacritization.py +++ b/arabic_llm_benchmark/tasks/ArabicDiacritization.py @@ -119,18 +119,25 @@ def __init__(self, **kwargs): super(ArabicDiacritizationTask, self).__init__(**kwargs) def evaluate(self, true_labels, predicted_labels): - # split sentence into words + # Flatten sentences into a long list of words hyp = [] ref = [] for t, p in zip(true_labels, predicted_labels): if p is None: + # Use undiacritized word in case of prediction failiure p = re.sub(r"[ًٌٍَُِّْ]", "", t).split() else: p = p.split() + t = t.split() + + # If prediction is missing tokens, pad with empty tokens if len(p) < len(t): for i in range(len(p) - len(t)): hyp.append("") + + # If prediction has extra tokens, only consider the first + # N tokens, where N == number of gold tokens hyp += p[: len(t)] ref += t return wer(ref, hyp, False) diff --git a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py index d1ba05d9..cf8637a7 100644 --- a/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py +++ b/assets/benchmark_v1/sequence_tagging_ner_pos_etc/diacritization_ChatGPT_ZeroShot.py @@ -1,7 +1,7 @@ import os from arabic_llm_benchmark.datasets import ArabicDiacritizationDataset -from arabic_llm_benchmark.models import GPTModel, RandomGPTModel +from arabic_llm_benchmark.models import GPTModel from arabic_llm_benchmark.tasks import ArabicDiacritizationTask @@ -17,8 +17,7 @@ def config(): "api_version": "2023-03-15-preview", "api_base": os.environ["AZURE_API_URL"], "api_key": os.environ["AZURE_API_KEY"], - "engine_name": "gpt", - # "class_labels": ["m", "f"], + "engine_name": os.environ["ENGINE_NAME"], "max_tries": 3, }, "general_args": { From ffe3c615e16c4b0623ff3174b675f466e50c6696 Mon Sep 17 00:00:00 2001 From: Fahim Imaduddin Dalvi Date: Wed, 21 Jun 2023 10:51:13 +0300 Subject: [PATCH 5/5] More fixes to dataloader --- arabic_llm_benchmark/datasets/ArabicDiacritization.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arabic_llm_benchmark/datasets/ArabicDiacritization.py b/arabic_llm_benchmark/datasets/ArabicDiacritization.py index 6f0b05fd..2d43b14c 100644 --- a/arabic_llm_benchmark/datasets/ArabicDiacritization.py +++ b/arabic_llm_benchmark/datasets/ArabicDiacritization.py @@ -35,10 +35,11 @@ def load_data(self, data_path, no_labels=False): with open(data_path, "r") as fp: for line_idx, line in enumerate(fp): + text, diacritized_text = line.split("\t") data.append( { - "input": line.split("\t")[0], - "label": line.split("\t")[1], + "input": text.strip(), + "label": diacritized_text.strip(), "line_number": line_idx, } )