diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py new file mode 100644 index 00000000..c3add3ff --- /dev/null +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_BLOOMZ_ENZeroShot.py @@ -0,0 +1,52 @@ +import os + +from arabic_llm_benchmark.datasets import CheckworthinessDataset +from arabic_llm_benchmark.models import BLOOMPetalModel +from arabic_llm_benchmark.tasks import CheckworthinessTask + + +def config(): + return { + "dataset": CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": BLOOMPetalModel, + "model_args": { + "api_url": os.environ["API_URL"], + "class_labels": ["0", "1"], + "max_tries": 3, + }, + "general_args": { + "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_test_gold.tsv" + }, + } + + +def prompt(input_sample): + return { + "prompt": "Classify the tweet as checkworthy or not checkworthy. Provide only label.\n\n" + + "tweet: " + + input_sample + + "label: \n" + } + + +def post_process(response): + label = response["outputs"].strip().lower() + label = label.replace("", "") + label = label.replace("", "") + + label_fixed = None + + if label == "checkworthy": + label_fixed = "1" + elif ( + label == "Not_checkworthy." + or label == "not_checkworthy" + or label == "not checkworthy" + or label.lower() == "no" + ): + label_fixed = "0" + + return label_fixed diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py new file mode 100644 index 00000000..57c61e5a --- /dev/null +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENFewShot.py @@ -0,0 +1,93 @@ +import os +import re + +from arabic_llm_benchmark.datasets import CheckworthinessDataset +from arabic_llm_benchmark.models import GPTChatCompletionModel +from arabic_llm_benchmark.tasks import CheckworthinessTask + + +def config(): + return { + "dataset": CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": GPTChatCompletionModel, + "model_args": { + "api_type": "azure", + "api_version": "2023-03-15-preview", + "api_base": os.environ["AZURE_API_URL"], + "api_key": os.environ["AZURE_API_KEY"], + "engine_name": os.environ["ENGINE_NAME"], + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": { + "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_test_gold.tsv", + "fewshot": { + "train_data_path": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_train.tsv", + }, + }, + } + + +def few_shot_prompt(input_sample, base_prompt, examples): + out_prompt = base_prompt + "\n" + out_prompt = out_prompt + "Here are some examples:\n\n" + for index, example in enumerate(examples): + label = "no" if example["label"] == "0" else "yes" + + out_prompt = ( + out_prompt + + "Example " + + str(example["input_id"]) + + ":" + + "\n" + + "tweet: " + + example["input"] + + "\nlabel: " + + label + + "\n\n" + ) + + # Append the sentence we want the model to predict for but leave the Label blank + out_prompt = out_prompt + "tweet: " + input_sample + "\nlabel: \n" + + return out_prompt + + +def prompt(input_sample, examples): + base_prompt = f'Annotate the "tweet" into "one" of the following categories: checkworthy or not_checkworthy. Provide only label.' + return [ + { + "role": "system", + "content": "As an AI system, your role is to analyze tweets and classify them as 'checkworthy' or 'not_checkworthy' based on their potential importance for journalists and fact-checkers.", + }, + { + "role": "user", + "content": few_shot_prompt(input_sample, base_prompt, examples), + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + if "label: " in label: + arr = label.split("label: ") + label = arr[1].strip() + + if label == "checkworthy" or label == "Checkworthy": + label_fixed = "1" + elif label == "Not_checkworthy." or label == "not_checkworthy": + label_fixed = "0" + elif "not_checkworthy" in label or "label: not_checkworthy" in label: + label_fixed = "0" + elif "checkworthy" in label or "label: checkworthy" in label: + label_fixed = "1" + else: + label_fixed = None + + return label_fixed diff --git a/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py new file mode 100644 index 00000000..75800b2e --- /dev/null +++ b/assets/benchmark_v1/factuality_disinformation_harmful_content/Checkworthiness_GPTChatCompletion_ENZeroShot.py @@ -0,0 +1,69 @@ +import os +import re + +from arabic_llm_benchmark.datasets import CheckworthinessDataset +from arabic_llm_benchmark.models import GPTChatCompletionModel +from arabic_llm_benchmark.tasks import CheckworthinessTask + + +def config(): + return { + "dataset": CheckworthinessDataset, + "dataset_args": {}, + "task": CheckworthinessTask, + "task_args": {}, + "model": GPTChatCompletionModel, + "model_args": { + "api_type": "azure", + "api_version": "2023-03-15-preview", + "api_base": os.environ["AZURE_API_URL"], + "api_key": os.environ["AZURE_API_KEY"], + "engine_name": os.environ["ENGINE_NAME"], + "class_labels": ["0", "1"], + "max_tries": 30, + }, + "general_args": { + "data_path": "data/factuality_disinformation_harmful_content/checkworthyness/english/CT22_english_1A_checkworthy_test_gold.tsv" + }, + } + + +def prompt(input_sample): + prompt_string = ( + f'Annotate the "tweet" into "one" of the following categories: checkworthy or not_checkworthy\n\n' + f"tweet: {input_sample}\n" + f"label: \n" + ) + return [ + { + "role": "system", + "content": "As an AI system, your role is to analyze tweets and classify them as 'checkworthy' or 'not_checkworthy' based on their potential importance for journalists and fact-checkers.", + }, + { + "role": "user", + "content": prompt_string, + }, + ] + + +def post_process(response): + label = response["choices"][0]["message"]["content"] + + label = label.replace("label:", "").strip() + + if "label: " in label: + arr = label.split("label: ") + label = arr[1].strip() + + if label == "checkworthy" or label == "Checkworthy": + label_fixed = "1" + elif label == "Not_checkworthy." or label == "not_checkworthy": + label_fixed = "0" + elif "not_checkworthy" in label or "label: not_checkworthy" in label: + label_fixed = "0" + elif "checkworthy" in label or "label: checkworthy" in label: + label_fixed = "1" + else: + label_fixed = None + + return label_fixed