From 77fdceced9c5238ad1075eba6cf00590350c5829 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <43818888+Dominastorm@users.noreply.github.com>
Date: Fri, 12 Apr 2024 17:09:38 +0530
Subject: [PATCH] Improve Language critique operator (#683)

* add coherence, gramar and politeness to languag critique

* Add removed few shot prompts

* Improve prompts for language critique

* Reorder log_and_evaluate
---
 uptrain/framework/remote.py                   |   2 +-
 .../operators/language/language_quality.py    | 244 ++++++++++++++++--
 uptrain/operators/language/prompts/classic.py |  98 ++++++-
 .../operators/language/prompts/few_shots.py   | 152 ++++++++++-
 .../language/prompts/output_format.py         |  55 +++-
 5 files changed, 505 insertions(+), 46 deletions(-)

diff --git a/uptrain/framework/remote.py b/uptrain/framework/remote.py
index c3f87a64..6b16976a 100644
--- a/uptrain/framework/remote.py
+++ b/uptrain/framework/remote.py
@@ -551,9 +551,9 @@ def perform_root_cause_analysis(
 
     def log_and_evaluate(
         self,
+        project_name: str,
         data: t.Union[list[dict], pl.DataFrame, pd.DataFrame],
         checks: list[t.Union[str, Evals, ParametricEval]],
-        project_name: str,
         evaluation_name: t.Optional[str] = None,
         scenario_description: t.Optional[str] = None,
         schema: t.Union[DataSchema, dict[str, str], None] = None,
diff --git a/uptrain/operators/language/language_quality.py b/uptrain/operators/language/language_quality.py
index 9135721c..6568fb65 100644
--- a/uptrain/operators/language/language_quality.py
+++ b/uptrain/operators/language/language_quality.py
@@ -7,7 +7,7 @@
 """
 
 from __future__ import annotations
-import json
+import json5
 import typing as t
 
 from loguru import logger
@@ -15,21 +15,36 @@
 
 from uptrain.operators.language.llm import LLMMulticlient
 from uptrain.operators.language.prompts.classic import (
-    LANGUAGE_FLUENCY_PROMPT_TEMPLATE,
     LANGUAGE_COHERENCE_PROMPT_TEMPLATE,
+    LANGUAGE_CRITIQUE_FLUENCY_PROMPT_TEMPLATE,
+    LANGUAGE_CRITIQUE_COHERENCE_PROMPT_TEMPLATE,
+    LANGUAGE_CRITIQUE_GRAMMAR_PROMPT_TEMPLATE,
+    LANGUAGE_CRITIQUE_POLITENESS_PROMPT_TEMPLATE,
 )
 from uptrain.operators.language.prompts.few_shots import (
-    LANGUAGE_FLUENCY_FEW_SHOT__CLASSIFY,
-    LANGUAGE_FLUENCY_FEW_SHOT__COT,
+    LANGUAGE_CRITIQUE_FLUENCY_FEW_SHOT__CLASSIFY,
+    LANGUAGE_CRITIQUE_FLUENCY_FEW_SHOT__COT,
+    LANGUAGE_CRITIQUE_COHERENCE_FEW_SHOT__CLASSIFY,
+    LANGUAGE_CRITIQUE_COHERENCE_FEW_SHOT__COT,
+    LANGUAGE_CRITIQUE_GRAMMAR_FEW_SHOT__CLASSIFY,
+    LANGUAGE_CRITIQUE_GRAMMAR_FEW_SHOT__COT,
+    LANGUAGE_CRITIQUE_POLITENESS_FEW_SHOT__CLASSIFY,
+    LANGUAGE_CRITIQUE_POLITENESS_FEW_SHOT__COT,
     LANGUAGE_COHERENCE_FEW_SHOT__CLASSIFY,
-    LANGUAGE_COHERENCE_FEW_SHOT__COT,
+    LANGUAGE_COHERENCE_FEW_SHOT__COT
 )
 from uptrain.operators.language.prompts.instructions import CHAIN_OF_THOUGHT, CLASSIFY
 from uptrain.operators.language.prompts.output_format import (
-    LANGUAGE_FLUENCY_OUTPUT_FORMAT__CLASSIFY,
-    LANGUAGE_FLUENCY_OUTPUT_FORMAT__COT,
     LANGUAGE_COHERENCE_OUTPUT_FORMAT__CLASSIFY,
     LANGUAGE_COHERENCE_OUTPUT_FORMAT__COT,
+    LANGUAGE_CRITIQUE_FLUENCY_OUTPUT_FORMAT__CLASSIFY,
+    LANGUAGE_CRITIQUE_FLUENCY_OUTPUT_FORMAT__COT,
+    LANGUAGE_CRITIQUE_COHERENCE_OUTPUT_FORMAT__CLASSIFY,
+    LANGUAGE_CRITIQUE_COHERENCE_OUTPUT_FORMAT__COT,
+    LANGUAGE_CRITIQUE_GRAMMAR_OUTPUT_FORMAT__CLASSIFY,
+    LANGUAGE_CRITIQUE_GRAMMAR_OUTPUT_FORMAT__COT,
+    LANGUAGE_CRITIQUE_POLITENESS_OUTPUT_FORMAT__CLASSIFY,
+    LANGUAGE_CRITIQUE_POLITENESS_OUTPUT_FORMAT__COT,
 )
 from uptrain.utilities.prompt_utils import parse_scenario_description
 
@@ -65,7 +80,7 @@ class LanguageCritique(ColumnOp):
     col_response: str = "response"
     col_out: str = "score_language_critique"
     scenario_description: t.Optional[str] = None
-    score_mapping: dict = {"A": 1.0, "B": 0.5, "C": 0.0}
+    score_mapping: dict = {1: 0.2, 2: 0.4, 3: 0.6, 4: 0.8, 5: 1.0}
 
     def setup(self, settings: t.Optional[Settings] = None):
         from uptrain.framework.remote import APIClient
@@ -107,8 +122,8 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
 
     def critique_language_classify_validate_func(self, llm_output):
         is_correct = True
-        is_correct = is_correct and ("Choice" in llm_output)
-        is_correct = is_correct and llm_output["Choice"] in ["A", "B", "C"]
+        is_correct = is_correct and ("Score" in llm_output)
+        is_correct = is_correct and llm_output["Score"] in self.score_mapping.keys()
         return is_correct
 
     def critique_language_cot_validate_func(self, llm_output):
@@ -124,15 +139,17 @@ def evaluate_local(self, data):
         self.scenario_description, scenario_vars = parse_scenario_description(
             self.scenario_description
         )
+
+        # Fluency
         input_payloads = []
         if self.settings.eval_type == "basic":
-            few_shot_examples = LANGUAGE_FLUENCY_FEW_SHOT__CLASSIFY
-            output_format = LANGUAGE_FLUENCY_OUTPUT_FORMAT__CLASSIFY
+            few_shot_examples = LANGUAGE_CRITIQUE_FLUENCY_FEW_SHOT__CLASSIFY
+            output_format = LANGUAGE_CRITIQUE_FLUENCY_OUTPUT_FORMAT__CLASSIFY
             validation_func = self.critique_language_classify_validate_func
             prompting_instructions = CLASSIFY
         elif self.settings.eval_type == "cot":
-            few_shot_examples = LANGUAGE_FLUENCY_FEW_SHOT__COT
-            output_format = LANGUAGE_FLUENCY_OUTPUT_FORMAT__COT
+            few_shot_examples = LANGUAGE_CRITIQUE_FLUENCY_FEW_SHOT__COT
+            output_format = LANGUAGE_CRITIQUE_FLUENCY_OUTPUT_FORMAT__COT
             validation_func = self.critique_language_cot_validate_func
             prompting_instructions = CHAIN_OF_THOUGHT
         else:
@@ -150,7 +167,7 @@ def evaluate_local(self, data):
                 }
             )
             try:
-                grading_prompt_template = LANGUAGE_FLUENCY_PROMPT_TEMPLATE.replace(
+                grading_prompt_template = LANGUAGE_CRITIQUE_FLUENCY_PROMPT_TEMPLATE.replace(
                     "{scenario_description}", self.scenario_description
                 ).format(**kwargs)
             except KeyError as e:
@@ -168,23 +185,206 @@ def evaluate_local(self, data):
         for res in output_payloads:
             idx = res.metadata["index"]
             output = {
-                "score_critique_language": None,
-                "explanation_critique_language": None,
+                "score_fluency": None,
+                "explanation_fluency": None,
             }
             try:
                 score = self.score_mapping[
-                    json.loads(res.response.choices[0].message.content)["Choice"]
+                    json5.loads(res.response.choices[0].message.content)["Score"]
                 ]
-                output["score_critique_language"] = float(score)
-                output["explanation_critique_language"] = res.response.choices[
+                output["score_fluency"] = float(score)
+                output["explanation_fluency"] = json5.loads(res.response.choices[
                     0
-                ].message.content
+                ].message.content)["Reasoning"]
             except Exception:
                 logger.error(
                     f"Error when processing payload at index {idx}: {res.error}"
                 )
             results.append((idx, output))
 
+        # Coherence
+        input_payloads = []
+        if self.settings.eval_type == "basic":
+            few_shot_examples = LANGUAGE_CRITIQUE_COHERENCE_FEW_SHOT__CLASSIFY
+            output_format = LANGUAGE_CRITIQUE_COHERENCE_OUTPUT_FORMAT__CLASSIFY
+            validation_func = self.critique_language_classify_validate_func
+            prompting_instructions = CLASSIFY
+        elif self.settings.eval_type == "cot":
+            few_shot_examples = LANGUAGE_CRITIQUE_COHERENCE_FEW_SHOT__COT
+            output_format = LANGUAGE_CRITIQUE_COHERENCE_OUTPUT_FORMAT__COT
+            validation_func = self.critique_language_cot_validate_func
+            prompting_instructions = CHAIN_OF_THOUGHT
+        else:
+            raise ValueError(
+                f"Invalid eval_type: {self.settings.eval_type}. Must be either 'basic' or 'cot'"
+            )
+
+        for idx, row in enumerate(data):
+            kwargs = row
+            kwargs.update(
+                {
+                    "output_format": output_format,
+                    "prompting_instructions": prompting_instructions,
+                    "few_shot_examples": few_shot_examples,
+                }
+            )
+            try:
+                grading_prompt_template = LANGUAGE_CRITIQUE_COHERENCE_PROMPT_TEMPLATE.replace(
+                    "{scenario_description}", self.scenario_description
+                ).format(**kwargs)
+            except KeyError as e:
+                raise KeyError(
+                    f"Missing required attribute(s) for scenario description: {e}"
+                )
+            input_payloads.append(
+                self._api_client.make_payload(idx, grading_prompt_template)
+            )
+        output_payloads = self._api_client.fetch_responses(
+            input_payloads, validation_func
+        )
+
+        for res in output_payloads:
+            idx = res.metadata["index"]
+            output = {
+                "score_coherence": None,
+                "explanation_coherence": None,
+            }
+            try:
+                score = self.score_mapping[
+                    json5.loads(res.response.choices[0].message.content)["Score"]
+                ]
+                output["score_coherence"] = float(score)
+                output["explanation_coherence"] = json5.loads(res.response.choices[
+                    0
+                ].message.content)["Reasoning"]
+            except Exception:
+                logger.error(
+                    f"Error when processing payload at index {idx}: {res.error}"
+                )
+            results[idx][1].update(output)
+    
+        # Grammar
+        input_payloads = []
+        if self.settings.eval_type == "basic":
+            few_shot_examples = LANGUAGE_CRITIQUE_GRAMMAR_FEW_SHOT__CLASSIFY
+            output_format = LANGUAGE_CRITIQUE_GRAMMAR_OUTPUT_FORMAT__CLASSIFY
+            validation_func = self.critique_language_classify_validate_func
+            prompting_instructions = CLASSIFY
+        elif self.settings.eval_type == "cot":
+            few_shot_examples = LANGUAGE_CRITIQUE_GRAMMAR_FEW_SHOT__COT
+            output_format = LANGUAGE_CRITIQUE_GRAMMAR_OUTPUT_FORMAT__COT
+            validation_func = self.critique_language_cot_validate_func
+            prompting_instructions = CHAIN_OF_THOUGHT
+        else:
+            raise ValueError(
+                f"Invalid eval_type: {self.settings.eval_type}. Must be either 'basic' or 'cot'"
+            )
+
+        for idx, row in enumerate(data):
+            kwargs = row
+            kwargs.update(
+                {
+                    "output_format": output_format,
+                    "prompting_instructions": prompting_instructions,
+                    "few_shot_examples": few_shot_examples,
+                }
+            )
+            try:
+                grading_prompt_template = LANGUAGE_CRITIQUE_GRAMMAR_PROMPT_TEMPLATE.replace(
+                    "{scenario_description}", self.scenario_description
+                ).format(**kwargs)
+            except KeyError as e:
+                raise KeyError(
+                    f"Missing required attribute(s) for scenario description: {e}"
+                )
+            input_payloads.append(
+                self._api_client.make_payload(idx, grading_prompt_template)
+            )
+        output_payloads = self._api_client.fetch_responses(
+            input_payloads, validation_func
+        )
+
+        for res in output_payloads:
+            idx = res.metadata["index"]
+            output = {
+                "score_grammar": None,
+                "explanation_grammar": None,
+            }
+            try:
+                score = self.score_mapping[
+                    json5.loads(res.response.choices[0].message.content)["Score"]
+                ]
+                output["score_grammar"] = float(score)
+                output["explanation_grammar"] = json5.loads(res.response.choices[
+                    0
+                ].message.content)["Reasoning"]
+            except Exception:
+                logger.error(
+                    f"Error when processing payload at index {idx}: {res.error}"
+                )
+            results[idx][1].update(output)
+
+        # Politeness
+        input_payloads = []
+        if self.settings.eval_type == "basic":
+            few_shot_examples = LANGUAGE_CRITIQUE_POLITENESS_FEW_SHOT__CLASSIFY
+            output_format = LANGUAGE_CRITIQUE_POLITENESS_OUTPUT_FORMAT__CLASSIFY
+            validation_func = self.critique_language_classify_validate_func
+            prompting_instructions = CLASSIFY
+        elif self.settings.eval_type == "cot":
+            few_shot_examples = LANGUAGE_CRITIQUE_POLITENESS_FEW_SHOT__COT
+            output_format = LANGUAGE_CRITIQUE_POLITENESS_OUTPUT_FORMAT__COT
+            validation_func = self.critique_language_cot_validate_func
+            prompting_instructions = CHAIN_OF_THOUGHT
+        else:
+            raise ValueError(
+                f"Invalid eval_type: {self.settings.eval_type}. Must be either 'basic' or 'cot'"
+            )
+
+        for idx, row in enumerate(data):
+            kwargs = row
+            kwargs.update(
+                {
+                    "output_format": output_format,
+                    "prompting_instructions": prompting_instructions,
+                    "few_shot_examples": few_shot_examples,
+                }
+            )
+            try:
+                grading_prompt_template = LANGUAGE_CRITIQUE_POLITENESS_PROMPT_TEMPLATE.replace(
+                    "{scenario_description}", self.scenario_description
+                ).format(**kwargs)
+            except KeyError as e:
+                raise KeyError(
+                    f"Missing required attribute(s) for scenario description: {e}"
+                )
+            input_payloads.append(
+                self._api_client.make_payload(idx, grading_prompt_template)
+            )
+        output_payloads = self._api_client.fetch_responses(
+            input_payloads, validation_func
+        )
+
+        for res in output_payloads:
+            idx = res.metadata["index"]
+            output = {
+                "score_politeness": None,
+                "explanation_politeness": None,
+            }
+            try:
+                score = self.score_mapping[
+                    json5.loads(res.response.choices[0].message.content)["Score"]
+                ]
+                output["score_politeness"] = float(score)
+                output["explanation_politeness"] = json5.loads(res.response.choices[
+                    0
+                ].message.content)["Reasoning"]
+            except Exception:
+                logger.error(
+                    f"Error when processing payload at index {idx}: {res.error}"
+                )
+            results[idx][1].update(output)
+
         results = [val for _, val in sorted(results, key=lambda x: x[0])]
 
         return results
@@ -321,7 +521,7 @@ def evaluate_local(self, data):
             }
             try:
                 score = self.score_mapping[
-                    json.loads(res.response.choices[0].message.content)["Choice"]
+                    json5.loads(res.response.choices[0].message.content)["Choice"]
                 ]
                 output["score_response_coherence"] = float(score)
                 output["explanation_response_coherence"] = res.response.choices[
diff --git a/uptrain/operators/language/prompts/classic.py b/uptrain/operators/language/prompts/classic.py
index 5e991589..15610c33 100644
--- a/uptrain/operators/language/prompts/classic.py
+++ b/uptrain/operators/language/prompts/classic.py
@@ -321,7 +321,7 @@
 """
 
 
-# Critique Language Coherence
+# Coherence
 LANGUAGE_COHERENCE_PROMPT_TEMPLATE = """
 Please assess the language quality of the provided machine-generated response, and rate how coherent the response is, i.e. if the multiple parts of the response have conflicting information.
 
@@ -344,26 +344,104 @@
 [Response]: {response}
 """
 
+
+# Critique Language Coherence
+LANGUAGE_CRITIQUE_COHERENCE_PROMPT_TEMPLATE = """
+You are a detail-oriented LLM which pays close attention to the details. You are given a text and your job is to evaluate the quality of the provided text, focusing on the coherence aspect.
+
+Coherence is the quality of the text that makes it logical and consistent. It is important that the text is well-organized and the ideas are connected in a clear and meaningful way. A coherent text is easy to follow and understand.
+
+Please provide a score on the scale of 1-5, with 1 meaning that the text is completely incoherent and the elements in the text do not stitch together to produce meaningful text, and 5 meaning that the text is completely coherent and the elements in the text stitch together to produce meaningful text.
+
+{scenario_description}
+
+Example Data.
+{few_shot_examples}
+
+First, analyze the text and determine how fluent and natural sounding it is. Consider the structure, connectivity of ideas, and overall readability of the text. Write down step-by-step reasoning to make sure that your conclusion is correct.
+
+{prompting_instructions}
+
+Return the output only in the corresponding JSON format. Do not output anything other than this JSON object:
+{output_format}
+
+Task data.
+[Resposne]: {response}
+"""
+
+
 # Critique Language Fluency
-LANGUAGE_FLUENCY_PROMPT_TEMPLATE = """
-Please assess the language quality of the provided machine-generated response, and rate how fluent the response is.
+LANGUAGE_CRITIQUE_FLUENCY_PROMPT_TEMPLATE = """
+You are a detail-oriented LLM which pays close attention to the details. You are given a text and your job is to evaluate the quality of the provided text, focusing on the fluency aspect.
+
+Fluency is the ability of the text to flow smoothly and naturally. It is important that the text is easy to read and understand. A fluent text is well-structured, coherent, and free of awkward phrasing.
+
+Please provide a score on the scale of 1-5, with 1 meaning that the text is not fluent at all and/or has awkward phrasing, and 5 meaning that the text is completely fluent and natural sounding.
 
 {scenario_description}
 
 Example Data.
 {few_shot_examples}
 
-For the given task data, determine which case applies by selecting one of the following options:
-A. The response is highly fluent.
-B. The response is moderately fluent and can be improved.
-C. The response is not fluent at all.
+First, analyze the text and determine how fluent and natural sounding it is. Consider the structure, connectivity of ideas, and overall readability of the text. Write down step-by-step reasoning to make sure that your conclusion is correct.
+
 {prompting_instructions}
 
 Return the output only in the corresponding JSON format. Do not output anything other than this JSON object:
 {output_format}
 
-Task Data.
-[Response]: {response}
+Task data.
+[Resposne]: {response}
+"""
+
+
+# Critique Language Grammar
+LANGUAGE_CRITIQUE_GRAMMAR_PROMPT_TEMPLATE = """
+You are a detail-oriented LLM which pays close attention to the details. You are given a text and your job is to evaluate the quality of the provided text, focusing on the grammar aspect.
+
+Grammar is the correctness of the text in terms of spelling, punctuation, and sentence structure. It is important that the text is free of grammatical errors and follows the rules of the language.
+
+Please provide a score on the scale of 1-5, with 1 meaning that the text has many grammatical errors, and 5 meaning that the text has perfect grammar and word choice.
+
+{scenario_description}
+
+Example Data.
+{few_shot_examples}
+
+First, analyze the text and determine the grammar and word usage in the text.
+
+{prompting_instructions}
+
+Return the output only in the corresponding JSON format. Do not output anything other than this JSON object:
+{output_format}
+
+Task data.
+[Resposne]: {response}
+"""
+
+
+# Critique Language Politeness
+LANGUAGE_CRITIQUE_POLITENESS_PROMPT_TEMPLATE = """
+You are a detail-oriented LLM which pays close attention to the details. You are given a text and your job is to evaluate the quality of the provided text, focusing on the politeness aspect.
+
+Politeness is the tone of the text and how polite or impolite it is. It is important that the text is written in a respectful and appropriate tone.
+
+Please provide a score on the scale of 1-5, with 1 meaning that the tone of the text is very rude or inappropriate, and 5 meaning that the tone of the text is extremely polite.
+
+{scenario_description}
+
+Example Data.
+{few_shot_examples}
+
+First, analyze the text and determine how polite or impolite the tone of the text is.
+
+{prompting_instructions}
+
+Return the output only in the corresponding JSON format. Do not output anything other than this JSON object:
+{output_format}
+
+Task data.
+[Resposne]: {response}
 """
 
 
@@ -518,4 +596,4 @@
 Task data.
 [Question]: {question}
 [Conversation]: {conversation}
-"""
\ No newline at end of file
+"""
diff --git a/uptrain/operators/language/prompts/few_shots.py b/uptrain/operators/language/prompts/few_shots.py
index 22960aa1..2e1967d4 100644
--- a/uptrain/operators/language/prompts/few_shots.py
+++ b/uptrain/operators/language/prompts/few_shots.py
@@ -370,40 +370,40 @@
 
 
 # Critique Language Fluency
-LANGUAGE_FLUENCY_FEW_SHOT__CLASSIFY = """
+LANGUAGE_CRITIQUE_FLUENCY_FEW_SHOT__CLASSIFY = """
 [Response]: Exercise is good  health. It makes body strong and helps the mind too. Many benefits gained.
 [Output]:
 {
-    "Choice": "B"
+    "Score": 3
 }
 
 [Response]: Exercises are very good for your health as they make the body physically strong as well as promote mental well-being.
 [Output]:
 {
-    "Choice": "A"
+    "Score": 5
 }
 
 
 [Response]: Exercise good  health your. It maken strong strong body, fit, mind and.
 [Output]:
 {
-    "Choice": "C"
+    "Score": 1
 }
 """
 
-LANGUAGE_FLUENCY_FEW_SHOT__COT = """
+LANGUAGE_CRITIQUE_FLUENCY_FEW_SHOT__COT = """
 [Response]: Exercise is good  health. It makes body strong and helps the mind too. Many benefits gained.
 [Output]:
 {
     "Reasoning": "The text is somewhat fluent but lacks variety in sentence structure and uses repetitive language.",
-    "Choice": "B"
+    "Score": 3
 }
 
 [Response]: Exercises are very good for your health as they make the body physically strong as well as promote mental well-being.
 [Output]:
 {
     "Reasoning": "The text is completely fluent and natural sounding.",
-    "Choice": "A"
+    "Score": 5
 }
 
 
@@ -411,12 +411,148 @@
 [Output]:
 {
     "Reasoning": "The text is not fluent at all and has awkward phrasing, making it difficult to understand.",
-    "Choice": "C"
+    "Score": 1
 }
 """
 
 
 # Critique Language Coherence
+LANGUAGE_CRITIQUE_COHERENCE_FEW_SHOT__CLASSIFY = """
+[Response]: Exercise is beneficial for both physical and mental health. It strengthens the body and uplifts the mind.
+[Output]:
+{
+    "Score": 5
+}
+
+[Response]: Regular exercise contributes to overall well-being by enhancing physical strength and mental clarity.
+[Output]:
+{
+    "Score": 4
+}
+
+[Response]: Exercise good. Health. Make body strong. Help mind. Benefits many.
+[Output]:
+{
+    "Score": 2
+}
+"""
+
+
+LANGUAGE_CRITIQUE_COHERENCE_FEW_SHOT__CLASSIFY = """
+[Response]: Exercise is beneficial for both physical and mental health. It strengthens the body and uplifts the mind.
+[Output]:
+{
+    "Reasoning": "The text is coherent and effectively conveys the message with clear organization of ideas.",
+    "Score": 5
+}
+
+[Response]: Regular exercise contributes to overall well-being by enhancing physical strength and mental clarity.
+[Output]:
+{
+    "Reasoning": "The text maintains coherence by linking ideas logically, providing a clear flow of information.",
+    "Score": 4
+}
+
+[Response]: Exercise good. Health. Make body strong. Help mind. Benefits many.
+[Output]:
+{
+    "Reasoning": "The text lacks coherence, as it presents fragmented ideas without clear connections.",
+    "Score": 2
+}
+"""
+
+
+# Critique Language Grammar
+LANGUAGE_CRITIQUE_GRAMMAR_FEW_SHOT__CLASSIFY = """
+[Response]: Exercise is essential for maintaining good health. It strengthens the body and improves mental well-being.
+[Output]:
+{
+    "Score": 5
+}
+
+[Response]: Exercises is important for healthiness. It makes body strong and helps the mind too.
+[Output]:
+{
+    "Score": 3
+}
+
+[Response]: Exercise good for healthy. It make body strong and help mind.
+[Output]:
+{
+    "Score": 2
+}
+"""
+
+LANGUAGE_CRITIQUE_GRAMMAR_FEW_SHOT__COT = """
+[Response]: Exercise is essential for maintaining good health. It strengthens the body and improves mental well-being.
+[Output]:
+{
+    "Reasoning": "The text demonstrates proper grammar usage and sentence structure.",
+    "Score": 5
+}
+
+[Response]: Exercises is important for healthiness. It makes body strong and helps the mind too.
+[Output]:
+{
+    "Reasoning": "The text contains some grammatical errors, such as subject-verb agreement and pluralization.",
+    "Score": 3
+}
+
+[Response]: Exercise good for healthy. It make body strong and help mind.
+[Output]:
+{
+    "Reasoning": "The text has several grammatical errors, such as missing articles and incorrect verb forms.",
+    "Score": 2
+}
+"""
+
+
+# Critique Language Politness
+LANGUAGE_CRITIQUE_POLITENESS_FEW_SHOT__CLASSIFY = """
+[Response]: Thank you for considering my application. I appreciate the opportunity to interview for the position.
+[Output]:
+{
+    "Score": 5
+}
+
+[Response]: Thanks for considering my application. I appreciate the opportunity to interview for the position.
+[Output]:
+{
+    "Score": 4
+}
+
+[Response]: Consider my application. Interview for position.
+[Output]:
+{
+    "Score": 1
+}
+"""
+
+LANGUAGE_CRITIQUE_POLITENESS_FEW_SHOT__COT = """
+[Response]: Thank you for considering my application. I appreciate the opportunity to interview for the position.
+[Output]:
+{
+    "Reasoning": "The text is very polite and courteous, expressing gratitude and appreciation.",
+    "Score": 5
+}
+
+[Response]: Thanks for considering my application. I appreciate the opportunity to interview for the position.
+[Output]:
+{
+    "Reasoning": "The text is polite, but could be slightly improved with a more formal expression such as 'thank you'.",
+    "Score": 4
+}
+
+[Response]: Consider my application. Interview for position.
+[Output]:
+{
+    "Reasoning": "The text lacks politeness and appears rather abrupt, lacking in courtesy.",
+    "Score": 1
+}
+"""
+
+
+# Response Coherence
 LANGUAGE_COHERENCE_FEW_SHOT__CLASSIFY = """
 [Response]: Exercise is good  health. It makes body strong and helps the mind too. Many benefits gained.
 [Output]:
diff --git a/uptrain/operators/language/prompts/output_format.py b/uptrain/operators/language/prompts/output_format.py
index b5f597ff..6b31d0d3 100644
--- a/uptrain/operators/language/prompts/output_format.py
+++ b/uptrain/operators/language/prompts/output_format.py
@@ -214,21 +214,66 @@
 
 
 # Critique Language Fluency
-LANGUAGE_FLUENCY_OUTPUT_FORMAT__CLASSIFY = """
+LANGUAGE_CRITIQUE_FLUENCY_OUTPUT_FORMAT__CLASSIFY = """
 {
-    "Choice": [Selected Choice],  # Choice selected for the given task data, one of ("A", "B", "C")
+    "Score": [Score],  # Score between 1 to 5, to evaluate the fluency of the response,
 }
 """
 
-LANGUAGE_FLUENCY_OUTPUT_FORMAT__COT = """
+LANGUAGE_CRITIQUE_FLUENCY_OUTPUT_FORMAT__COT = """
 {
     "Reasoning": [Reasoning],  # Reasoning to critique the fluency of the response,
-    "Choice": [Selected Choice],  # Choice selected for the given task data, one of ("A", "B", "C")
+    "Score": [Score],  # Score between 1 to 5, to evaluate the fluency of the response,
 }
 """
 
 
 # Critique Language Coherence
+LANGUAGE_CRITIQUE_COHERENCE_OUTPUT_FORMAT__CLASSIFY = """
+{
+    "Score": [Score],  # Score between 1 to 5, to evaluate the coherence of the response,
+}
+"""
+
+LANGUAGE_CRITIQUE_COHERENCE_OUTPUT_FORMAT__COT = """
+{
+    "Reasoning": [Reasoning],  # Reasoning to critique the coherence of the response,
+    "Score": [Score],  # Score between 1 to 5, to evaluate the coherence of the response,
+}
+"""
+
+
+# Critique Language Grammar
+LANGUAGE_CRITIQUE_GRAMMAR_OUTPUT_FORMAT__CLASSIFY = """
+{
+    "Score": [Score],  # Score between 1 to 5, to evaluate the grammar of the response,
+}
+"""
+
+LANGUAGE_CRITIQUE_GRAMMAR_OUTPUT_FORMAT__COT = """
+{
+    "Reasoning": [Reasoning],  # Reasoning to critique the grammar of the response,
+    "Score": [Score],  # Score between 1 to 5, to evaluate the grammar of the response,
+}
+"""
+
+
+# Critique Language Politeness
+LANGUAGE_CRITIQUE_POLITENESS_OUTPUT_FORMAT__CLASSIFY = """
+{
+    "Score": [Score],  # Score between 1 to 5, to evaluate the politeness of the response,
+}
+"""
+
+LANGUAGE_CRITIQUE_POLITENESS_OUTPUT_FORMAT__COT = """
+{
+    "Reasoning": [Reasoning],  # Reasoning to critique the politeness of the response,
+    "Score": [Score],  # Score between 1 to 5, to evaluate the politeness of the response,
+}
+"""
+
+
+# Coherence
 LANGUAGE_COHERENCE_OUTPUT_FORMAT__CLASSIFY = """
 {
     "Choice": [Selected Choice],  # Choice selected for the given task data, one of ("A", "B", "C")
@@ -349,4 +394,4 @@
 {
     "Question": [Rewritten Question],
 }
-"""
\ No newline at end of file
+"""