From 6d056250e8a5599bf47aa8336ddfc705b5e3c3dd Mon Sep 17 00:00:00 2001
From: Oindrilla Chatterjee <oc@bu.edu>
Date: Fri, 26 Jan 2024 00:40:38 +0530
Subject: [PATCH] added custom criteria (#30)

---
 app/app.py           | 69 +++++++++++++++++++++-----------------------
 app/requirements.txt |  3 +-
 app/utils.py         | 47 ++++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 40 deletions(-)

diff --git a/app/app.py b/app/app.py
index 3b7225e..d219f88 100644
--- a/app/app.py
+++ b/app/app.py
@@ -5,6 +5,7 @@
     generate_text_using_OpenAI,
     eval_using_model,
     indicate_key_presence,
+    eval_using_langchain,
 )
 from feedback import store_feedback
 import os
@@ -136,7 +137,8 @@ def OPENAI_API_KEY() -> str:
 
     instruction = st.text_area(
         "Instruction",
-        """You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:
+        """
+You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:
 
 1. Introduction: Briefly describe the purpose of the API and its intended use.   
 2. Functions: Document each API function, including:
@@ -222,7 +224,6 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str):
             top_p,
             GENAI_KEY(),
         )
-
     col1, col2, col3 = st.columns([1.5, 1.5, 0.5])
 
     with col1:
@@ -243,22 +244,38 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str):
 
     with col3:
         st.subheader("Evaluation Metrics")
-        # rouge score addition
-        scorer = rouge_scorer.RougeScorer(
-            ["rouge1", "rouge2", "rougeL"], use_stemmer=True
+        st.markdown(
+            "**GenAI evaluation on Overall Quality:**",
+            help="Use OpenAI GPT 3 to evaluate the result of the generated API doc",
+        )
+
+        score = eval_using_model(result, openai_key=OPENAI_API_KEY())
+        st.write(score)
+
+        st.markdown(
+            "**LangChain evaluation on grammar, descriptiveness and helpfulness:**",
+            help="Use Langchain to evaluate on cutsom criteria (this list can be updated based on what we are looking to see from the generated docs"
         )
-        rouge_scores = scorer.score(actual_doc, result)
+
+        lc_score = eval_using_langchain(prompt, result)
         st.markdown(
-            f"ROUGE-1 Score:{rouge_scores['rouge1'].fmeasure:.2f}",
-            help="ROUGE-1 refers to the overlap of unigrams (each word) between the system and reference summaries",
+            f"Grammatical: {lc_score[0]['score']}",
+            help="Checks if the output grammatically correct. Binary integer 0 to 1, where 1 would mean that the output is gramatically accurate and 0 means it is not",
         )
+        
         st.markdown(
-            f"ROUGE-2 Score: {rouge_scores['rouge2'].fmeasure:.2f}",
-            help="ROUGE-2 refers to the overlap of bigrams between the system and reference summaries",
+            f"Descriptiveness: {lc_score[1]['score']}",
+            help="Checks if the output descriptive. Binary integer 0 to 1, where 1 would mean that the output is descriptive and 0 means it is not",
         )
+
+        st.markdown(
+            f"Helpfulness: {lc_score[2]['score']}",
+            help="Checks if the output helpful for the end user. Binary integer 0 to 1, where 1 would mean that the output is helpful and 0 means it is not"
+        )
+
         st.markdown(
-            f"ROUGE-L Score: {rouge_scores['rougeL'].fmeasure:.2f}",
-            help="Longest common subsequence problem takes into account sentence-level structure similarity naturally and identifies longest co-occurring in sequence n-grams automatically",
+            "**Consistency:**",
+            help="Evaluate how similar or divergent the generated document is to the actual documentation",
         )
 
         # calc cosine similarity
@@ -270,17 +287,12 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str):
             help="0 cosine similarity means no similarity between generated and actual API documentation, 1 means they are same",
         )
         st.markdown("###")  # add a line break
-
+        
         st.markdown(
-            "**GenAI evaluation scores:**",
-            help="Use OpenAI GPT 3 to evaluate the result of the generated API doc",
+            "**Readability Scores:**",
+            help="Evaluate how readable the generated text is",
         )
-        score = eval_using_model(result, openai_key=OPENAI_API_KEY())
-        st.write(score)
-
-        # Readability Scores
-        st.subheader("Readability Metrics")
-
+        
         # Flesch Reading Ease
         flesch_reading_ease = textstat.flesch_reading_ease(result)
         st.markdown(
@@ -288,21 +300,6 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str):
             help="Flesch Reading Ease measures how easy a text is to read. Higher scores indicate easier readability. Ranges 0-100 and a negative score indicates a more challenging text.",
         )
 
-        # Dale Chall Readability
-        dale_chall_readability = textstat.dale_chall_readability_score(result)
-        st.markdown(
-            f"Dale Chall Readability: {dale_chall_readability:.2f}",
-            help="The Dale-Chall Formula is a readability formula based on the use of familiar words, rather than syllable or letter counts. Lower scores mean more difficult words. No fixed ranges.",
-        )
-
-        # Automated Readability Index (ARI)
-        ari = textstat.automated_readability_index(result)
-        st.markdown(
-            f"ARI (Automated Readability Index): {ari:.2f}",
-            help="ARI relies on a factor of characters per word, instead of the usual syllables per word. ARI corresponds to a U.S. grade level. Higher scores indicate more advanced reading levels.",
-        )
-
-
 if st.button("Generate API Documentation"):
     if model_id != "OpenAI/gpt3.5":
         prompt_success, prompt_diff = check_prompt_token_limit(
diff --git a/app/requirements.txt b/app/requirements.txt
index 2ad5204..fa997bf 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -10,4 +10,5 @@ py-readability-metrics
 openai
 textstat
 scikit-learn
-streamlit-feedback
\ No newline at end of file
+streamlit-feedback
+langchain
\ No newline at end of file
diff --git a/app/utils.py b/app/utils.py
index c8e6411..c5a4efe 100644
--- a/app/utils.py
+++ b/app/utils.py
@@ -1,6 +1,3 @@
-import json
-import os
-from openai import OpenAI
 from genai import Credentials, Client
 from genai.text.generation import TextGenerationParameters
 from genai.text.tokenization import (
@@ -8,6 +5,15 @@
     TextTokenizationReturnOptions,
     TextTokenizationCreateResults,
 )
+from langchain.evaluation import (
+    Criteria,
+    load_evaluator,
+    EvaluatorType
+)
+import os
+import json
+from openai import OpenAI
+from langchain_community.chat_models import ChatOpenAI
 
 
 def generate_prompt(
@@ -261,3 +267,38 @@ def indicate_key_presence(env: str) -> str:
         return "*" * len(key)
     else:
         return ""
+
+def eval_using_langchain(prediction: str, query: str):
+
+    evaluation = []
+    llm = ChatOpenAI(model="gpt-4", temperature=0)
+
+    # If you wanted to specify multiple criteria. Generally not recommended
+    custom_criterion_1 = {
+        "grammatical": "Is the output grammatically correct?",
+    }
+
+    eval_chain = load_evaluator(EvaluatorType.CRITERIA, llm=llm, criteria=custom_criterion_1)
+
+    eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)
+    evaluation.append(eval_result)
+
+    custom_criterion_2 = {
+        "descriptive": "Does the output describe a piece of code and its intended functionality?"
+    }
+
+    eval_chain = load_evaluator(EvaluatorType.CRITERIA, llm=llm, criteria=custom_criterion_2)
+
+    eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)
+    evaluation.append(eval_result)
+
+    evaluator = load_evaluator("criteria", llm=llm, criteria="helpfulness")
+
+    eval_result = evaluator.evaluate_strings(prediction=prediction,input=query)
+    evaluation.append(eval_result)
+
+    return evaluation
+
+
+
+