diff --git a/app/app.py b/app/app.py index 9149bb0..494504c 100644 --- a/app/app.py +++ b/app/app.py @@ -256,7 +256,7 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str): help="Use Langchain Criteria based Eval to evaluate on cutsom criteria (this list can be updated based on what we are looking to see from the generated docs). Note this is language mo0del based evaluation and not always a true indication of the quality of the output that is generatged." ) - lc_score = eval_using_langchain(prompt, result) + lc_score = eval_using_langchain(result, prompt, actual_doc) st.markdown( f"Logical: {lc_score[0]['score']}", help="Checks if the output is logical. Binary integer 0 to 1, where 1 would mean that the output is logical and 0 means it is not", @@ -288,4 +288,4 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str): on_submit=store_feedback, optional_text_label="Please tell us how we could make this more useful", align="flex-start", -) \ No newline at end of file +) diff --git a/app/utils.py b/app/utils.py index 6da7bd3..f30b516 100644 --- a/app/utils.py +++ b/app/utils.py @@ -241,7 +241,7 @@ def indicate_key_presence(env: str) -> str: else: return "" -def eval_using_langchain(prediction: str, query: str): +def eval_using_langchain(prediction: str, query: str, actual_doc: str): evaluation = [] llm = ChatOpenAI(model="gpt-4", temperature=0) @@ -256,7 +256,7 @@ def eval_using_langchain(prediction: str, query: str): # 2 evaluator = load_evaluator("labeled_criteria", llm=llm, criteria="correctness") - eval_result = evaluator.evaluate_strings(prediction=generated_patch, input=prompt, reference=actual_doc) + eval_result = evaluator.evaluate_strings(prediction=prediction, input=query, reference=actual_doc) evaluation.append(eval_result) # 3