diff --git a/app/app.py b/app/app.py index d219f88..a8537cc 100644 --- a/app/app.py +++ b/app/app.py @@ -316,4 +316,4 @@ def main(prompt_success: bool, prompt_diff: int, actual_doc: str): on_submit=store_feedback, optional_text_label="Please tell us how we could make this more useful", align="flex-start", -) +) \ No newline at end of file diff --git a/app/utils.py b/app/utils.py index c5a4efe..f30b5a9 100644 --- a/app/utils.py +++ b/app/utils.py @@ -218,8 +218,8 @@ def generate_text( ) response = responses[0].results[0] print(response) - generated_patch = response.generated_text - return generated_patch + generated_text = response.generated_text + return generated_text def generate_text_using_OpenAI(prompt: str, openai_key: str): @@ -297,8 +297,4 @@ def eval_using_langchain(prediction: str, query: str): eval_result = evaluator.evaluate_strings(prediction=prediction,input=query) evaluation.append(eval_result) - return evaluation - - - - + return evaluation \ No newline at end of file diff --git a/notebooks/evaluation/eval_df.pkl b/notebooks/evaluation/eval_df.pkl new file mode 100644 index 0000000..e740151 Binary files /dev/null and b/notebooks/evaluation/eval_df.pkl differ diff --git a/notebooks/evaluation/evaluation_metrics.ipynb b/notebooks/evaluation/evaluation_metrics.ipynb index 424a1f3..cd90ac7 100644 --- a/notebooks/evaluation/evaluation_metrics.ipynb +++ b/notebooks/evaluation/evaluation_metrics.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d2d79440-f74f-4e7b-920a-3da2812c869b", "metadata": { "tags": [] @@ -34,14 +34,13 @@ "from genai.credentials import Credentials\n", "import sys\n", "sys.path.append('../../app')\n", - "from utils import eval_using_model\n", + "from utils import eval_using_model, generate_text_using_OpenAI, generate_prompt, generate_text\n", "from langchain.evaluation import (\n", " Criteria,\n", " load_evaluator,\n", " EvaluatorType\n", ")\n", - "from langchain_community.chat_models import ChatOpenAI\n", - "from openai import OpenAI" + "from langchain_community.chat_models import ChatOpenAI" ] }, { @@ -679,8 +678,8 @@ "source": [ "## Evaluate the results\n", "\n", - "There are different ways to evaluate the results generated by our LLMs. Some of the methods we will explore are:\n", - "* **GenAI evaluation** - Use OpenAI GPT 3 to evaluate the result of the generated API doc\n", + "There are different ways to evaluate the results generated by our LLMs. Some of the Gen AI methods we will explore are:\n", + "* **GenAI evaluation using prompts to GPT** - Use OpenAI GPT 3 to evaluate the result of the generated API doc\n", "* **LangChain evaluation** - Using Langchain to evaluate on custom criteria such as helpfullness, correctness, descriptiveness etc" ] }, @@ -704,7 +703,7 @@ "tags": [] }, "source": [ - "### GenAI Evaluation\n", + "### GenAI GPT Evaluation\n", "\n", "We will now ask GPT-3 to evaluate the generated doc based on factors such as Accuracy, Relevance, Clarity, Completeness and Readability. We asked it to rate on a scale of 1 to 5. 1 for the poorest documentation and 5 for the best." ] @@ -749,6 +748,78 @@ "The generated output provides a generic documentation for the API, but fails to provide specific documentation for the code functions provided. Hence, GPT-3 has failed to evaluate the generated output. In order to improve the evaluation capability, we need to further fine-tune the prompt for GPT-3 by supplementing it with the source code file we provided as the initial input for generating the resultant documentation." ] }, + { + "cell_type": "markdown", + "id": "e17f56fa-7a5c-4637-bc78-3b7a5ecca1a4", + "metadata": { + "tags": [] + }, + "source": [ + "### Supplement Gen AI prompt with info on source code" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "13c6a809-c9c8-40aa-86f4-07fb3e49980f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def eval_using_model(result: str, openai_key: str, initial_prompt: str):\n", + " prompt = f\"\"\"Below is a prompt and the API documentation generated for code based on the prompt, rate the documentation on factors such as Accuracy, Relevance, Clarity, Completeness and Readability. Rate it on a scale of 1 to 5. 1 for the poorest documentation and 5 for the best and provide reasoning for the score given.\n", + " Example: \n", + "\n", + " Accuracy: 1 - Give specific explanation why the generated documentation is or is not accurate and point out reasons from code and generated doc\n", + " Relevance: 2 - Give specific explanation why the generated documentation is or is not relevant and point out reasons from code and generated doc\n", + " Clarity: 3 - Give specific explanation explanation why the generated documentation is or is not clear and point out reasons from code and generated doc\n", + " Completeness: 4 - Give specific explanation explanation why the generated documentation is or is not complete and point out reasons from code and generated doc\n", + " Readability: 5 - Give specific explanation explanation why the generated documentation is or is not readable and point out reasons from code and generated doc\n", + " Overall Score: 3\n", + " \n", + " Prompt:\n", + " \n", + " {initial_prompt}\n", + " Documentation:\n", + " \n", + " {result}\n", + " \n", + " GenAI Score: \"\"\"\n", + " response = generate_text_using_OpenAI(prompt, openai_key)\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "25c8448a-1f66-422f-b116-1e8ccc5ae264", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 4 - The generated documentation accurately describes the purpose of the API and its intended use. It also accurately describes the function and its behavior, including the parameters and return values.\n", + "\n", + "Relevance: 5 - The generated documentation is relevant as it provides clear and concise information about the function and its usage.\n", + "\n", + "Clarity: 5 - The generated documentation is clear and easy to understand. It provides a clear description of the function, its parameters, return values, and error handling.\n", + "\n", + "Completeness: 5 - The generated documentation is complete as it covers all the necessary information about the function, including its purpose, parameters, return values, and error handling.\n", + "\n", + "Readability: 5 - The generated documentation is well-structured and organized. It is easy to read and understand, making it user-friendly.\n", + "\n", + "Overall Score: 4.8\n" + ] + } + ], + "source": [ + "score = eval_using_model(result, openai_key=openai_key, initial_prompt=prompt)" + ] + }, { "cell_type": "markdown", "id": "d0dd1891-b6b0-4eb7-ae15-74c0ac7d6266", @@ -1029,6 +1100,5104 @@ "\n", "Our generated doc has been scored 0 for logicalness, indicating that the generated doc does not capture the documentation for the input Python code provided and hence is not logical." ] + }, + { + "cell_type": "markdown", + "id": "b74e0295-9269-4dd4-8e38-b6bb22c679db", + "metadata": {}, + "source": [ + "## Quantitative Evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "d5595a0f-b868-4d23-8ee4-9801341dc9d8", + "metadata": {}, + "source": [ + "In this section, in order to drill down on the best genai evaluation criteria, we construct a quantitative evaluation matrix to determine how often these scores are valid by\n", + "\n", + " - Looking at cases where we know the generated output is deliberately wrong and see how the allotted scores perform\n", + " - And doing this over a number of output for each criteria\n", + " \n", + "To do that we have columns for each evaluation criteria as well as human evaluation scores associated with each criteria." + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "b2151eb6-58a1-4611-9986-85990b97f650", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data = {\n", + " 'prompt': [],\n", + " 'response': [],\n", + " 'gpt_accuracy_score': [],\n", + " 'human_accuracy_score': [],\n", + " 'gpt_relevance_score': [],\n", + " 'human_relevance_score': [],\n", + " 'gpt_clarity_score': [],\n", + " 'human_clarity_score': [],\n", + " 'gpt_completeness_score': [],\n", + " 'human_completeness_score': [],\n", + " 'gpt_readability_score': [],\n", + " 'human_readability_score': [],\n", + " 'langchain_helpfulness': [],\n", + " 'human_helpfulness': [],\n", + " 'langchain_correctness': [],\n", + " 'human_correctness': [],\n", + " 'langchain_logical': [],\n", + " 'human_logical': []\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "c4645826-f5a4-4730-90cf-b0fb60cbb24c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_response(model_id, file, functions, classes, documentation, imports, other, functions_code, functions_doc, classes_code, classes_doc):\n", + "\n", + "\n", + " DATASET_PATH = \"../../data/raw/chunked_data.json\"\n", + "\n", + " with open(DATASET_PATH, \"r\", encoding=\"utf-8\") as f:\n", + " data = json.load(f)\n", + "\n", + " code = data[file][\"code_chunks\"]\n", + "\n", + " actual_doc = data[file][\"markdown\"]\n", + "\n", + " functions_text = code[\"functions\"]\n", + " classes_text = code[\"classes\"]\n", + " documentation_text = code[\"documentation\"]\n", + " imports_text = code[\"imports\"]\n", + " other_text = code[\"other\"]\n", + " functions_code_text = code[\"functions_code\"]\n", + " functions_doc_text = code[\"functions_docstrings\"]\n", + " classes_code_text = code[\"classes_code\"]\n", + " classes_doc_text = code[\"classes_docstrings\"]\n", + "\n", + "\n", + " prompt = generate_prompt(\n", + " instruction,\n", + " functions=functions,\n", + " functions_text=functions_text,\n", + " classes=classes,\n", + " classes_text=classes_text,\n", + " documentation=documentation,\n", + " documentation_text=documentation_text,\n", + " imports=imports,\n", + " imports_text=imports_text,\n", + " other=other,\n", + " other_text=other_text,\n", + " functions_code=functions_code,\n", + " functions_code_text=functions_code_text,\n", + " functions_doc=functions_doc,\n", + " functions_doc_text=functions_doc_text,\n", + " classes_code=classes_code,\n", + " classes_code_text=classes_code_text,\n", + " classes_doc=classes_doc,\n", + " classes_doc_text=classes_doc_text,\n", + " )\n", + "\n", + " if model_id == \"OpenAI/gpt3.5\":\n", + " result = generate_text_using_OpenAI(prompt, openai_key)\n", + "\n", + " else:\n", + " result = generate_text(model_id, prompt, decoding_method=\"sample\", max_new_tokens=1024, temperature=0.7, top_k=50, top_p=0.50, genai_key=api_key)\n", + " \n", + " return prompt, result, actual_doc" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "d20821db-c08e-41ec-bfba-34fb24ce9116", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def extract_scores(gpt_score):\n", + " pattern = r'(\\w+):\\s(\\d+)'\n", + " matches = re.findall(pattern, gpt_score)\n", + "\n", + " evaluation_scores = {match[0]: int(match[1]) for match in matches}\n", + "\n", + " gpt_accuracy_score = evaluation_scores['Accuracy']\n", + " gpt_relevance_score = evaluation_scores['Relevance']\n", + " gpt_clarity_score = evaluation_scores['Clarity']\n", + " gpt_completeness_score = evaluation_scores['Completeness']\n", + " gpt_readability_score = evaluation_scores['Readability']\n", + " \n", + " return gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "e60c1c16-9ce9-4b27-95ce-372ea59a864a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def append_row_to_dataframe(df, prompt, generated_patch, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score):\n", + "\n", + " evaluator = load_evaluator(\"criteria\", llm=llm, criteria=\"helpfulness\")\n", + " eval_result = evaluator.evaluate_strings(prediction=generated_patch, input=prompt)\n", + " print(eval_result)\n", + " langchain_helpfulness = eval_result['score']\n", + " \n", + " evaluator = load_evaluator(\"labeled_criteria\", llm=llm, criteria=\"correctness\")\n", + " eval_result = evaluator.evaluate_strings(prediction=generated_patch, input=prompt, reference=actual_doc)\n", + " print(eval_result)\n", + " langchain_correctness = eval_result['score']\n", + "\n", + " custom_criteria = {\n", + " \"logical\": \"Is the output complete? Does it capture all required fields\"\n", + " }\n", + " eval_chain = load_evaluator(\n", + " EvaluatorType.CRITERIA,\n", + " criteria=custom_criteria,\n", + " llm=llm\n", + " )\n", + " eval_result = eval_chain.evaluate_strings(prediction=generated_patch, input=prompt)\n", + " print(eval_result)\n", + " langchain_logical = eval_result['score']\n", + "\n", + " new_row = {\n", + " 'prompt': prompt,\n", + " 'response': generated_patch,\n", + " 'gpt_accuracy_score': gpt_accuracy_score,\n", + " 'gpt_relevance_score': gpt_relevance_score,\n", + " 'gpt_clarity_score' : gpt_clarity_score,\n", + " 'gpt_completeness_score' : gpt_completeness_score,\n", + " 'gpt_readability_score' : gpt_readability_score,\n", + " 'langchain_helpfulness' : langchain_helpfulness,\n", + " 'langchain_correctness' : langchain_correctness,\n", + " 'langchain_logical' : langchain_logical\n", + " }\n", + "\n", + " df = df.append(new_row, ignore_index=True)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "edbe4fa7-e3e7-4503-9f35-5f5410ea71e3", + "metadata": {}, + "source": [ + "# DO NOT RUN CELLS WITH EXAMPLES THAT ARE ALREADY ADDED SO THEY ARE NOT OVERWRITTEN.\n", + "Scroll to the bottom and add more examples" + ] + }, + { + "cell_type": "markdown", + "id": "704424b2-5970-4498-87e8-67b204df39ba", + "metadata": { + "tags": [] + }, + "source": [ + "### Example 1 - Do not Re-run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90b1b55a-e0b4-428e-9d06-ffb7fa0fb494", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "cffee783-1cf0-410a-a5f9-b269a8c480df", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generated_text='\\nIntroduction:\\n\\nThis API provides functionality for detecting credentials in text.\\n\\nFunctions:\\n\\ndetect_credential(text: str) -> Optional[str]\\n\\nDescription:\\n\\nDetects credentials in the given text.\\n\\nParameters:\\n\\ntext (str): The text to detect credentials in.\\n\\nReturn Values:\\n\\nstr: The detected credential.\\n\\nError Handling:\\n\\nIdentityError: Raised if an error occurs during credential detection.\\n\\nMake sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.' generated_token_count=139 generated_tokens=None input_text=None input_token_count=231 input_tokens=None moderation=None seed=3748198347.0 stop_reason='eos_token' stop_sequence=None\n" + ] + } + ], + "source": [ + "prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'oidc', functions=True, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=False, classes_doc=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "76fa32c8-5a24-452a-8a47-ba9c856118f9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Prompt \n", + " \n", + "You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:\n", + "\n", + "1. Introduction: Briefly describe the purpose of the API and its intended use.\n", + "2. Functions: Document each API function, including:\n", + " - Description: Clearly explain what the endpoint or function does.\n", + " - Parameters: List and describe each parameter, including data types and any constraints.\n", + " - Return Values: Specify the data type and possible values returned.\n", + "\n", + "3. Error Handling: Describe possible error responses and their meanings.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n", + "\n", + "\n", + "Function Code:\n", + "\n", + "def detect_credential() -> Optional[str]:\n", + " \n", + " try:\n", + " return cast(Optional[str], id.detect_credential(_DEFAULT_AUDIENCE))\n", + " except id.IdentityError as exc:\n", + " IdentityError.raise_from_id(exc)\n", + "\n", + "Function Documentation:\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\"\\n Prompt \\n\", prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "ba25e20a-30b3-45a6-bf3e-203af37052b5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Generated Patch \n", + " \n", + "Introduction:\n", + "\n", + "This API provides functionality for detecting credentials in text.\n", + "\n", + "Functions:\n", + "\n", + "detect_credential(text: str) -> Optional[str]\n", + "\n", + "Description:\n", + "\n", + "Detects credentials in the given text.\n", + "\n", + "Parameters:\n", + "\n", + "text (str): The text to detect credentials in.\n", + "\n", + "Return Values:\n", + "\n", + "str: The detected credential.\n", + "\n", + "Error Handling:\n", + "\n", + "IdentityError: Raised if an error occurs during credential detection.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n" + ] + } + ], + "source": [ + "print(\"\\n Generated Text \\n\", generated_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "aded6c51-6a89-41db-a1ac-7a58663cdf9d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 4 - The generated documentation accurately describes the purpose of the API and the function. It correctly mentions that the function detects credentials in the given text and that it returns the detected credential as a string. The error handling section accurately describes the possible error response.\n", + "\n", + "Relevance: 5 - The generated documentation is relevant to the provided code. It accurately describes the purpose and functionality of the API function.\n", + "\n", + "Clarity: 4 - The generated documentation is clear in explaining what the function does and what its parameters and return values are. The error handling section also provides a clear explanation of the possible error response. \n", + "\n", + "Completeness: 4 - The generated documentation provides a comprehensive description of the API function, including its purpose, parameters, return values, and error handling. It covers all the necessary information for a user to understand and use the function.\n", + "\n", + "Readability: 5 - The generated documentation is well-structured and easy to read. It uses appropriate formatting and language to convey the information clearly.\n", + "\n", + "Overall Score: 4\n" + ] + } + ], + "source": [ + "gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "0691630c-d9cf-43cb-bbc8-0c1772636449", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "1a813054-51df-41bc-90a0-0666b9a1a8c7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reasoning': 'The criterion for this task is \"helpfulness\". \\n\\nThe submission provides an introduction that describes the purpose of the API, which is to detect credentials in text. This is helpful for users to understand what the API does.\\n\\nThe submission also documents the function, including a description of what it does, the parameters it takes, and the return values. This is helpful for users to understand how to use the function.\\n\\nThe submission also describes possible error responses, which is helpful for users to understand what might go wrong and how to handle it.\\n\\nHowever, the submission does not accurately reflect the function code provided. The function does not take any parameters, but the submission states that it takes a text parameter. This could mislead users and cause confusion.\\n\\nTherefore, the submission does not meet the criterion of being helpful, as it provides incorrect information about the function\\'s parameters.\\n\\nN', 'value': 'N', 'score': 0}\n", + "{'reasoning': \"The criteria is to assess if the submission is correct, accurate, and factual.\\n\\nLooking at the submission, the introduction correctly describes the purpose of the API. The function is also correctly documented with a description, parameters, and return values. However, the parameters section is incorrect. The function 'detect_credential' does not take any parameters, but the submission states that it takes a 'text' parameter. This is not accurate according to the provided function code.\\n\\nThe return value is correctly described as an optional string, which matches the function code. The error handling section correctly identifies that an 'IdentityError' can be raised, which is also accurate according to the function code.\\n\\nIn conclusion, the submission is mostly correct and accurate, but it fails to be completely factual due to the incorrect description of the function's parameters.\\n\\nN\", 'value': 'N', 'score': 0}\n", + "{'reasoning': \"The criteria for this task is to assess whether the output is complete and captures all required fields. \\n\\nLooking at the submission, the introduction is present and describes the purpose of the API. \\n\\nThe function 'detect_credential' is documented with a description, parameters, and return values. However, the parameters section is incorrect. The function does not take any parameters according to the provided function code, but the submission states that it takes a 'text' parameter. This is a discrepancy.\\n\\nThe error handling section is present and describes the possible error that can occur.\\n\\nBased on this analysis, the submission does not meet the criteria because it incorrectly documents a parameter that does not exist in the function.\\n\\nN\", 'value': 'N', 'score': 0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_600/2635151755.py:38: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " df = df.append(new_row, ignore_index=True)\n" + ] + } + ], + "source": [ + "df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "fbedc847-bb4b-484b-9689-b7e5edba3f5c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.0NaN5.0NaN4.0NaN4.0NaN5.0NaN0.0NaN0.0NaN0.0NaN
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 NaN 5.0 NaN \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 NaN 4.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 NaN 5.0 NaN \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 NaN 0.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 NaN 0.0 NaN " + ] + }, + "execution_count": 130, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "244be0e0-c18e-4329-8fc9-7669370321b3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Append Human Scores\n", + "\n", + "df.at[0, 'human_accuracy_score'] = '2.0'\n", + "df.at[0, 'human_relevance_score'] = '3.0'\n", + "df.at[0, 'human_clarity_score'] = '4.0'\n", + "df.at[0, 'human_completeness_score'] = '4.0'\n", + "df.at[0, 'human_readability_score'] = '5.0'\n", + "df.at[0, 'human_helpfulness'] = '0.0'\n", + "df.at[0, 'human_correctness'] = '0.0'\n", + "df.at[0, 'human_logical'] = '0.0'" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "c8476dc3-d5ca-4f93-b57f-57622bde246d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.054.04.04.05.05.00.000.000.00
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 54.0 4.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0 0.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0 0.0 0 " + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "dd414598-e98b-4891-874d-178bc063ad73", + "metadata": {}, + "source": [ + "Note: Above is a great example of where the generated documentation is partially incorrect and the langchain eval criteria is able to detect the issue correctly." + ] + }, + { + "cell_type": "markdown", + "id": "b89f8748-22b4-449c-b32b-f469a5dc7ef4", + "metadata": { + "tags": [] + }, + "source": [ + "### Example 2 - Do not Re-run" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "ccd823f7-9fff-46dc-ab63-75d6ad459963", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generated_text='1. Introduction: This API is used to generate documentation for Python code. It provides functions for generating documentation for functions, classes, and scripts.\\n\\n2. Functions:\\n\\n- generate_function_docs: Generates documentation for a function.\\n- generate_class_docs: Generates documentation for a class.\\n- generate_script_docs: Generates documentation for a script.\\n\\n3. Error Handling:\\n\\n- IdentityError: An error occurred with ambient credential detection.\\n- IssuerError: An error occurred with the OIDC issuer.\\n- NetworkError: A network error occurred.\\n\\nMake sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\\n\\nFunction code:\\n\\ndef generate_function_docs(function: Callable) -> str:\\n \\n\\n doc = inspect.getdoc(function)\\n if doc is None:\\n raise ValueError(f\"function {function.__name__!r} has no docstring\")\\n\\n signature = inspect.signature(function)\\n\\n doc_lines = [\\n f\"Function: {function.__name__}\",\\n \"\",\\n f\"{doc}\",\\n \"\",\\n \"Parameters:\",\\n ]\\n\\n for name, param in signature.parameters.items():\\n doc_lines.append(f\"- {name}: {param.annotation}\")\\n\\n return \"\\\\n\".join(doc_lines)\\nFunction Documentation:\\n\\n1. Introduction: This function generates documentation for a function.\\n\\n2. Parameters:\\n\\n- function: The function to generate documentation for.\\n\\n3. Return Values: A string containing the generated documentation.\\n\\n4. Error Handling:\\n\\n- ValueError: The function has no docstring.\\n\\n5. Diagnostics:\\n\\n- Insufficient permissions for GitHub Actions workflow: The most common reason for this is incorrect configuration of the top-level `permissions` setting of the workflow YAML file. It should be configured like so:\\n\\n permissions:\\n id-token: write\\n\\n Relevant documentation here:\\n\\n https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/about-security-hardening-with-openid-connect#adding-permissions-settings\\n\\n Another possible reason is that the workflow run has been triggered by a PR from a forked repository. PRs from forked repositories typically cannot be granted write access.\\n\\n Relevant documentation here:\\n\\n https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token\\n\\n Additional context:\\n\\n An issue occurred with ambient credential detection.\\n\\n Additional context:\\n\\n An error occurred with ambient credential detection.\\n\\n\\nClass code:\\n\\ndef generate_class_docs(cls: Type) -> str:\\n \\n\\n doc = inspect.getdoc(cls)\\n if doc is None:\\n raise ValueError(f\"class {cls.__name__!r} has no docstring\")\\n\\n signature = inspect.signature(cls)\\n\\n doc_lines = [\\n f\"Class: {cls.__name__}\",\\n \"\",\\n f\"{doc}\",\\n \"\",\\n \"Parameters:\",\\n ]\\n\\n for name, param in signature.parameters.items():\\n doc_lines.append(f\"- {name}: {param.annotation}\")\\n\\n return \"\\\\n\".join(doc_lines)\\nClass Documentation:\\n\\n1. Introduction: This function generates documentation for a class.\\n\\n2. Parameters:\\n\\n- cls: The class to generate documentation for.\\n\\n3. Return Values: A string containing the generated documentation.\\n\\n4. Error Handling:\\n\\n- ValueError: The class has no docstring.\\n\\n5. Diagnostics:\\n\\n- Insufficient permissions for GitHub Actions workflow: The most common reason for this is incorrect configuration of the top-level `permissions` setting of the workflow YAML file. It should be configured like so:\\n\\n permissions:\\n id-token: write\\n\\n Relevant documentation here:\\n\\n https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/about-security-hardening-with-openid-connect#adding-permissions-settings\\n\\n Another possible reason is that the workflow run has been triggered by a PR from a forked repository. PRs from forked repositories typically cannot be granted write access.\\n\\n Relevant documentation here:\\n\\n https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token\\n\\n Additional context:\\n\\n An issue occurred with ambient credential detection.\\n\\n Additional context:\\n\\n An error occurred with ambient credential detection.\\n\\n\\nClass code:\\n\\ndef generate_script_docs(script_path: Path) -> str:\\n \\n\\n doc = inspect.getdoc(script_path)\\n if doc is None:\\n raise ValueError(f\"script {script_' generated_token_count=1024 generated_tokens=None input_text=None input_token_count=2427 input_tokens=None moderation=None seed=511192819.0 stop_reason='max_tokens' stop_sequence=None\n" + ] + } + ], + "source": [ + "prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'oidc', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "81cc9177-d622-4e8d-9704-bb09e91843df", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Prompt \n", + " \n", + "You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:\n", + "\n", + "1. Introduction: Briefly describe the purpose of the API and its intended use.\n", + "2. Functions: Document each API function, including:\n", + " - Description: Clearly explain what the endpoint or function does.\n", + " - Parameters: List and describe each parameter, including data types and any constraints.\n", + " - Return Values: Specify the data type and possible values returned.\n", + "\n", + "3. Error Handling: Describe possible error responses and their meanings.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n", + "\n", + "\n", + " \n", + "Class code:\n", + "\n", + "class _OpenIDConfiguration(BaseModel):\n", + " \n", + "\n", + " authorization_endpoint: StrictStr\n", + " token_endpoint: StrictStr\n", + "class ExpiredIdentity(Exception):\n", + " \n", + "class IdentityToken:\n", + " \n", + "\n", + " def __init__(self, raw_token: str) -> None:\n", + " \n", + "\n", + " self._raw_token = raw_token\n", + "\n", + " # NOTE: The lack of verification here is intentional, and is part of\n", + " # Sigstore's verification model: clients like sigstore-python are\n", + " # responsible only for forwarding the OIDC identity to Fulcio for\n", + " # certificate binding and issuance.\n", + " try:\n", + " self._unverified_claims = jwt.decode(\n", + " raw_token,\n", + " options={\n", + " \"verify_signature\": False,\n", + " \"verify_aud\": True,\n", + " \"verify_iat\": True,\n", + " \"verify_exp\": True,\n", + " # These claims are required by OpenID Connect, so\n", + " # we can strongly enforce their presence.\n", + " # See: https://openid.net/specs/openid-connect-basic-1_0.html#IDToken\n", + " \"require\": [\"aud\", \"sub\", \"iat\", \"exp\", \"iss\"],\n", + " },\n", + " audience=DEFAULT_AUDIENCE,\n", + " # NOTE: This leeway shouldn't be strictly necessary, but is\n", + " # included to preempt any (small) skew between the host\n", + " # and the originating IdP.\n", + " leeway=5,\n", + " )\n", + " except Exception as exc:\n", + " raise IdentityError(\n", + " \"Identity token is malformed or missing claims\"\n", + " ) from exc\n", + "\n", + " self._iss: str = self._unverified_claims[\"iss\"]\n", + " self._nbf: int | None = self._unverified_claims.get(\"nbf\")\n", + " self._exp: int = self._unverified_claims[\"exp\"]\n", + "\n", + " # Fail early if this token isn't within its validity period.\n", + " if not self.in_validity_period():\n", + " raise IdentityError(\"Identity token is not within its validity period\")\n", + "\n", + " # When verifying the private key possession proof, Fulcio uses\n", + " # different claims depending on the token's issuer.\n", + " # We currently special-case a handful of these, and fall back\n", + " # on signing the \"sub\" claim otherwise.\n", + " identity_claim = _KNOWN_OIDC_ISSUERS.get(self.issuer)\n", + " if identity_claim is not None:\n", + " if identity_claim not in self._unverified_claims:\n", + " raise IdentityError(\n", + " f\"Identity token is missing the required {identity_claim!r} claim\"\n", + " )\n", + "\n", + " self._identity = str(self._unverified_claims.get(identity_claim))\n", + " else:\n", + " try:\n", + " self._identity = str(self._unverified_claims[\"sub\"])\n", + " except KeyError:\n", + " raise IdentityError(\n", + " \"Identity token is missing the required 'sub' claim\"\n", + " )\n", + "\n", + " # This identity token might have been retrieved directly from\n", + " # an identity provider, or it might be a \"federated\" identity token\n", + " # retrieved from a federated IdP (e.g., Sigstore's own Dex instance).\n", + " # In the latter case, the claims will also include a `federated_claims`\n", + " # set, which in turn should include a `connector_id` that reflects\n", + " # the \"real\" token issuer. We retrieve this, despite technically\n", + " # being an implementation detail, because it has value to client\n", + " # users: a client might want to make sure that its user is identifying\n", + " # with a *particular* IdP, which means that they need to pierce the\n", + " # federation layer to check which IdP is actually being used.\n", + " self._federated_issuer: str | None = None\n", + " federated_claims = self._unverified_claims.get(\"federated_claims\")\n", + " if federated_claims is not None:\n", + " if not isinstance(federated_claims, dict):\n", + " raise IdentityError(\n", + " \"unexpected claim type: federated_claims is not a dict\"\n", + " )\n", + "\n", + " federated_issuer = federated_claims.get(\"connector_id\")\n", + " if federated_issuer is not None:\n", + " if not isinstance(federated_issuer, str):\n", + " raise IdentityError(\n", + " \"unexpected claim type: federated_claims.connector_id is not a string\"\n", + " )\n", + "\n", + " self._federated_issuer = federated_issuer\n", + "\n", + " def in_validity_period(self) -> bool:\n", + " \n", + "\n", + " now = datetime.now(timezone.utc).timestamp()\n", + "\n", + " if self._nbf is not None:\n", + " return self._nbf <= now < self._exp\n", + " else:\n", + " return now < self._exp\n", + "\n", + " @property\n", + " def identity(self) -> str:\n", + " \n", + " return self._identity\n", + "\n", + " @property\n", + " def issuer(self) -> str:\n", + " \n", + " return self._iss\n", + "\n", + " @property\n", + " def expected_certificate_subject(self) -> str:\n", + " \n", + " if self._federated_issuer is not None:\n", + " return self._federated_issuer\n", + "\n", + " return self.issuer\n", + "\n", + " def __str__(self) -> str:\n", + " \n", + " return self._raw_token\n", + "class IssuerError(Exception):\n", + " \n", + "\n", + " pass\n", + "class Issuer:\n", + " \n", + "\n", + " def __init__(self, base_url: str) -> None:\n", + " \n", + " oidc_config_url = urllib.parse.urljoin(\n", + " f\"{base_url}/\", \".well-known/openid-configuration\"\n", + " )\n", + "\n", + " try:\n", + " resp: requests.Response = requests.get(oidc_config_url, timeout=30)\n", + " except (requests.ConnectionError, requests.Timeout) as exc:\n", + " raise NetworkError from exc\n", + "\n", + " try:\n", + " resp.raise_for_status()\n", + " except requests.HTTPError as http_error:\n", + " raise IssuerError from http_error\n", + "\n", + " try:\n", + " # We don't generally expect this to fail (since the provider should\n", + " # return a non-success HTTP code which we catch above), but we\n", + " # check just in case we have a misbehaving OIDC issuer.\n", + " self.oidc_config = _OpenIDConfiguration.model_validate(resp.json())\n", + " except ValueError as exc:\n", + " raise IssuerError(f\"OIDC issuer returned invalid configuration: {exc}\")\n", + "\n", + " @classmethod\n", + " def production(cls) -> Issuer:\n", + " \n", + " return cls(DEFAULT_OAUTH_ISSUER_URL)\n", + "\n", + " @classmethod\n", + " def staging(cls) -> Issuer:\n", + " \n", + " return cls(STAGING_OAUTH_ISSUER_URL)\n", + "\n", + " def identity_token( # nosec: B107\n", + " self,\n", + " client_id: str = \"sigstore\",\n", + " client_secret: str = \"\",\n", + " force_oob: bool = False,\n", + " ) -> IdentityToken:\n", + " \n", + "\n", + " # This function and the components that it relies on are based off of:\n", + " # https://github.com/psteniusubi/python-sample\n", + "\n", + " from sigstore._internal.oidc.oauth import _OAuthFlow\n", + "\n", + " code: str\n", + " with _OAuthFlow(client_id, client_secret, self) as server:\n", + " # Launch web browser\n", + " if not force_oob and webbrowser.open(server.base_uri):\n", + " print(\"Waiting for browser interaction...\", file=sys.stderr)\n", + " else:\n", + " server.enable_oob()\n", + " print(\n", + " f\"Go to the following link in a browser:\\n\\n\\t{server.auth_endpoint}\",\n", + " file=sys.stderr,\n", + " )\n", + "\n", + " if not server.is_oob():\n", + " # Wait until the redirect server populates the response\n", + " while server.auth_response is None:\n", + " time.sleep(0.1)\n", + "\n", + " auth_error = server.auth_response.get(\"error\")\n", + " if auth_error is not None:\n", + " raise IdentityError(\n", + " f\"Error response from auth endpoint: {auth_error[0]}\"\n", + " )\n", + " code = server.auth_response[\"code\"][0]\n", + " else:\n", + " # In the out-of-band case, we wait until the user provides the code\n", + " code = input(\"Enter verification code: \")\n", + "\n", + " # Provide code to token endpoint\n", + " data = {\n", + " \"grant_type\": \"authorization_code\",\n", + " \"redirect_uri\": server.redirect_uri,\n", + " \"code\": code,\n", + " \"code_verifier\": server.oauth_session.code_verifier,\n", + " }\n", + " auth = (\n", + " client_id,\n", + " client_secret,\n", + " )\n", + " logging.debug(f\"PAYLOAD: data={data}\")\n", + " try:\n", + " resp: requests.Response = requests.post(\n", + " self.oidc_config.token_endpoint,\n", + " data=data,\n", + " auth=auth,\n", + " timeout=30,\n", + " )\n", + " except (requests.ConnectionError, requests.Timeout) as exc:\n", + " raise NetworkError from exc\n", + "\n", + " try:\n", + " resp.raise_for_status()\n", + " except requests.HTTPError as http_error:\n", + " raise IdentityError(\n", + " f\"Token request failed with {resp.status_code}\"\n", + " ) from http_error\n", + "\n", + " token_json = resp.json()\n", + " token_error = token_json.get(\"error\")\n", + " if token_error is not None:\n", + " raise IdentityError(f\"Error response from token endpoint: {token_error}\")\n", + "\n", + " return IdentityToken(token_json[\"access_token\"])\n", + "class IdentityError(Error):\n", + " \n", + "\n", + " @classmethod\n", + " def raise_from_id(cls, exc: id.IdentityError) -> NoReturn:\n", + " \n", + " raise cls(str(exc)) from exc\n", + "\n", + " def diagnostics(self) -> str:\n", + " \n", + " if isinstance(self.__cause__, id.GitHubOidcPermissionCredentialError):\n", + " return f\n", + " Insufficient permissions for GitHub Actions workflow.\n", + "\n", + " The most common reason for this is incorrect\n", + " configuration of the top-level `permissions` setting of the\n", + " workflow YAML file. It should be configured like so:\n", + "\n", + " permissions:\n", + " id-token: write\n", + "\n", + " Relevant documentation here:\n", + "\n", + " https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/about-security-hardening-with-openid-connect#adding-permissions-settings\n", + "\n", + " Another possible reason is that the workflow run has been\n", + " triggered by a PR from a forked repository. PRs from forked\n", + " repositories typically cannot be granted write access.\n", + "\n", + " Relevant documentation here:\n", + "\n", + " https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token\n", + "\n", + " Additional context:\n", + "\n", + " {self.__cause__}\n", + " \n", + " else:\n", + " return f\n", + " An issue occurred with ambient credential detection.\n", + "\n", + " Additional context:\n", + "\n", + " {self}\n", + " \n", + "\n", + "Class Documentation:\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\"\\n Prompt \\n\", prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "id": "6ee68b6e-d4e5-477d-949c-a84b58c3d5c9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Generated Patch \n", + " 1. Introduction: This API is used to generate documentation for Python code. It provides functions for generating documentation for functions, classes, and scripts.\n", + "\n", + "2. Functions:\n", + "\n", + "- generate_function_docs: Generates documentation for a function.\n", + "- generate_class_docs: Generates documentation for a class.\n", + "- generate_script_docs: Generates documentation for a script.\n", + "\n", + "3. Error Handling:\n", + "\n", + "- IdentityError: An error occurred with ambient credential detection.\n", + "- IssuerError: An error occurred with the OIDC issuer.\n", + "- NetworkError: A network error occurred.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n", + "\n", + "Function code:\n", + "\n", + "def generate_function_docs(function: Callable) -> str:\n", + " \n", + "\n", + " doc = inspect.getdoc(function)\n", + " if doc is None:\n", + " raise ValueError(f\"function {function.__name__!r} has no docstring\")\n", + "\n", + " signature = inspect.signature(function)\n", + "\n", + " doc_lines = [\n", + " f\"Function: {function.__name__}\",\n", + " \"\",\n", + " f\"{doc}\",\n", + " \"\",\n", + " \"Parameters:\",\n", + " ]\n", + "\n", + " for name, param in signature.parameters.items():\n", + " doc_lines.append(f\"- {name}: {param.annotation}\")\n", + "\n", + " return \"\\n\".join(doc_lines)\n", + "Function Documentation:\n", + "\n", + "1. Introduction: This function generates documentation for a function.\n", + "\n", + "2. Parameters:\n", + "\n", + "- function: The function to generate documentation for.\n", + "\n", + "3. Return Values: A string containing the generated documentation.\n", + "\n", + "4. Error Handling:\n", + "\n", + "- ValueError: The function has no docstring.\n", + "\n", + "5. Diagnostics:\n", + "\n", + "- Insufficient permissions for GitHub Actions workflow: The most common reason for this is incorrect configuration of the top-level `permissions` setting of the workflow YAML file. It should be configured like so:\n", + "\n", + " permissions:\n", + " id-token: write\n", + "\n", + " Relevant documentation here:\n", + "\n", + " https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/about-security-hardening-with-openid-connect#adding-permissions-settings\n", + "\n", + " Another possible reason is that the workflow run has been triggered by a PR from a forked repository. PRs from forked repositories typically cannot be granted write access.\n", + "\n", + " Relevant documentation here:\n", + "\n", + " https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token\n", + "\n", + " Additional context:\n", + "\n", + " An issue occurred with ambient credential detection.\n", + "\n", + " Additional context:\n", + "\n", + " An error occurred with ambient credential detection.\n", + "\n", + "\n", + "Class code:\n", + "\n", + "def generate_class_docs(cls: Type) -> str:\n", + " \n", + "\n", + " doc = inspect.getdoc(cls)\n", + " if doc is None:\n", + " raise ValueError(f\"class {cls.__name__!r} has no docstring\")\n", + "\n", + " signature = inspect.signature(cls)\n", + "\n", + " doc_lines = [\n", + " f\"Class: {cls.__name__}\",\n", + " \"\",\n", + " f\"{doc}\",\n", + " \"\",\n", + " \"Parameters:\",\n", + " ]\n", + "\n", + " for name, param in signature.parameters.items():\n", + " doc_lines.append(f\"- {name}: {param.annotation}\")\n", + "\n", + " return \"\\n\".join(doc_lines)\n", + "Class Documentation:\n", + "\n", + "1. Introduction: This function generates documentation for a class.\n", + "\n", + "2. Parameters:\n", + "\n", + "- cls: The class to generate documentation for.\n", + "\n", + "3. Return Values: A string containing the generated documentation.\n", + "\n", + "4. Error Handling:\n", + "\n", + "- ValueError: The class has no docstring.\n", + "\n", + "5. Diagnostics:\n", + "\n", + "- Insufficient permissions for GitHub Actions workflow: The most common reason for this is incorrect configuration of the top-level `permissions` setting of the workflow YAML file. It should be configured like so:\n", + "\n", + " permissions:\n", + " id-token: write\n", + "\n", + " Relevant documentation here:\n", + "\n", + " https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/about-security-hardening-with-openid-connect#adding-permissions-settings\n", + "\n", + " Another possible reason is that the workflow run has been triggered by a PR from a forked repository. PRs from forked repositories typically cannot be granted write access.\n", + "\n", + " Relevant documentation here:\n", + "\n", + " https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token\n", + "\n", + " Additional context:\n", + "\n", + " An issue occurred with ambient credential detection.\n", + "\n", + " Additional context:\n", + "\n", + " An error occurred with ambient credential detection.\n", + "\n", + "\n", + "Class code:\n", + "\n", + "def generate_script_docs(script_path: Path) -> str:\n", + " \n", + "\n", + " doc = inspect.getdoc(script_path)\n", + " if doc is None:\n", + " raise ValueError(f\"script {script_\n" + ] + } + ], + "source": [ + "print(\"\\n Generated Text \\n\", generated_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "id": "f6100452-8179-4249-9034-d627e7ff8121", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ sigstore](../sigstore.html)\n", + "\n", + "## API Documentation\n", + "\n", + " * DEFAULT_OAUTH_ISSUER_URL\n", + " * STAGING_OAUTH_ISSUER_URL\n", + " * DEFAULT_AUDIENCE\n", + " * ExpiredIdentity\n", + " * IdentityToken\n", + " * IdentityToken\n", + " * in_validity_period\n", + " * identity\n", + " * issuer\n", + " * expected_certificate_subject\n", + " * IssuerError\n", + " * Issuer\n", + " * Issuer\n", + " * production\n", + " * staging\n", + " * identity_token\n", + " * IdentityError\n", + " * raise_from_id\n", + " * diagnostics\n", + " * detect_credential\n", + "\n", + "[ built with pdoc ](https://pdoc.dev \"pdoc: Python API documentation\n", + "generator\")\n", + "\n", + "# [sigstore](./../sigstore.html).oidc\n", + "\n", + "API for retrieving OIDC tokens.\n", + "\n", + "View Source\n", + " \n", + "\n", + "DEFAULT_OAUTH_ISSUER_URL = 'https://oauth2.sigstore.dev/auth'\n", + "\n", + "STAGING_OAUTH_ISSUER_URL = 'https://oauth2.sigstage.dev/auth'\n", + "\n", + "DEFAULT_AUDIENCE = 'sigstore'\n", + "\n", + "class ExpiredIdentity(builtins.Exception): View Source\n", + " \n", + "\n", + "An error raised when an identity token is expired.\n", + "\n", + "##### Inherited Members\n", + "\n", + "builtins.Exception\n", + "\n", + " Exception\n", + "\n", + "builtins.BaseException\n", + "\n", + " with_traceback\n", + " add_note\n", + " args\n", + "\n", + "class IdentityToken: View Source\n", + " \n", + "\n", + "An OIDC \"identity\", corresponding to an underlying OIDC token with a sensible\n", + "subject, issuer, and audience for Sigstore purposes.\n", + "\n", + "IdentityToken(raw_token: str) View Source\n", + " \n", + "\n", + "Create a new `IdentityToken` from the given OIDC token.\n", + "\n", + "def in_validity_period(self) -> bool: View Source\n", + " \n", + "\n", + "Returns whether or not this `Identity` is currently within its self-stated\n", + "validity period.\n", + "\n", + "NOTE: As noted in `Identity.__init__`, this is not a verifying wrapper; the\n", + "check here only asserts whether the _unverified_ identity's claims are within\n", + "their validity period.\n", + "\n", + "identity: str\n", + "\n", + "Returns this `IdentityToken`'s underlying \"subject\".\n", + "\n", + "Note that this is **not** always the `sub` claim in the corresponding identity\n", + "token: depending onm the token's issuer, it may be a _different_ claim, such\n", + "as `email`. This corresponds to the Sigstore ecosystem's behavior, e.g. in\n", + "each issued certificate's SAN.\n", + "\n", + "issuer: str\n", + "\n", + "Returns a URL identifying this `IdentityToken`'s issuer.\n", + "\n", + "expected_certificate_subject: str\n", + "\n", + "Returns a URL identifying the **expected** subject for any Sigstore\n", + "certificate issued against this identity token.\n", + "\n", + "The behavior of this field is slightly subtle: for non-federated identity\n", + "providers (like a token issued directly by Google's IdP) it should be exactly\n", + "equivalent to `IdentityToken.issuer`. For federated issuers (like Sigstore's\n", + "own federated IdP) it should be equivalent to the underlying federated\n", + "issuer's URL, which is kept in an implementation-defined claim.\n", + "\n", + "This attribute exists so that clients who wish to inspect the expected subject\n", + "of their certificates can do so without relying on implementation-specific\n", + "behavior.\n", + "\n", + "class IssuerError(builtins.Exception): View Source\n", + " \n", + "\n", + "Raised on any communication or format error with an OIDC issuer.\n", + "\n", + "##### Inherited Members\n", + "\n", + "builtins.Exception\n", + "\n", + " Exception\n", + "\n", + "builtins.BaseException\n", + "\n", + " with_traceback\n", + " add_note\n", + " args\n", + "\n", + "class Issuer: View Source\n", + " \n", + "\n", + "Represents an OIDC issuer (IdP).\n", + "\n", + "Issuer(base_url: str) View Source\n", + " \n", + "\n", + "Create a new `Issuer` from the given base URL.\n", + "\n", + "This URL is used to locate an OpenID Connect configuration file, which is then\n", + "used to bootstrap the issuer's state (such as authorization and token\n", + "endpoints).\n", + "\n", + "@classmethod\n", + "\n", + "def production(cls) -> Issuer: View Source\n", + " \n", + "\n", + "Returns an `Issuer` configured against Sigstore's production-level services.\n", + "\n", + "@classmethod\n", + "\n", + "def staging(cls) -> Issuer: View Source\n", + " \n", + "\n", + "Returns an `Issuer` configured against Sigstore's staging-level services.\n", + "\n", + "def identity_token( self, client_id: str = 'sigstore', client_secret: str =\n", + "'', force_oob: bool = False) -> IdentityToken: View Source\n", + " \n", + "\n", + "Retrieves and returns an `IdentityToken` from the current `Issuer`, via OAuth.\n", + "\n", + "This function blocks on user interaction.\n", + "\n", + "The `force_oob` flag controls the kind of flow performed. When `False` (the\n", + "default), this function attempts to open the user's web browser before falling\n", + "back to an out-of-band flow. When `True`, the out-of-band flow is always used.\n", + "\n", + "class IdentityError([sigstore.errors.Error](errors.html#Error)): View Source\n", + " \n", + "\n", + "Wraps `id`'s IdentityError.\n", + "\n", + "@classmethod\n", + "\n", + "def raise_from_id(cls, exc: id.IdentityError) -> NoReturn: View Source\n", + " \n", + "\n", + "Raises a wrapped IdentityError from the provided `id.IdentityError`.\n", + "\n", + "def diagnostics(self) -> str: View Source\n", + " \n", + "\n", + "Returns diagnostics for the error.\n", + "\n", + "##### Inherited Members\n", + "\n", + "builtins.Exception\n", + "\n", + " Exception\n", + "\n", + "[sigstore.errors.Error](errors.html#Error)\n", + "\n", + " [print_and_exit](errors.html#Error.print_and_exit)\n", + "\n", + "builtins.BaseException\n", + "\n", + " with_traceback\n", + " add_note\n", + " args\n", + "\n", + "def detect_credential() -> Optional[str]: View Source\n", + " \n", + "\n", + "Calls `id.detect_credential`, but wraps exceptions with our own exception\n", + "type.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(actual_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "39b0ecfb-58da-46c9-8f1d-d5be5e470df4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 4 - The generated documentation accurately identifies the purpose and functionality of the API functions and classes. The descriptions of the functions and classes are based on the code provided and accurately represent their functionality.\n", + "\n", + "Relevance: 3.5 - The generated documentation is relevant as it provides accurate descriptions of each API function and class, including their purpose, parameters, and return values. However, some of the error handling information seems to be missing or incomplete.\n", + "\n", + "Clarity: 3.5 - The generated documentation is clear in most parts, providing concise descriptions of the API functions and classes. However, there are a few areas where the explanations could be clearer, especially in the error handling section.\n", + "\n", + "Completeness: 3 - The generated documentation provides descriptions of each API function and class, including their purpose and parameters. However, some parts of the documentation, especially in the error handling section, are incomplete or missing important details.\n", + "\n", + "Readability: 4 - The generated documentation is readable and well-organized. The descriptions are clear and concise, making it easy for users to understand the purpose and functionality of the API functions and classes.\n", + "\n", + "Overall Score: 3.6\n" + ] + } + ], + "source": [ + "gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "650daba3-8a73-4ea9-963b-919c57a2afa4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "25f1fc90-86c5-4ca9-b147-54b717b7fd9e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reasoning': 'The criteria for this task is \"helpfulness\". The submission is supposed to be helpful, insightful, and appropriate. \\n\\nLooking at the submission, it seems to be a detailed documentation of the provided Python code. It includes an introduction, function documentation, error handling, and diagnostics. It also provides links to relevant documentation for further reading. \\n\\nHowever, the submission seems to have misunderstood the task. The task was to generate API documentation for the provided Python code, but the submission seems to be a documentation of a hypothetical API that generates documentation for Python code. This is a significant misunderstanding of the task.\\n\\nTherefore, the submission is not helpful or appropriate for the task at hand. \\n\\nN', 'value': 'N', 'score': 0}\n", + "{'reasoning': 'The submission is supposed to provide API documentation for the provided Python code. The code provided includes several classes and methods, including the _OpenIDConfiguration class, the ExpiredIdentity exception, the IdentityToken class, the IssuerError exception, the Issuer class, and the IdentityError class.\\n\\nThe submission, however, does not accurately reflect the provided code. It instead provides documentation for non-existent functions such as generate_function_docs, generate_class_docs, and generate_script_docs. The submission also includes error handling and diagnostics information that is not present in the provided code.\\n\\nThe reference API documentation provides a more accurate representation of the provided code, including the correct classes, methods, and exceptions. The submission does not match this reference documentation.\\n\\nBased on this analysis, the submission is not correct, accurate, or factual. It does not accurately document the provided Python code, and it includes information that is not present in the code. Therefore, the submission does not meet the criteria. \\n\\nN', 'value': 'N', 'score': 0}\n", + "{'reasoning': 'The criteria for this task is to assess whether the output is complete and captures all required fields. \\n\\nLooking at the submission, it seems to be a mix of API documentation and Python code. The task was to generate API documentation for the provided Python code, but the submission includes Python code for generating documentation, which is not part of the task.\\n\\nThe submission does include some elements of API documentation, such as an introduction, function descriptions, parameters, return values, and error handling. However, these elements are not consistently applied to all the classes and functions in the provided Python code. \\n\\nFor example, the classes _OpenIDConfiguration, ExpiredIdentity, IdentityToken, IssuerError, Issuer, and IdentityError are not documented at all. The functions within the IdentityToken and Issuer classes are also not documented. \\n\\nFurthermore, the submission includes documentation for functions that are not part of the provided Python code, such as generate_function_docs, generate_class_docs, and generate_script_docs. \\n\\nTherefore, the submission does not meet the criteria of being complete and capturing all required fields. \\n\\nN', 'value': 'N', 'score': 0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_600/2635151755.py:38: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " df = df.append(new_row, ignore_index=True)\n" + ] + } + ], + "source": [ + "df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "ce332c83-86e4-49d3-a842-2333bbf7edb4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.054.04.04.05.05.00.000.000.00
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.0NaN3.0NaN3.0NaN3.0NaN4.0NaN0.0NaN0.0NaN0.0NaN
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 NaN 3.0 NaN \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 54.0 4.0 \n", + "1 3.0 NaN 3.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 NaN 4.0 NaN \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0 0.0 \n", + "1 0.0 NaN 0.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0 0.0 0 \n", + "1 NaN 0.0 NaN " + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "6668bbac-618f-4932-b628-839bd74bd905", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Append Human Scores\n", + "\n", + "df.at[1, 'human_accuracy_score'] = '1.0'\n", + "df.at[1, 'human_relevance_score'] = '1.0'\n", + "df.at[1, 'human_clarity_score'] = '1.0'\n", + "df.at[1, 'human_completeness_score'] = '1.0'\n", + "df.at[1, 'human_readability_score'] = '1.0'\n", + "df.at[1, 'human_helpfulness'] = '0.0'\n", + "df.at[1, 'human_correctness'] = '0.0'\n", + "df.at[1, 'human_logical'] = '0.0'" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "8208bd8a-4387-4c9f-b9ba-cebe495648f1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 " + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "9efe3efb-0cc4-447e-89bf-41aa3e0911cd", + "metadata": {}, + "source": [ + "Note: This is a great example where the generated output completely misunderstood the task and hallucinated content, langchain evaluation was able to catch the error well. Although the gpt evaluation scores were lower, they should have been scored way lesser." + ] + }, + { + "cell_type": "markdown", + "id": "f41b9a7b-6e12-43d5-b0cf-85242dc6f371", + "metadata": {}, + "source": [ + "### Example 3 - Do not Re-run" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "47a7d83e-d889-4786-b7cb-7b5b9995934c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generated_text='1. Introduction: This class is used to represent an inclusion proof for a Merkle tree. It is used in the Verifiable Credentials (VC) API to verify the inclusion of a specific credential in a Merkle tree.\\n\\n2. Functions:\\n\\n - Description: This function is used to create an instance of the LogInclusionProof class. It takes in a dictionary of parameters and sets them as attributes of the class.\\n\\n - Parameters:\\n - checkpoint (str): The checkpoint of the Merkle tree.\\n - hashes (list): A list of hashes in the inclusion proof.\\n - log_index (int): The index of the log in the Merkle tree.\\n - root_hash (str): The root hash of the Merkle tree.\\n - tree_size (int): The size of the Merkle tree.\\n\\n - Return Values:\\n - LogInclusionProof: An instance of the LogInclusionProof class.\\n\\n - Error Handling:\\n - ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a ValueError is raised.\\n\\n3. Error Handling:\\n - ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a ValueError is raised.\\n\\nFunction code:\\n\\ndef generate_proof(\\n log_index: int,\\n tree_size: int,\\n root_hash: str,\\n hashes: List[str],\\n checkpoint: str,\\n) -> LogInclusionProof:\\n \"\"\"\\n Generate an inclusion proof for a Merkle tree.\\n\\n Args:\\n log_index (int): The index of the log in the Merkle tree.\\n tree_size (int): The size of the Merkle tree.\\n root_hash (str): The root hash of the Merkle tree.\\n hashes (list): A list of hashes in the inclusion proof.\\n checkpoint (str): The checkpoint of the Merkle tree.\\n\\n Returns:\\n LogInclusionProof: An instance of the LogInclusionProof class.\\n\\n Raises:\\n ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a ValueError is raised.\\n \"\"\"\\n return LogInclusionProof(\\n log_index=log_index,\\n tree_size=tree_size,\\n root_hash=root_hash,\\n hashes=hashes,\\n checkpoint=checkpoint,\\n )\\n\\nFunction Documentation:\\n\\n1. Introduction: This function is used to generate an inclusion proof for a Merkle tree. It takes in the log index, tree size, root hash, hashes, and checkpoint of the Merkle tree and returns an instance of the LogInclusionProof class.\\n\\n2. Parameters:\\n - log_index (int): The index of the log in the Merkle tree.\\n - tree_size (int): The size of the Merkle tree.\\n - root_hash (str): The root hash of the Merkle tree.\\n - hashes (list): A list of hashes in the inclusion proof.\\n - checkpoint (str): The checkpoint of the Merkle tree.\\n\\n3. Return Values:\\n - LogInclusionProof: An instance of the LogInclusionProof class.\\n\\n4. Error Handling:\\n - ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a ValueError is raised.\\n\\nScript code:\\n\\nimport os\\nimport sys\\n\\nfrom pydantic import BaseModel, Field\\nfrom pydantic.dataclasses import dataclass\\nfrom pydantic.fields import ConfigDict, ValidationInfo, field_validator\\nfrom typing import Any, List, StrictInt, StrictStr\\n\\n\\nclass LogInclusionProof(BaseModel):\\n \\n\\n model_config = ConfigDict(populate_by_name=True)\\n\\n checkpoint: StrictStr = Field(..., alias=\"checkpoint\")\\n hashes: List[StrictStr] = Field(..., alias=\"hashes\")\\n log_index: StrictInt = Field(..., alias=\"logIndex\")\\n root_hash: StrictStr = Field(..., alias=\"rootHash\")\\n tree_size: StrictInt = Field(..., alias=\"treeSize\")\\n\\n @field_validator(\"log_index\")\\n def _log_index_positive(cls, v: int) -> int:\\n if v < 0:\\n raise ValueError(f\"Inclusion proof has invalid log index: {v} < 0\")\\n return v\\n\\n @field_validator(\"tree_size\")\\n def _tree_size_positive(cls, v: int) -> int:\\n if v < 0:\\n raise ValueError(f\"Inclusion proof has invalid tree size: {v} < 0\")\\n return v\\n\\n @field_validator(\"tree_size\")\\n def _log' generated_token_count=1024 generated_tokens=None input_text=None input_token_count=505 input_tokens=None moderation=None seed=133181325.0 stop_reason='max_tokens' stop_sequence=None\n" + ] + } + ], + "source": [ + "prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'transparency', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "ef9f94eb-f9c5-47b2-851c-5eb009fb9397", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Prompt \n", + " \n", + "You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:\n", + "\n", + "1. Introduction: Briefly describe the purpose of the API and its intended use.\n", + "2. Functions: Document each API function, including:\n", + " - Description: Clearly explain what the endpoint or function does.\n", + " - Parameters: List and describe each parameter, including data types and any constraints.\n", + " - Return Values: Specify the data type and possible values returned.\n", + "\n", + "3. Error Handling: Describe possible error responses and their meanings.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n", + "\n", + "\n", + " \n", + "Class code:\n", + "\n", + "class LogInclusionProof(BaseModel):\n", + " \n", + "\n", + " model_config = ConfigDict(populate_by_name=True)\n", + "\n", + " checkpoint: StrictStr = Field(..., alias=\"checkpoint\")\n", + " hashes: List[StrictStr] = Field(..., alias=\"hashes\")\n", + " log_index: StrictInt = Field(..., alias=\"logIndex\")\n", + " root_hash: StrictStr = Field(..., alias=\"rootHash\")\n", + " tree_size: StrictInt = Field(..., alias=\"treeSize\")\n", + "\n", + " @field_validator(\"log_index\")\n", + " def _log_index_positive(cls, v: int) -> int:\n", + " if v < 0:\n", + " raise ValueError(f\"Inclusion proof has invalid log index: {v} < 0\")\n", + " return v\n", + "\n", + " @field_validator(\"tree_size\")\n", + " def _tree_size_positive(cls, v: int) -> int:\n", + " if v < 0:\n", + " raise ValueError(f\"Inclusion proof has invalid tree size: {v} < 0\")\n", + " return v\n", + "\n", + " @field_validator(\"tree_size\")\n", + " def _log_index_within_tree_size(\n", + " cls, v: int, info: ValidationInfo, **kwargs: Any\n", + " ) -> int:\n", + " if \"log_index\" in info.data and v <= info.data[\"log_index\"]:\n", + " raise ValueError(\n", + " \"Inclusion proof has log index greater than or equal to tree size: \"\n", + " f\"{v} <= {info.data['log_index']}\"\n", + " )\n", + " return v\n", + "\n", + "Class Documentation:\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\"\\n Prompt \\n\", prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "id": "caa58060-66a0-4912-b974-d5ae39cc59fa", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Generated Text \n", + " 1. Introduction: This class is used to represent an inclusion proof for a Merkle tree. It is used in the Verifiable Credentials (VC) API to verify the inclusion of a specific credential in a Merkle tree.\n", + "\n", + "2. Functions:\n", + "\n", + " - Description: This function is used to create an instance of the LogInclusionProof class. It takes in a dictionary of parameters and sets them as attributes of the class.\n", + "\n", + " - Parameters:\n", + " - checkpoint (str): The checkpoint of the Merkle tree.\n", + " - hashes (list): A list of hashes in the inclusion proof.\n", + " - log_index (int): The index of the log in the Merkle tree.\n", + " - root_hash (str): The root hash of the Merkle tree.\n", + " - tree_size (int): The size of the Merkle tree.\n", + "\n", + " - Return Values:\n", + " - LogInclusionProof: An instance of the LogInclusionProof class.\n", + "\n", + " - Error Handling:\n", + " - ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a ValueError is raised.\n", + "\n", + "3. Error Handling:\n", + " - ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a ValueError is raised.\n", + "\n", + "Function code:\n", + "\n", + "def generate_proof(\n", + " log_index: int,\n", + " tree_size: int,\n", + " root_hash: str,\n", + " hashes: List[str],\n", + " checkpoint: str,\n", + ") -> LogInclusionProof:\n", + " \"\"\"\n", + " Generate an inclusion proof for a Merkle tree.\n", + "\n", + " Args:\n", + " log_index (int): The index of the log in the Merkle tree.\n", + " tree_size (int): The size of the Merkle tree.\n", + " root_hash (str): The root hash of the Merkle tree.\n", + " hashes (list): A list of hashes in the inclusion proof.\n", + " checkpoint (str): The checkpoint of the Merkle tree.\n", + "\n", + " Returns:\n", + " LogInclusionProof: An instance of the LogInclusionProof class.\n", + "\n", + " Raises:\n", + " ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a ValueError is raised.\n", + " \"\"\"\n", + " return LogInclusionProof(\n", + " log_index=log_index,\n", + " tree_size=tree_size,\n", + " root_hash=root_hash,\n", + " hashes=hashes,\n", + " checkpoint=checkpoint,\n", + " )\n", + "\n", + "Function Documentation:\n", + "\n", + "1. Introduction: This function is used to generate an inclusion proof for a Merkle tree. It takes in the log index, tree size, root hash, hashes, and checkpoint of the Merkle tree and returns an instance of the LogInclusionProof class.\n", + "\n", + "2. Parameters:\n", + " - log_index (int): The index of the log in the Merkle tree.\n", + " - tree_size (int): The size of the Merkle tree.\n", + " - root_hash (str): The root hash of the Merkle tree.\n", + " - hashes (list): A list of hashes in the inclusion proof.\n", + " - checkpoint (str): The checkpoint of the Merkle tree.\n", + "\n", + "3. Return Values:\n", + " - LogInclusionProof: An instance of the LogInclusionProof class.\n", + "\n", + "4. Error Handling:\n", + " - ValueError: If the log index or tree size is negative or if the log index is greater than or equal to the tree size, a ValueError is raised.\n", + "\n", + "Script code:\n", + "\n", + "import os\n", + "import sys\n", + "\n", + "from pydantic import BaseModel, Field\n", + "from pydantic.dataclasses import dataclass\n", + "from pydantic.fields import ConfigDict, ValidationInfo, field_validator\n", + "from typing import Any, List, StrictInt, StrictStr\n", + "\n", + "\n", + "class LogInclusionProof(BaseModel):\n", + " \n", + "\n", + " model_config = ConfigDict(populate_by_name=True)\n", + "\n", + " checkpoint: StrictStr = Field(..., alias=\"checkpoint\")\n", + " hashes: List[StrictStr] = Field(..., alias=\"hashes\")\n", + " log_index: StrictInt = Field(..., alias=\"logIndex\")\n", + " root_hash: StrictStr = Field(..., alias=\"rootHash\")\n", + " tree_size: StrictInt = Field(..., alias=\"treeSize\")\n", + "\n", + " @field_validator(\"log_index\")\n", + " def _log_index_positive(cls, v: int) -> int:\n", + " if v < 0:\n", + " raise ValueError(f\"Inclusion proof has invalid log index: {v} < 0\")\n", + " return v\n", + "\n", + " @field_validator(\"tree_size\")\n", + " def _tree_size_positive(cls, v: int) -> int:\n", + " if v < 0:\n", + " raise ValueError(f\"Inclusion proof has invalid tree size: {v} < 0\")\n", + " return v\n", + "\n", + " @field_validator(\"tree_size\")\n", + " def _log\n" + ] + } + ], + "source": [ + "print(\"\\n Generated Text \\n\", generated_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "0abde7aa-8368-4961-9efb-c7bb4439f745", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 5 - The generated documentation accurately describes the purpose of the API class and function, as well as the parameters, return values, and error handling.\n", + "\n", + "Relevance: 5 - The generated documentation is relevant as it provides accurate and specific information about the class and function, including their purpose, parameters, return values, and error handling.\n", + "\n", + "Clarity: 5 - The generated documentation is clear and easy to understand. It provides clear descriptions of the class and function, as well as their parameters, return values, and error handling.\n", + "\n", + "Completeness: 5 - The generated documentation is complete as it includes all the necessary information about the class and function, including their purpose, parameters, return values, and error handling.\n", + "\n", + "Readability: 5 - The generated documentation is highly readable. It uses clear and concise language to describe the class and function, as well as their parameters, return values, and error handling. The formatting and organization of the documentation is also clean and easy to follow.\n", + "\n", + "Overall Score: 5\n" + ] + } + ], + "source": [ + "gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "78c55090-65f9-4a53-82f2-83220cfe9b27", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "id": "818c116b-711b-47e1-aa1f-3542b736fc5f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reasoning': 'The criterion for this task is \"helpfulness\". The submission should be helpful, insightful, and appropriate.\\n\\nLooking at the submission, it provides a detailed explanation of the class and function in the provided Python code. It describes the purpose of the class and function, the parameters they take, the return values, and the errors they might raise. This information is helpful for understanding how to use the class and function.\\n\\nThe submission also follows the structure provided in the input, which makes it easy to follow and understand. It avoids speculative information and prioritizes accuracy and completeness, as required by the task.\\n\\nTherefore, the submission meets the criterion of being helpful, insightful, and appropriate.\\n\\nY', 'value': 'Y', 'score': 1}\n", + "{'reasoning': 'The submission is being evaluated for correctness, accuracy, and factualness. \\n\\n1. Correctness: The submission correctly describes the purpose of the class and function, their parameters, return values, and error handling. The descriptions match the provided Python code and the reference documentation. The submission also correctly follows the requested output structure.\\n\\n2. Accuracy: The submission accurately describes the class and function. The descriptions of the parameters, return values, and error handling are accurate and match the provided Python code and the reference documentation.\\n\\n3. Factualness: The submission is factual and does not include speculative information. The descriptions are based on the provided Python code and the reference documentation.\\n\\nBased on these evaluations, the submission meets all the criteria. \\n\\nY', 'value': 'Y', 'score': 1}\n", + "{'reasoning': 'The criteria is to assess if the output is complete and captures all required fields. \\n\\nLooking at the submission, it provides documentation for the class and function provided in the input. \\n\\nFor the class, it provides an introduction, describes the function, lists and describes the parameters, specifies the return values, and describes possible error responses. \\n\\nFor the function, it provides an introduction, lists and describes the parameters, specifies the return values, and describes possible error responses. \\n\\nThe submission also provides documentation for the script code, but the script code is cut off and not complete. \\n\\nHowever, the criteria only asks if the output is complete and captures all required fields. The output does capture all required fields for the class and function. The script code is not a required field, so its incompleteness does not affect the criteria. \\n\\nTherefore, the submission meets the criteria. \\n\\nY', 'value': 'Y', 'score': 1}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_600/2635151755.py:38: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " df = df.append(new_row, ignore_index=True)\n" + ] + } + ], + "source": [ + "df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "488357b4-963a-4d01-bebd-fd0dc90d75ad", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
2\\nYou are an AI system specialized at generati...1. Introduction: This class is used to represe...5.0NaN5.0NaN5.0NaN5.0NaN5.0NaN1.0NaN1.0NaN1.0NaN
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "2 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "2 1. Introduction: This class is used to represe... 5.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "2 NaN 5.0 NaN \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "2 5.0 NaN 5.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "2 NaN 5.0 NaN \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 1.0 NaN 1.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 NaN 1.0 NaN " + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "id": "d8981d30-c402-4c7c-8f08-4f186cca7158", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Append Human Scores\n", + "\n", + "df.at[2, 'human_accuracy_score'] = '2.0'\n", + "df.at[2, 'human_relevance_score'] = '3.0'\n", + "df.at[2, 'human_clarity_score'] = '3.0'\n", + "df.at[2, 'human_completeness_score'] = '2.0'\n", + "df.at[2, 'human_readability_score'] = '3.0'\n", + "df.at[2, 'human_helpfulness'] = '1.0'\n", + "df.at[2, 'human_correctness'] = '0.0'\n", + "df.at[2, 'human_logical'] = '1.0'" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "id": "b8fd95e0-bf0b-42ba-b297-058a58d99569", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
2\\nYou are an AI system specialized at generati...1. Introduction: This class is used to represe...5.02.05.03.05.03.05.02.05.03.01.01.01.00.01.01.0
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "2 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "2 1. Introduction: This class is used to represe... 5.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "2 5.0 3.0 5.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 1.0 1.0 1.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 1.0 1.0 " + ] + }, + "execution_count": 177, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "bf2bf401-609b-42da-af02-aa6aade21dc5", + "metadata": {}, + "source": [ + "Note: This is an interesting case where the generated output correctly captures the conditions being checked for in the given class but it also halluicinates function code. The answer is still however correct in places for example it correctly captured that\n", + "\n", + "```\n", + "_log_index_positive ensures that the log_index value is non-negative.\n", + "_tree_size_positive ensures that the tree_size value is non-negative.\n", + "_log_index_within_tree_size ensures that the log_index is within the range of the tree_size.\n", + "```\n", + "\n", + "but in some places it is made up and inaccurate and langchain evaluation was not able to assess that correctly" + ] + }, + { + "cell_type": "markdown", + "id": "5430700f-9668-4e3c-a09c-099bc94f5de2", + "metadata": {}, + "source": [ + "### Example 4 - Do not Re-run" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "id": "21c4fef4-cf22-435c-9aba-54fe7ab1fcc3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generated_text='1. Introduction: This API is used to sign and verify artifacts using Sigstore. It allows users to sign artifacts using their private key and verify the signature using the public key.\\n2. Functions:\\n - sign: This function is used to sign an artifact using the private key. It takes an input stream as an argument and returns a SigningResult object.\\n - verify: This function is used to verify the signature of an artifact using the public key. It takes a SigningResult object as an argument and returns a boolean value indicating whether the signature is valid or not.\\n3. Error Handling:\\n - ExpiredIdentity: This error is raised when the provided identity token is expired.\\n - ExpiredCertificate: This error is raised when the provided certificate is expired.\\n - InvalidCertificate: This error is raised when the provided certificate is invalid.\\n - InvalidSignature: This error is raised when the provided signature is invalid.\\n - InvalidCertificateChain: This error is raised when the provided certificate chain is invalid.\\n - InvalidTransparencyLogEntry: This error is raised when the provided transparency log entry is invalid.\\n - InvalidBundle: This error is raised when the provided bundle is invalid.\\n\\nMake sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.' generated_token_count=284 generated_tokens=None input_text=None input_token_count=2219 input_tokens=None moderation=None seed=411052976.0 stop_reason='eos_token' stop_sequence=None\n" + ] + } + ], + "source": [ + "prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'sign', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "id": "7ca742b4-d95b-4c01-95df-5f92f6daa3a2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Prompt \n", + " \n", + "You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:\n", + "\n", + "1. Introduction: Briefly describe the purpose of the API and its intended use.\n", + "2. Functions: Document each API function, including:\n", + " - Description: Clearly explain what the endpoint or function does.\n", + " - Parameters: List and describe each parameter, including data types and any constraints.\n", + " - Return Values: Specify the data type and possible values returned.\n", + "\n", + "3. Error Handling: Describe possible error responses and their meanings.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n", + "\n", + "\n", + " \n", + "Class code:\n", + "\n", + "class Signer:\n", + " \n", + "\n", + " def __init__(\n", + " self,\n", + " identity_token: IdentityToken,\n", + " signing_ctx: SigningContext,\n", + " cache: bool = True,\n", + " ) -> None:\n", + " \n", + " self._identity_token = identity_token\n", + " self._signing_ctx: SigningContext = signing_ctx\n", + " self.__cached_private_key: Optional[ec.EllipticCurvePrivateKey] = None\n", + " self.__cached_signing_certificate: Optional[\n", + " FulcioCertificateSigningResponse\n", + " ] = None\n", + " if cache:\n", + " logger.debug(\"Generating ephemeral keys...\")\n", + " self.__cached_private_key = ec.generate_private_key(ec.SECP256R1())\n", + " logger.debug(\"Requesting ephemeral certificate...\")\n", + " self.__cached_signing_certificate = self._signing_cert(self._private_key)\n", + "\n", + " @property\n", + " def _private_key(self) -> ec.EllipticCurvePrivateKey:\n", + " \n", + " if self.__cached_private_key is None:\n", + " logger.debug(\"no cached key; generating ephemeral key\")\n", + " return ec.generate_private_key(ec.SECP256R1())\n", + " return self.__cached_private_key\n", + "\n", + " def _signing_cert(\n", + " self,\n", + " private_key: ec.EllipticCurvePrivateKey,\n", + " ) -> FulcioCertificateSigningResponse:\n", + " \n", + " # If it exists, verify if the current certificate is expired\n", + " if self.__cached_signing_certificate:\n", + " not_valid_after = self.__cached_signing_certificate.cert.not_valid_after\n", + " not_valid_after_tzutc = not_valid_after.replace(tzinfo=timezone.utc)\n", + " if datetime.now(timezone.utc) > not_valid_after_tzutc:\n", + " raise ExpiredCertificate\n", + " return self.__cached_signing_certificate\n", + "\n", + " else:\n", + " logger.debug(\"Retrieving signed certificate...\")\n", + "\n", + " # Build an X.509 Certificiate Signing Request\n", + " builder = (\n", + " x509.CertificateSigningRequestBuilder()\n", + " .subject_name(\n", + " x509.Name(\n", + " [\n", + " x509.NameAttribute(\n", + " NameOID.EMAIL_ADDRESS, self._identity_token._identity\n", + " ),\n", + " ]\n", + " )\n", + " )\n", + " .add_extension(\n", + " x509.BasicConstraints(ca=False, path_length=None),\n", + " critical=True,\n", + " )\n", + " )\n", + " certificate_request = builder.sign(private_key, hashes.SHA256())\n", + "\n", + " certificate_response = self._signing_ctx._fulcio.signing_cert.post(\n", + " certificate_request, self._identity_token\n", + " )\n", + "\n", + " return certificate_response\n", + "\n", + " def sign(\n", + " self,\n", + " input_: IO[bytes],\n", + " ) -> SigningResult:\n", + " \n", + " input_digest = sha256_streaming(input_)\n", + " private_key = self._private_key\n", + "\n", + " if not self._identity_token.in_validity_period():\n", + " raise ExpiredIdentity\n", + "\n", + " try:\n", + " certificate_response = self._signing_cert(private_key)\n", + " except ExpiredCertificate as e:\n", + " raise e\n", + "\n", + " # TODO(alex): Retrieve the public key via TUF\n", + " #\n", + " # Verify the SCT\n", + " sct = certificate_response.sct # noqa\n", + " cert = certificate_response.cert # noqa\n", + " chain = certificate_response.chain\n", + "\n", + " verify_sct(sct, cert, chain, self._signing_ctx._rekor._ct_keyring)\n", + "\n", + " logger.debug(\"Successfully verified SCT...\")\n", + "\n", + " # Sign artifact\n", + " artifact_signature = private_key.sign(\n", + " input_digest, ec.ECDSA(Prehashed(hashes.SHA256()))\n", + " )\n", + " b64_artifact_signature = B64Str(base64.b64encode(artifact_signature).decode())\n", + "\n", + " # Prepare inputs\n", + " b64_cert = base64.b64encode(\n", + " cert.public_bytes(encoding=serialization.Encoding.PEM)\n", + " )\n", + "\n", + " # Create the transparency log entry\n", + " proposed_entry = sigstore_rekor_types.Hashedrekord(\n", + " kind=\"hashedrekord\",\n", + " api_version=\"0.0.1\",\n", + " spec=sigstore_rekor_types.HashedrekordV001Schema(\n", + " signature=sigstore_rekor_types.Signature1(\n", + " content=b64_artifact_signature,\n", + " public_key=sigstore_rekor_types.PublicKey1(\n", + " content=b64_cert.decode()\n", + " ),\n", + " ),\n", + " data=sigstore_rekor_types.Data(\n", + " hash=sigstore_rekor_types.Hash(\n", + " algorithm=sigstore_rekor_types.Algorithm.SHA256,\n", + " value=input_digest.hex(),\n", + " )\n", + " ),\n", + " ),\n", + " )\n", + " entry = self._signing_ctx._rekor.log.entries.post(proposed_entry)\n", + "\n", + " logger.debug(f\"Transparency log entry created with index: {entry.log_index}\")\n", + "\n", + " return SigningResult(\n", + " input_digest=HexStr(input_digest.hex()),\n", + " cert_pem=PEMCert(\n", + " cert.public_bytes(encoding=serialization.Encoding.PEM).decode()\n", + " ),\n", + " b64_signature=B64Str(b64_artifact_signature),\n", + " log_entry=entry,\n", + " )\n", + "class SigningContext:\n", + " \n", + "\n", + " def __init__(\n", + " self,\n", + " *,\n", + " fulcio: FulcioClient,\n", + " rekor: RekorClient,\n", + " ):\n", + " \n", + " self._fulcio = fulcio\n", + " self._rekor = rekor\n", + "\n", + " @classmethod\n", + " def production(cls) -> SigningContext:\n", + " \n", + " updater = TrustUpdater.production()\n", + " rekor = RekorClient.production(updater)\n", + " return cls(\n", + " fulcio=FulcioClient.production(),\n", + " rekor=rekor,\n", + " )\n", + "\n", + " @classmethod\n", + " def staging(cls) -> SigningContext:\n", + " \n", + " updater = TrustUpdater.staging()\n", + " rekor = RekorClient.staging(updater)\n", + " return cls(\n", + " fulcio=FulcioClient.staging(),\n", + " rekor=rekor,\n", + " )\n", + "\n", + " @contextmanager\n", + " def signer(\n", + " self, identity_token: IdentityToken, *, cache: bool = True\n", + " ) -> Iterator[Signer]:\n", + " \n", + " yield Signer(identity_token, self, cache)\n", + "class SigningResult(BaseModel):\n", + " \n", + "\n", + " input_digest: HexStr\n", + " \n", + "\n", + " cert_pem: PEMCert\n", + " \n", + "\n", + " b64_signature: B64Str\n", + " \n", + "\n", + " log_entry: LogEntry\n", + " \n", + "\n", + " def to_bundle(self) -> Bundle:\n", + " \n", + "\n", + " # NOTE: We explicitly only include the leaf certificate in the bundle's \"chain\"\n", + " # here: the specs explicitly forbid the inclusion of the root certificate,\n", + " # and discourage inclusion of any intermediates (since they're in the root of\n", + " # trust already).\n", + " cert = x509.load_pem_x509_certificate(self.cert_pem.encode())\n", + " cert_der = cert.public_bytes(encoding=serialization.Encoding.DER)\n", + " chain = X509CertificateChain(certificates=[X509Certificate(raw_bytes=cert_der)])\n", + "\n", + " inclusion_proof: InclusionProof | None = None\n", + " if self.log_entry.inclusion_proof is not None:\n", + " inclusion_proof = InclusionProof(\n", + " log_index=self.log_entry.inclusion_proof.log_index,\n", + " root_hash=bytes.fromhex(self.log_entry.inclusion_proof.root_hash),\n", + " tree_size=self.log_entry.inclusion_proof.tree_size,\n", + " hashes=[\n", + " bytes.fromhex(h) for h in self.log_entry.inclusion_proof.hashes\n", + " ],\n", + " checkpoint=Checkpoint(\n", + " envelope=self.log_entry.inclusion_proof.checkpoint\n", + " ),\n", + " )\n", + "\n", + " tlog_entry = TransparencyLogEntry(\n", + " log_index=self.log_entry.log_index,\n", + " log_id=LogId(key_id=bytes.fromhex(self.log_entry.log_id)),\n", + " kind_version=KindVersion(kind=\"hashedrekord\", version=\"0.0.1\"),\n", + " integrated_time=self.log_entry.integrated_time,\n", + " inclusion_promise=InclusionPromise(\n", + " signed_entry_timestamp=base64.b64decode(\n", + " self.log_entry.inclusion_promise\n", + " )\n", + " )\n", + " if self.log_entry.inclusion_promise\n", + " else None,\n", + " inclusion_proof=inclusion_proof,\n", + " canonicalized_body=base64.b64decode(self.log_entry.body),\n", + " )\n", + "\n", + " material = VerificationMaterial(\n", + " x509_certificate_chain=chain,\n", + " tlog_entries=[tlog_entry],\n", + " )\n", + "\n", + " bundle = Bundle(\n", + " media_type=\"application/vnd.dev.sigstore.bundle+json;version=0.2\",\n", + " verification_material=material,\n", + " message_signature=MessageSignature(\n", + " message_digest=HashOutput(\n", + " algorithm=HashAlgorithm.SHA2_256,\n", + " digest=bytes.fromhex(self.input_digest),\n", + " ),\n", + " signature=base64.b64decode(self.b64_signature),\n", + " ),\n", + " )\n", + "\n", + " return bundle\n", + "\n", + "Class Documentation:\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\"\\n Prompt \\n\", prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "id": "4d4f21b8-1d92-46bb-8b6f-27e70ea5d569", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Generated Text \n", + " 1. Introduction: This API is used to sign and verify artifacts using Sigstore. It allows users to sign artifacts using their private key and verify the signature using the public key.\n", + "2. Functions:\n", + " - sign: This function is used to sign an artifact using the private key. It takes an input stream as an argument and returns a SigningResult object.\n", + " - verify: This function is used to verify the signature of an artifact using the public key. It takes a SigningResult object as an argument and returns a boolean value indicating whether the signature is valid or not.\n", + "3. Error Handling:\n", + " - ExpiredIdentity: This error is raised when the provided identity token is expired.\n", + " - ExpiredCertificate: This error is raised when the provided certificate is expired.\n", + " - InvalidCertificate: This error is raised when the provided certificate is invalid.\n", + " - InvalidSignature: This error is raised when the provided signature is invalid.\n", + " - InvalidCertificateChain: This error is raised when the provided certificate chain is invalid.\n", + " - InvalidTransparencyLogEntry: This error is raised when the provided transparency log entry is invalid.\n", + " - InvalidBundle: This error is raised when the provided bundle is invalid.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n" + ] + } + ], + "source": [ + "print(\"\\n Generated Text \\n\", generated_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "id": "b16cdfb2-94a7-42e4-aaa6-67c728376686", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 4 - The generated documentation accurately describes the purpose of the API and its functions. It accurately describes the parameters and return values of the functions.\n", + "Relevance: 5 - The generated documentation is relevant as it provides information about how to use the API functions and what error handling is implemented.\n", + "Clarity: 3 - The generated documentation provides clear descriptions of the purpose of the API and its functions. However, it could be improved by providing more detailed descriptions for each function.\n", + "Completeness: 4 - The generated documentation includes the introduction, functions, and error handling sections as required. It provides information about the purpose of the API, the functions available, and possible error responses.\n", + "Readability: 5 - The generated documentation is readable and follows a clear structure. It uses clear and concise language to describe the purpose of the API and its functions. The sections are organized logically and are easy to understand.\n", + "Overall Score: 4\n" + ] + } + ], + "source": [ + "gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "id": "abddcc41-d81a-47de-9942-99fd4acb2c4a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "id": "02147658-0c12-4a1b-8380-bd5e170cd8ef", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reasoning': 'The criterion for this task is \"helpfulness\". The submission is supposed to be helpful, insightful, and appropriate. \\n\\nLooking at the submission, it provides a brief introduction to the API, which is helpful for users to understand what the API is used for. \\n\\nThe submission also documents the functions of the API, including a description of what each function does, the parameters it takes, and the return values. This is insightful as it provides users with the necessary information to use the API functions. \\n\\nThe submission also describes possible error responses and their meanings, which is appropriate as it helps users understand what could go wrong when using the API and how to handle these errors. \\n\\nHowever, the submission includes a function \"verify\" which is not present in the provided Python code. This is misleading and not accurate. \\n\\nTherefore, the submission is not completely helpful, insightful, and appropriate. \\n\\nN', 'value': 'N', 'score': 0}\n", + "{'reasoning': 'The submission is not entirely correct. The task was to generate API documentation for the provided Python code, which includes three classes: Signer, SigningContext, and SigningResult. The submission, however, only provides documentation for two functions, sign and verify, and does not cover all the methods and properties of the classes. Furthermore, the verify function is not present in the provided code, which makes the submission inaccurate. The submission also does not provide a description for each parameter, including data types and any constraints, and does not specify the data type and possible values returned for each function. The error handling section is also incorrect as it includes errors that are not present in the provided code. Therefore, the submission does not meet the criteria of being correct, accurate, and factual. \\n\\nN', 'value': 'N', 'score': 0}\n", + "{'reasoning': 'The criteria is to assess if the output is complete and captures all required fields. \\n\\nLooking at the submission, it provides an introduction to the API and describes its intended use. It also documents the functions, including their descriptions, parameters, and return values. The submission also describes possible error responses and their meanings. \\n\\nHowever, the submission does not accurately document all the functions and classes provided in the input. The input includes three classes: Signer, SigningContext, and SigningResult. Each of these classes has several methods, but the submission only documents two functions: sign and verify. The verify function is not even present in the provided code. \\n\\nThe submission also does not accurately document the parameters and return values of the functions. For example, the sign function in the Signer class takes an input stream and a private key as arguments, but the submission only mentions the input stream. The return value is a SigningResult object, which is correctly mentioned in the submission. \\n\\nThe submission also does not accurately document the error handling. The provided code only raises two exceptions: ExpiredIdentity and ExpiredCertificate. The other errors mentioned in the submission are not present in the provided code.\\n\\nTherefore, the submission does not meet the criteria of being complete and capturing all required fields. \\n\\nN', 'value': 'N', 'score': 0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_600/2635151755.py:38: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " df = df.append(new_row, ignore_index=True)\n" + ] + } + ], + "source": [ + "df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "id": "f134462b-15b7-4bdc-8d14-f5d4054edac1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
2\\nYou are an AI system specialized at generati...1. Introduction: This class is used to represe...5.02.05.03.05.03.05.02.05.03.01.01.01.00.01.01.0
3\\nYou are an AI system specialized at generati...1. Introduction: This API is used to sign and ...4.0NaN5.0NaN3.0NaN4.0NaN5.0NaN0.0NaN0.0NaN0.0NaN
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "2 \\nYou are an AI system specialized at generati... \n", + "3 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "2 1. Introduction: This class is used to represe... 5.0 \n", + "3 1. Introduction: This API is used to sign and ... 4.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 NaN 5.0 NaN \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "2 5.0 3.0 5.0 \n", + "3 3.0 NaN 4.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 NaN 5.0 NaN \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 1.0 1.0 1.0 \n", + "3 0.0 NaN 0.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 1.0 1.0 \n", + "3 NaN 0.0 NaN " + ] + }, + "execution_count": 184, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "id": "fcd57416-5f7d-4bd2-8e32-5145701188e1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Append Human Scores\n", + "\n", + "df.at[3, 'human_accuracy_score'] = '1.0'\n", + "df.at[3, 'human_relevance_score'] = '1.0'\n", + "df.at[3, 'human_clarity_score'] = '1.0'\n", + "df.at[3, 'human_completeness_score'] = '1.0'\n", + "df.at[3, 'human_readability_score'] = '2.0'\n", + "df.at[3, 'human_helpfulness'] = '0.0'\n", + "df.at[3, 'human_correctness'] = '0.0'\n", + "df.at[3, 'human_logical'] = '0.0'" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "id": "b4e2d7e3-e792-4a34-bb83-f1164e04d5d5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
2\\nYou are an AI system specialized at generati...1. Introduction: This class is used to represe...5.02.05.03.05.03.05.02.05.03.01.01.01.00.01.01.0
3\\nYou are an AI system specialized at generati...1. Introduction: This API is used to sign and ...4.01.05.01.03.01.04.01.05.02.00.00.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "2 \\nYou are an AI system specialized at generati... \n", + "3 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "2 1. Introduction: This class is used to represe... 5.0 \n", + "3 1. Introduction: This API is used to sign and ... 4.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 1.0 \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "2 5.0 3.0 5.0 \n", + "3 3.0 1.0 4.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 2.0 \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 1.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 " + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "80095a8b-58fa-4e7c-be93-27808ae2fd41", + "metadata": {}, + "source": [ + "Note: This is a great example where the generated output was quite terrible, hallucinated classes, incomplete list of classes, incorrect explanations. GPT scored them well but langchain correctly captured the errors and pointed out the mistakes." + ] + }, + { + "cell_type": "markdown", + "id": "7cdf80e7-b473-4c11-8dc5-3a4cc0e46883", + "metadata": {}, + "source": [ + "### Example 5 - Do not Re-run" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "99d62521-94e1-47a3-a6c9-09e3d2a03773", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "**Introduction:**\n", + "\n", + "The `LogInclusionProof` class represents an inclusion proof for a log entry in a Merkle tree. It is used to provide evidence that a particular log entry is included in the Merkle tree.\n", + "\n", + "**Functions:**\n", + "\n", + "1. `__init__()`:\n", + " \n", + " - **Description:** Initializes a new instance of the `LogInclusionProof` class.\n", + " - **Parameters:**\n", + " - None\n", + " - **Return Value:** None\n", + "\n", + "\n", + "2. `__repr__()`:\n", + " \n", + " - **Description:** Returns a string representation of the `LogInclusionProof` class instance.\n", + " - **Parameters:**\n", + " - None\n", + " - **Return Value:** String representation of the `LogInclusionProof` class instance.\n", + "\n", + "\n", + "3. `_log_index_positive(v: int) -> int`:\n", + " \n", + " - **Description:** Validates that the log index value is positive.\n", + " - **Parameters:**\n", + " - `v` (int): The log index value to be validated.\n", + " - **Return Value:** The validated log index value.\n", + " - **Raises:**\n", + " - ValueError: If the log index value is less than 0.\n", + " \n", + "\n", + "4. `_tree_size_positive(v: int) -> int`:\n", + " \n", + " - **Description:** Validates that the tree size value is positive.\n", + " - **Parameters:**\n", + " - `v` (int): The tree size value to be validated.\n", + " - **Return Value:** The validated tree size value.\n", + " - **Raises:**\n", + " - ValueError: If the tree size value is less than 0.\n", + " \n", + "\n", + "5. `_log_index_within_tree_size(v: int, info: ValidationInfo, **kwargs: Any) -> int`:\n", + " \n", + " - **Description:** Validates that the log index value is within the range of the tree size.\n", + " - **Parameters:**\n", + " - `v` (int): The log index value to be validated.\n", + " - `info` (ValidationInfo): Contains additional data needed for validation.\n", + " - `**kwargs` (Any): Additional keyword arguments.\n", + " - **Return Value:** The validated log index value.\n", + " - **Raises:**\n", + " - ValueError: If the log index value is greater than or equal to the tree size value or if the information data does not contain the log index value.\n", + "\n", + "**Error Handling:**\n", + "\n", + "- `ValueError`: Raised when the log index value is less than 0, when the tree size value is less than 0, or when the log index value is greater than or equal to the tree size value.\n" + ] + } + ], + "source": [ + "prompt, generated_text, actual_doc = get_response(\"OpenAI/gpt3.5\", 'transparency', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "cf38a473-7828-443a-b080-30c47f55bcf8", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Prompt \n", + " \n", + "You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:\n", + "\n", + "1. Introduction: Briefly describe the purpose of the API and its intended use.\n", + "2. Functions: Document each API function, including:\n", + " - Description: Clearly explain what the endpoint or function does.\n", + " - Parameters: List and describe each parameter, including data types and any constraints.\n", + " - Return Values: Specify the data type and possible values returned.\n", + "\n", + "3. Error Handling: Describe possible error responses and their meanings.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n", + "\n", + "\n", + " \n", + "Class code:\n", + "\n", + "class LogInclusionProof(BaseModel):\n", + " \n", + "\n", + " model_config = ConfigDict(populate_by_name=True)\n", + "\n", + " checkpoint: StrictStr = Field(..., alias=\"checkpoint\")\n", + " hashes: List[StrictStr] = Field(..., alias=\"hashes\")\n", + " log_index: StrictInt = Field(..., alias=\"logIndex\")\n", + " root_hash: StrictStr = Field(..., alias=\"rootHash\")\n", + " tree_size: StrictInt = Field(..., alias=\"treeSize\")\n", + "\n", + " @field_validator(\"log_index\")\n", + " def _log_index_positive(cls, v: int) -> int:\n", + " if v < 0:\n", + " raise ValueError(f\"Inclusion proof has invalid log index: {v} < 0\")\n", + " return v\n", + "\n", + " @field_validator(\"tree_size\")\n", + " def _tree_size_positive(cls, v: int) -> int:\n", + " if v < 0:\n", + " raise ValueError(f\"Inclusion proof has invalid tree size: {v} < 0\")\n", + " return v\n", + "\n", + " @field_validator(\"tree_size\")\n", + " def _log_index_within_tree_size(\n", + " cls, v: int, info: ValidationInfo, **kwargs: Any\n", + " ) -> int:\n", + " if \"log_index\" in info.data and v <= info.data[\"log_index\"]:\n", + " raise ValueError(\n", + " \"Inclusion proof has log index greater than or equal to tree size: \"\n", + " f\"{v} <= {info.data['log_index']}\"\n", + " )\n", + " return v\n", + "\n", + "Class Documentation:\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\"\\n Prompt \\n\", prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "id": "4b8e2ac6-b2bf-4577-8d15-ec24a8b559cf", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Generated Text \n", + " **Introduction:**\n", + "\n", + "The `LogInclusionProof` class represents an inclusion proof for a log entry in a Merkle tree. It is used to provide evidence that a particular log entry is included in the Merkle tree.\n", + "\n", + "**Functions:**\n", + "\n", + "1. `__init__()`:\n", + " \n", + " - **Description:** Initializes a new instance of the `LogInclusionProof` class.\n", + " - **Parameters:**\n", + " - None\n", + " - **Return Value:** None\n", + "\n", + "\n", + "2. `__repr__()`:\n", + " \n", + " - **Description:** Returns a string representation of the `LogInclusionProof` class instance.\n", + " - **Parameters:**\n", + " - None\n", + " - **Return Value:** String representation of the `LogInclusionProof` class instance.\n", + "\n", + "\n", + "3. `_log_index_positive(v: int) -> int`:\n", + " \n", + " - **Description:** Validates that the log index value is positive.\n", + " - **Parameters:**\n", + " - `v` (int): The log index value to be validated.\n", + " - **Return Value:** The validated log index value.\n", + " - **Raises:**\n", + " - ValueError: If the log index value is less than 0.\n", + " \n", + "\n", + "4. `_tree_size_positive(v: int) -> int`:\n", + " \n", + " - **Description:** Validates that the tree size value is positive.\n", + " - **Parameters:**\n", + " - `v` (int): The tree size value to be validated.\n", + " - **Return Value:** The validated tree size value.\n", + " - **Raises:**\n", + " - ValueError: If the tree size value is less than 0.\n", + " \n", + "\n", + "5. `_log_index_within_tree_size(v: int, info: ValidationInfo, **kwargs: Any) -> int`:\n", + " \n", + " - **Description:** Validates that the log index value is within the range of the tree size.\n", + " - **Parameters:**\n", + " - `v` (int): The log index value to be validated.\n", + " - `info` (ValidationInfo): Contains additional data needed for validation.\n", + " - `**kwargs` (Any): Additional keyword arguments.\n", + " - **Return Value:** The validated log index value.\n", + " - **Raises:**\n", + " - ValueError: If the log index value is greater than or equal to the tree size value or if the information data does not contain the log index value.\n", + "\n", + "**Error Handling:**\n", + "\n", + "- `ValueError`: Raised when the log index value is less than 0, when the tree size value is less than 0, or when the log index value is greater than or equal to the tree size value.\n" + ] + } + ], + "source": [ + "print(\"\\n Generated Text \\n\", generated_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "805b56fa-18d5-4437-9e44-d896779d0ea6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 5 - The generated documentation accurately represents the code. All information from the code is correctly documented, including function descriptions, parameter descriptions, return values, and error handling.\n", + "\n", + "Relevance: 5 - The generated documentation is relevant to the code. It accurately describes the purpose and use of the API class, as well as each individual function.\n", + "\n", + "Clarity: 4 - The generated documentation is clear. It provides clear descriptions of each function and its purpose. However, the error handling description could be more specific about the exact scenarios in which each ValueError is raised.\n", + "\n", + "Completeness: 5 - The generated documentation is complete. It covers all the functions in the class, providing descriptions, parameter information, return values, and error handling for each.\n", + "\n", + "Readability: 4 - The generated documentation is readable. It uses clear language and follows a consistent structure. However, some of the descriptions could be more concise and the error handling descriptions could be more specific.\n", + "\n", + "Overall Score: 4.5\n" + ] + } + ], + "source": [ + "gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "d4b5ecd3-2e43-4df0-9dfc-97024d9a7e5c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "id": "b436c9c0-9a12-4722-b9bf-79c6c84fd773", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reasoning': 'The criterion for this task is \"helpfulness\". The submission is to be evaluated based on whether it is helpful, insightful, and appropriate.\\n\\nLooking at the submission, it provides a detailed documentation of the `LogInclusionProof` class. It starts with an introduction that explains the purpose of the class. This is helpful for users who are not familiar with the class and its use.\\n\\nThe submission then documents each function in the class. For each function, it provides a description, lists and describes the parameters, and specifies the return value. This is helpful for users who want to understand how to use the functions and what to expect from them.\\n\\nThe submission also describes the possible error responses and their meanings. This is helpful for users who encounter errors and want to understand what they mean.\\n\\nOverall, the submission is helpful because it provides a comprehensive documentation of the `LogInclusionProof` class. It is insightful because it explains the purpose of the class and its functions, and it describes the possible error responses. It is appropriate because it follows the output structure specified in the task.\\n\\nTherefore, the submission meets the criterion of helpfulness.\\n\\nY', 'value': 'Y', 'score': 1}\n", + "{'reasoning': 'The submission is being evaluated for correctness, accuracy, and factualness. \\n\\n1. Correctness: The submission correctly follows the structure of the API documentation as requested in the input. It provides an introduction, documents each function, and describes error handling. \\n\\n2. Accuracy: The submission accurately describes the functions and their parameters, return values, and possible errors. It correctly identifies the class as representing an inclusion proof for a log entry in a Merkle tree. \\n\\n3. Factualness: The submission is factual and does not include speculative information. It is based on the provided Python code and does not make assumptions beyond what is provided in the code.\\n\\nHowever, the submission missed documenting the class attributes (checkpoint, hashes, log_index, root_hash, tree_size) which are part of the API. This is a significant omission as these attributes are crucial for understanding the functionality of the class.\\n\\nY\\nN', 'value': 'N', 'score': 0}\n", + "{'reasoning': 'The criteria for this task is to assess whether the output is complete and captures all required fields. \\n\\nLooking at the submission, the introduction provides a brief description of the purpose of the API and its intended use. \\n\\nThe functions are documented with a clear explanation of what each function does. The parameters for each function are listed and described, including data types and any constraints. The return values are specified with the data type and possible values returned. \\n\\nThe error handling section describes possible error responses and their meanings. \\n\\nHowever, the submission does not document the class attributes (checkpoint, hashes, log_index, root_hash, tree_size) which are also part of the API and should be documented according to the task instructions. \\n\\nTherefore, the submission does not meet the criteria as it does not capture all required fields. \\n\\nN', 'value': 'N', 'score': 0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_600/2635151755.py:38: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " df = df.append(new_row, ignore_index=True)\n" + ] + } + ], + "source": [ + "df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "id": "35f04195-ecb6-4691-92ee-a12dc49adc33", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
2\\nYou are an AI system specialized at generati...1. Introduction: This class is used to represe...5.02.05.03.05.03.05.02.05.03.01.01.01.00.01.01.0
3\\nYou are an AI system specialized at generati...1. Introduction: This API is used to sign and ...4.01.05.01.03.01.04.01.05.02.00.00.00.00.00.00.0
4\\nYou are an AI system specialized at generati...**Introduction:**\\n\\nThe `LogInclusionProof` c...5.0NaN5.0NaN4.0NaN5.0NaN4.0NaN1.0NaN0.0NaN0.0NaN
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "2 \\nYou are an AI system specialized at generati... \n", + "3 \\nYou are an AI system specialized at generati... \n", + "4 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "2 1. Introduction: This class is used to represe... 5.0 \n", + "3 1. Introduction: This API is used to sign and ... 4.0 \n", + "4 **Introduction:**\\n\\nThe `LogInclusionProof` c... 5.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 1.0 \n", + "4 NaN 5.0 NaN \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "2 5.0 3.0 5.0 \n", + "3 3.0 1.0 4.0 \n", + "4 4.0 NaN 5.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 2.0 \n", + "4 NaN 4.0 NaN \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 1.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "4 1.0 NaN 0.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "4 NaN 0.0 NaN " + ] + }, + "execution_count": 198, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "id": "342eca73-fa95-4a8d-bc4a-c7c13d8a1cf8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Append Human Scores\n", + "\n", + "df.at[4, 'human_accuracy_score'] = '2.0'\n", + "df.at[4, 'human_relevance_score'] = '2.0'\n", + "df.at[4, 'human_clarity_score'] = '3.0'\n", + "df.at[4, 'human_completeness_score'] = '2.0'\n", + "df.at[4, 'human_readability_score'] = '4.0'\n", + "df.at[4, 'human_helpfulness'] = '0.0'\n", + "df.at[4, 'human_correctness'] = '0.0'\n", + "df.at[4, 'human_logical'] = '1.0'" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "37af86d2-b54b-4fbb-8991-bb4a19cea68e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
2\\nYou are an AI system specialized at generati...1. Introduction: This class is used to represe...5.02.05.03.05.03.05.02.05.03.01.01.01.00.01.01.0
3\\nYou are an AI system specialized at generati...1. Introduction: This API is used to sign and ...4.01.05.01.03.01.04.01.05.02.00.00.00.00.00.00.0
4\\nYou are an AI system specialized at generati...**Introduction:**\\n\\nThe `LogInclusionProof` c...5.02.05.02.04.03.05.02.04.04.01.00.00.00.00.01.0
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "2 \\nYou are an AI system specialized at generati... \n", + "3 \\nYou are an AI system specialized at generati... \n", + "4 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "2 1. Introduction: This class is used to represe... 5.0 \n", + "3 1. Introduction: This API is used to sign and ... 4.0 \n", + "4 **Introduction:**\\n\\nThe `LogInclusionProof` c... 5.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 1.0 \n", + "4 2.0 5.0 2.0 \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "2 5.0 3.0 5.0 \n", + "3 3.0 1.0 4.0 \n", + "4 4.0 3.0 5.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 2.0 \n", + "4 2.0 4.0 4.0 \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 1.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "4 1.0 0.0 0.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 1.0 " + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "21a70af4-9a22-4b9b-96d2-9b1147f36ae3", + "metadata": {}, + "source": [ + "Note: This is again a great example of where langchain evaluation is not fully correct. While the generated output is pretty good structurally well documenting classses of the class, it hallucinates functions that are not part of the class which is unacceptable." + ] + }, + { + "cell_type": "markdown", + "id": "e5b56fa0-133f-46f8-8db2-23b608e92bff", + "metadata": { + "tags": [] + }, + "source": [ + "### Example 6 - Do not Re-run" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "592ea8ea-272f-41fc-9737-89873dcaef40", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. Introduction:\n", + "The Error class is a base class for all custom error classes in the API. It provides a common interface for handling and reporting errors. The Error class is not intended to be instantiated directly.\n", + "\n", + "The NetworkError class is a subclass of Error and represents an error that occurs when there is a network issue. It provides specific diagnostics and suggestions for resolving the issue.\n", + "\n", + "The TUFError class is a subclass of Error and represents an error that occurs in the context of The Update Framework (TUF). It provides additional context-specific diagnostics and suggestions for reporting the issue.\n", + "\n", + "The MetadataError class is a subclass of Error and represents an error that occurs when there is an issue with the metadata.\n", + "\n", + "The RootError class is a subclass of Error and represents an error that occurs when the root of trust cannot be established.\n", + "\n", + "2. Functions:\n", + "- Error.diagnostics():\n", + " - Description: Returns a string with a general diagnostic message for the error.\n", + " - Parameters: None.\n", + " - Return Values: str - A general diagnostic message for the error.\n", + "\n", + "- Error.print_and_exit(raise_error=False):\n", + " - Description: Prints the diagnostic message for the error to stderr and exits the program with exit code 1. If raise_error=True, it also raises the error.\n", + " - Parameters:\n", + " - raise_error (bool): Indicates whether to raise the error or not. Default is False.\n", + " - Return Values: None.\n", + " \n", + "- NetworkError.diagnostics():\n", + " - Description: Returns a string with a diagnostic message for the network error.\n", + " - Parameters: None.\n", + " - Return Values: str - A diagnostic message for the network error.\n", + "\n", + "- TUFError.__init__(message: str):\n", + " - Description: Initializes a TUFError instance with the provided message.\n", + " - Parameters:\n", + " - message (str): The error message.\n", + " - Return Values: None.\n", + "\n", + "- TUFError.diagnostics():\n", + " - Description: Returns a string with a diagnostic message for the TUF error.\n", + " - Parameters: None.\n", + " - Return Values: str - A diagnostic message for the TUF error.\n", + "\n", + "- MetadataError.diagnostics():\n", + " - Description: Returns a string with a diagnostic message for the metadata error.\n", + " - Parameters: None.\n", + " - Return Values: str - A diagnostic message for the metadata error.\n", + "\n", + "- RootError.diagnostics():\n", + " - Description: Returns a string with a diagnostic message for the root error.\n", + " - Parameters: None.\n", + " - Return Values: str - A diagnostic message for the root error.\n", + "\n", + "3. Error Handling:\n", + "- When an Error is raised, the diagnostic message can be accessed through the diagnostics() method.\n", + "- If the print_and_exit() method is called with raise_error=True, the error is re-raised after printing the diagnostic message.\n", + "- NetworkError indicates a network issue and suggests checking the internet connection.\n", + "- TUFError provides additional diagnostic details specific to certain error types within TUF.\n", + "- MetadataError represents an error with the metadata.\n", + "- RootError indicates a failure to establish the root of trust.\n" + ] + } + ], + "source": [ + "prompt, generated_text, actual_doc = get_response(\"OpenAI/gpt3.5\", 'errors', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "b204d858-ddea-4f91-bdd8-8dd8265b7fb2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Prompt \n", + " \n", + "You are an AI system specialized at generating API documentation for the provided Python code. You will be provided functions, classes, or Python scripts. Your documentation should include:\n", + "\n", + "1. Introduction: Briefly describe the purpose of the API and its intended use.\n", + "2. Functions: Document each API function, including:\n", + " - Description: Clearly explain what the endpoint or function does.\n", + " - Parameters: List and describe each parameter, including data types and any constraints.\n", + " - Return Values: Specify the data type and possible values returned.\n", + "\n", + "3. Error Handling: Describe possible error responses and their meanings.\n", + "\n", + "Make sure to follow this output structure to create API documentation that is clear, concise, accurate, and user-centric. Avoid speculative information and prioritize accuracy and completeness.\n", + "\n", + "\n", + " \n", + "Class code:\n", + "\n", + "class Error(Exception):\n", + " \n", + "\n", + " def diagnostics(self) -> str:\n", + " \n", + "\n", + " return An issue occurred.\n", + "\n", + " def print_and_exit(self, raise_error: bool = False) -> None:\n", + " \n", + "\n", + " remind_verbose = (\n", + " \"Raising original exception:\"\n", + " if raise_error\n", + " else \"For detailed error information, run sigstore with the `--verbose` flag.\"\n", + " )\n", + "\n", + " print(f\"{self.diagnostics()}\\n{remind_verbose}\", file=sys.stderr)\n", + "\n", + " if raise_error:\n", + " # don't want \"during handling another exception\"\n", + " self.__suppress_context__ = True\n", + " raise self\n", + "\n", + " sys.exit(1)\n", + "class NetworkError(Error):\n", + " \n", + "\n", + " def diagnostics(self) -> str:\n", + " \n", + "\n", + " cause_ctx = (\n", + " f\n", + " Additional context:\n", + "\n", + " {self.__cause__}\n", + " \n", + " if self.__cause__\n", + " else \"\"\n", + " )\n", + "\n", + " return (\n", + " \\\n", + " A network issue occurred.\n", + "\n", + " Check your internet connection and try again.\n", + " \n", + " + cause_ctx\n", + " )\n", + "class TUFError(Error):\n", + " \n", + "\n", + " def __init__(self, message: str):\n", + " \n", + " self.message = message\n", + "\n", + " from tuf.api import exceptions\n", + "\n", + " _details: Mapping[Any, str] = {\n", + " exceptions.DownloadError: NetworkError().diagnostics()\n", + " }\n", + "\n", + " def diagnostics(self) -> str:\n", + " \n", + " details = TUFError._details.get(\n", + " type(self.__context__),\n", + " \"Please report this issue at .\",\n", + " )\n", + "\n", + " return f\\\n", + " {self.message}.\n", + "\n", + " {details}\n", + " \n", + "class MetadataError(Error):\n", + " \n", + "\n", + " def diagnostics(self) -> str:\n", + " \n", + " return f{str(self)}.\n", + "class RootError(Error):\n", + " \n", + "\n", + " def diagnostics(self) -> str:\n", + " \n", + " return \\\n", + " Unable to establish root of trust.\n", + "\n", + " This error may occur when the resources embedded in this distribution of sigstore-python are out of date.\n", + "\n", + "Class Documentation:\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\"\\n Prompt \\n\", prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "816ca598-7d65-4b44-80d0-519fc83ecbd1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Generated Text \n", + " 1. Introduction:\n", + "The Error class is a base class for all custom error classes in the API. It provides a common interface for handling and reporting errors. The Error class is not intended to be instantiated directly.\n", + "\n", + "The NetworkError class is a subclass of Error and represents an error that occurs when there is a network issue. It provides specific diagnostics and suggestions for resolving the issue.\n", + "\n", + "The TUFError class is a subclass of Error and represents an error that occurs in the context of The Update Framework (TUF). It provides additional context-specific diagnostics and suggestions for reporting the issue.\n", + "\n", + "The MetadataError class is a subclass of Error and represents an error that occurs when there is an issue with the metadata.\n", + "\n", + "The RootError class is a subclass of Error and represents an error that occurs when the root of trust cannot be established.\n", + "\n", + "2. Functions:\n", + "- Error.diagnostics():\n", + " - Description: Returns a string with a general diagnostic message for the error.\n", + " - Parameters: None.\n", + " - Return Values: str - A general diagnostic message for the error.\n", + "\n", + "- Error.print_and_exit(raise_error=False):\n", + " - Description: Prints the diagnostic message for the error to stderr and exits the program with exit code 1. If raise_error=True, it also raises the error.\n", + " - Parameters:\n", + " - raise_error (bool): Indicates whether to raise the error or not. Default is False.\n", + " - Return Values: None.\n", + " \n", + "- NetworkError.diagnostics():\n", + " - Description: Returns a string with a diagnostic message for the network error.\n", + " - Parameters: None.\n", + " - Return Values: str - A diagnostic message for the network error.\n", + "\n", + "- TUFError.__init__(message: str):\n", + " - Description: Initializes a TUFError instance with the provided message.\n", + " - Parameters:\n", + " - message (str): The error message.\n", + " - Return Values: None.\n", + "\n", + "- TUFError.diagnostics():\n", + " - Description: Returns a string with a diagnostic message for the TUF error.\n", + " - Parameters: None.\n", + " - Return Values: str - A diagnostic message for the TUF error.\n", + "\n", + "- MetadataError.diagnostics():\n", + " - Description: Returns a string with a diagnostic message for the metadata error.\n", + " - Parameters: None.\n", + " - Return Values: str - A diagnostic message for the metadata error.\n", + "\n", + "- RootError.diagnostics():\n", + " - Description: Returns a string with a diagnostic message for the root error.\n", + " - Parameters: None.\n", + " - Return Values: str - A diagnostic message for the root error.\n", + "\n", + "3. Error Handling:\n", + "- When an Error is raised, the diagnostic message can be accessed through the diagnostics() method.\n", + "- If the print_and_exit() method is called with raise_error=True, the error is re-raised after printing the diagnostic message.\n", + "- NetworkError indicates a network issue and suggests checking the internet connection.\n", + "- TUFError provides additional diagnostic details specific to certain error types within TUF.\n", + "- MetadataError represents an error with the metadata.\n", + "- RootError indicates a failure to establish the root of trust.\n" + ] + } + ], + "source": [ + "print(\"\\n Generated Text \\n\", generated_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "id": "78869d03-22d3-42a8-8c84-cb4e98d9b6f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 4 - The generated documentation accurately describes the purpose and functionality of each class and function. The details from the code are correctly reflected in the documentation.\n", + "\n", + "Relevance: 5 - The generated documentation is relevant as it provides clear and concise descriptions of each class and function, including their purpose, parameters, and return values. It also includes information on error handling.\n", + "\n", + "Clarity: 4 - The generated documentation is clear and easy to understand. The descriptions for each class and function provide sufficient detail to understand their purpose and functionality.\n", + "\n", + "Completeness: 5 - The generated documentation is complete and includes descriptions for all the classes and functions in the code. It also includes information on error handling and possible error responses.\n", + "\n", + "Readability: 5 - The generated documentation is well-structured and formatted, making it easy to read and understand. The information is presented in a clear and concise manner.\n", + "\n", + "Overall Score: 4.6\n" + ] + } + ], + "source": [ + "gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "id": "95862558-294a-4bd6-80d6-d22e9d335fcd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "id": "1b5832d1-0550-47b4-99e8-26d751e57992", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reasoning': 'The criterion for this task is \"helpfulness\". The submission is to be evaluated based on whether it is helpful, insightful, and appropriate.\\n\\nLooking at the submission, it provides a detailed and structured documentation for the provided Python code. It follows the output structure provided in the input, which includes an introduction, function documentation, and error handling.\\n\\nIn the introduction, the submission provides a brief description of the purpose of each class in the API. This is helpful for users to understand the purpose and intended use of each class.\\n\\nIn the function documentation, the submission documents each function in the classes, including a description of what the function does, the parameters it takes, and the values it returns. This is insightful as it provides users with a clear understanding of how to use each function.\\n\\nIn the error handling section, the submission describes possible error responses and their meanings. This is appropriate as it helps users understand what each error means and how to handle it.\\n\\nTherefore, the submission is helpful, insightful, and appropriate. It provides a clear, concise, accurate, and user-centric documentation for the provided Python code.\\n\\nBased on the above reasoning, the submission meets the criterion. \\n\\nY', 'value': 'Y', 'score': 1}\n", + "{'reasoning': 'The submission is being evaluated for correctness, accuracy, and factualness. \\n\\n1. The submission correctly identifies the Error class as the base class for all custom error classes in the API. It also correctly describes the purpose of the Error class and its methods.\\n\\n2. The submission accurately describes the NetworkError, TUFError, MetadataError, and RootError classes as subclasses of the Error class. It also correctly describes the purpose of these classes and their methods.\\n\\n3. The submission correctly describes the parameters and return values of the methods in the Error class and its subclasses.\\n\\n4. The submission accurately describes the error handling process in the API, including the use of the diagnostics() method and the print_and_exit() method.\\n\\n5. The submission is factual and does not include any speculative information. It is based on the provided Python code and does not make any assumptions or predictions.\\n\\nBased on these points, the submission meets the criteria of correctness, accuracy, and factualness. \\n\\nY', 'value': 'Y', 'score': 1}\n", + "{'reasoning': 'The criteria for this task is to assess whether the output is complete and captures all required fields. \\n\\nLooking at the submission, it is clear that the output is structured according to the given instructions. \\n\\n1. Introduction: The submission provides a brief description of the purpose of the API and its intended use. It describes the Error class and its subclasses, NetworkError, TUFError, MetadataError, and RootError.\\n\\n2. Functions: The submission documents each API function, including a description of what the function does, a list and description of each parameter, and the data type and possible values returned.\\n\\n3. Error Handling: The submission describes possible error responses and their meanings. It explains what happens when an Error is raised, and what each subclass of Error represents.\\n\\nTherefore, the submission is complete and captures all required fields.\\n\\nY', 'value': 'Y', 'score': 1}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_600/2635151755.py:38: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + " df = df.append(new_row, ignore_index=True)\n" + ] + } + ], + "source": [ + "df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "id": "89f29b67-d5cc-4635-abff-52e487df49f1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
2\\nYou are an AI system specialized at generati...1. Introduction: This class is used to represe...5.02.05.03.05.03.05.02.05.03.01.01.01.00.01.01.0
3\\nYou are an AI system specialized at generati...1. Introduction: This API is used to sign and ...4.01.05.01.03.01.04.01.05.02.00.00.00.00.00.00.0
4\\nYou are an AI system specialized at generati...**Introduction:**\\n\\nThe `LogInclusionProof` c...5.02.05.02.04.03.05.02.04.04.01.00.00.00.00.01.0
5\\nYou are an AI system specialized at generati...1. Introduction:\\nThe Error class is a base cl...4.0NaN5.0NaN4.0NaN5.0NaN5.0NaN1.0NaN1.0NaN1.0NaN
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "2 \\nYou are an AI system specialized at generati... \n", + "3 \\nYou are an AI system specialized at generati... \n", + "4 \\nYou are an AI system specialized at generati... \n", + "5 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "2 1. Introduction: This class is used to represe... 5.0 \n", + "3 1. Introduction: This API is used to sign and ... 4.0 \n", + "4 **Introduction:**\\n\\nThe `LogInclusionProof` c... 5.0 \n", + "5 1. Introduction:\\nThe Error class is a base cl... 4.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 1.0 \n", + "4 2.0 5.0 2.0 \n", + "5 NaN 5.0 NaN \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "2 5.0 3.0 5.0 \n", + "3 3.0 1.0 4.0 \n", + "4 4.0 3.0 5.0 \n", + "5 4.0 NaN 5.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 2.0 \n", + "4 2.0 4.0 4.0 \n", + "5 NaN 5.0 NaN \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 1.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "4 1.0 0.0 0.0 \n", + "5 1.0 NaN 1.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 1.0 \n", + "5 NaN 1.0 NaN " + ] + }, + "execution_count": 207, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "cac95917-d67e-47b8-975a-49ace9496400", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Append Human Scores\n", + "\n", + "df.at[5, 'human_accuracy_score'] = '5.0'\n", + "df.at[5, 'human_relevance_score'] = '5.0'\n", + "df.at[5, 'human_clarity_score'] = '5.0'\n", + "df.at[5, 'human_completeness_score'] = '5.0'\n", + "df.at[5, 'human_readability_score'] = '5.0'\n", + "df.at[5, 'human_helpfulness'] = '1.0'\n", + "df.at[5, 'human_correctness'] = '1.0'\n", + "df.at[5, 'human_logical'] = '1.0'" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "2d9d8a0d-99e6-4f96-a8f2-c29fbcc9d7ad", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponsegpt_accuracy_scorehuman_accuracy_scoregpt_relevance_scorehuman_relevance_scoregpt_clarity_scorehuman_clarity_scoregpt_completeness_scorehuman_completeness_scoregpt_readability_scorehuman_readability_scorelangchain_helpfulnesshuman_helpfulnesslangchain_correctnesshuman_correctnesslangchain_logicalhuman_logical
0\\nYou are an AI system specialized at generati...\\nIntroduction:\\n\\nThis API provides functiona...4.02.05.03.04.04.04.04.05.05.00.00.00.00.00.00.0
1\\nYou are an AI system specialized at generati...1. Introduction: This API is used to generate ...4.01.03.01.03.01.03.01.04.01.00.00.00.00.00.00.0
2\\nYou are an AI system specialized at generati...1. Introduction: This class is used to represe...5.02.05.03.05.03.05.02.05.03.01.01.01.00.01.01.0
3\\nYou are an AI system specialized at generati...1. Introduction: This API is used to sign and ...4.01.05.01.03.01.04.01.05.02.00.00.00.00.00.00.0
4\\nYou are an AI system specialized at generati...**Introduction:**\\n\\nThe `LogInclusionProof` c...5.02.05.02.04.03.05.02.04.04.01.00.00.00.00.01.0
5\\nYou are an AI system specialized at generati...1. Introduction:\\nThe Error class is a base cl...4.05.05.05.04.05.05.05.05.05.01.01.01.01.01.01.0
\n", + "
" + ], + "text/plain": [ + " prompt \\\n", + "0 \\nYou are an AI system specialized at generati... \n", + "1 \\nYou are an AI system specialized at generati... \n", + "2 \\nYou are an AI system specialized at generati... \n", + "3 \\nYou are an AI system specialized at generati... \n", + "4 \\nYou are an AI system specialized at generati... \n", + "5 \\nYou are an AI system specialized at generati... \n", + "\n", + " response gpt_accuracy_score \\\n", + "0 \\nIntroduction:\\n\\nThis API provides functiona... 4.0 \n", + "1 1. Introduction: This API is used to generate ... 4.0 \n", + "2 1. Introduction: This class is used to represe... 5.0 \n", + "3 1. Introduction: This API is used to sign and ... 4.0 \n", + "4 **Introduction:**\\n\\nThe `LogInclusionProof` c... 5.0 \n", + "5 1. Introduction:\\nThe Error class is a base cl... 4.0 \n", + "\n", + " human_accuracy_score gpt_relevance_score human_relevance_score \\\n", + "0 2.0 5.0 3.0 \n", + "1 1.0 3.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 1.0 \n", + "4 2.0 5.0 2.0 \n", + "5 5.0 5.0 5.0 \n", + "\n", + " gpt_clarity_score human_clarity_score gpt_completeness_score \\\n", + "0 4.0 4.0 4.0 \n", + "1 3.0 1.0 3.0 \n", + "2 5.0 3.0 5.0 \n", + "3 3.0 1.0 4.0 \n", + "4 4.0 3.0 5.0 \n", + "5 4.0 5.0 5.0 \n", + "\n", + " human_completeness_score gpt_readability_score human_readability_score \\\n", + "0 4.0 5.0 5.0 \n", + "1 1.0 4.0 1.0 \n", + "2 2.0 5.0 3.0 \n", + "3 1.0 5.0 2.0 \n", + "4 2.0 4.0 4.0 \n", + "5 5.0 5.0 5.0 \n", + "\n", + " langchain_helpfulness human_helpfulness langchain_correctness \\\n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 1.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "4 1.0 0.0 0.0 \n", + "5 1.0 1.0 1.0 \n", + "\n", + " human_correctness langchain_logical human_logical \n", + "0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 1.0 1.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 1.0 \n", + "5 1.0 1.0 1.0 " + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "9b620961-5dec-45f3-8b3d-6d6c69d66b49", + "metadata": {}, + "source": [ + "Note: The output generated is quite detailed and pretty accurate to a non SME and the langchain eval seems to be capturing that correctly too. The GPT eval is also pretty high that is consistent with the human eval." + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "id": "f0a6da3b-f5cb-4470-a33e-7cc34763e689", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.to_pickle('eval_df.pkl')" + ] + }, + { + "cell_type": "markdown", + "id": "812910d9-9b4f-4430-bffe-d58bb4b67083", + "metadata": {}, + "source": [ + "## Copy this section, modify and run from here" + ] + }, + { + "cell_type": "markdown", + "id": "6ecea4ae-90bd-4932-b52d-968c02d28d28", + "metadata": {}, + "source": [ + "### Example X " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fea4b1a-df7a-4ff4-bc16-75c5d334caf8", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle('eval_df.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "828ae7e4-43ff-4db7-9eca-2f5daf0e51ea", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ade9ad54-db0d-491f-a2fe-b886991c9fcd", + "metadata": {}, + "outputs": [], + "source": [ + "prompt, generated_text, actual_doc = get_response('ibm/granite-20b-code-instruct-v1', 'oidc', functions=False, classes=False, documentation=False, imports=False, other=False, functions_code=False, functions_doc=False, classes_code=True, classes_doc=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e2b6e37-bdee-4778-904b-252e2d7a7c9b", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n Prompt \\n\", prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3913dd7-44c5-4a7f-8476-b52b6dc68f27", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n Generated Text \\n\", generated_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10065769-7d8a-49d9-83ec-192900e9e19c", + "metadata": {}, + "outputs": [], + "source": [ + "gpt_score = eval_using_model(generated_text, openai_key=openai_key, initial_prompt=prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0510cc8-b2bb-46a9-8ebb-ca3250ffa2e9", + "metadata": {}, + "outputs": [], + "source": [ + "gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score = extract_scores(gpt_score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42ec80f0-8724-474b-a7ac-615c9fa7b030", + "metadata": {}, + "outputs": [], + "source": [ + "df = append_row_to_dataframe(df, prompt, generated_text, gpt_accuracy_score, gpt_relevance_score, gpt_clarity_score, gpt_completeness_score, gpt_readability_score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18a37b82-ae78-492f-9900-28ad898c0643", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d557d78a-7e7a-4954-b5bc-e69d5bdfe450", + "metadata": {}, + "outputs": [], + "source": [ + "# Append Human Scores\n", + "\n", + "df.at[X, 'human_accuracy_score'] = '2.0'\n", + "df.at[X, 'human_relevance_score'] = '3.0'\n", + "df.at[X, 'human_clarity_score'] = '4.0'\n", + "df.at[X, 'human_completeness_score'] = '4.0'\n", + "df.at[X, 'human_readability_score'] = '5.0'\n", + "df.at[X, 'human_helpfulness'] = '0.0'\n", + "df.at[X, 'human_correctness'] = '0.0'\n", + "df.at[X, 'human_logical'] = '0.0'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abd613e7-b92c-44c7-93f7-12be30c3f693", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ce1d099-9c48-48e2-8fe3-78f24ade821a", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_pickle('eval_df.pkl')" + ] } ], "metadata": { diff --git a/requirements.txt b/requirements.txt index 1a1d8e7..1224383 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ html2text -# new ibm version breaks existing code, -# use this for the time being -# TODO: update this tio version 2.0.0 -ibm-generative-ai==0.6.1 +# ibm-generative-ai==0.6.1 +ibm-generative-ai>=2.0.0 tree-sitter openai +python-dotenv +langchain \ No newline at end of file