diff --git a/python/packages/agbench/src/agbench/linter/_base.py b/python/packages/agbench/src/agbench/linter/_base.py index c59e826d201b..ea0a893e4e7a 100644 --- a/python/packages/agbench/src/agbench/linter/_base.py +++ b/python/packages/agbench/src/agbench/linter/_base.py @@ -8,6 +8,7 @@ class Document(BaseModel): text: str = Field(..., description="Text content of the document.") + lines: List[str] = Field(..., description="List of lines in the document. This is a list of strings.") name: Optional[str] = Field(None, description="Optional name of the document.") def __hash__(self) -> int: diff --git a/python/packages/agbench/src/agbench/linter/cli.py b/python/packages/agbench/src/agbench/linter/cli.py index 14f428929b17..2a2c28aa9bab 100644 --- a/python/packages/agbench/src/agbench/linter/cli.py +++ b/python/packages/agbench/src/agbench/linter/cli.py @@ -25,7 +25,7 @@ def load_log_file(path: str, prepend_numbers: bool = False) -> Document: lines = prepend_line_numbers(lines) text = "".join(lines) - return Document(text=text, name=os.path.abspath(path)) + return Document(text=text, lines=lines, name=os.path.abspath(path)) def code_log(path: str) -> Optional[CodedDocument]: diff --git a/python/packages/agbench/src/agbench/linter/coders/_prompt.py b/python/packages/agbench/src/agbench/linter/coders/_prompt.py new file mode 100644 index 000000000000..7402c4100a99 --- /dev/null +++ b/python/packages/agbench/src/agbench/linter/coders/_prompt.py @@ -0,0 +1,73 @@ +MAIN_PROMPT = """You are an expert qualitative researcher. + +Given a document containing errors below, generate a list of (error) codes. +The document shows a log of interaction between multiple agents collaborating +to solve a complex task. + +For example, the name could be of the format "lack-of-word2", +"failed-to-bar", "excessive-use-of-magenta". Name should adhere to +Joseph M. Williams' writing principles of clarity, conciseness, and coherence. + +Ensure each code name is lower-case, hyphenated, and directly reflects the +concept it represents. Avoid ambiguous or overly complex terms, and prioritize +simplicity, precision, and readability in the naming. + +The code names should pass the 'clarity and grace' test by being easy to +understand, descriptive, and reflective of the content they categorize. +- suggest codes that are similar to good code names. avoid code names that are +similar to bad code names. +- The definition should be simple worded and practical. At least 2 sentences, +max 3. It should be written in past tense. + +It should convey how a labeller could apply this code to future logs, without +mentioning the word "labeller". The definition should be specific enough to be +useful in debugging. It should be very concrete. And should be well thought and +make sense. Bull shitting will not earn you any points. + +- The examples should be a list. Each example should be descriptive between +2-3 sentences. Examples should be concrete, informative and not vague. Provide +at max 20 salient examples. Examples should contain a lot of detail about what +happened and should refer to incidents in the log. + +- The list of codes must mutually exclusive. + +# GOOD EXAMPLES OF FINAL CODE NAMES/CLUSTERS +* looped-without-progress +* repeated-unsuccessful-actions +* repeated-syntax-errors +* exceeded-context-window-limits +* encountered-security-risks +* failure-to-switch-strategy +* exceeded-resource-limits +* attempted-to-handle-excessive-data +* no-errors-detected +These names are high-level but also concrete. They exactly mention the type of +error, issue, gap that has been identified. + +## BAD EXAMPLES OF FINAL CODE NAMES/CLUSTERS +* mismanaged-data-utilization -- too high level +* incomplete-or-misguided-execution -- too high level +* misaligned-agent-interactions -- too high level +* mismanaged-task-strategies -- too high level +* resource-inefficiencies -- vague +* communication-issues -- vague +* coordination-issues -- too high level and vague +* operational-failures +* execution-errors -- too high level +* navigation-issues -- too concise +* adaptive-failures -- too concise +* successful-processes -- I dont like the word processes +* system-constraints +* configuration-issues +* information-inaccuracies -- too high level +* process-improvements -- vague, not an error +* inadequate-error-response -- too high-level, unclear what kind of errors +* specific-access-issues -- makes no sense +* strategy-inefficiency -- strategy is too high level +* error-management-gaps -- unclear what error management means +* error-handling-deficiency -- unclear what kind of errors +* coordination-breakdown -- unclear what coordination means +* muddled-task-execution -- unclear what kind of tasks were muddled +* task-completion-gaps -- too high level +The above names are too high level and unclear. Please DO NOT use such names. +""" diff --git a/python/packages/agbench/src/agbench/linter/coders/oai_coder.py b/python/packages/agbench/src/agbench/linter/coders/oai_coder.py index 01322e0c5ccc..451abdfed4e2 100644 --- a/python/packages/agbench/src/agbench/linter/coders/oai_coder.py +++ b/python/packages/agbench/src/agbench/linter/coders/oai_coder.py @@ -5,7 +5,8 @@ from openai import OpenAI from pydantic import BaseModel -from .._base import BaseQualitativeCoder, Code, CodedDocument, Document +from .._base import BaseQualitativeCoder, Code, CodedDocument, CodeExample, Document +from ._prompt import MAIN_PROMPT class CodeList(BaseModel): @@ -21,6 +22,7 @@ def remove_control_characters(text: str) -> str: class OAIQualitativeCoder(BaseQualitativeCoder): DEFAULT_MODEL = "gpt-4o" + MAIN_PROMPT = MAIN_PROMPT def __init__(self, cache_dir: str = ".cache", model: str = DEFAULT_MODEL, cache_enabled: bool = False) -> None: self.client = OpenAI() @@ -28,11 +30,72 @@ def __init__(self, cache_dir: str = ".cache", model: str = DEFAULT_MODEL, cache_ self.model = model self.cache_enabled = cache_enabled - def code_document( - self, - doc: Document, - code_set: Optional[Set[Code]] = None, - ) -> Optional[CodedDocument]: + def code_document(self, doc: Document, code_set: Optional[Set[Code]] = None) -> Optional[CodedDocument]: + coded_doc = self._code_document(doc) + if coded_doc is None: + raise ValueError("Error in coding document with OpenAI") + + feedback = self._reflect_on_codes(coded_doc) + + coded_doc = self._code_document_with_feedback(coded_doc, feedback) + + if coded_doc is None: + raise ValueError("Error in coding document with OpenAI") + + feedback = self._reflect_on_codes(coded_doc) + + coded_doc = self._code_document_with_feedback(coded_doc, feedback) + + return coded_doc + + def _code_document_with_feedback(self, coded_doc: CodedDocument, feedback: str) -> Optional[CodedDocument]: + """ + Given a coded document and feedback, update the codes in the document. + + Again uses completion to generate new code lists + based on the doc, original codes, and feedback. + """ + + prompt = self.MAIN_PROMPT + + prompt += "\nDocument:\n" + for line in coded_doc.doc.lines: + prompt += f"{line}" + prompt += "Notice that the document contains the following number of lines: " + prompt += str(len(coded_doc.doc.lines)) + + prompt += "\n\n" + + prompt += "A previous attempt to code the document resulted in the following codes:\n" + for code in coded_doc.codes: + prompt += code.model_dump_json(indent=4) + prompt += "\n" + prompt += "\n\n" + + prompt += "A human expert has provided the following feedback on the codes:\n" + prompt += f"{feedback}\n\n" + + prompt += "Now revise the codes based on the feedback. " + + # save coding with feedback prompt to a file + # with open("coding_with_feedback_prompt.txt", "w") as f: + # f.write(prompt) + + completion = self.client.beta.chat.completions.parse( + model=self.model, + messages=[{"role": "user", "content": prompt}], + response_format=CodeList, + ) + message = completion.choices[0].message + if message.parsed and len(message.parsed.code_list) > 0: + coded_doc.codes = set(message.parsed.code_list) + else: + print(message.refusal) + raise ValueError("Error in coding document with OpenAI") + + return coded_doc + + def _code_document(self, doc: Document) -> Optional[CodedDocument]: # get hash of the document doc_hash = hash(doc) cache_file = os.path.join(self.cache_dir, f"{doc_hash}.json") if self.cache_enabled else None @@ -50,15 +113,12 @@ def code_document( coded_document: Optional[CodedDocument] = None - if code_set is None: - completion = self.client.beta.chat.completions.parse( - model=self.model, - messages=[ - { - "role": "system", - "content": """You are an expert qualitative researcher. + prompt = """You are an expert qualitative researcher. + +Given a document containing errors below, generate a list of (error) codes. +The document shows a log of interaction between multiple agents collaborating +to solve a complex task. -Given a list of dcocuments containing errors below, generate a list of (error) codes. Each code should contains: - at least 3 words, max 4 word, hyphenated. @@ -75,7 +135,7 @@ def code_document( - suggest codes that are similar to good code names. avoid code names that are similar to bad code names. - The definition should be simple worded and practical. At least 2 sentences, - max 3. It should be written in past tense. +max 3. It should be written in past tense. It should convey how a labeller could apply this code to future logs, without mentioning the word "labeller". The definition should be specific enough to be @@ -128,81 +188,134 @@ def code_document( * muddled-task-execution -- unclear what kind of tasks were muddled * task-completion-gaps -- too high level The above names are too high level and unclear. Please DO NOT use such names. - """, - }, - { - "role": "user", - "content": doc.text, - }, - ], - response_format=CodeList, - ) - - message = completion.choices[0].message - if message.parsed and len(message.parsed.code_list) > 0: - coded_document = CodedDocument(doc=doc, codes=set(message.parsed.code_list)) - else: - print(message.refusal) - raise ValueError("Error in coding document with OpenAI") + +Document: + +""" + + for line in doc.lines: + prompt += f"{line}" + prompt += "\n\n" + prompt += "Notice that the document contains the following number of lines: " + prompt += str(len(doc.lines)) + prompt += "\n\n" + + prompt += ( + "Now generate a list of codes for the document." + " Especially codes that detect errors/inefficiencies in the document." + ) + + # save the coding prompt to a file + # with open("coding_prompt.txt", "w") as f: + # f.write(prompt) + + completion = self.client.beta.chat.completions.parse( + model=self.model, + messages=[ + { + "role": "user", + "content": prompt, + }, + ], + response_format=CodeList, + ) + + message = completion.choices[0].message + if message.parsed and len(message.parsed.code_list) > 0: + coded_document = CodedDocument(doc=doc, codes=set(message.parsed.code_list)) else: - code_to_str = "\n".join( - [ - ( - f"\n---\nCode Name: {code.name}\n" - f"Definition: {code.definition}\n" - f"Examples: {code.examples}\n---\n" - ) - for code in code_set - ] - ) - - completion = self.client.beta.chat.completions.parse( - model=self.model, - messages=[ - { - "role": "system", - "content": """You are an expert qualitative researcher. - You can answer any questions about coding logs.""", - }, - { - "role": "user", - "content": f""" -## Context -The text below shows a log containing errors. Your task is to code the log with -the following codes. Generate a list of codes for the log below. - -Only use the codes from the list below. Do not create new codes. -Modify the examples of the codes to fit the context of the log. - -Your example should be informative to narrow down the details of the error in -the context of the example. - -## Codes - -{code_to_str} - -## Log - -{doc.text} -""", - }, - ], - response_format=CodeList, - ) - - message = completion.choices[0].message - if message.parsed and len(message.parsed.code_list) > 0: - code_list = message.parsed.code_list - # filter out codes whose names are not in the code_set - code_set_names = {code.name for code in code_set} - code_list = [code for code in code_list if code.name in code_set_names] - - coded_document = CodedDocument(doc=doc, codes=set(code_list)) - - if coded_document is None: + print(message.refusal) raise ValueError("Error in coding document with OpenAI") if self.cache_enabled and cache_file: with open(cache_file, "w") as f: f.write(coded_document.model_dump_json(indent=4)) + return coded_document + + def _codes_to_string(self, codes: Set[Code]) -> str: + """ + Convert a set of codes to a string representation. + Include name, definition, examples, line number, and severity. + """ + code_list: List[str] = [] + for code in codes: + code_list.append(f"[{code.severity}]: {code.name}: {code.definition}") + for example in code.examples: + code_list.append(f"\t{example.line}:{example.line_end}\t{example.reason}") + return "\n".join(code_list) + + def _extract_lines(self, doc: Document, start: int, end: int, buffer: int = 1) -> str: + """ + Extract a line from the document. + """ + start_line = max(0, start - buffer) + end_line = min(len(doc.lines), end + buffer) + lines = doc.lines[start_line:end_line] + return "".join(lines) + + def _extract_code_lines(self, doc: Document, example: CodeExample) -> str: + """ + Extract lines from the document based on the code. + """ + start = example.line + end = example.line_end + lines = self._extract_lines(doc, start, end) + return lines + + def _reflect_on_codes(self, coded_doc: CodedDocument) -> str: + """ + Given a coded document generate feedback. + E.g., whether the code used seem appropriate or not. + """ + + prompt = ( + "You are an expert qualitative researcher. " + "You are given a list of codes. " + "Pay attention the codes and the lines mentioned in the examples of the codes. " + "Which examples fail to spot meaningful errors? " + "Be direct and critical. " + "If a code identifies a BS error, say it. " + "There is no need to figure out how to fix the actual error. " + "The goal is to double check the validity of detected errors.\n\n" + ) + + # for line in coded_doc.doc.lines: + # prompt += f"{line}" + # prompt += "\n\n" + + # prompt += "Notice that the document contains the following number of lines: " + # prompt += str(len(coded_doc.doc.lines)) + + # prompt += "\n\n" + prompt += "A qualitative coding of a document claims to spot the following errors:\n\n" + for code in coded_doc.codes: + prompt += f"Code: {code.name}\n" + prompt += f"Definition: {code.definition}\n" + prompt += "Examples:\n" + for example in code.examples: + extracted_lines = self._extract_code_lines(coded_doc.doc, example) + prompt += f"- Does the text in the lines {example.line}:{example.line_end} shown below have enough information to justify the {code.name} error? " + prompt += f"Especially does the line {example.line} contain the error?\n\n" + prompt += f"{extracted_lines}\n" + prompt += "\n\n" + prompt += "\n" + + prompt += ( + "Now carefully analyze the examples. And provide feedback on the codes." + "If the examples lines do not align with the code name or definition, provide feedback." + ) + + # save the reflection_prompt to a file + # with open("reflection_prompt.txt", "w") as f: + # f.write(prompt) + + completion = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + ) + + feedback = completion.choices[0].message.content + if feedback is None: + raise ValueError("Error in generating feedback with OpenAI") + return feedback