diff --git a/bugbug/tools/code_review/__init__.py b/bugbug/tools/code_review/__init__.py index 0642c3f5b0..23f5ff7704 100644 --- a/bugbug/tools/code_review/__init__.py +++ b/bugbug/tools/code_review/__init__.py @@ -15,7 +15,7 @@ """ # Agent -from bugbug.tools.code_review.agent import TARGET_SOFTWARE, CodeReviewTool +from bugbug.tools.code_review.agent import CodeReviewTool # Databases from bugbug.tools.code_review.database import ( @@ -54,7 +54,6 @@ __all__ = [ # Agent "CodeReviewTool", - "TARGET_SOFTWARE", # Databases "EvaluationAction", "ReviewCommentsDB", diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py index afe636f258..203d95e2ba 100644 --- a/bugbug/tools/code_review/agent.py +++ b/bugbug/tools/code_review/agent.py @@ -9,14 +9,16 @@ import os from datetime import datetime from logging import getLogger -from typing import Iterable, Literal, Optional +from typing import Iterable, Optional from langchain.agents import create_agent +from langchain.agents.structured_output import ProviderStrategy from langchain.chat_models import BaseChatModel from langchain.messages import HumanMessage from langchain_classic.chains import LLMChain from langchain_classic.prompts import PromptTemplate from langgraph.errors import GraphRecursionError +from pydantic import BaseModel, Field from unidiff import PatchSet from bugbug.code_search.function_search import FunctionSearch @@ -29,19 +31,17 @@ ) from bugbug.tools.code_review.prompts import ( DEFAULT_REJECTED_EXAMPLES, - OUTPUT_FORMAT_JSON, - OUTPUT_FORMAT_TEXT, + FIRST_MESSAGE_TEMPLATE, PROMPT_TEMPLATE_FILTERING_ANALYSIS, - PROMPT_TEMPLATE_REVIEW, PROMPT_TEMPLATE_SUMMARIZATION, STATIC_COMMENT_EXAMPLES, + SYSTEM_PROMPT_TEMPLATE, TEMPLATE_COMMENT_EXAMPLE, TEMPLATE_PATCH_FROM_HUNK, ) from bugbug.tools.code_review.utils import ( format_patch_set, generate_processed_output, - parse_model_output, ) from bugbug.tools.core.data_types import InlineComment from bugbug.tools.core.exceptions import LargeDiffError, ModelResultError @@ -50,8 +50,27 @@ logger = getLogger(__name__) -# Global variable for target software -TARGET_SOFTWARE: str | None = None + +class GeneratedReviewComment(BaseModel): + """A review comment generated by the code review agent.""" + + file: str = Field(description="The path to the file the comment applies to.") + code_line: int = Field(description="The line number that the comment refers to.") + comment: str = Field(description="The review comment.") + explanation: str = Field( + description="A brief rationale for the comment, including how confident you are and why." + ) + order: int = Field( + description="An integer representing the priority of the comment, with 1 being the highest confidence/importance." + ) + + +class AgentResponse(BaseModel): + """The response from the code review agent.""" + + comments: list[GeneratedReviewComment] = Field( + description="A list of generated review comments." + ) class CodeReviewTool(GenerativeModelTool): @@ -60,16 +79,18 @@ class CodeReviewTool(GenerativeModelTool): def __init__( self, llm: BaseChatModel, + summarization_llm: BaseChatModel, + filtering_llm: BaseChatModel, function_search: Optional[FunctionSearch] = None, review_comments_db: Optional["ReviewCommentsDB"] = None, show_patch_example: bool = False, verbose: bool = True, suggestions_feedback_db: Optional["SuggestionsFeedbackDB"] = None, - target_software: Optional[str] = None, + target_software: str = "Mozilla Firefox", ) -> None: super().__init__() - self.target_software = target_software or TARGET_SOFTWARE + self.target_software = target_software self._tokenizer = get_tokenizer( llm.model_name if hasattr(llm, "model_name") else "" @@ -87,28 +108,22 @@ def __init__( "----------------------------------------------------" ) - experience_scope = ( - f"the {self.target_software} source code" - if self.target_software - else "a software project" - ) - self.summarization_chain = LLMChain( prompt=PromptTemplate.from_template( PROMPT_TEMPLATE_SUMMARIZATION, - partial_variables={"experience_scope": experience_scope}, + partial_variables={ + "experience_scope": f"the {self.target_software} source code" + }, ), - llm=llm, + llm=summarization_llm, verbose=verbose, ) self.filtering_chain = LLMChain( prompt=PromptTemplate.from_template( PROMPT_TEMPLATE_FILTERING_ANALYSIS, - partial_variables={ - "target_code_consistency": self.target_software or "rest of the" - }, + partial_variables={"target_code_consistency": self.target_software}, ), - llm=llm, + llm=filtering_llm, verbose=verbose, ) @@ -119,7 +134,10 @@ def __init__( self.agent = create_agent( llm, tools, - system_prompt=f"You are an expert reviewer for {experience_scope}, with experience on source code reviews.", + system_prompt=SYSTEM_PROMPT_TEMPLATE.format( + target_software=self.target_software, + ), + response_format=ProviderStrategy(AgentResponse), ) self.review_comments_db = review_comments_db @@ -130,12 +148,29 @@ def __init__( self.suggestions_feedback_db = suggestions_feedback_db + @staticmethod + def create( + llm=None, summarization_llm=None, filtering_llm=None, **kwargs + ) -> "CodeReviewTool": + from bugbug.tools.core.llms import create_anthropic_llm + + return CodeReviewTool( + llm=llm + or create_anthropic_llm( + model_name="claude-opus-4-5-20251101", + max_tokens=40_000, + temperature=None, + thinking={"type": "enabled", "budget_tokens": 10_000}, + ), + summarization_llm=summarization_llm or create_anthropic_llm(), + filtering_llm=filtering_llm or create_anthropic_llm(), + **kwargs, + ) + def count_tokens(self, text): return len(self._tokenizer.encode(text)) - def generate_initial_prompt( - self, patch: Patch, output_format: Literal["JSON", "TEXT"] = "JSON" - ) -> str: + def generate_initial_prompt(self, patch: Patch) -> str: formatted_patch = format_patch_set(patch.patch_set) output_summarization = self.summarization_chain.invoke( @@ -143,6 +178,7 @@ def generate_initial_prompt( "patch": formatted_patch, "bug_title": patch.bug_title, "patch_title": patch.patch_title, + "patch_description": patch.patch_description, }, return_only_outputs=True, )["text"] @@ -150,30 +186,15 @@ def generate_initial_prompt( if self.verbose: GenerativeModelTool._print_answer(output_summarization) - if output_format == "JSON": - output_instructions = OUTPUT_FORMAT_JSON - elif output_format == "TEXT": - output_instructions = OUTPUT_FORMAT_TEXT - else: - raise ValueError( - f"Unsupported output format: {output_format}, choose JSON or TEXT" - ) - created_before = patch.date_created if self.is_experiment_env else None - return PROMPT_TEMPLATE_REVIEW.format( + return FIRST_MESSAGE_TEMPLATE.format( patch=formatted_patch, patch_summarization=output_summarization, comment_examples=self._get_comment_examples(patch, created_before), approved_examples=self._get_generated_examples(patch, created_before), - target_code_consistency=self.target_software or "rest of the", - output_instructions=output_instructions, - bug_title=patch.bug_title, - patch_title=patch.patch_title, - patch_url=patch.patch_url, - target_software=self.target_software, ) - def _generate_suggestions(self, patch: Patch): + def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]: try: for chunk in self.agent.stream( { @@ -189,15 +210,13 @@ def _generate_suggestions(self, patch: Patch): except GraphRecursionError as e: raise ModelResultError("The model could not complete the review") from e - return result["messages"][-1].content + return result["structured_response"].comments def run(self, patch: Patch) -> list[InlineComment] | None: if self.count_tokens(patch.raw_diff) > 21000: raise LargeDiffError("The diff is too large") - output = self._generate_suggestions(patch) - - unfiltered_suggestions = parse_model_output(output) + unfiltered_suggestions = self._generate_suggestions(patch) if not unfiltered_suggestions: logger.info("No suggestions were generated") return [] @@ -210,7 +229,9 @@ def run(self, patch: Patch) -> list[InlineComment] | None: raw_output = self.filtering_chain.invoke( { - "comments": output, + "comments": str( + [comment.model_dump() for comment in unfiltered_suggestions] + ), "rejected_examples": rejected_examples, }, return_only_outputs=True, @@ -300,7 +321,9 @@ def generate_formatted_patch_from_raw_hunk(raw_hunk, filename): for num, example in enumerate(comment_examples) ) - def get_similar_rejected_comments(self, suggestions) -> Iterable[str]: + def get_similar_rejected_comments( + self, suggestions: list[GeneratedReviewComment] + ) -> Iterable[str]: if not self.suggestions_feedback_db: raise Exception("Suggestions feedback database is not available") @@ -310,7 +333,7 @@ def get_similar_rejected_comments(self, suggestions) -> Iterable[str]: for suggestion in suggestions: similar_rejected_suggestions = ( self.suggestions_feedback_db.find_similar_rejected_suggestions( - suggestion["comment"], + suggestion.comment, limit=num_examples_per_suggestion, excluded_ids=seen_ids, ) diff --git a/bugbug/tools/code_review/prompts.py b/bugbug/tools/code_review/prompts.py index 5716f102d5..9b253c026b 100644 --- a/bugbug/tools/code_review/prompts.py +++ b/bugbug/tools/code_review/prompts.py @@ -13,122 +13,94 @@ The summarization should have two parts: 1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request. - 2. **Structure**: Describe the structure of the changes, including any new functions, classes, or modules introduced, and how they fit into the existing codebase. + 2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent. Do not include any code in the summarization, only a description of the changes. **Bug title**: + {bug_title} + **Commit message**: + {patch_title} +{patch_description} + **Diff**: -{patch}""" - -PROMPT_TEMPLATE_REVIEW = """ -Generate high-quality code review comments for the patch provided below. - - - - -**Analyze the Changes**: -* Understand the intent and structure of the changes in the patch. -* Use the provided summarization for context, but prioritize what's visible in the diff. - - - -**Identify Issues**: -* Detect bugs, logical errors, performance concerns, security issues, or violations of the `{target_code_consistency}` coding standards. -* Focus only on **new or changed lines** (lines beginning with `+`). -* **Prioritize**: Security vulnerabilities > Functional bugs > Performance issues > Style/readability concerns. - - - -**Assess Confidence and Order**: -* **Only include comments where you are at least 80% confident the issue is valid**. -* **Sort the comments by descending confidence and importance**: - * Start with issues you are **certain are valid**. - * Also, prioritize important issues that you are **confident about**. - * Follow with issues that are **plausible but uncertain** (possible false positives). -* **When uncertain, use available tools to verify before commenting**. -* Assign each comment a numeric `order`, starting at 1. - - - -**Write Clear, Constructive Comments**: -* Use **direct, declarative language**. State the problem definitively, then suggest the fix. -* Keep comments **short and specific**. -* Focus strictly on code-related concerns. -* **Banned phrases**: "maybe", "might want to", "consider", "possibly", "could be", "you may want to". -* **Use directive language**: "Fix", "Remove", "Change", "Add", "Validate", "Check" (not "Consider checking"). -* Avoid repeating what the code is doing unless it supports your critique. - - - -**Use available tools to verify concerns**: -* Use tools to gather context when you suspect an issue but need verification. -* Use `find_function_definition` to check if error handling or validation exists elsewhere. -* Use `expand_context` to see if edge cases are handled in surrounding code. -* **Do not suggest issues you cannot verify with available context and tools**. - - - -**Avoid Comments That**: -* Refer to unmodified code (lines without a `+` prefix). -* Ask for verification or confirmation (e.g., "Check if…", "Ensure that…"). -* Provide praise or restate obvious facts. -* Focus on testing. -* Point out issues that are already handled in the visible code. -* Suggest problems based on assumptions without verifying the context. -* Flag style preferences without clear `{target_code_consistency}` standard violations. - - - - -{output_instructions} - + +{patch} +""" + +SYSTEM_PROMPT_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing a pull request and providing high-quality review comments. You will examine a code patch and generate constructive feedback focusing on potential issues in the changed code. + +## Instructions + +Follow this systematic approach to review the patch: + +**Step 1: Analyze the Changes** +- Understand what the patch is trying to accomplish +- Use the patch summary for context, but focus primarily on what you can see in the actual diff +- Identify the intent and structure of the changes + +**Step 2: Identify Issues** +- Look for bugs, logical errors, performance problems, security vulnerabilities, or violations of the coding standards +- Focus ONLY on new or changed lines (lines that begin with `+`) +- Never comment on unmodified code +- Prioritize issues in this order: Security vulnerabilities > Functional bugs > Performance issues > Style/readability concerns + +**Step 3: Verify and Assess Confidence** +- Use available tools when you need to verify concerns or gather additional context +- Only include comments where you are at least 80% confident the issue is valid +- When uncertain about an issue, use tools like `find_function_definition` or `expand_context` to verify before commenting +- Do not suggest issues you cannot verify with available context + +**Step 4: Sort and Order Comments** +- Sort comments by descending confidence and importance +- Start with issues you are certain are valid and that are most critical +- Assign each comment a numeric order starting at 1 + +**Step 5: Write Clear, Constructive Comments** +- Use direct, declarative language - state the problem definitively, then suggest the fix +- Keep comments short and specific +- Use directive language: "Fix", "Remove", "Change", "Add" +- NEVER use these banned phrases: "maybe", "might want to", "consider", "possibly", "could be", "you may want to" +- Focus strictly on code-related concerns + +## What NOT to Include + +Do not write comments that: +- Refer to unmodified code (lines without a `+` prefix) +- Ask for verification or confirmation (e.g., "Check if...", "Ensure that...") +- Provide praise or restate obvious facts +- Focus on testing concerns +- Point out issues that are already handled in the visible code +- Suggest problems based on assumptions without verifying the context +- Flag style preferences without clear coding standard violations +""" - -{comment_examples} -{approved_examples} - - -**Review Context**: -Target Software: {target_software} -Bug Title: {bug_title} -Patch Title: {patch_title} -Source URL: {patch_url} - +FIRST_MESSAGE_TEMPLATE = """Here is a summary of the patch: {patch_summarization} - -{patch} - -""" -OUTPUT_FORMAT_JSON = """ -Respond only with a **JSON list**. Each object must contain the following fields: +Here are examples of good code review comments to guide your style and approach: -* `"file"`: The relative path to the file the comment applies to. -* `"code_line"`: The number of the specific changed line of code that the comment refers to. -* `"comment"`: A concise review comment. -* `"explanation"`: A brief rationale for the comment, including how confident you are and why. -* `"order"`: An integer representing the priority of the comment, with 1 being the highest confidence/importance. -""" + +{comment_examples} +{approved_examples} + -OUTPUT_FORMAT_TEXT = """ -Respond only with a **plain text list** with the following details: -* `"filename"`: The relative path to the file the comment applies to. -* `"line_number"`: The number of the specific changed line of code that the comment refers to. -* `"comment"`: A concise review comment. +Here is the patch you need to review: -The format should be: filename:line_number "comment" + +{patch} + """ diff --git a/bugbug/tools/core/llms.py b/bugbug/tools/core/llms.py index 2c90360d12..d40aeb1999 100644 --- a/bugbug/tools/core/llms.py +++ b/bugbug/tools/core/llms.py @@ -44,7 +44,7 @@ def create_azureopenai_llm(temperature=0.2, top_p=None): def create_anthropic_llm( - temperature=0.2, top_p=None, model_name="claude-sonnet-4-5-20250929" + temperature=0.2, top_p=None, model_name="claude-sonnet-4-5-20250929", **kwargs ): from langchain_anthropic import ChatAnthropic @@ -53,6 +53,7 @@ def create_anthropic_llm( api_key=get_secret("ANTHROPIC_API_KEY"), temperature=temperature, top_p=top_p, + **kwargs, ) diff --git a/bugbug/tools/core/platforms/base.py b/bugbug/tools/core/platforms/base.py index 3e6b5142a8..c63102d780 100644 --- a/bugbug/tools/core/platforms/base.py +++ b/bugbug/tools/core/platforms/base.py @@ -51,6 +51,12 @@ def patch_title(self) -> str: """Return the title of the patch.""" ... + @property + @abstractmethod + def patch_description(self) -> str: + """Return the description of the patch.""" + ... + @property @abstractmethod def patch_url(self) -> str: diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py index 775cf294e1..8f0defdd57 100644 --- a/bugbug/tools/core/platforms/phabricator.py +++ b/bugbug/tools/core/platforms/phabricator.py @@ -291,6 +291,10 @@ def bug_title(self) -> str: def patch_title(self) -> str: return self._revision_metadata["fields"]["title"] + @property + def patch_description(self) -> str: + return self._revision_metadata["fields"].get("summary", "") + @property def revision_id(self) -> int: return self._revision_metadata["id"] diff --git a/bugbug/tools/core/platforms/swarm.py b/bugbug/tools/core/platforms/swarm.py index d22517b89a..0fbf234d96 100644 --- a/bugbug/tools/core/platforms/swarm.py +++ b/bugbug/tools/core/platforms/swarm.py @@ -53,6 +53,10 @@ def date_created(self) -> datetime: def patch_title(self) -> str: raise NotImplementedError + @property + def patch_description(self) -> str: + raise NotImplementedError + @cached_property def bug_title(self) -> str: raise NotImplementedError diff --git a/mcp/src/bugbug_mcp/server.py b/mcp/src/bugbug_mcp/server.py index 0ea6620eb2..c8223d85fc 100644 --- a/mcp/src/bugbug_mcp/server.py +++ b/mcp/src/bugbug_mcp/server.py @@ -55,7 +55,8 @@ async def patch_review( else: raise ValueError(f"Unsupported patch URL: {patch_url}") - return get_code_review_tool().generate_initial_prompt(patch, "TEXT") + # FIXME: add the system prompt as well + return get_code_review_tool().generate_initial_prompt(patch) def get_file(commit_hash, path): diff --git a/scripts/code_review_tool_evaluator.py b/scripts/code_review_tool_evaluator.py index 7057ccbdb1..085035d834 100644 --- a/scripts/code_review_tool_evaluator.py +++ b/scripts/code_review_tool_evaluator.py @@ -35,7 +35,6 @@ from bugbug.tools.core.exceptions import ModelResultError from bugbug.vectordb import QdrantVectorDB -code_review.TARGET_SOFTWARE = "Mozilla Firefox" VERBOSE_CODE_REVIEW = False @@ -261,8 +260,7 @@ def get_file(commit_hash, path): tool_variants.append( ( "Claude", - code_review.CodeReviewTool( - llm=llms.create_anthropic_llm(), + code_review.CodeReviewTool.create( function_search=function_search, review_comments_db=review_comments_db, suggestions_feedback_db=suggestions_feedback_db, @@ -272,11 +270,14 @@ def get_file(commit_hash, path): ) if "gpt" in variants: + llm = llms.create_openai_llm() tool_variants.append( ( "GPT", - code_review.CodeReviewTool( - llm=llms.create_openai_llm(), + code_review.CodeReviewTool.create( + llm=llm, + summarization_llm=llm, + filtering_llm=llm, function_search=function_search, review_comments_db=review_comments_db, suggestions_feedback_db=suggestions_feedback_db, diff --git a/scripts/code_review_tool_runner.py b/scripts/code_review_tool_runner.py deleted file mode 100644 index 80fb7d2659..0000000000 --- a/scripts/code_review_tool_runner.py +++ /dev/null @@ -1,66 +0,0 @@ -# -*- coding: utf-8 -*- -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. - -import argparse -import sys - -from bugbug.code_search.function_search import function_search_classes -from bugbug.tools import code_review -from bugbug.tools.core import llms -from bugbug.vectordb import QdrantVectorDB - - -def run(args) -> None: - llm = llms.create_llm_from_args(args) - - function_search = ( - function_search_classes[args.function_search_type]() - if args.function_search_type is not None - else None - ) - vector_db = QdrantVectorDB("diff_comments") - review_comments_db = code_review.ReviewCommentsDB(vector_db) - code_review_tool = code_review.CodeReviewTool( - llm, - function_search=function_search, - review_comments_db=review_comments_db, - show_patch_example=False, - ) - - review_data = code_review.review_data_classes[args.review_platform]() - - revision = review_data.get_review_request_by_id(args.review_request_id) - patch = review_data.get_patch_by_id(revision.patch_id) - - print(patch) - print(code_review_tool.run(patch)) - input() - - -def parse_args(args): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument( - "--review_platform", - help="Review platform", - choices=list(code_review.review_data_classes.keys()), - ) - parser.add_argument( - "--review_request_id", - help="Review request ID", - ) - llms.create_llm_to_args(parser) - parser.add_argument( - "--function_search_type", - help="Function search tool", - choices=list(function_search_classes.keys()), - ) - return parser.parse_args(args) - - -if __name__ == "__main__": - args = parse_args(sys.argv[1:]) - run(args)