diff --git a/bugbug/tools/code_review/__init__.py b/bugbug/tools/code_review/__init__.py
index 0642c3f5b0..23f5ff7704 100644
--- a/bugbug/tools/code_review/__init__.py
+++ b/bugbug/tools/code_review/__init__.py
@@ -15,7 +15,7 @@
"""
# Agent
-from bugbug.tools.code_review.agent import TARGET_SOFTWARE, CodeReviewTool
+from bugbug.tools.code_review.agent import CodeReviewTool
# Databases
from bugbug.tools.code_review.database import (
@@ -54,7 +54,6 @@
__all__ = [
# Agent
"CodeReviewTool",
- "TARGET_SOFTWARE",
# Databases
"EvaluationAction",
"ReviewCommentsDB",
diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py
index afe636f258..203d95e2ba 100644
--- a/bugbug/tools/code_review/agent.py
+++ b/bugbug/tools/code_review/agent.py
@@ -9,14 +9,16 @@
import os
from datetime import datetime
from logging import getLogger
-from typing import Iterable, Literal, Optional
+from typing import Iterable, Optional
from langchain.agents import create_agent
+from langchain.agents.structured_output import ProviderStrategy
from langchain.chat_models import BaseChatModel
from langchain.messages import HumanMessage
from langchain_classic.chains import LLMChain
from langchain_classic.prompts import PromptTemplate
from langgraph.errors import GraphRecursionError
+from pydantic import BaseModel, Field
from unidiff import PatchSet
from bugbug.code_search.function_search import FunctionSearch
@@ -29,19 +31,17 @@
)
from bugbug.tools.code_review.prompts import (
DEFAULT_REJECTED_EXAMPLES,
- OUTPUT_FORMAT_JSON,
- OUTPUT_FORMAT_TEXT,
+ FIRST_MESSAGE_TEMPLATE,
PROMPT_TEMPLATE_FILTERING_ANALYSIS,
- PROMPT_TEMPLATE_REVIEW,
PROMPT_TEMPLATE_SUMMARIZATION,
STATIC_COMMENT_EXAMPLES,
+ SYSTEM_PROMPT_TEMPLATE,
TEMPLATE_COMMENT_EXAMPLE,
TEMPLATE_PATCH_FROM_HUNK,
)
from bugbug.tools.code_review.utils import (
format_patch_set,
generate_processed_output,
- parse_model_output,
)
from bugbug.tools.core.data_types import InlineComment
from bugbug.tools.core.exceptions import LargeDiffError, ModelResultError
@@ -50,8 +50,27 @@
logger = getLogger(__name__)
-# Global variable for target software
-TARGET_SOFTWARE: str | None = None
+
+class GeneratedReviewComment(BaseModel):
+ """A review comment generated by the code review agent."""
+
+ file: str = Field(description="The path to the file the comment applies to.")
+ code_line: int = Field(description="The line number that the comment refers to.")
+ comment: str = Field(description="The review comment.")
+ explanation: str = Field(
+ description="A brief rationale for the comment, including how confident you are and why."
+ )
+ order: int = Field(
+ description="An integer representing the priority of the comment, with 1 being the highest confidence/importance."
+ )
+
+
+class AgentResponse(BaseModel):
+ """The response from the code review agent."""
+
+ comments: list[GeneratedReviewComment] = Field(
+ description="A list of generated review comments."
+ )
class CodeReviewTool(GenerativeModelTool):
@@ -60,16 +79,18 @@ class CodeReviewTool(GenerativeModelTool):
def __init__(
self,
llm: BaseChatModel,
+ summarization_llm: BaseChatModel,
+ filtering_llm: BaseChatModel,
function_search: Optional[FunctionSearch] = None,
review_comments_db: Optional["ReviewCommentsDB"] = None,
show_patch_example: bool = False,
verbose: bool = True,
suggestions_feedback_db: Optional["SuggestionsFeedbackDB"] = None,
- target_software: Optional[str] = None,
+ target_software: str = "Mozilla Firefox",
) -> None:
super().__init__()
- self.target_software = target_software or TARGET_SOFTWARE
+ self.target_software = target_software
self._tokenizer = get_tokenizer(
llm.model_name if hasattr(llm, "model_name") else ""
@@ -87,28 +108,22 @@ def __init__(
"----------------------------------------------------"
)
- experience_scope = (
- f"the {self.target_software} source code"
- if self.target_software
- else "a software project"
- )
-
self.summarization_chain = LLMChain(
prompt=PromptTemplate.from_template(
PROMPT_TEMPLATE_SUMMARIZATION,
- partial_variables={"experience_scope": experience_scope},
+ partial_variables={
+ "experience_scope": f"the {self.target_software} source code"
+ },
),
- llm=llm,
+ llm=summarization_llm,
verbose=verbose,
)
self.filtering_chain = LLMChain(
prompt=PromptTemplate.from_template(
PROMPT_TEMPLATE_FILTERING_ANALYSIS,
- partial_variables={
- "target_code_consistency": self.target_software or "rest of the"
- },
+ partial_variables={"target_code_consistency": self.target_software},
),
- llm=llm,
+ llm=filtering_llm,
verbose=verbose,
)
@@ -119,7 +134,10 @@ def __init__(
self.agent = create_agent(
llm,
tools,
- system_prompt=f"You are an expert reviewer for {experience_scope}, with experience on source code reviews.",
+ system_prompt=SYSTEM_PROMPT_TEMPLATE.format(
+ target_software=self.target_software,
+ ),
+ response_format=ProviderStrategy(AgentResponse),
)
self.review_comments_db = review_comments_db
@@ -130,12 +148,29 @@ def __init__(
self.suggestions_feedback_db = suggestions_feedback_db
+ @staticmethod
+ def create(
+ llm=None, summarization_llm=None, filtering_llm=None, **kwargs
+ ) -> "CodeReviewTool":
+ from bugbug.tools.core.llms import create_anthropic_llm
+
+ return CodeReviewTool(
+ llm=llm
+ or create_anthropic_llm(
+ model_name="claude-opus-4-5-20251101",
+ max_tokens=40_000,
+ temperature=None,
+ thinking={"type": "enabled", "budget_tokens": 10_000},
+ ),
+ summarization_llm=summarization_llm or create_anthropic_llm(),
+ filtering_llm=filtering_llm or create_anthropic_llm(),
+ **kwargs,
+ )
+
def count_tokens(self, text):
return len(self._tokenizer.encode(text))
- def generate_initial_prompt(
- self, patch: Patch, output_format: Literal["JSON", "TEXT"] = "JSON"
- ) -> str:
+ def generate_initial_prompt(self, patch: Patch) -> str:
formatted_patch = format_patch_set(patch.patch_set)
output_summarization = self.summarization_chain.invoke(
@@ -143,6 +178,7 @@ def generate_initial_prompt(
"patch": formatted_patch,
"bug_title": patch.bug_title,
"patch_title": patch.patch_title,
+ "patch_description": patch.patch_description,
},
return_only_outputs=True,
)["text"]
@@ -150,30 +186,15 @@ def generate_initial_prompt(
if self.verbose:
GenerativeModelTool._print_answer(output_summarization)
- if output_format == "JSON":
- output_instructions = OUTPUT_FORMAT_JSON
- elif output_format == "TEXT":
- output_instructions = OUTPUT_FORMAT_TEXT
- else:
- raise ValueError(
- f"Unsupported output format: {output_format}, choose JSON or TEXT"
- )
-
created_before = patch.date_created if self.is_experiment_env else None
- return PROMPT_TEMPLATE_REVIEW.format(
+ return FIRST_MESSAGE_TEMPLATE.format(
patch=formatted_patch,
patch_summarization=output_summarization,
comment_examples=self._get_comment_examples(patch, created_before),
approved_examples=self._get_generated_examples(patch, created_before),
- target_code_consistency=self.target_software or "rest of the",
- output_instructions=output_instructions,
- bug_title=patch.bug_title,
- patch_title=patch.patch_title,
- patch_url=patch.patch_url,
- target_software=self.target_software,
)
- def _generate_suggestions(self, patch: Patch):
+ def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]:
try:
for chunk in self.agent.stream(
{
@@ -189,15 +210,13 @@ def _generate_suggestions(self, patch: Patch):
except GraphRecursionError as e:
raise ModelResultError("The model could not complete the review") from e
- return result["messages"][-1].content
+ return result["structured_response"].comments
def run(self, patch: Patch) -> list[InlineComment] | None:
if self.count_tokens(patch.raw_diff) > 21000:
raise LargeDiffError("The diff is too large")
- output = self._generate_suggestions(patch)
-
- unfiltered_suggestions = parse_model_output(output)
+ unfiltered_suggestions = self._generate_suggestions(patch)
if not unfiltered_suggestions:
logger.info("No suggestions were generated")
return []
@@ -210,7 +229,9 @@ def run(self, patch: Patch) -> list[InlineComment] | None:
raw_output = self.filtering_chain.invoke(
{
- "comments": output,
+ "comments": str(
+ [comment.model_dump() for comment in unfiltered_suggestions]
+ ),
"rejected_examples": rejected_examples,
},
return_only_outputs=True,
@@ -300,7 +321,9 @@ def generate_formatted_patch_from_raw_hunk(raw_hunk, filename):
for num, example in enumerate(comment_examples)
)
- def get_similar_rejected_comments(self, suggestions) -> Iterable[str]:
+ def get_similar_rejected_comments(
+ self, suggestions: list[GeneratedReviewComment]
+ ) -> Iterable[str]:
if not self.suggestions_feedback_db:
raise Exception("Suggestions feedback database is not available")
@@ -310,7 +333,7 @@ def get_similar_rejected_comments(self, suggestions) -> Iterable[str]:
for suggestion in suggestions:
similar_rejected_suggestions = (
self.suggestions_feedback_db.find_similar_rejected_suggestions(
- suggestion["comment"],
+ suggestion.comment,
limit=num_examples_per_suggestion,
excluded_ids=seen_ids,
)
diff --git a/bugbug/tools/code_review/prompts.py b/bugbug/tools/code_review/prompts.py
index 5716f102d5..9b253c026b 100644
--- a/bugbug/tools/code_review/prompts.py
+++ b/bugbug/tools/code_review/prompts.py
@@ -13,122 +13,94 @@
The summarization should have two parts:
1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request.
- 2. **Structure**: Describe the structure of the changes, including any new functions, classes, or modules introduced, and how they fit into the existing codebase.
+ 2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent.
Do not include any code in the summarization, only a description of the changes.
**Bug title**:
+
{bug_title}
+
**Commit message**:
+
{patch_title}
+{patch_description}
+
**Diff**:
-{patch}"""
-
-PROMPT_TEMPLATE_REVIEW = """
-Generate high-quality code review comments for the patch provided below.
-
-
-
-
-**Analyze the Changes**:
-* Understand the intent and structure of the changes in the patch.
-* Use the provided summarization for context, but prioritize what's visible in the diff.
-
-
-
-**Identify Issues**:
-* Detect bugs, logical errors, performance concerns, security issues, or violations of the `{target_code_consistency}` coding standards.
-* Focus only on **new or changed lines** (lines beginning with `+`).
-* **Prioritize**: Security vulnerabilities > Functional bugs > Performance issues > Style/readability concerns.
-
-
-
-**Assess Confidence and Order**:
-* **Only include comments where you are at least 80% confident the issue is valid**.
-* **Sort the comments by descending confidence and importance**:
- * Start with issues you are **certain are valid**.
- * Also, prioritize important issues that you are **confident about**.
- * Follow with issues that are **plausible but uncertain** (possible false positives).
-* **When uncertain, use available tools to verify before commenting**.
-* Assign each comment a numeric `order`, starting at 1.
-
-
-
-**Write Clear, Constructive Comments**:
-* Use **direct, declarative language**. State the problem definitively, then suggest the fix.
-* Keep comments **short and specific**.
-* Focus strictly on code-related concerns.
-* **Banned phrases**: "maybe", "might want to", "consider", "possibly", "could be", "you may want to".
-* **Use directive language**: "Fix", "Remove", "Change", "Add", "Validate", "Check" (not "Consider checking").
-* Avoid repeating what the code is doing unless it supports your critique.
-
-
-
-**Use available tools to verify concerns**:
-* Use tools to gather context when you suspect an issue but need verification.
-* Use `find_function_definition` to check if error handling or validation exists elsewhere.
-* Use `expand_context` to see if edge cases are handled in surrounding code.
-* **Do not suggest issues you cannot verify with available context and tools**.
-
-
-
-**Avoid Comments That**:
-* Refer to unmodified code (lines without a `+` prefix).
-* Ask for verification or confirmation (e.g., "Check if…", "Ensure that…").
-* Provide praise or restate obvious facts.
-* Focus on testing.
-* Point out issues that are already handled in the visible code.
-* Suggest problems based on assumptions without verifying the context.
-* Flag style preferences without clear `{target_code_consistency}` standard violations.
-
-
-
-
-{output_instructions}
-
+
+{patch}
+"""
+
+SYSTEM_PROMPT_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing a pull request and providing high-quality review comments. You will examine a code patch and generate constructive feedback focusing on potential issues in the changed code.
+
+## Instructions
+
+Follow this systematic approach to review the patch:
+
+**Step 1: Analyze the Changes**
+- Understand what the patch is trying to accomplish
+- Use the patch summary for context, but focus primarily on what you can see in the actual diff
+- Identify the intent and structure of the changes
+
+**Step 2: Identify Issues**
+- Look for bugs, logical errors, performance problems, security vulnerabilities, or violations of the coding standards
+- Focus ONLY on new or changed lines (lines that begin with `+`)
+- Never comment on unmodified code
+- Prioritize issues in this order: Security vulnerabilities > Functional bugs > Performance issues > Style/readability concerns
+
+**Step 3: Verify and Assess Confidence**
+- Use available tools when you need to verify concerns or gather additional context
+- Only include comments where you are at least 80% confident the issue is valid
+- When uncertain about an issue, use tools like `find_function_definition` or `expand_context` to verify before commenting
+- Do not suggest issues you cannot verify with available context
+
+**Step 4: Sort and Order Comments**
+- Sort comments by descending confidence and importance
+- Start with issues you are certain are valid and that are most critical
+- Assign each comment a numeric order starting at 1
+
+**Step 5: Write Clear, Constructive Comments**
+- Use direct, declarative language - state the problem definitively, then suggest the fix
+- Keep comments short and specific
+- Use directive language: "Fix", "Remove", "Change", "Add"
+- NEVER use these banned phrases: "maybe", "might want to", "consider", "possibly", "could be", "you may want to"
+- Focus strictly on code-related concerns
+
+## What NOT to Include
+
+Do not write comments that:
+- Refer to unmodified code (lines without a `+` prefix)
+- Ask for verification or confirmation (e.g., "Check if...", "Ensure that...")
+- Provide praise or restate obvious facts
+- Focus on testing concerns
+- Point out issues that are already handled in the visible code
+- Suggest problems based on assumptions without verifying the context
+- Flag style preferences without clear coding standard violations
+"""
-
-{comment_examples}
-{approved_examples}
-
-
-**Review Context**:
-Target Software: {target_software}
-Bug Title: {bug_title}
-Patch Title: {patch_title}
-Source URL: {patch_url}
-
+FIRST_MESSAGE_TEMPLATE = """Here is a summary of the patch:
{patch_summarization}
-
-{patch}
-
-"""
-OUTPUT_FORMAT_JSON = """
-Respond only with a **JSON list**. Each object must contain the following fields:
+Here are examples of good code review comments to guide your style and approach:
-* `"file"`: The relative path to the file the comment applies to.
-* `"code_line"`: The number of the specific changed line of code that the comment refers to.
-* `"comment"`: A concise review comment.
-* `"explanation"`: A brief rationale for the comment, including how confident you are and why.
-* `"order"`: An integer representing the priority of the comment, with 1 being the highest confidence/importance.
-"""
+
+{comment_examples}
+{approved_examples}
+
-OUTPUT_FORMAT_TEXT = """
-Respond only with a **plain text list** with the following details:
-* `"filename"`: The relative path to the file the comment applies to.
-* `"line_number"`: The number of the specific changed line of code that the comment refers to.
-* `"comment"`: A concise review comment.
+Here is the patch you need to review:
-The format should be: filename:line_number "comment"
+
+{patch}
+
"""
diff --git a/bugbug/tools/core/llms.py b/bugbug/tools/core/llms.py
index 2c90360d12..d40aeb1999 100644
--- a/bugbug/tools/core/llms.py
+++ b/bugbug/tools/core/llms.py
@@ -44,7 +44,7 @@ def create_azureopenai_llm(temperature=0.2, top_p=None):
def create_anthropic_llm(
- temperature=0.2, top_p=None, model_name="claude-sonnet-4-5-20250929"
+ temperature=0.2, top_p=None, model_name="claude-sonnet-4-5-20250929", **kwargs
):
from langchain_anthropic import ChatAnthropic
@@ -53,6 +53,7 @@ def create_anthropic_llm(
api_key=get_secret("ANTHROPIC_API_KEY"),
temperature=temperature,
top_p=top_p,
+ **kwargs,
)
diff --git a/bugbug/tools/core/platforms/base.py b/bugbug/tools/core/platforms/base.py
index 3e6b5142a8..c63102d780 100644
--- a/bugbug/tools/core/platforms/base.py
+++ b/bugbug/tools/core/platforms/base.py
@@ -51,6 +51,12 @@ def patch_title(self) -> str:
"""Return the title of the patch."""
...
+ @property
+ @abstractmethod
+ def patch_description(self) -> str:
+ """Return the description of the patch."""
+ ...
+
@property
@abstractmethod
def patch_url(self) -> str:
diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py
index 775cf294e1..8f0defdd57 100644
--- a/bugbug/tools/core/platforms/phabricator.py
+++ b/bugbug/tools/core/platforms/phabricator.py
@@ -291,6 +291,10 @@ def bug_title(self) -> str:
def patch_title(self) -> str:
return self._revision_metadata["fields"]["title"]
+ @property
+ def patch_description(self) -> str:
+ return self._revision_metadata["fields"].get("summary", "")
+
@property
def revision_id(self) -> int:
return self._revision_metadata["id"]
diff --git a/bugbug/tools/core/platforms/swarm.py b/bugbug/tools/core/platforms/swarm.py
index d22517b89a..0fbf234d96 100644
--- a/bugbug/tools/core/platforms/swarm.py
+++ b/bugbug/tools/core/platforms/swarm.py
@@ -53,6 +53,10 @@ def date_created(self) -> datetime:
def patch_title(self) -> str:
raise NotImplementedError
+ @property
+ def patch_description(self) -> str:
+ raise NotImplementedError
+
@cached_property
def bug_title(self) -> str:
raise NotImplementedError
diff --git a/mcp/src/bugbug_mcp/server.py b/mcp/src/bugbug_mcp/server.py
index 0ea6620eb2..c8223d85fc 100644
--- a/mcp/src/bugbug_mcp/server.py
+++ b/mcp/src/bugbug_mcp/server.py
@@ -55,7 +55,8 @@ async def patch_review(
else:
raise ValueError(f"Unsupported patch URL: {patch_url}")
- return get_code_review_tool().generate_initial_prompt(patch, "TEXT")
+ # FIXME: add the system prompt as well
+ return get_code_review_tool().generate_initial_prompt(patch)
def get_file(commit_hash, path):
diff --git a/scripts/code_review_tool_evaluator.py b/scripts/code_review_tool_evaluator.py
index 7057ccbdb1..085035d834 100644
--- a/scripts/code_review_tool_evaluator.py
+++ b/scripts/code_review_tool_evaluator.py
@@ -35,7 +35,6 @@
from bugbug.tools.core.exceptions import ModelResultError
from bugbug.vectordb import QdrantVectorDB
-code_review.TARGET_SOFTWARE = "Mozilla Firefox"
VERBOSE_CODE_REVIEW = False
@@ -261,8 +260,7 @@ def get_file(commit_hash, path):
tool_variants.append(
(
"Claude",
- code_review.CodeReviewTool(
- llm=llms.create_anthropic_llm(),
+ code_review.CodeReviewTool.create(
function_search=function_search,
review_comments_db=review_comments_db,
suggestions_feedback_db=suggestions_feedback_db,
@@ -272,11 +270,14 @@ def get_file(commit_hash, path):
)
if "gpt" in variants:
+ llm = llms.create_openai_llm()
tool_variants.append(
(
"GPT",
- code_review.CodeReviewTool(
- llm=llms.create_openai_llm(),
+ code_review.CodeReviewTool.create(
+ llm=llm,
+ summarization_llm=llm,
+ filtering_llm=llm,
function_search=function_search,
review_comments_db=review_comments_db,
suggestions_feedback_db=suggestions_feedback_db,
diff --git a/scripts/code_review_tool_runner.py b/scripts/code_review_tool_runner.py
deleted file mode 100644
index 80fb7d2659..0000000000
--- a/scripts/code_review_tool_runner.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# -*- coding: utf-8 -*-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-
-import argparse
-import sys
-
-from bugbug.code_search.function_search import function_search_classes
-from bugbug.tools import code_review
-from bugbug.tools.core import llms
-from bugbug.vectordb import QdrantVectorDB
-
-
-def run(args) -> None:
- llm = llms.create_llm_from_args(args)
-
- function_search = (
- function_search_classes[args.function_search_type]()
- if args.function_search_type is not None
- else None
- )
- vector_db = QdrantVectorDB("diff_comments")
- review_comments_db = code_review.ReviewCommentsDB(vector_db)
- code_review_tool = code_review.CodeReviewTool(
- llm,
- function_search=function_search,
- review_comments_db=review_comments_db,
- show_patch_example=False,
- )
-
- review_data = code_review.review_data_classes[args.review_platform]()
-
- revision = review_data.get_review_request_by_id(args.review_request_id)
- patch = review_data.get_patch_by_id(revision.patch_id)
-
- print(patch)
- print(code_review_tool.run(patch))
- input()
-
-
-def parse_args(args):
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
- parser.add_argument(
- "--review_platform",
- help="Review platform",
- choices=list(code_review.review_data_classes.keys()),
- )
- parser.add_argument(
- "--review_request_id",
- help="Review request ID",
- )
- llms.create_llm_to_args(parser)
- parser.add_argument(
- "--function_search_type",
- help="Function search tool",
- choices=list(function_search_classes.keys()),
- )
- return parser.parse_args(args)
-
-
-if __name__ == "__main__":
- args = parse_args(sys.argv[1:])
- run(args)