mozilla · suhaibmujahid · Dec 19, 2025 · Dec 17, 2025 · Dec 1, 2025 · Dec 17, 2025
diff --git a/bugbug/tools/code_review/__init__.py b/bugbug/tools/code_review/__init__.py
@@ -15,7 +15,7 @@
 """
 
 # Agent
-from bugbug.tools.code_review.agent import TARGET_SOFTWARE, CodeReviewTool
+from bugbug.tools.code_review.agent import CodeReviewTool
 
 # Databases
 from bugbug.tools.code_review.database import (
@@ -54,7 +54,6 @@
 __all__ = [
     # Agent
     "CodeReviewTool",
-    "TARGET_SOFTWARE",
     # Databases
     "EvaluationAction",
     "ReviewCommentsDB",

diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py
@@ -9,14 +9,16 @@
 import os
 from datetime import datetime
 from logging import getLogger
-from typing import Iterable, Literal, Optional
+from typing import Iterable, Optional
 
 from langchain.agents import create_agent
+from langchain.agents.structured_output import ProviderStrategy
 from langchain.chat_models import BaseChatModel
 from langchain.messages import HumanMessage
 from langchain_classic.chains import LLMChain
 from langchain_classic.prompts import PromptTemplate
 from langgraph.errors import GraphRecursionError
+from pydantic import BaseModel, Field
 from unidiff import PatchSet
 
 from bugbug.code_search.function_search import FunctionSearch
@@ -29,19 +31,17 @@
 )
 from bugbug.tools.code_review.prompts import (
     DEFAULT_REJECTED_EXAMPLES,
-    OUTPUT_FORMAT_JSON,
-    OUTPUT_FORMAT_TEXT,
+    FIRST_MESSAGE_TEMPLATE,
     PROMPT_TEMPLATE_FILTERING_ANALYSIS,
-    PROMPT_TEMPLATE_REVIEW,
     PROMPT_TEMPLATE_SUMMARIZATION,
     STATIC_COMMENT_EXAMPLES,
+    SYSTEM_PROMPT_TEMPLATE,
     TEMPLATE_COMMENT_EXAMPLE,
     TEMPLATE_PATCH_FROM_HUNK,
 )
 from bugbug.tools.code_review.utils import (
     format_patch_set,
     generate_processed_output,
-    parse_model_output,
 )
 from bugbug.tools.core.data_types import InlineComment
 from bugbug.tools.core.exceptions import LargeDiffError, ModelResultError
@@ -50,8 +50,27 @@
 
 logger = getLogger(__name__)
 
-# Global variable for target software
-TARGET_SOFTWARE: str | None = None
+
+class GeneratedReviewComment(BaseModel):
+    """A review comment generated by the code review agent."""
+
+    file: str = Field(description="The path to the file the comment applies to.")
+    code_line: int = Field(description="The line number that the comment refers to.")
+    comment: str = Field(description="The review comment.")
+    explanation: str = Field(
+        description="A brief rationale for the comment, including how confident you are and why."
+    )
+    order: int = Field(
+        description="An integer representing the priority of the comment, with 1 being the highest confidence/importance."
+    )
+
+
+class AgentResponse(BaseModel):
+    """The response from the code review agent."""
+
+    comments: list[GeneratedReviewComment] = Field(
+        description="A list of generated review comments."
+    )
 
 
 class CodeReviewTool(GenerativeModelTool):
@@ -60,16 +79,18 @@ class CodeReviewTool(GenerativeModelTool):
     def __init__(
         self,
         llm: BaseChatModel,
+        summarization_llm: BaseChatModel,
+        filtering_llm: BaseChatModel,
         function_search: Optional[FunctionSearch] = None,
         review_comments_db: Optional["ReviewCommentsDB"] = None,
         show_patch_example: bool = False,
         verbose: bool = True,
         suggestions_feedback_db: Optional["SuggestionsFeedbackDB"] = None,
-        target_software: Optional[str] = None,
+        target_software: str = "Mozilla Firefox",
     ) -> None:
         super().__init__()
 
-        self.target_software = target_software or TARGET_SOFTWARE
+        self.target_software = target_software
 
         self._tokenizer = get_tokenizer(
             llm.model_name if hasattr(llm, "model_name") else ""
@@ -87,28 +108,22 @@ def __init__(
                 "----------------------------------------------------"
             )
 
-        experience_scope = (
-            f"the {self.target_software} source code"
-            if self.target_software
-            else "a software project"
-        )
-
         self.summarization_chain = LLMChain(
             prompt=PromptTemplate.from_template(
                 PROMPT_TEMPLATE_SUMMARIZATION,
-                partial_variables={"experience_scope": experience_scope},
+                partial_variables={
+                    "experience_scope": f"the {self.target_software} source code"
+                },
             ),
-            llm=llm,
+            llm=summarization_llm,
             verbose=verbose,
         )
         self.filtering_chain = LLMChain(
             prompt=PromptTemplate.from_template(
                 PROMPT_TEMPLATE_FILTERING_ANALYSIS,
-                partial_variables={
-                    "target_code_consistency": self.target_software or "rest of the"
-                },
+                partial_variables={"target_code_consistency": self.target_software},
             ),
-            llm=llm,
+            llm=filtering_llm,
             verbose=verbose,
         )
 
@@ -119,7 +134,10 @@ def __init__(
         self.agent = create_agent(
             llm,
             tools,
-            system_prompt=f"You are an expert reviewer for {experience_scope}, with experience on source code reviews.",
+            system_prompt=SYSTEM_PROMPT_TEMPLATE.format(
+                target_software=self.target_software,
+            ),
+            response_format=ProviderStrategy(AgentResponse),
         )
 
         self.review_comments_db = review_comments_db
@@ -130,50 +148,53 @@ def __init__(
 
         self.suggestions_feedback_db = suggestions_feedback_db
 
+    @staticmethod
+    def create(
+        llm=None, summarization_llm=None, filtering_llm=None, **kwargs
+    ) -> "CodeReviewTool":
+        from bugbug.tools.core.llms import create_anthropic_llm
+
+        return CodeReviewTool(
+            llm=llm
+            or create_anthropic_llm(
+                model_name="claude-opus-4-5-20251101",
+                max_tokens=40_000,
+                temperature=None,
+                thinking={"type": "enabled", "budget_tokens": 10_000},
+            ),
+            summarization_llm=summarization_llm or create_anthropic_llm(),
+            filtering_llm=filtering_llm or create_anthropic_llm(),
+            **kwargs,
+        )
+
     def count_tokens(self, text):
         return len(self._tokenizer.encode(text))
 
-    def generate_initial_prompt(
-        self, patch: Patch, output_format: Literal["JSON", "TEXT"] = "JSON"
-    ) -> str:
+    def generate_initial_prompt(self, patch: Patch) -> str:
         formatted_patch = format_patch_set(patch.patch_set)
 
         output_summarization = self.summarization_chain.invoke(
             {
                 "patch": formatted_patch,
                 "bug_title": patch.bug_title,
                 "patch_title": patch.patch_title,
+                "patch_description": patch.patch_description,
             },
             return_only_outputs=True,
         )["text"]
 
         if self.verbose:
             GenerativeModelTool._print_answer(output_summarization)
 
-        if output_format == "JSON":
-            output_instructions = OUTPUT_FORMAT_JSON
-        elif output_format == "TEXT":
-            output_instructions = OUTPUT_FORMAT_TEXT
-        else:
-            raise ValueError(
-                f"Unsupported output format: {output_format}, choose JSON or TEXT"
-            )
-
         created_before = patch.date_created if self.is_experiment_env else None
-        return PROMPT_TEMPLATE_REVIEW.format(
+        return FIRST_MESSAGE_TEMPLATE.format(
             patch=formatted_patch,
             patch_summarization=output_summarization,
             comment_examples=self._get_comment_examples(patch, created_before),
             approved_examples=self._get_generated_examples(patch, created_before),
-            target_code_consistency=self.target_software or "rest of the",
-            output_instructions=output_instructions,
-            bug_title=patch.bug_title,
-            patch_title=patch.patch_title,
-            patch_url=patch.patch_url,
-            target_software=self.target_software,
         )
 
-    def _generate_suggestions(self, patch: Patch):
+    def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]:
         try:
             for chunk in self.agent.stream(
                 {
@@ -189,15 +210,13 @@ def _generate_suggestions(self, patch: Patch):
         except GraphRecursionError as e:
             raise ModelResultError("The model could not complete the review") from e
 
-        return result["messages"][-1].content
+        return result["structured_response"].comments
 
     def run(self, patch: Patch) -> list[InlineComment] | None:
         if self.count_tokens(patch.raw_diff) > 21000:
             raise LargeDiffError("The diff is too large")
 
-        output = self._generate_suggestions(patch)
-
-        unfiltered_suggestions = parse_model_output(output)
+        unfiltered_suggestions = self._generate_suggestions(patch)
         if not unfiltered_suggestions:
             logger.info("No suggestions were generated")
             return []
@@ -210,7 +229,9 @@ def run(self, patch: Patch) -> list[InlineComment] | None:
 
         raw_output = self.filtering_chain.invoke(
             {
-                "comments": output,
+                "comments": str(
+                    [comment.model_dump() for comment in unfiltered_suggestions]
+                ),
                 "rejected_examples": rejected_examples,
             },
             return_only_outputs=True,
@@ -300,7 +321,9 @@ def generate_formatted_patch_from_raw_hunk(raw_hunk, filename):
             for num, example in enumerate(comment_examples)
         )
 
-    def get_similar_rejected_comments(self, suggestions) -> Iterable[str]:
+    def get_similar_rejected_comments(
+        self, suggestions: list[GeneratedReviewComment]
+    ) -> Iterable[str]:
         if not self.suggestions_feedback_db:
             raise Exception("Suggestions feedback database is not available")
 
@@ -310,7 +333,7 @@ def get_similar_rejected_comments(self, suggestions) -> Iterable[str]:
         for suggestion in suggestions:
             similar_rejected_suggestions = (
                 self.suggestions_feedback_db.find_similar_rejected_suggestions(
-                    suggestion["comment"],
+                    suggestion.comment,
                     limit=num_examples_per_suggestion,
                     excluded_ids=seen_ids,
                 )