diff --git a/bugbug/tools/code_review/__init__.py b/bugbug/tools/code_review/__init__.py
index 0642c3f5b0..23f5ff7704 100644
--- a/bugbug/tools/code_review/__init__.py
+++ b/bugbug/tools/code_review/__init__.py
@@ -15,7 +15,7 @@
 """
 
 # Agent
-from bugbug.tools.code_review.agent import TARGET_SOFTWARE, CodeReviewTool
+from bugbug.tools.code_review.agent import CodeReviewTool
 
 # Databases
 from bugbug.tools.code_review.database import (
@@ -54,7 +54,6 @@
 __all__ = [
     # Agent
     "CodeReviewTool",
-    "TARGET_SOFTWARE",
     # Databases
     "EvaluationAction",
     "ReviewCommentsDB",
diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py
index afe636f258..203d95e2ba 100644
--- a/bugbug/tools/code_review/agent.py
+++ b/bugbug/tools/code_review/agent.py
@@ -9,14 +9,16 @@
 import os
 from datetime import datetime
 from logging import getLogger
-from typing import Iterable, Literal, Optional
+from typing import Iterable, Optional
 
 from langchain.agents import create_agent
+from langchain.agents.structured_output import ProviderStrategy
 from langchain.chat_models import BaseChatModel
 from langchain.messages import HumanMessage
 from langchain_classic.chains import LLMChain
 from langchain_classic.prompts import PromptTemplate
 from langgraph.errors import GraphRecursionError
+from pydantic import BaseModel, Field
 from unidiff import PatchSet
 
 from bugbug.code_search.function_search import FunctionSearch
@@ -29,19 +31,17 @@
 )
 from bugbug.tools.code_review.prompts import (
     DEFAULT_REJECTED_EXAMPLES,
-    OUTPUT_FORMAT_JSON,
-    OUTPUT_FORMAT_TEXT,
+    FIRST_MESSAGE_TEMPLATE,
     PROMPT_TEMPLATE_FILTERING_ANALYSIS,
-    PROMPT_TEMPLATE_REVIEW,
     PROMPT_TEMPLATE_SUMMARIZATION,
     STATIC_COMMENT_EXAMPLES,
+    SYSTEM_PROMPT_TEMPLATE,
     TEMPLATE_COMMENT_EXAMPLE,
     TEMPLATE_PATCH_FROM_HUNK,
 )
 from bugbug.tools.code_review.utils import (
     format_patch_set,
     generate_processed_output,
-    parse_model_output,
 )
 from bugbug.tools.core.data_types import InlineComment
 from bugbug.tools.core.exceptions import LargeDiffError, ModelResultError
@@ -50,8 +50,27 @@
 
 logger = getLogger(__name__)
 
-# Global variable for target software
-TARGET_SOFTWARE: str | None = None
+
+class GeneratedReviewComment(BaseModel):
+    """A review comment generated by the code review agent."""
+
+    file: str = Field(description="The path to the file the comment applies to.")
+    code_line: int = Field(description="The line number that the comment refers to.")
+    comment: str = Field(description="The review comment.")
+    explanation: str = Field(
+        description="A brief rationale for the comment, including how confident you are and why."
+    )
+    order: int = Field(
+        description="An integer representing the priority of the comment, with 1 being the highest confidence/importance."
+    )
+
+
+class AgentResponse(BaseModel):
+    """The response from the code review agent."""
+
+    comments: list[GeneratedReviewComment] = Field(
+        description="A list of generated review comments."
+    )
 
 
 class CodeReviewTool(GenerativeModelTool):
@@ -60,16 +79,18 @@ class CodeReviewTool(GenerativeModelTool):
     def __init__(
         self,
         llm: BaseChatModel,
+        summarization_llm: BaseChatModel,
+        filtering_llm: BaseChatModel,
         function_search: Optional[FunctionSearch] = None,
         review_comments_db: Optional["ReviewCommentsDB"] = None,
         show_patch_example: bool = False,
         verbose: bool = True,
         suggestions_feedback_db: Optional["SuggestionsFeedbackDB"] = None,
-        target_software: Optional[str] = None,
+        target_software: str = "Mozilla Firefox",
     ) -> None:
         super().__init__()
 
-        self.target_software = target_software or TARGET_SOFTWARE
+        self.target_software = target_software
 
         self._tokenizer = get_tokenizer(
             llm.model_name if hasattr(llm, "model_name") else ""
@@ -87,28 +108,22 @@ def __init__(
                 "----------------------------------------------------"
             )
 
-        experience_scope = (
-            f"the {self.target_software} source code"
-            if self.target_software
-            else "a software project"
-        )
-
         self.summarization_chain = LLMChain(
             prompt=PromptTemplate.from_template(
                 PROMPT_TEMPLATE_SUMMARIZATION,
-                partial_variables={"experience_scope": experience_scope},
+                partial_variables={
+                    "experience_scope": f"the {self.target_software} source code"
+                },
             ),
-            llm=llm,
+            llm=summarization_llm,
             verbose=verbose,
         )
         self.filtering_chain = LLMChain(
             prompt=PromptTemplate.from_template(
                 PROMPT_TEMPLATE_FILTERING_ANALYSIS,
-                partial_variables={
-                    "target_code_consistency": self.target_software or "rest of the"
-                },
+                partial_variables={"target_code_consistency": self.target_software},
             ),
-            llm=llm,
+            llm=filtering_llm,
             verbose=verbose,
         )
 
@@ -119,7 +134,10 @@ def __init__(
         self.agent = create_agent(
             llm,
             tools,
-            system_prompt=f"You are an expert reviewer for {experience_scope}, with experience on source code reviews.",
+            system_prompt=SYSTEM_PROMPT_TEMPLATE.format(
+                target_software=self.target_software,
+            ),
+            response_format=ProviderStrategy(AgentResponse),
         )
 
         self.review_comments_db = review_comments_db
@@ -130,12 +148,29 @@ def __init__(
 
         self.suggestions_feedback_db = suggestions_feedback_db
 
+    @staticmethod
+    def create(
+        llm=None, summarization_llm=None, filtering_llm=None, **kwargs
+    ) -> "CodeReviewTool":
+        from bugbug.tools.core.llms import create_anthropic_llm
+
+        return CodeReviewTool(
+            llm=llm
+            or create_anthropic_llm(
+                model_name="claude-opus-4-5-20251101",
+                max_tokens=40_000,
+                temperature=None,
+                thinking={"type": "enabled", "budget_tokens": 10_000},
+            ),
+            summarization_llm=summarization_llm or create_anthropic_llm(),
+            filtering_llm=filtering_llm or create_anthropic_llm(),
+            **kwargs,
+        )
+
     def count_tokens(self, text):
         return len(self._tokenizer.encode(text))
 
-    def generate_initial_prompt(
-        self, patch: Patch, output_format: Literal["JSON", "TEXT"] = "JSON"
-    ) -> str:
+    def generate_initial_prompt(self, patch: Patch) -> str:
         formatted_patch = format_patch_set(patch.patch_set)
 
         output_summarization = self.summarization_chain.invoke(
@@ -143,6 +178,7 @@ def generate_initial_prompt(
                 "patch": formatted_patch,
                 "bug_title": patch.bug_title,
                 "patch_title": patch.patch_title,
+                "patch_description": patch.patch_description,
             },
             return_only_outputs=True,
         )["text"]
@@ -150,30 +186,15 @@ def generate_initial_prompt(
         if self.verbose:
             GenerativeModelTool._print_answer(output_summarization)
 
-        if output_format == "JSON":
-            output_instructions = OUTPUT_FORMAT_JSON
-        elif output_format == "TEXT":
-            output_instructions = OUTPUT_FORMAT_TEXT
-        else:
-            raise ValueError(
-                f"Unsupported output format: {output_format}, choose JSON or TEXT"
-            )
-
         created_before = patch.date_created if self.is_experiment_env else None
-        return PROMPT_TEMPLATE_REVIEW.format(
+        return FIRST_MESSAGE_TEMPLATE.format(
             patch=formatted_patch,
             patch_summarization=output_summarization,
             comment_examples=self._get_comment_examples(patch, created_before),
             approved_examples=self._get_generated_examples(patch, created_before),
-            target_code_consistency=self.target_software or "rest of the",
-            output_instructions=output_instructions,
-            bug_title=patch.bug_title,
-            patch_title=patch.patch_title,
-            patch_url=patch.patch_url,
-            target_software=self.target_software,
         )
 
-    def _generate_suggestions(self, patch: Patch):
+    def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]:
         try:
             for chunk in self.agent.stream(
                 {
@@ -189,15 +210,13 @@ def _generate_suggestions(self, patch: Patch):
         except GraphRecursionError as e:
             raise ModelResultError("The model could not complete the review") from e
 
-        return result["messages"][-1].content
+        return result["structured_response"].comments
 
     def run(self, patch: Patch) -> list[InlineComment] | None:
         if self.count_tokens(patch.raw_diff) > 21000:
             raise LargeDiffError("The diff is too large")
 
-        output = self._generate_suggestions(patch)
-
-        unfiltered_suggestions = parse_model_output(output)
+        unfiltered_suggestions = self._generate_suggestions(patch)
         if not unfiltered_suggestions:
             logger.info("No suggestions were generated")
             return []
@@ -210,7 +229,9 @@ def run(self, patch: Patch) -> list[InlineComment] | None:
 
         raw_output = self.filtering_chain.invoke(
             {
-                "comments": output,
+                "comments": str(
+                    [comment.model_dump() for comment in unfiltered_suggestions]
+                ),
                 "rejected_examples": rejected_examples,
             },
             return_only_outputs=True,
@@ -300,7 +321,9 @@ def generate_formatted_patch_from_raw_hunk(raw_hunk, filename):
             for num, example in enumerate(comment_examples)
         )
 
-    def get_similar_rejected_comments(self, suggestions) -> Iterable[str]:
+    def get_similar_rejected_comments(
+        self, suggestions: list[GeneratedReviewComment]
+    ) -> Iterable[str]:
         if not self.suggestions_feedback_db:
             raise Exception("Suggestions feedback database is not available")
 
@@ -310,7 +333,7 @@ def get_similar_rejected_comments(self, suggestions) -> Iterable[str]:
         for suggestion in suggestions:
             similar_rejected_suggestions = (
                 self.suggestions_feedback_db.find_similar_rejected_suggestions(
-                    suggestion["comment"],
+                    suggestion.comment,
                     limit=num_examples_per_suggestion,
                     excluded_ids=seen_ids,
                 )
diff --git a/bugbug/tools/code_review/prompts.py b/bugbug/tools/code_review/prompts.py
index 5716f102d5..9b253c026b 100644
--- a/bugbug/tools/code_review/prompts.py
+++ b/bugbug/tools/code_review/prompts.py
@@ -13,122 +13,94 @@
 
 The summarization should have two parts:
     1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request.
-    2. **Structure**: Describe the structure of the changes, including any new functions, classes, or modules introduced, and how they fit into the existing codebase.
+    2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent.
 
 Do not include any code in the summarization, only a description of the changes.
 
 **Bug title**:
+<bug_title>
 {bug_title}
+</bug_title>
 
 **Commit message**:
+<commit_message>
 {patch_title}
+{patch_description}
+</commit_message>
 
 **Diff**:
-{patch}"""
-
-PROMPT_TEMPLATE_REVIEW = """<task>
-Generate high-quality code review comments for the patch provided below.
-</task>
-
-<instructions>
-<analyze_changes>
-**Analyze the Changes**:
-* Understand the intent and structure of the changes in the patch.
-* Use the provided summarization for context, but prioritize what's visible in the diff.
-</analyze_changes>
-
-<identify_issues>
-**Identify Issues**:
-* Detect bugs, logical errors, performance concerns, security issues, or violations of the `{target_code_consistency}` coding standards.
-* Focus only on **new or changed lines** (lines beginning with `+`).
-* **Prioritize**: Security vulnerabilities > Functional bugs > Performance issues > Style/readability concerns.
-</identify_issues>
-
-<assess_confidence>
-**Assess Confidence and Order**:
-* **Only include comments where you are at least 80% confident the issue is valid**.
-* **Sort the comments by descending confidence and importance**:
-  * Start with issues you are **certain are valid**.
-  * Also, prioritize important issues that you are **confident about**.
-  * Follow with issues that are **plausible but uncertain** (possible false positives).
-* **When uncertain, use available tools to verify before commenting**.
-* Assign each comment a numeric `order`, starting at 1.
-</assess_confidence>
-
-<write_comments>
-**Write Clear, Constructive Comments**:
-* Use **direct, declarative language**. State the problem definitively, then suggest the fix.
-* Keep comments **short and specific**.
-* Focus strictly on code-related concerns.
-* **Banned phrases**: "maybe", "might want to", "consider", "possibly", "could be", "you may want to".
-* **Use directive language**: "Fix", "Remove", "Change", "Add", "Validate", "Check" (not "Consider checking").
-* Avoid repeating what the code is doing unless it supports your critique.
-</write_comments>
-
-<use_tools>
-**Use available tools to verify concerns**:
-* Use tools to gather context when you suspect an issue but need verification.
-* Use `find_function_definition` to check if error handling or validation exists elsewhere.
-* Use `expand_context` to see if edge cases are handled in surrounding code.
-* **Do not suggest issues you cannot verify with available context and tools**.
-</use_tools>
-
-<avoid>
-**Avoid Comments That**:
-* Refer to unmodified code (lines without a `+` prefix).
-* Ask for verification or confirmation (e.g., "Check if…", "Ensure that…").
-* Provide praise or restate obvious facts.
-* Focus on testing.
-* Point out issues that are already handled in the visible code.
-* Suggest problems based on assumptions without verifying the context.
-* Flag style preferences without clear `{target_code_consistency}` standard violations.
-</avoid>
-</instructions>
-
-<output_format>
-{output_instructions}
-</output_format>
+<patch>
+{patch}
+</patch>"""
+
+SYSTEM_PROMPT_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing a pull request and providing high-quality review comments. You will examine a code patch and generate constructive feedback focusing on potential issues in the changed code.
+
+## Instructions
+
+Follow this systematic approach to review the patch:
+
+**Step 1: Analyze the Changes**
+- Understand what the patch is trying to accomplish
+- Use the patch summary for context, but focus primarily on what you can see in the actual diff
+- Identify the intent and structure of the changes
+
+**Step 2: Identify Issues**
+- Look for bugs, logical errors, performance problems, security vulnerabilities, or violations of the coding standards
+- Focus ONLY on new or changed lines (lines that begin with `+`)
+- Never comment on unmodified code
+- Prioritize issues in this order: Security vulnerabilities > Functional bugs > Performance issues > Style/readability concerns
+
+**Step 3: Verify and Assess Confidence**
+- Use available tools when you need to verify concerns or gather additional context
+- Only include comments where you are at least 80% confident the issue is valid
+- When uncertain about an issue, use tools like `find_function_definition` or `expand_context` to verify before commenting
+- Do not suggest issues you cannot verify with available context
+
+**Step 4: Sort and Order Comments**
+- Sort comments by descending confidence and importance
+- Start with issues you are certain are valid and that are most critical
+- Assign each comment a numeric order starting at 1
+
+**Step 5: Write Clear, Constructive Comments**
+- Use direct, declarative language - state the problem definitively, then suggest the fix
+- Keep comments short and specific
+- Use directive language: "Fix", "Remove", "Change", "Add"
+- NEVER use these banned phrases: "maybe", "might want to", "consider", "possibly", "could be", "you may want to"
+- Focus strictly on code-related concerns
+
+## What NOT to Include
+
+Do not write comments that:
+- Refer to unmodified code (lines without a `+` prefix)
+- Ask for verification or confirmation (e.g., "Check if...", "Ensure that...")
+- Provide praise or restate obvious facts
+- Focus on testing concerns
+- Point out issues that are already handled in the visible code
+- Suggest problems based on assumptions without verifying the context
+- Flag style preferences without clear coding standard violations
+"""
 
-<examples>
-{comment_examples}
-{approved_examples}
-</examples>
 
-<context>
-**Review Context**:
-Target Software: {target_software}
-Bug Title: {bug_title}
-Patch Title: {patch_title}
-Source URL: {patch_url}
-</context>
+FIRST_MESSAGE_TEMPLATE = """Here is a summary of the patch:
 
 <patch_summary>
 {patch_summarization}
 </patch_summary>
 
-<patch>
-{patch}
-</patch>
-"""
 
-OUTPUT_FORMAT_JSON = """
-Respond only with a **JSON list**. Each object must contain the following fields:
+Here are examples of good code review comments to guide your style and approach:
 
-* `"file"`: The relative path to the file the comment applies to.
-* `"code_line"`: The number of the specific changed line of code that the comment refers to.
-* `"comment"`: A concise review comment.
-* `"explanation"`: A brief rationale for the comment, including how confident you are and why.
-* `"order"`: An integer representing the priority of the comment, with 1 being the highest confidence/importance.
-"""
+<examples>
+{comment_examples}
+{approved_examples}
+</examples>
 
-OUTPUT_FORMAT_TEXT = """
-Respond only with a **plain text list** with the following details:
 
-* `"filename"`: The relative path to the file the comment applies to.
-* `"line_number"`: The number of the specific changed line of code that the comment refers to.
-* `"comment"`: A concise review comment.
+Here is the patch you need to review:
 
-The format should be: filename:line_number "comment"
+<patch>
+{patch}
+</patch>
 """
 
 
diff --git a/bugbug/tools/core/llms.py b/bugbug/tools/core/llms.py
index 2c90360d12..d40aeb1999 100644
--- a/bugbug/tools/core/llms.py
+++ b/bugbug/tools/core/llms.py
@@ -44,7 +44,7 @@ def create_azureopenai_llm(temperature=0.2, top_p=None):
 
 
 def create_anthropic_llm(
-    temperature=0.2, top_p=None, model_name="claude-sonnet-4-5-20250929"
+    temperature=0.2, top_p=None, model_name="claude-sonnet-4-5-20250929", **kwargs
 ):
     from langchain_anthropic import ChatAnthropic
 
@@ -53,6 +53,7 @@ def create_anthropic_llm(
         api_key=get_secret("ANTHROPIC_API_KEY"),
         temperature=temperature,
         top_p=top_p,
+        **kwargs,
     )
 
 
diff --git a/bugbug/tools/core/platforms/base.py b/bugbug/tools/core/platforms/base.py
index 3e6b5142a8..c63102d780 100644
--- a/bugbug/tools/core/platforms/base.py
+++ b/bugbug/tools/core/platforms/base.py
@@ -51,6 +51,12 @@ def patch_title(self) -> str:
         """Return the title of the patch."""
         ...
 
+    @property
+    @abstractmethod
+    def patch_description(self) -> str:
+        """Return the description of the patch."""
+        ...
+
     @property
     @abstractmethod
     def patch_url(self) -> str:
diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py
index 775cf294e1..8f0defdd57 100644
--- a/bugbug/tools/core/platforms/phabricator.py
+++ b/bugbug/tools/core/platforms/phabricator.py
@@ -291,6 +291,10 @@ def bug_title(self) -> str:
     def patch_title(self) -> str:
         return self._revision_metadata["fields"]["title"]
 
+    @property
+    def patch_description(self) -> str:
+        return self._revision_metadata["fields"].get("summary", "")
+
     @property
     def revision_id(self) -> int:
         return self._revision_metadata["id"]
diff --git a/bugbug/tools/core/platforms/swarm.py b/bugbug/tools/core/platforms/swarm.py
index d22517b89a..0fbf234d96 100644
--- a/bugbug/tools/core/platforms/swarm.py
+++ b/bugbug/tools/core/platforms/swarm.py
@@ -53,6 +53,10 @@ def date_created(self) -> datetime:
     def patch_title(self) -> str:
         raise NotImplementedError
 
+    @property
+    def patch_description(self) -> str:
+        raise NotImplementedError
+
     @cached_property
     def bug_title(self) -> str:
         raise NotImplementedError
diff --git a/mcp/src/bugbug_mcp/server.py b/mcp/src/bugbug_mcp/server.py
index 0ea6620eb2..c8223d85fc 100644
--- a/mcp/src/bugbug_mcp/server.py
+++ b/mcp/src/bugbug_mcp/server.py
@@ -55,7 +55,8 @@ async def patch_review(
     else:
         raise ValueError(f"Unsupported patch URL: {patch_url}")
 
-    return get_code_review_tool().generate_initial_prompt(patch, "TEXT")
+    # FIXME: add the system prompt as well
+    return get_code_review_tool().generate_initial_prompt(patch)
 
 
 def get_file(commit_hash, path):
diff --git a/scripts/code_review_tool_evaluator.py b/scripts/code_review_tool_evaluator.py
index 7057ccbdb1..085035d834 100644
--- a/scripts/code_review_tool_evaluator.py
+++ b/scripts/code_review_tool_evaluator.py
@@ -35,7 +35,6 @@
 from bugbug.tools.core.exceptions import ModelResultError
 from bugbug.vectordb import QdrantVectorDB
 
-code_review.TARGET_SOFTWARE = "Mozilla Firefox"
 VERBOSE_CODE_REVIEW = False
 
 
@@ -261,8 +260,7 @@ def get_file(commit_hash, path):
         tool_variants.append(
             (
                 "Claude",
-                code_review.CodeReviewTool(
-                    llm=llms.create_anthropic_llm(),
+                code_review.CodeReviewTool.create(
                     function_search=function_search,
                     review_comments_db=review_comments_db,
                     suggestions_feedback_db=suggestions_feedback_db,
@@ -272,11 +270,14 @@ def get_file(commit_hash, path):
         )
 
     if "gpt" in variants:
+        llm = llms.create_openai_llm()
         tool_variants.append(
             (
                 "GPT",
-                code_review.CodeReviewTool(
-                    llm=llms.create_openai_llm(),
+                code_review.CodeReviewTool.create(
+                    llm=llm,
+                    summarization_llm=llm,
+                    filtering_llm=llm,
                     function_search=function_search,
                     review_comments_db=review_comments_db,
                     suggestions_feedback_db=suggestions_feedback_db,
diff --git a/scripts/code_review_tool_runner.py b/scripts/code_review_tool_runner.py
deleted file mode 100644
index 80fb7d2659..0000000000
--- a/scripts/code_review_tool_runner.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# -*- coding: utf-8 -*-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-
-import argparse
-import sys
-
-from bugbug.code_search.function_search import function_search_classes
-from bugbug.tools import code_review
-from bugbug.tools.core import llms
-from bugbug.vectordb import QdrantVectorDB
-
-
-def run(args) -> None:
-    llm = llms.create_llm_from_args(args)
-
-    function_search = (
-        function_search_classes[args.function_search_type]()
-        if args.function_search_type is not None
-        else None
-    )
-    vector_db = QdrantVectorDB("diff_comments")
-    review_comments_db = code_review.ReviewCommentsDB(vector_db)
-    code_review_tool = code_review.CodeReviewTool(
-        llm,
-        function_search=function_search,
-        review_comments_db=review_comments_db,
-        show_patch_example=False,
-    )
-
-    review_data = code_review.review_data_classes[args.review_platform]()
-
-    revision = review_data.get_review_request_by_id(args.review_request_id)
-    patch = review_data.get_patch_by_id(revision.patch_id)
-
-    print(patch)
-    print(code_review_tool.run(patch))
-    input()
-
-
-def parse_args(args):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        "--review_platform",
-        help="Review platform",
-        choices=list(code_review.review_data_classes.keys()),
-    )
-    parser.add_argument(
-        "--review_request_id",
-        help="Review request ID",
-    )
-    llms.create_llm_to_args(parser)
-    parser.add_argument(
-        "--function_search_type",
-        help="Function search tool",
-        choices=list(function_search_classes.keys()),
-    )
-    return parser.parse_args(args)
-
-
-if __name__ == "__main__":
-    args = parse_args(sys.argv[1:])
-    run(args)