From 57f284966b9374b92fb29e7bc94458cb7e497f5a Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Fri, 31 Jan 2025 23:54:14 +0800
Subject: [PATCH 01/22] Implement `optimize.minimize`

---
 pytensor/tensor/optimize.py   | 159 ++++++++++++++++++++++++++++++++++
 tests/tensor/test_optimize.py |  26 ++++++
 2 files changed, 185 insertions(+)
 create mode 100644 pytensor/tensor/optimize.py
 create mode 100644 tests/tensor/test_optimize.py

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
new file mode 100644
index 0000000000..44a4800fc3
--- /dev/null
+++ b/pytensor/tensor/optimize.py
@@ -0,0 +1,159 @@
+from copy import copy
+
+from scipy.optimize import minimize as scipy_minimize
+
+from pytensor import function
+from pytensor.gradient import grad
+from pytensor.graph import Apply, Constant, FunctionGraph, clone_replace
+from pytensor.graph.basic import truncated_graph_inputs
+from pytensor.graph.op import HasInnerGraph, Op
+from pytensor.scalar import bool as scalar_bool
+
+
+class MinimizeOp(Op, HasInnerGraph):
+    def __init__(
+        self,
+        x,
+        *args,
+        output,
+        method="BFGS",
+        jac=False,
+        options: dict | None = None,
+        debug: bool = False,
+    ):
+        self.fgraph = FunctionGraph([x, *args], [output])
+
+        if jac:
+            grad_wrt_x = grad(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            self.fgraph.add_output(grad_wrt_x)
+
+        self.jac = jac
+        # self.hess = hess
+        self.method = method
+        self.options = options if options is not None else {}
+        self.debug = debug
+        self._fn = None
+        self._fn_wrapped = None
+
+    def build_fn(self):
+        outputs = self.inner_outputs
+        if len(outputs) == 1:
+            outputs = outputs[0]
+        self._fn = fn = function(self.inner_inputs, outputs)
+        self.fgraph = (
+            fn.maker.fgraph
+        )  # So we see the compiled graph ater the first call
+
+        if self.inner_inputs[0].type.shape == ():
+            # Work-around for scipy changing the type of x
+            def fn_wrapper(x, *args):
+                return fn(x.squeeze(), *args)
+
+            self._fn_wrapped = fn_wrapper
+        else:
+            self._fn_wrapped = fn
+
+    @property
+    def fn(self):
+        if self._fn is None:
+            self.build_fn()
+        return self._fn
+
+    @property
+    def fn_wrapped(self):
+        if self._fn_wrapped is None:
+            self.build_fn()
+        return self._fn_wrapped
+
+    @property
+    def inner_inputs(self):
+        return self.fgraph.inputs
+
+    @property
+    def inner_outputs(self):
+        return self.fgraph.outputs
+
+    def clone(self):
+        copy_op = copy(self)
+        copy_op.fgraph = self.fgraph.clone()
+        return copy_op
+
+    # def prepare_node():
+    #     # ... trigger the compilation of the inner fgraph so it shows in the dprint before the first call
+    #     ...
+
+    def make_node(self, *inputs):
+        # print(inputs)
+        assert len(inputs) == len(self.inner_inputs)
+        # Assert type is correct.
+        return Apply(
+            self, inputs, [self.inner_outputs[0].type(), scalar_bool("success")]
+        )
+
+    def perform(self, node, inputs, outputs):
+        f = self.fn_wrapped
+        x0, *args = inputs
+
+        # print(f(*inputs))
+
+        res = scipy_minimize(
+            fun=f,
+            jac=self.jac,
+            x0=x0,
+            args=tuple(args),
+            method=self.method,
+            **self.options,
+        )
+        if self.debug:
+            print(res)
+        outputs[0][0] = res.x
+        outputs[1][0] = res.success
+
+    def L_op(self, inputs, outputs, output_grads):
+        x, *args = inputs
+        x_star, success = outputs
+        output_grad, _ = output_grads
+
+        # x_root, stats = root(func, x0, args=[arg], tol=1e-8)
+
+        inner_x, *inner_args = self.fgraph.inputs
+        inner_fx = self.fgraph.outputs[0]
+
+        # f_x_star = clone_replace(inner_fx, replace={inner_x: x_star})
+
+        inner_grads = grad(inner_fx, [inner_x, *inner_args])
+
+        # TODO: Does clone replace do what we want? It might need a merge optimization pass afterwards
+        replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
+        grad_f_wrt_x_star, *grad_f_wrt_args = clone_replace(
+            inner_grads, replace=replace
+        )
+
+        # # TODO: If scipy optimizer uses hessian (or hessp), just store it from the inner function
+        # inner_hess = jacobian(inner_fx, inner_args)
+        # hess_f_x = clone_replace(inner_hess, replace=replace)
+
+        grad_wrt_args = [
+            -grad_f_wrt_arg / grad_f_wrt_x_star * output_grad
+            for grad_f_wrt_arg in grad_f_wrt_args
+        ]
+
+        return [x.zeros_like(), *grad_wrt_args]
+
+
+def minimize(
+    objective, x, jac: bool = True, debug: bool = False, options: dict | None = None
+):
+    args = [
+        arg
+        for arg in truncated_graph_inputs([objective], [x])
+        if (arg is not x and not isinstance(arg, Constant))
+    ]
+    # print(args)
+    minimize_op = MinimizeOp(
+        x, *args, output=objective, jac=jac, debug=debug, options=options
+    )
+    return minimize_op(x, *args)
+
+
+__all__ = ["minimize"]
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
new file mode 100644
index 0000000000..09140a0fde
--- /dev/null
+++ b/tests/tensor/test_optimize.py
@@ -0,0 +1,26 @@
+import pytensor.tensor as pt
+from pytensor.tensor.optimize import minimize
+
+
+def test_minimize():
+    x = pt.scalar("x")
+    a = pt.scalar("a")
+    c = pt.scalar("c")
+
+    b = a * 2
+    b.name = "b"
+    out = (x - b * c) ** 2
+
+    minimized_x, success = minimize(out, x, debug=False)
+
+    a_val = 2
+    c_val = 3
+
+    assert success
+    assert minimized_x.eval({a: a_val, c: c_val, x: 0.0}) == (2 * a_val * c_val)
+
+    x_grad, a_grad, c_grad = pt.grad(minimized_x, [x, a, c])
+
+    assert x_grad.eval({x: 0.0}) == 0.0
+    assert a_grad.eval({a: a_val, c: c_val, x: 0.0}) == 2 * c_val
+    assert c_grad.eval({a: a_val, c: c_val, x: 0.0}) == 2 * a_val

From b4ea4c0f5abfc977f0942d85f84fcdba16da3215 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Sat, 1 Feb 2025 18:49:30 +0800
Subject: [PATCH 02/22] Add more tests

---
 tests/tensor/test_optimize.py | 49 +++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index 09140a0fde..9082ad084b 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -1,8 +1,15 @@
+import numpy as np
+
 import pytensor.tensor as pt
+from pytensor import config
 from pytensor.tensor.optimize import minimize
+from tests import unittest_tools as utt
+
 
+floatX = config.floatX
 
-def test_minimize():
+
+def test_simple_minimize():
     x = pt.scalar("x")
     a = pt.scalar("a")
     c = pt.scalar("c")
@@ -11,16 +18,42 @@ def test_minimize():
     b.name = "b"
     out = (x - b * c) ** 2
 
-    minimized_x, success = minimize(out, x, debug=False)
+    minimized_x, success = minimize(out, x)
 
-    a_val = 2
-    c_val = 3
+    a_val = 2.0
+    c_val = 3.0
 
     assert success
     assert minimized_x.eval({a: a_val, c: c_val, x: 0.0}) == (2 * a_val * c_val)
 
-    x_grad, a_grad, c_grad = pt.grad(minimized_x, [x, a, c])
+    def f(x, a, b):
+        objective = (x - a * b) ** 2
+        out = minimize(objective, x)[0]
+        return out
+
+    utt.verify_grad(f, [0.0, a_val, c_val], eps=1e-6)
+
+
+def test_minimize_vector_x():
+    def rosenbrock_shifted_scaled(x, a, b):
+        return (a * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum() + b
+
+    x = pt.dvector("x")
+    a = pt.scalar("a")
+    b = pt.scalar("b")
+
+    objective = rosenbrock_shifted_scaled(x, a, b)
+
+    minimized_x, success = minimize(objective, x, method="BFGS")
+
+    a_val = 0.5
+    b_val = 1.0
+    x0 = np.zeros(5).astype(floatX)
+    x_star_val = minimized_x.eval({a: a_val, b: b_val, x: x0})
+
+    assert success
+    np.testing.assert_allclose(
+        x_star_val, np.ones_like(x_star_val), atol=1e-6, rtol=1e-6
+    )
 
-    assert x_grad.eval({x: 0.0}) == 0.0
-    assert a_grad.eval({a: a_val, c: c_val, x: 0.0}) == 2 * c_val
-    assert c_grad.eval({a: a_val, c: c_val, x: 0.0}) == 2 * a_val
+    utt.verify_grad(rosenbrock_shifted_scaled, [x0, a_val, b_val], eps=1e-6)

From 19100d0dde3dae958e38fe6192eede93dd2d65d5 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Sat, 1 Feb 2025 18:54:15 +0800
Subject: [PATCH 03/22] Code cleanup

---
 pytensor/tensor/optimize.py | 77 ++++++++++++++++++++++++++++---------
 1 file changed, 59 insertions(+), 18 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 44a4800fc3..739d65613b 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -2,11 +2,11 @@
 
 from scipy.optimize import minimize as scipy_minimize
 
-from pytensor import function
+from pytensor import function, graph_replace
 from pytensor.gradient import grad
-from pytensor.graph import Apply, Constant, FunctionGraph, clone_replace
+from pytensor.graph import Apply, Constant, FunctionGraph
 from pytensor.graph.basic import truncated_graph_inputs
-from pytensor.graph.op import HasInnerGraph, Op
+from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
 from pytensor.scalar import bool as scalar_bool
 
 
@@ -17,7 +17,9 @@ def __init__(
         *args,
         output,
         method="BFGS",
-        jac=False,
+        jac=True,
+        hess=False,
+        hessp=False,
         options: dict | None = None,
         debug: bool = False,
     ):
@@ -28,7 +30,9 @@ def __init__(
             self.fgraph.add_output(grad_wrt_x)
 
         self.jac = jac
-        # self.hess = hess
+        self.hess = hess
+        self.hessp = hessp
+
         self.method = method
         self.options = options if options is not None else {}
         self.debug = debug
@@ -78,14 +82,19 @@ def clone(self):
         copy_op.fgraph = self.fgraph.clone()
         return copy_op
 
-    # def prepare_node():
-    #     # ... trigger the compilation of the inner fgraph so it shows in the dprint before the first call
-    #     ...
+    def prepare_node(
+        self,
+        node: Apply,
+        storage_map: StorageMapType | None,
+        compute_map: ComputeMapType | None,
+        impl: str | None,
+    ):
+        """Trigger the compilation of the inner fgraph so it shows in the dprint before the first call"""
+        # TODO: Implemet this method
 
     def make_node(self, *inputs):
-        # print(inputs)
         assert len(inputs) == len(self.inner_inputs)
-        # Assert type is correct.
+
         return Apply(
             self, inputs, [self.inner_outputs[0].type(), scalar_bool("success")]
         )
@@ -114,18 +123,14 @@ def L_op(self, inputs, outputs, output_grads):
         x_star, success = outputs
         output_grad, _ = output_grads
 
-        # x_root, stats = root(func, x0, args=[arg], tol=1e-8)
-
         inner_x, *inner_args = self.fgraph.inputs
         inner_fx = self.fgraph.outputs[0]
 
-        # f_x_star = clone_replace(inner_fx, replace={inner_x: x_star})
-
         inner_grads = grad(inner_fx, [inner_x, *inner_args])
 
         # TODO: Does clone replace do what we want? It might need a merge optimization pass afterwards
         replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
-        grad_f_wrt_x_star, *grad_f_wrt_args = clone_replace(
+        grad_f_wrt_x_star, *grad_f_wrt_args = graph_replace(
             inner_grads, replace=replace
         )
 
@@ -142,16 +147,52 @@ def L_op(self, inputs, outputs, output_grads):
 
 
 def minimize(
-    objective, x, jac: bool = True, debug: bool = False, options: dict | None = None
+    objective,
+    x,
+    method: str = "BFGS",
+    jac: bool = True,
+    debug: bool = False,
+    options: dict | None = None,
 ):
+    """
+    Minimize a scalar objective function using scipy.optimize.minimize.
+
+    Parameters
+    ----------
+    objective : TensorVariable
+        The objective function to minimize. This should be a pytensor variable representing a scalar value.
+
+    x : TensorVariable
+        The variable with respect to which the objective function is minimized. It must be an input to the
+        computational graph of `objective`.
+
+    method : str, optional
+        The optimization method to use. Default is "BFGS". See scipy.optimize.minimize for other options.
+
+    jac : bool, optional
+        Whether to compute and use the gradient of teh objective function with respect to x for optimization.
+        Default is True.
+
+    debug : bool, optional
+        If True, prints raw scipy result after optimization. Default is False.
+
+    **optimizer_kwargs
+        Additional keyword arguments to pass to scipy.optimize.minimize
+
+    Returns
+    -------
+    TensorVariable
+        The optimized value of x that minimizes the objective function.
+
+    """
     args = [
         arg
         for arg in truncated_graph_inputs([objective], [x])
         if (arg is not x and not isinstance(arg, Constant))
     ]
-    # print(args)
+
     minimize_op = MinimizeOp(
-        x, *args, output=objective, jac=jac, debug=debug, options=options
+        x, *args, output=objective, method=method, jac=jac, debug=debug, options=options
     )
     return minimize_op(x, *args)
 

From ff664db2046ee84b5629d024dcdbaf4ec7f7f761 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Sun, 2 Feb 2025 00:09:38 +0800
Subject: [PATCH 04/22] Add RootOp, fix gradient tests (they are failing)

---
 pytensor/tensor/optimize.py   | 203 ++++++++++++++++++++++++++--------
 tests/tensor/test_optimize.py |  67 ++++++++++-
 2 files changed, 223 insertions(+), 47 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 739d65613b..3982fee9db 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -1,55 +1,43 @@
+from collections.abc import Sequence
 from copy import copy
 
 from scipy.optimize import minimize as scipy_minimize
+from scipy.optimize import root as scipy_root
 
-from pytensor import function, graph_replace
-from pytensor.gradient import grad
+from pytensor import Variable, function, graph_replace
+from pytensor.gradient import grad, jacobian
 from pytensor.graph import Apply, Constant, FunctionGraph
 from pytensor.graph.basic import truncated_graph_inputs
 from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
 from pytensor.scalar import bool as scalar_bool
+from pytensor.tensor.basic import atleast_2d
+from pytensor.tensor.slinalg import solve
+from pytensor.tensor.variable import TensorVariable
 
 
-class MinimizeOp(Op, HasInnerGraph):
-    def __init__(
-        self,
-        x,
-        *args,
-        output,
-        method="BFGS",
-        jac=True,
-        hess=False,
-        hessp=False,
-        options: dict | None = None,
-        debug: bool = False,
-    ):
-        self.fgraph = FunctionGraph([x, *args], [output])
+class ScipyWrapperOp(Op, HasInnerGraph):
+    """Shared logic for scipy optimization ops"""
 
-        if jac:
-            grad_wrt_x = grad(self.fgraph.outputs[0], self.fgraph.inputs[0])
-            self.fgraph.add_output(grad_wrt_x)
-
-        self.jac = jac
-        self.hess = hess
-        self.hessp = hessp
-
-        self.method = method
-        self.options = options if options is not None else {}
-        self.debug = debug
-        self._fn = None
-        self._fn_wrapped = None
+    __props__ = ("method", "debug")
 
     def build_fn(self):
+        """
+        This is overloaded because scipy converts scalar inputs to lists, changing the return type. The
+        wrapper function logic is there to handle this.
+        """
+        # TODO: Introduce rewrites to change MinimizeOp to MinimizeScalarOp and RootOp to RootScalarOp
+        #  when x is scalar. That will remove the need for the wrapper.
+
         outputs = self.inner_outputs
         if len(outputs) == 1:
             outputs = outputs[0]
         self._fn = fn = function(self.inner_inputs, outputs)
-        self.fgraph = (
-            fn.maker.fgraph
-        )  # So we see the compiled graph ater the first call
+
+        # Do this reassignment to see the compiled graph in the dprint
+        self.fgraph = fn.maker.fgraph
 
         if self.inner_inputs[0].type.shape == ():
-            # Work-around for scipy changing the type of x
+
             def fn_wrapper(x, *args):
                 return fn(x.squeeze(), *args)
 
@@ -90,21 +78,51 @@ def prepare_node(
         impl: str | None,
     ):
         """Trigger the compilation of the inner fgraph so it shows in the dprint before the first call"""
-        # TODO: Implemet this method
+        self.build_fn()
 
     def make_node(self, *inputs):
         assert len(inputs) == len(self.inner_inputs)
 
         return Apply(
-            self, inputs, [self.inner_outputs[0].type(), scalar_bool("success")]
+            self, inputs, [self.inner_inputs[0].type(), scalar_bool("success")]
         )
 
+
+class MinimizeOp(ScipyWrapperOp):
+    __props__ = ("method", "jac", "hess", "hessp", "debug")
+
+    def __init__(
+        self,
+        x,
+        *args,
+        objective,
+        method="BFGS",
+        jac=True,
+        hess=False,
+        hessp=False,
+        options: dict | None = None,
+        debug: bool = False,
+    ):
+        self.fgraph = FunctionGraph([x, *args], [objective])
+
+        if jac:
+            grad_wrt_x = grad(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            self.fgraph.add_output(grad_wrt_x)
+
+        self.jac = jac
+        self.hess = hess
+        self.hessp = hessp
+
+        self.method = method
+        self.options = options if options is not None else {}
+        self.debug = debug
+        self._fn = None
+        self._fn_wrapped = None
+
     def perform(self, node, inputs, outputs):
         f = self.fn_wrapped
         x0, *args = inputs
 
-        # print(f(*inputs))
-
         res = scipy_minimize(
             fun=f,
             jac=self.jac,
@@ -113,8 +131,10 @@ def perform(self, node, inputs, outputs):
             method=self.method,
             **self.options,
         )
+
         if self.debug:
             print(res)
+
         outputs[0][0] = res.x
         outputs[1][0] = res.success
 
@@ -128,16 +148,12 @@ def L_op(self, inputs, outputs, output_grads):
 
         inner_grads = grad(inner_fx, [inner_x, *inner_args])
 
-        # TODO: Does clone replace do what we want? It might need a merge optimization pass afterwards
         replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
+
         grad_f_wrt_x_star, *grad_f_wrt_args = graph_replace(
             inner_grads, replace=replace
         )
 
-        # # TODO: If scipy optimizer uses hessian (or hessp), just store it from the inner function
-        # inner_hess = jacobian(inner_fx, inner_args)
-        # hess_f_x = clone_replace(inner_hess, replace=replace)
-
         grad_wrt_args = [
             -grad_f_wrt_arg / grad_f_wrt_x_star * output_grad
             for grad_f_wrt_arg in grad_f_wrt_args
@@ -192,9 +208,108 @@ def minimize(
     ]
 
     minimize_op = MinimizeOp(
-        x, *args, output=objective, method=method, jac=jac, debug=debug, options=options
+        x,
+        *args,
+        objective=objective,
+        method=method,
+        jac=jac,
+        debug=debug,
+        options=options,
     )
+
     return minimize_op(x, *args)
 
 
-__all__ = ["minimize"]
+class RootOp(ScipyWrapperOp):
+    __props__ = ("method", "jac", "debug")
+
+    def __init__(
+        self,
+        variables,
+        *args,
+        equations,
+        method="hybr",
+        jac=True,
+        options: dict | None = None,
+        debug: bool = False,
+    ):
+        self.fgraph = FunctionGraph([variables, *args], [equations])
+
+        if jac:
+            jac_wrt_x = jacobian(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            self.fgraph.add_output(atleast_2d(jac_wrt_x))
+
+        self.jac = jac
+
+        self.method = method
+        self.options = options if options is not None else {}
+        self.debug = debug
+        self._fn = None
+        self._fn_wrapped = None
+
+    def perform(self, node, inputs, outputs):
+        f = self.fn_wrapped
+        variables, *args = inputs
+
+        res = scipy_root(
+            fun=f,
+            jac=self.jac,
+            x0=variables,
+            args=tuple(args),
+            method=self.method,
+            **self.options,
+        )
+
+        if self.debug:
+            print(res)
+
+        outputs[0][0] = res.x
+        outputs[1][0] = res.success
+
+    def L_op(
+        self,
+        inputs: Sequence[Variable],
+        outputs: Sequence[Variable],
+        output_grads: Sequence[Variable],
+    ) -> list[Variable]:
+        # TODO: Broken
+        x, *args = inputs
+        x_star, success = outputs
+        output_grad, _ = output_grads
+
+        inner_x, *inner_args = self.fgraph.inputs
+        inner_fx = self.fgraph.outputs[0]
+
+        inner_jac = jacobian(inner_fx, [inner_x, *inner_args])
+
+        replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
+        jac_f_wrt_x_star, *jac_f_wrt_args = graph_replace(inner_jac, replace=replace)
+
+        jac_wrt_args = solve(-jac_f_wrt_x_star, output_grad)
+
+        return [x.zeros_like(), jac_wrt_args]
+
+
+def root(
+    equations: TensorVariable,
+    variables: TensorVariable,
+    method: str = "hybr",
+    jac: bool = True,
+    debug: bool = False,
+):
+    """Find roots of a system of equations using scipy.optimize.root."""
+
+    args = [
+        arg
+        for arg in truncated_graph_inputs([equations], [variables])
+        if (arg is not variables and not isinstance(arg, Constant))
+    ]
+
+    root_op = RootOp(
+        variables, *args, equations=equations, method=method, jac=jac, debug=debug
+    )
+
+    return root_op(variables, *args)
+
+
+__all__ = ["minimize", "root"]
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index 9082ad084b..58e6693098 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -1,8 +1,9 @@
 import numpy as np
 
+import pytensor
 import pytensor.tensor as pt
 from pytensor import config
-from pytensor.tensor.optimize import minimize
+from pytensor.tensor.optimize import minimize, root
 from tests import unittest_tools as utt
 
 
@@ -19,6 +20,7 @@ def test_simple_minimize():
     out = (x - b * c) ** 2
 
     minimized_x, success = minimize(out, x)
+    minimized_x.dprint()
 
     a_val = 2.0
     c_val = 3.0
@@ -43,7 +45,6 @@ def rosenbrock_shifted_scaled(x, a, b):
     b = pt.scalar("b")
 
     objective = rosenbrock_shifted_scaled(x, a, b)
-
     minimized_x, success = minimize(objective, x, method="BFGS")
 
     a_val = 0.5
@@ -56,4 +57,64 @@ def rosenbrock_shifted_scaled(x, a, b):
         x_star_val, np.ones_like(x_star_val), atol=1e-6, rtol=1e-6
     )
 
-    utt.verify_grad(rosenbrock_shifted_scaled, [x0, a_val, b_val], eps=1e-6)
+    def f(x, a, b):
+        objective = rosenbrock_shifted_scaled(x, a, b)
+        out = minimize(objective, x)[0]
+        return out
+
+    utt.verify_grad(f, [x0, a_val, b_val], eps=1e-6)
+
+
+def test_root_simple():
+    x = pt.scalar("x")
+    a = pt.scalar("a")
+
+    def fn(x, a):
+        return x + 2 * a * pt.cos(x)
+
+    f = fn(x, a)
+    root_f, success = root(f, x)
+    func = pytensor.function([x, a], [root_f, success])
+
+    x0 = 0.0
+    a_val = 1.0
+    solution, success = func(x0, a_val)
+
+    assert success
+    np.testing.assert_allclose(solution, -1.02986653, atol=1e-6, rtol=1e-6)
+
+    def root_fn(x, a):
+        f = fn(x, a)
+        return root(f, x)[0]
+
+    utt.verify_grad(root_fn, [x0, a_val], eps=1e-6)
+
+
+def test_root_system_of_equations():
+    x = pt.dvector("x")
+    a = pt.dvector("a")
+    b = pt.dvector("b")
+
+    f = pt.stack([a[0] * x[0] * pt.cos(x[1]) - b[0], x[0] * x[1] - a[1] * x[1] - b[1]])
+
+    root_f, success = root(f, x, debug=True)
+    func = pytensor.function([x, a, b], [root_f, success])
+
+    x0 = np.array([1.0, 1.0])
+    a_val = np.array([1.0, 1.0])
+    b_val = np.array([4.0, 5.0])
+    solution, success = func(x0, a_val, b_val)
+
+    assert success
+
+    np.testing.assert_allclose(
+        solution, np.array([6.50409711, 0.90841421]), atol=1e-6, rtol=1e-6
+    )
+
+    def root_fn(x, a, b):
+        f = pt.stack(
+            [a[0] * x[0] * pt.cos(x[1]) - b[0], x[0] * x[1] - a[1] * x[1] - b[1]]
+        )
+        return root(f, x)[0]
+
+    utt.verify_grad(root_fn, [x0, a_val, b_val], eps=1e-6)

From bd2065dd08ac60fcf5ece8b0481978b822b91e42 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Sun, 2 Feb 2025 11:38:03 +0800
Subject: [PATCH 05/22] Correct gradients for `minimize`

---
 pytensor/tensor/optimize.py   | 42 ++++++++++++++++++++++++++---------
 tests/tensor/test_optimize.py |  1 -
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 3982fee9db..05132bb38a 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -5,12 +5,12 @@
 from scipy.optimize import root as scipy_root
 
 from pytensor import Variable, function, graph_replace
-from pytensor.gradient import grad, jacobian
+from pytensor.gradient import DisconnectedType, grad, jacobian
 from pytensor.graph import Apply, Constant, FunctionGraph
 from pytensor.graph.basic import truncated_graph_inputs
 from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
 from pytensor.scalar import bool as scalar_bool
-from pytensor.tensor.basic import atleast_2d
+from pytensor.tensor.basic import atleast_2d, concatenate
 from pytensor.tensor.slinalg import solve
 from pytensor.tensor.variable import TensorVariable
 
@@ -146,18 +146,40 @@ def L_op(self, inputs, outputs, output_grads):
         inner_x, *inner_args = self.fgraph.inputs
         inner_fx = self.fgraph.outputs[0]
 
-        inner_grads = grad(inner_fx, [inner_x, *inner_args])
+        implicit_f = grad(inner_fx, inner_x)
 
-        replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
+        df_dx = atleast_2d(concatenate(jacobian(implicit_f, [inner_x]), axis=-1))
 
-        grad_f_wrt_x_star, *grad_f_wrt_args = graph_replace(
-            inner_grads, replace=replace
+        df_dtheta = concatenate(
+            [
+                atleast_2d(x, left=False)
+                for x in jacobian(implicit_f, inner_args, disconnected_inputs="ignore")
+            ],
+            axis=-1,
         )
 
-        grad_wrt_args = [
-            -grad_f_wrt_arg / grad_f_wrt_x_star * output_grad
-            for grad_f_wrt_arg in grad_f_wrt_args
-        ]
+        replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
+
+        df_dx_star, df_dtheta_star = graph_replace([df_dx, df_dtheta], replace=replace)
+
+        grad_wrt_args_vector = solve(-df_dtheta_star, df_dx_star)
+
+        cursor = 0
+        grad_wrt_args = []
+
+        for output_grad, arg in zip(output_grads, args, strict=True):
+            arg_shape = arg.shape
+            arg_size = arg_shape.prod()
+            arg_grad = grad_wrt_args_vector[cursor : cursor + arg_size].reshape(
+                arg_shape
+            )
+
+            grad_wrt_args.append(
+                arg_grad * output_grad
+                if not isinstance(output_grad.type, DisconnectedType)
+                else DisconnectedType()
+            )
+            cursor += arg_size
 
         return [x.zeros_like(), *grad_wrt_args]
 
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index 58e6693098..201aa664b6 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -20,7 +20,6 @@ def test_simple_minimize():
     out = (x - b * c) ** 2
 
     minimized_x, success = minimize(out, x)
-    minimized_x.dprint()
 
     a_val = 2.0
     c_val = 3.0

From 9a1e29d9391cd4cf443905946e9bd1a51e5b0139 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Fri, 6 Jun 2025 18:11:46 +0800
Subject: [PATCH 06/22] Mypy

---
 pytensor/tensor/optimize.py | 61 +++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 05132bb38a..b0bb413e96 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -1,5 +1,6 @@
 from collections.abc import Sequence
 from copy import copy
+from typing import cast
 
 from scipy.optimize import minimize as scipy_minimize
 from scipy.optimize import root as scipy_root
@@ -10,7 +11,7 @@
 from pytensor.graph.basic import truncated_graph_inputs
 from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
 from pytensor.scalar import bool as scalar_bool
-from pytensor.tensor.basic import atleast_2d, concatenate
+from pytensor.tensor.basic import atleast_2d, concatenate, zeros_like
 from pytensor.tensor.slinalg import solve
 from pytensor.tensor.variable import TensorVariable
 
@@ -18,8 +19,6 @@
 class ScipyWrapperOp(Op, HasInnerGraph):
     """Shared logic for scipy optimization ops"""
 
-    __props__ = ("method", "debug")
-
     def build_fn(self):
         """
         This is overloaded because scipy converts scalar inputs to lists, changing the return type. The
@@ -93,20 +92,22 @@ class MinimizeOp(ScipyWrapperOp):
 
     def __init__(
         self,
-        x,
-        *args,
-        objective,
-        method="BFGS",
-        jac=True,
-        hess=False,
-        hessp=False,
-        options: dict | None = None,
+        x: Variable,
+        *args: Variable,
+        objective: Variable,
+        method: str = "BFGS",
+        jac: bool = True,
+        hess: bool = False,
+        hessp: bool = False,
+        optimizer_kwargs: dict | None = None,
         debug: bool = False,
     ):
         self.fgraph = FunctionGraph([x, *args], [objective])
 
         if jac:
-            grad_wrt_x = grad(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            grad_wrt_x = cast(
+                Variable, grad(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            )
             self.fgraph.add_output(grad_wrt_x)
 
         self.jac = jac
@@ -114,7 +115,7 @@ def __init__(
         self.hessp = hessp
 
         self.method = method
-        self.options = options if options is not None else {}
+        self.optimizer_kwargs = optimizer_kwargs if optimizer_kwargs is not None else {}
         self.debug = debug
         self._fn = None
         self._fn_wrapped = None
@@ -132,9 +133,6 @@ def perform(self, node, inputs, outputs):
             **self.options,
         )
 
-        if self.debug:
-            print(res)
-
         outputs[0][0] = res.x
         outputs[1][0] = res.success
 
@@ -185,12 +183,12 @@ def L_op(self, inputs, outputs, output_grads):
 
 
 def minimize(
-    objective,
-    x,
+    objective: TensorVariable,
+    x: TensorVariable,
     method: str = "BFGS",
     jac: bool = True,
     debug: bool = False,
-    options: dict | None = None,
+    optimizer_kwargs: dict | None = None,
 ):
     """
     Minimize a scalar objective function using scipy.optimize.minimize.
@@ -214,7 +212,7 @@ def minimize(
     debug : bool, optional
         If True, prints raw scipy result after optimization. Default is False.
 
-    **optimizer_kwargs
+    optimizer_kwargs
         Additional keyword arguments to pass to scipy.optimize.minimize
 
     Returns
@@ -236,7 +234,7 @@ def minimize(
         method=method,
         jac=jac,
         debug=debug,
-        options=options,
+        optimizer_kwargs=optimizer_kwargs,
     )
 
     return minimize_op(x, *args)
@@ -247,12 +245,12 @@ class RootOp(ScipyWrapperOp):
 
     def __init__(
         self,
-        variables,
-        *args,
-        equations,
-        method="hybr",
-        jac=True,
-        options: dict | None = None,
+        variables: Variable,
+        *args: Variable,
+        equations: Variable,
+        method: str = "hybr",
+        jac: bool = True,
+        optimizer_kwargs: dict | None = None,
         debug: bool = False,
     ):
         self.fgraph = FunctionGraph([variables, *args], [equations])
@@ -264,7 +262,7 @@ def __init__(
         self.jac = jac
 
         self.method = method
-        self.options = options if options is not None else {}
+        self.optimizer_kwargs = optimizer_kwargs if optimizer_kwargs is not None else {}
         self.debug = debug
         self._fn = None
         self._fn_wrapped = None
@@ -279,12 +277,9 @@ def perform(self, node, inputs, outputs):
             x0=variables,
             args=tuple(args),
             method=self.method,
-            **self.options,
+            **self.optimizer_kwargs,
         )
 
-        if self.debug:
-            print(res)
-
         outputs[0][0] = res.x
         outputs[1][0] = res.success
 
@@ -309,7 +304,7 @@ def L_op(
 
         jac_wrt_args = solve(-jac_f_wrt_x_star, output_grad)
 
-        return [x.zeros_like(), jac_wrt_args]
+        return [zeros_like(x), jac_wrt_args]
 
 
 def root(

From d6b654f715667919ff6cb356994188e0af80683c Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Fri, 6 Jun 2025 18:12:43 +0800
Subject: [PATCH 07/22] remove debug flag

---
 pytensor/tensor/optimize.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index b0bb413e96..b56bd8c327 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -88,7 +88,7 @@ def make_node(self, *inputs):
 
 
 class MinimizeOp(ScipyWrapperOp):
-    __props__ = ("method", "jac", "hess", "hessp", "debug")
+    __props__ = ("method", "jac", "hess", "hessp")
 
     def __init__(
         self,
@@ -100,7 +100,6 @@ def __init__(
         hess: bool = False,
         hessp: bool = False,
         optimizer_kwargs: dict | None = None,
-        debug: bool = False,
     ):
         self.fgraph = FunctionGraph([x, *args], [objective])
 
@@ -116,7 +115,6 @@ def __init__(
 
         self.method = method
         self.optimizer_kwargs = optimizer_kwargs if optimizer_kwargs is not None else {}
-        self.debug = debug
         self._fn = None
         self._fn_wrapped = None
 
@@ -187,7 +185,6 @@ def minimize(
     x: TensorVariable,
     method: str = "BFGS",
     jac: bool = True,
-    debug: bool = False,
     optimizer_kwargs: dict | None = None,
 ):
     """
@@ -209,9 +206,6 @@ def minimize(
         Whether to compute and use the gradient of teh objective function with respect to x for optimization.
         Default is True.
 
-    debug : bool, optional
-        If True, prints raw scipy result after optimization. Default is False.
-
     optimizer_kwargs
         Additional keyword arguments to pass to scipy.optimize.minimize
 
@@ -233,7 +227,6 @@ def minimize(
         objective=objective,
         method=method,
         jac=jac,
-        debug=debug,
         optimizer_kwargs=optimizer_kwargs,
     )
 
@@ -241,7 +234,7 @@ def minimize(
 
 
 class RootOp(ScipyWrapperOp):
-    __props__ = ("method", "jac", "debug")
+    __props__ = ("method", "jac")
 
     def __init__(
         self,
@@ -251,7 +244,6 @@ def __init__(
         method: str = "hybr",
         jac: bool = True,
         optimizer_kwargs: dict | None = None,
-        debug: bool = False,
     ):
         self.fgraph = FunctionGraph([variables, *args], [equations])
 
@@ -263,7 +255,6 @@ def __init__(
 
         self.method = method
         self.optimizer_kwargs = optimizer_kwargs if optimizer_kwargs is not None else {}
-        self.debug = debug
         self._fn = None
         self._fn_wrapped = None
 
@@ -312,7 +303,6 @@ def root(
     variables: TensorVariable,
     method: str = "hybr",
     jac: bool = True,
-    debug: bool = False,
 ):
     """Find roots of a system of equations using scipy.optimize.root."""
 
@@ -322,9 +312,7 @@ def root(
         if (arg is not variables and not isinstance(arg, Constant))
     ]
 
-    root_op = RootOp(
-        variables, *args, equations=equations, method=method, jac=jac, debug=debug
-    )
+    root_op = RootOp(variables, *args, equations=equations, method=method, jac=jac)
 
     return root_op(variables, *args)
 

From 6f32356a8007fa8e0b7f430954ebb083430571b3 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Fri, 6 Jun 2025 23:24:16 +0800
Subject: [PATCH 08/22] minimize works

---
 pytensor/tensor/optimize.py   | 44 +++++++++++++++++++++--------------
 tests/tensor/test_optimize.py | 13 +++++++----
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index b56bd8c327..be768964dd 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -2,16 +2,19 @@
 from copy import copy
 from typing import cast
 
+import numpy as np
 from scipy.optimize import minimize as scipy_minimize
 from scipy.optimize import root as scipy_root
 
 from pytensor import Variable, function, graph_replace
-from pytensor.gradient import DisconnectedType, grad, jacobian
+from pytensor.gradient import grad, jacobian
 from pytensor.graph import Apply, Constant, FunctionGraph
-from pytensor.graph.basic import truncated_graph_inputs
+from pytensor.graph.basic import graph_inputs, truncated_graph_inputs
 from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
 from pytensor.scalar import bool as scalar_bool
+from pytensor.tensor import dot
 from pytensor.tensor.basic import atleast_2d, concatenate, zeros_like
+from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.slinalg import solve
 from pytensor.tensor.variable import TensorVariable
 
@@ -33,7 +36,7 @@ def build_fn(self):
         self._fn = fn = function(self.inner_inputs, outputs)
 
         # Do this reassignment to see the compiled graph in the dprint
-        self.fgraph = fn.maker.fgraph
+        # self.fgraph = fn.maker.fgraph
 
         if self.inner_inputs[0].type.shape == ():
 
@@ -128,11 +131,11 @@ def perform(self, node, inputs, outputs):
             x0=x0,
             args=tuple(args),
             method=self.method,
-            **self.options,
+            **self.optimizer_kwargs,
         )
 
         outputs[0][0] = res.x
-        outputs[1][0] = res.success
+        outputs[1][0] = np.bool_(res.success)
 
     def L_op(self, inputs, outputs, output_grads):
         x, *args = inputs
@@ -158,26 +161,22 @@ def L_op(self, inputs, outputs, output_grads):
 
         df_dx_star, df_dtheta_star = graph_replace([df_dx, df_dtheta], replace=replace)
 
-        grad_wrt_args_vector = solve(-df_dtheta_star, df_dx_star)
+        grad_wrt_args_vector = solve(-df_dx_star, df_dtheta_star)
 
         cursor = 0
         grad_wrt_args = []
 
-        for output_grad, arg in zip(output_grads, args, strict=True):
+        for arg in args:
             arg_shape = arg.shape
             arg_size = arg_shape.prod()
-            arg_grad = grad_wrt_args_vector[cursor : cursor + arg_size].reshape(
-                arg_shape
+            arg_grad = grad_wrt_args_vector[:, cursor : cursor + arg_size].reshape(
+                (*x_star.shape, *arg_shape)
             )
 
-            grad_wrt_args.append(
-                arg_grad * output_grad
-                if not isinstance(output_grad.type, DisconnectedType)
-                else DisconnectedType()
-            )
+            grad_wrt_args.append(dot(output_grad, arg_grad))
             cursor += arg_size
 
-        return [x.zeros_like(), *grad_wrt_args]
+        return [zeros_like(x), *grad_wrt_args]
 
 
 def minimize(
@@ -217,7 +216,7 @@ def minimize(
     """
     args = [
         arg
-        for arg in truncated_graph_inputs([objective], [x])
+        for arg in graph_inputs([objective], [x])
         if (arg is not x and not isinstance(arg, Constant))
     ]
 
@@ -230,7 +229,18 @@ def minimize(
         optimizer_kwargs=optimizer_kwargs,
     )
 
-    return minimize_op(x, *args)
+    input_core_ndim = [var.ndim for var in minimize_op.inner_inputs]
+    input_signatures = [
+        f'({",".join(f"i{i}{n}" for n in range(ndim))})'
+        for i, ndim in enumerate(input_core_ndim)
+    ]
+
+    # Output dimensions are always the same as the first input (the initial values for the optimizer),
+    # then a scalar for the success flag
+    output_signatures = [input_signatures[0], "()"]
+
+    signature = f"{','.join(input_signatures)}->{','.join(output_signatures)}"
+    return Blockwise(minimize_op, signature=signature)(x, *args)
 
 
 class RootOp(ScipyWrapperOp):
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index 201aa664b6..f8ac50e0e6 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -2,7 +2,7 @@
 
 import pytensor
 import pytensor.tensor as pt
-from pytensor import config
+from pytensor import config, function
 from pytensor.tensor.optimize import minimize, root
 from tests import unittest_tools as utt
 
@@ -24,8 +24,12 @@ def test_simple_minimize():
     a_val = 2.0
     c_val = 3.0
 
-    assert success
-    assert minimized_x.eval({a: a_val, c: c_val, x: 0.0}) == (2 * a_val * c_val)
+    f = function([a, c, x], [minimized_x, success])
+
+    minimized_x_val, success_val = f(a_val, c_val, 0.0)
+
+    assert success_val
+    assert minimized_x_val == (2 * a_val * c_val)
 
     def f(x, a, b):
         objective = (x - a * b) ** 2
@@ -51,7 +55,8 @@ def rosenbrock_shifted_scaled(x, a, b):
     x0 = np.zeros(5).astype(floatX)
     x_star_val = minimized_x.eval({a: a_val, b: b_val, x: x0})
 
-    assert success
+    assert success.eval({a: a_val, b: b_val, x: x0})
+
     np.testing.assert_allclose(
         x_star_val, np.ones_like(x_star_val), atol=1e-6, rtol=1e-6
     )

From 67f0c86ba0fc0b86643a174f89ecc9cd11f97a00 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Fri, 6 Jun 2025 23:42:09 +0800
Subject: [PATCH 09/22] Implement minimize_scalar

---
 pytensor/tensor/optimize.py   | 99 +++++++++++++++++++++++++++++++++++
 tests/tensor/test_optimize.py | 31 ++++++++++-
 2 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index be768964dd..6662495c7d 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from scipy.optimize import minimize as scipy_minimize
+from scipy.optimize import minimize_scalar as scipy_minimize_scalar
 from scipy.optimize import root as scipy_root
 
 from pytensor import Variable, function, graph_replace
@@ -90,6 +91,104 @@ def make_node(self, *inputs):
         )
 
 
+class MinimizeScalarOp(ScipyWrapperOp):
+    __props__ = ("method",)
+
+    def __init__(
+        self,
+        x: Variable,
+        *args: Variable,
+        objective: Variable,
+        method: str = "brent",
+        optimizer_kwargs: dict | None = None,
+    ):
+        self.fgraph = FunctionGraph([x, *args], [objective])
+
+        self.method = method
+        self.optimizer_kwargs = optimizer_kwargs if optimizer_kwargs is not None else {}
+        self._fn = None
+        self._fn_wrapped = None
+
+    def perform(self, node, inputs, outputs):
+        f = self.fn_wrapped
+        x0, *args = inputs
+
+        res = scipy_minimize_scalar(
+            fun=f,
+            args=tuple(args),
+            method=self.method,
+            **self.optimizer_kwargs,
+        )
+
+        outputs[0][0] = np.array(res.x)
+        outputs[1][0] = np.bool_(res.success)
+
+    def L_op(self, inputs, outputs, output_grads):
+        x, *args = inputs
+        x_star, _ = outputs
+        output_grad, _ = output_grads
+
+        inner_x, *inner_args = self.fgraph.inputs
+        inner_fx = self.fgraph.outputs[0]
+
+        implicit_f = grad(inner_fx, inner_x)
+        df_dx = grad(implicit_f, inner_x)
+
+        df_dthetas = [
+            grad(implicit_f, arg, disconnected_inputs="ignore") for arg in inner_args
+        ]
+
+        replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
+        df_dx_star, *df_dthetas_stars = graph_replace(
+            [df_dx, *df_dthetas], replace=replace
+        )
+
+        grad_wrt_args = [
+            (-df_dtheta_star / df_dx_star) * output_grad
+            for df_dtheta_star in df_dthetas_stars
+        ]
+
+        return [zeros_like(x), *grad_wrt_args]
+
+
+def minimize_scalar(
+    objective: TensorVariable,
+    x: TensorVariable,
+    method: str = "brent",
+    optimizer_kwargs: dict | None = None,
+):
+    """
+    Minimize a scalar objective function using scipy.optimize.minimize_scalar.
+    """
+
+    args = [
+        arg
+        for arg in graph_inputs([objective], [x])
+        if (arg is not x and not isinstance(arg, Constant))
+    ]
+
+    minimize_scalar_op = MinimizeScalarOp(
+        x,
+        *args,
+        objective=objective,
+        method=method,
+        optimizer_kwargs=optimizer_kwargs,
+    )
+
+    input_core_ndim = [var.ndim for var in minimize_scalar_op.inner_inputs]
+    input_signatures = [
+        f'({",".join(f"i{i}{n}" for n in range(ndim))})'
+        for i, ndim in enumerate(input_core_ndim)
+    ]
+
+    # Output dimensions are always the same as the first input (the initial values for the optimizer),
+    # then a scalar for the success flag
+    output_signatures = [input_signatures[0], "()"]
+
+    signature = f"{','.join(input_signatures)}->{','.join(output_signatures)}"
+    return Blockwise(minimize_scalar_op, signature=signature)(x, *args)
+
+
 class MinimizeOp(ScipyWrapperOp):
     __props__ = ("method", "jac", "hess", "hessp")
 
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index f8ac50e0e6..ae3a5caeab 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -3,13 +3,42 @@
 import pytensor
 import pytensor.tensor as pt
 from pytensor import config, function
-from pytensor.tensor.optimize import minimize, root
+from pytensor.tensor.optimize import minimize, minimize_scalar, root
 from tests import unittest_tools as utt
 
 
 floatX = config.floatX
 
 
+def test_minimize_scalar():
+    x = pt.scalar("x")
+    a = pt.scalar("a")
+    c = pt.scalar("c")
+
+    b = a * 2
+    b.name = "b"
+    out = (x - b * c) ** 2
+
+    minimized_x, success = minimize_scalar(out, x)
+
+    a_val = 2.0
+    c_val = 3.0
+
+    f = function([a, c, x], [minimized_x, success])
+
+    minimized_x_val, success_val = f(a_val, c_val, 0.0)
+
+    assert success_val
+    np.testing.assert_allclose(minimized_x_val, (2 * a_val * c_val))
+
+    def f(x, a, b):
+        objective = (x - a * b) ** 2
+        out = minimize_scalar(objective, x)[0]
+        return out
+
+    utt.verify_grad(f, [0.0, a_val, c_val], eps=1e-6)
+
+
 def test_simple_minimize():
     x = pt.scalar("x")
     a = pt.scalar("a")

From 7e9ef50b72cb0f1f1919dff63413835a59dff4b4 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Sat, 7 Jun 2025 21:38:35 +0800
Subject: [PATCH 10/22] Use LRU cache wrapper for hessian option

---
 pytensor/tensor/optimize.py   | 102 ++++++++++++++++++++++++++++++++--
 tests/tensor/test_optimize.py |  16 +++++-
 2 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 6662495c7d..29801448eb 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -1,3 +1,4 @@
+import logging
 from collections.abc import Sequence
 from copy import copy
 from typing import cast
@@ -8,7 +9,7 @@
 from scipy.optimize import root as scipy_root
 
 from pytensor import Variable, function, graph_replace
-from pytensor.gradient import grad, jacobian
+from pytensor.gradient import grad, hessian, jacobian
 from pytensor.graph import Apply, Constant, FunctionGraph
 from pytensor.graph.basic import graph_inputs, truncated_graph_inputs
 from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
@@ -20,6 +21,87 @@
 from pytensor.tensor.variable import TensorVariable
 
 
+_log = logging.getLogger(__name__)
+
+
+class LRUCache1:
+    """
+    Simple LRU cache with a memory size of 1.
+
+    This cache is only usable for a function that takes a single input `x` and returns a single output. The
+    function can also take any number of additional arguments `*args`, but these are assumed to be constant
+    between function calls.
+
+    The purpose of this cache is to allow for Hessian computation to be reused when calling scipy.optimize functions.
+    It is very often the case that some sub-computations are repeated between the objective, gradient, and hessian
+    functions, but by default scipy only allows for the objective and gradient to be fused.
+
+    By using this cache, all 3 functions can be fused, which can significantly speed up the optimization process for
+    expensive functions.
+    """
+
+    def __init__(self, fn):
+        self.fn = fn
+        self.last_x = None
+        self.last_result = None
+
+        self.cache_hits = 0
+        self.cache_misses = 0
+
+        self.value_and_grad_calls = 0
+        self.hess_calls = 0
+
+    def __call__(self, x, *args):
+        """
+        Call the cached function with the given input `x` and additional arguments `*args`.
+
+        If the input `x` is the same as the last input, return the cached result. Otherwise update the cache with the
+        new input and result.
+        """
+        cache_hit = np.all(x == self.last_x)
+
+        if self.last_x is None or not cache_hit:
+            self.cache_misses += 1
+            result = self.fn(x, *args)
+            self.last_x = x
+            self.last_result = result
+            return result
+
+        else:
+            self.cache_hits += 1
+            return self.last_result
+
+    def value(self, x, *args):
+        self.value_and_grad_calls += 1
+        res = self(x, *args)
+        if isinstance(res, tuple):
+            return res[0]
+        else:
+            return res
+
+    def value_and_grad(self, x, *args):
+        self.value_and_grad_calls += 1
+        return self(x, *args)[:2]
+
+    def hess(self, x, *args):
+        self.hess_calls += 1
+        return self(x, *args)[-1]
+
+    def report(self):
+        _log.info(f"Value and Grad calls: {self.value_and_grad_calls}")
+        _log.info(f"Hess Calls: {self.hess_calls}")
+        _log.info(f"Hits: {self.cache_hits}")
+        _log.info(f"Misses: {self.cache_misses}")
+
+    def clear_cache(self):
+        self.last_x = None
+        self.last_result = None
+        self.cache_hits = 0
+        self.cache_misses = 0
+        self.value_and_grad_calls = 0
+        self.hess_calls = 0
+
+
 class ScipyWrapperOp(Op, HasInnerGraph):
     """Shared logic for scipy optimization ops"""
 
@@ -44,9 +126,9 @@ def build_fn(self):
             def fn_wrapper(x, *args):
                 return fn(x.squeeze(), *args)
 
-            self._fn_wrapped = fn_wrapper
+            self._fn_wrapped = LRUCache1(fn_wrapper)
         else:
-            self._fn_wrapped = fn
+            self._fn_wrapped = LRUCache1(fn)
 
     @property
     def fn(self):
@@ -120,6 +202,7 @@ def perform(self, node, inputs, outputs):
             **self.optimizer_kwargs,
         )
 
+        f.clear_cache()
         outputs[0][0] = np.array(res.x)
         outputs[1][0] = np.bool_(res.success)
 
@@ -211,6 +294,12 @@ def __init__(
             )
             self.fgraph.add_output(grad_wrt_x)
 
+        if hess:
+            hess_wrt_x = cast(
+                Variable, hessian(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            )
+            self.fgraph.add_output(hess_wrt_x)
+
         self.jac = jac
         self.hess = hess
         self.hessp = hessp
@@ -225,14 +314,17 @@ def perform(self, node, inputs, outputs):
         x0, *args = inputs
 
         res = scipy_minimize(
-            fun=f,
+            fun=f.value_and_grad if self.jac else f.value,
             jac=self.jac,
             x0=x0,
             args=tuple(args),
+            hess=f.hess if self.hess else None,
             method=self.method,
             **self.optimizer_kwargs,
         )
 
+        f.clear_cache()
+
         outputs[0][0] = res.x
         outputs[1][0] = np.bool_(res.success)
 
@@ -283,6 +375,7 @@ def minimize(
     x: TensorVariable,
     method: str = "BFGS",
     jac: bool = True,
+    hess: bool = False,
     optimizer_kwargs: dict | None = None,
 ):
     """
@@ -325,6 +418,7 @@ def minimize(
         objective=objective,
         method=method,
         jac=jac,
+        hess=hess,
         optimizer_kwargs=optimizer_kwargs,
     )
 
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index ae3a5caeab..ea03fe7475 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 
 import pytensor
 import pytensor.tensor as pt
@@ -68,7 +69,16 @@ def f(x, a, b):
     utt.verify_grad(f, [0.0, a_val, c_val], eps=1e-6)
 
 
-def test_minimize_vector_x():
+@pytest.mark.parametrize(
+    "method, jac, hess",
+    [
+        ("Newton-CG", True, True),
+        ("L-BFGS-B", True, False),
+        ("powell", False, False),
+    ],
+    ids=["Newton-CG", "L-BFGS-B", "powell"],
+)
+def test_minimize_vector_x(method, jac, hess):
     def rosenbrock_shifted_scaled(x, a, b):
         return (a * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum() + b
 
@@ -77,7 +87,9 @@ def rosenbrock_shifted_scaled(x, a, b):
     b = pt.scalar("b")
 
     objective = rosenbrock_shifted_scaled(x, a, b)
-    minimized_x, success = minimize(objective, x, method="BFGS")
+    minimized_x, success = minimize(
+        objective, x, method=method, jac=jac, hess=hess, optimizer_kwargs={"tol": 1e-16}
+    )
 
     a_val = 0.5
     b_val = 1.0

From 16a701f32a9065a79e6b397886a6966f63142b38 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Sun, 8 Jun 2025 23:53:14 +0800
Subject: [PATCH 11/22] Remove useless Blockwise

---
 pytensor/tensor/optimize.py | 27 ++-------------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 29801448eb..7ce8191e9d 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -16,7 +16,6 @@
 from pytensor.scalar import bool as scalar_bool
 from pytensor.tensor import dot
 from pytensor.tensor.basic import atleast_2d, concatenate, zeros_like
-from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.slinalg import solve
 from pytensor.tensor.variable import TensorVariable
 
@@ -258,18 +257,7 @@ def minimize_scalar(
         optimizer_kwargs=optimizer_kwargs,
     )
 
-    input_core_ndim = [var.ndim for var in minimize_scalar_op.inner_inputs]
-    input_signatures = [
-        f'({",".join(f"i{i}{n}" for n in range(ndim))})'
-        for i, ndim in enumerate(input_core_ndim)
-    ]
-
-    # Output dimensions are always the same as the first input (the initial values for the optimizer),
-    # then a scalar for the success flag
-    output_signatures = [input_signatures[0], "()"]
-
-    signature = f"{','.join(input_signatures)}->{','.join(output_signatures)}"
-    return Blockwise(minimize_scalar_op, signature=signature)(x, *args)
+    return minimize_scalar_op(x, *args)
 
 
 class MinimizeOp(ScipyWrapperOp):
@@ -422,18 +410,7 @@ def minimize(
         optimizer_kwargs=optimizer_kwargs,
     )
 
-    input_core_ndim = [var.ndim for var in minimize_op.inner_inputs]
-    input_signatures = [
-        f'({",".join(f"i{i}{n}" for n in range(ndim))})'
-        for i, ndim in enumerate(input_core_ndim)
-    ]
-
-    # Output dimensions are always the same as the first input (the initial values for the optimizer),
-    # then a scalar for the success flag
-    output_signatures = [input_signatures[0], "()"]
-
-    signature = f"{','.join(input_signatures)}->{','.join(output_signatures)}"
-    return Blockwise(minimize_op, signature=signature)(x, *args)
+    return minimize_op(x, *args)
 
 
 class RootOp(ScipyWrapperOp):

From 1d0b43c55ac7916411810e5453f9e9e4f61acd01 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Mon, 9 Jun 2025 00:24:07 +0800
Subject: [PATCH 12/22] Feedback

---
 pytensor/tensor/optimize.py | 48 +++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 7ce8191e9d..3bbdd3d425 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -47,6 +47,8 @@ def __init__(self, fn):
         self.cache_hits = 0
         self.cache_misses = 0
 
+        self.value_calls = 0
+        self.grad_calls = 0
         self.value_and_grad_calls = 0
         self.hess_calls = 0
 
@@ -57,13 +59,14 @@ def __call__(self, x, *args):
         If the input `x` is the same as the last input, return the cached result. Otherwise update the cache with the
         new input and result.
         """
-        cache_hit = np.all(x == self.last_x)
 
-        if self.last_x is None or not cache_hit:
+        if self.last_result is None or not (x == self.last_x).all():
             self.cache_misses += 1
-            result = self.fn(x, *args)
             self.last_x = x
+
+            result = self.fn(x, *args)
             self.last_result = result
+
             return result
 
         else:
@@ -71,12 +74,12 @@ def __call__(self, x, *args):
             return self.last_result
 
     def value(self, x, *args):
-        self.value_and_grad_calls += 1
-        res = self(x, *args)
-        if isinstance(res, tuple):
-            return res[0]
-        else:
-            return res
+        self.value_calls += 1
+        return self(x, *args)[0]
+
+    def grad(self, x, *args):
+        self.grad_calls += 1
+        return self(x, *args)[1]
 
     def value_and_grad(self, x, *args):
         self.value_and_grad_calls += 1
@@ -97,6 +100,8 @@ def clear_cache(self):
         self.last_result = None
         self.cache_hits = 0
         self.cache_misses = 0
+        self.value_calls = 0
+        self.grad_calls = 0
         self.value_and_grad_calls = 0
         self.hess_calls = 0
 
@@ -109,14 +114,8 @@ def build_fn(self):
         This is overloaded because scipy converts scalar inputs to lists, changing the return type. The
         wrapper function logic is there to handle this.
         """
-        # TODO: Introduce rewrites to change MinimizeOp to MinimizeScalarOp and RootOp to RootScalarOp
-        #  when x is scalar. That will remove the need for the wrapper.
-
         outputs = self.inner_outputs
-        if len(outputs) == 1:
-            outputs = outputs[0]
-        self._fn = fn = function(self.inner_inputs, outputs)
-
+        self._fn = fn = function(self.inner_inputs, outputs, trust_input=True)
         # Do this reassignment to see the compiled graph in the dprint
         # self.fgraph = fn.maker.fgraph
 
@@ -166,6 +165,10 @@ def prepare_node(
 
     def make_node(self, *inputs):
         assert len(inputs) == len(self.inner_inputs)
+        for input, inner_input in zip(inputs, self.inner_inputs):
+            assert (
+                input.type == inner_input.type
+            ), f"Input {input} does not match expected type {inner_input.type}"
 
         return Apply(
             self, inputs, [self.inner_inputs[0].type(), scalar_bool("success")]
@@ -192,16 +195,17 @@ def __init__(
 
     def perform(self, node, inputs, outputs):
         f = self.fn_wrapped
+        f.clear_cache()
+
         x0, *args = inputs
 
         res = scipy_minimize_scalar(
-            fun=f,
+            fun=f.value,
             args=tuple(args),
             method=self.method,
             **self.optimizer_kwargs,
         )
 
-        f.clear_cache()
         outputs[0][0] = np.array(res.x)
         outputs[1][0] = np.bool_(res.success)
 
@@ -214,11 +218,9 @@ def L_op(self, inputs, outputs, output_grads):
         inner_fx = self.fgraph.outputs[0]
 
         implicit_f = grad(inner_fx, inner_x)
-        df_dx = grad(implicit_f, inner_x)
-
-        df_dthetas = [
-            grad(implicit_f, arg, disconnected_inputs="ignore") for arg in inner_args
-        ]
+        df_dx, *df_dthetas = grad(
+            implicit_f, [inner_x, *inner_args], disconnect_inputs="ignore"
+        )
 
         replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
         df_dx_star, *df_dthetas_stars = graph_replace(

From 7a7c74789f70f661bbf37b43b381953c6a4cd18e Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Mon, 9 Jun 2025 01:10:16 +0800
Subject: [PATCH 13/22] use truncated_graph_inputs and refactor

---
 pytensor/tensor/optimize.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 3bbdd3d425..326bb2931d 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -11,7 +11,7 @@
 from pytensor import Variable, function, graph_replace
 from pytensor.gradient import grad, hessian, jacobian
 from pytensor.graph import Apply, Constant, FunctionGraph
-from pytensor.graph.basic import graph_inputs, truncated_graph_inputs
+from pytensor.graph.basic import truncated_graph_inputs
 from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
 from pytensor.scalar import bool as scalar_bool
 from pytensor.tensor import dot
@@ -106,6 +106,19 @@ def clear_cache(self):
         self.hess_calls = 0
 
 
+def _find_optimization_parameters(objective: TensorVariable, x: TensorVariable):
+    """
+    Find the parameters of the optimization problem that are not the variable `x`.
+
+    This is used to determine the additional arguments that need to be passed to the objective function.
+    """
+    return [
+        arg
+        for arg in truncated_graph_inputs([objective], [x])
+        if (arg is not x and not isinstance(arg, Constant))
+    ]
+
+
 class ScipyWrapperOp(Op, HasInnerGraph):
     """Shared logic for scipy optimization ops"""
 
@@ -197,7 +210,9 @@ def perform(self, node, inputs, outputs):
         f = self.fn_wrapped
         f.clear_cache()
 
-        x0, *args = inputs
+        # minimize_scalar doesn't take x0 as an argument. The Op still needs this input (to symbolically determine
+        # the args of the objective function), but it is not used in the optimization.
+        _, *args = inputs
 
         res = scipy_minimize_scalar(
             fun=f.value,
@@ -219,7 +234,7 @@ def L_op(self, inputs, outputs, output_grads):
 
         implicit_f = grad(inner_fx, inner_x)
         df_dx, *df_dthetas = grad(
-            implicit_f, [inner_x, *inner_args], disconnect_inputs="ignore"
+            implicit_f, [inner_x, *inner_args], disconnected_inputs="ignore"
         )
 
         replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
@@ -245,11 +260,7 @@ def minimize_scalar(
     Minimize a scalar objective function using scipy.optimize.minimize_scalar.
     """
 
-    args = [
-        arg
-        for arg in graph_inputs([objective], [x])
-        if (arg is not x and not isinstance(arg, Constant))
-    ]
+    args = _find_optimization_parameters(objective, x)
 
     minimize_scalar_op = MinimizeScalarOp(
         x,
@@ -396,11 +407,7 @@ def minimize(
         The optimized value of x that minimizes the objective function.
 
     """
-    args = [
-        arg
-        for arg in graph_inputs([objective], [x])
-        if (arg is not x and not isinstance(arg, Constant))
-    ]
+    args = _find_optimization_parameters(objective, x)
 
     minimize_op = MinimizeOp(
         x,

From 7d97cdc008b7505145c8a79398f8d1b6f8adf8bb Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Mon, 9 Jun 2025 17:24:47 +0800
Subject: [PATCH 14/22] Implement Root Op

---
 pytensor/tensor/optimize.py   | 52 +++++++++++++++++++++++++++++------
 tests/tensor/test_optimize.py |  2 +-
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 326bb2931d..c1fc3904e9 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -39,10 +39,11 @@ class LRUCache1:
     expensive functions.
     """
 
-    def __init__(self, fn):
+    def __init__(self, fn, copy_x: bool = False):
         self.fn = fn
         self.last_x = None
         self.last_result = None
+        self.copy_x = copy_x
 
         self.cache_hits = 0
         self.cache_misses = 0
@@ -59,9 +60,17 @@ def __call__(self, x, *args):
         If the input `x` is the same as the last input, return the cached result. Otherwise update the cache with the
         new input and result.
         """
+        # scipy.optimize.scalar_minimize and scalar_root don't take initial values as an argument, so we can't control
+        # the first input to the inner function. Of course, they use a scalar, but we need a 0d numpy array.
+        x = np.asarray(x)
 
         if self.last_result is None or not (x == self.last_x).all():
             self.cache_misses += 1
+
+            # scipy.optimize.root changes x in place, so the cache has to copy it, otherwise we get false
+            # cache hits and optimization always fails.
+            if self.copy_x:
+                x = x.copy()
             self.last_x = x
 
             result = self.fn(x, *args)
@@ -449,6 +458,9 @@ def __init__(
 
     def perform(self, node, inputs, outputs):
         f = self.fn_wrapped
+        f.clear_cache()
+        f.copy_x = True
+
         variables, *args = inputs
 
         res = scipy_root(
@@ -460,8 +472,8 @@ def perform(self, node, inputs, outputs):
             **self.optimizer_kwargs,
         )
 
-        outputs[0][0] = res.x
-        outputs[1][0] = res.success
+        outputs[0][0] = res.x.reshape(variables.shape)
+        outputs[1][0] = np.bool_(res.success)
 
     def L_op(
         self,
@@ -469,22 +481,44 @@ def L_op(
         outputs: Sequence[Variable],
         output_grads: Sequence[Variable],
     ) -> list[Variable]:
-        # TODO: Broken
         x, *args = inputs
-        x_star, success = outputs
+        x_star, _ = outputs
         output_grad, _ = output_grads
 
         inner_x, *inner_args = self.fgraph.inputs
         inner_fx = self.fgraph.outputs[0]
 
-        inner_jac = jacobian(inner_fx, [inner_x, *inner_args])
+        df_dx = jacobian(inner_fx, inner_x) if not self.jac else self.fgraph.outputs[1]
+
+        df_dtheta = concatenate(
+            [
+                atleast_2d(jac_column, left=False)
+                for jac_column in jacobian(
+                    inner_fx, inner_args, disconnected_inputs="ignore"
+                )
+            ],
+            axis=-1,
+        )
 
         replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
-        jac_f_wrt_x_star, *jac_f_wrt_args = graph_replace(inner_jac, replace=replace)
+        df_dx_star, df_dtheta_star = graph_replace([df_dx, df_dtheta], replace=replace)
 
-        jac_wrt_args = solve(-jac_f_wrt_x_star, output_grad)
+        grad_wrt_args_vector = solve(-df_dx_star, df_dtheta_star)
 
-        return [zeros_like(x), jac_wrt_args]
+        cursor = 0
+        grad_wrt_args = []
+
+        for arg in args:
+            arg_shape = arg.shape
+            arg_size = arg_shape.prod()
+            arg_grad = grad_wrt_args_vector[:, cursor : cursor + arg_size].reshape(
+                (*x_star.shape, *arg_shape)
+            )
+
+            grad_wrt_args.append(dot(output_grad, arg_grad))
+            cursor += arg_size
+
+        return [zeros_like(x), *grad_wrt_args]
 
 
 def root(
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index ea03fe7475..4dcfe08d0f 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -142,7 +142,7 @@ def test_root_system_of_equations():
 
     f = pt.stack([a[0] * x[0] * pt.cos(x[1]) - b[0], x[0] * x[1] - a[1] * x[1] - b[1]])
 
-    root_f, success = root(f, x, debug=True)
+    root_f, success = root(f, x)
     func = pytensor.function([x, a, b], [root_f, success])
 
     x0 = np.array([1.0, 1.0])

From f074a858bbcfc5b6b4282088293e8f83b987daf6 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Mon, 9 Jun 2025 17:28:34 +0800
Subject: [PATCH 15/22] Factor out shared functions

---
 pytensor/tensor/optimize.py | 218 ++++++++++++++++++++++++------------
 1 file changed, 147 insertions(+), 71 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index c1fc3904e9..b61fb960a3 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -128,6 +128,32 @@ def _find_optimization_parameters(objective: TensorVariable, x: TensorVariable):
     ]
 
 
+def _get_parameter_grads_from_vector(
+    grad_wrt_args_vector: Variable,
+    x_star: Variable,
+    args: Sequence[Variable],
+    output_grad: Variable,
+):
+    """
+    Given a single concatenated vector of objective function gradients with respect to raveled optimization parameters,
+    returns the contribution of each parameter to the total loss function, with the unraveled shape of the parameter.
+    """
+    cursor = 0
+    grad_wrt_args = []
+
+    for arg in args:
+        arg_shape = arg.shape
+        arg_size = arg_shape.prod()
+        arg_grad = grad_wrt_args_vector[:, cursor : cursor + arg_size].reshape(
+            (*x_star.shape, *arg_shape)
+        )
+
+        grad_wrt_args.append(dot(output_grad, arg_grad))
+        cursor += arg_size
+
+    return grad_wrt_args
+
+
 class ScipyWrapperOp(Op, HasInnerGraph):
     """Shared logic for scipy optimization ops"""
 
@@ -197,6 +223,98 @@ def make_node(self, *inputs):
         )
 
 
+def scalar_implict_optimization_grads(
+    inner_fx: Variable,
+    inner_x: Variable,
+    inner_args: Sequence[Variable],
+    args: Sequence[Variable],
+    x_star: Variable,
+    output_grad: Variable,
+    fgraph: FunctionGraph,
+) -> list[Variable]:
+    df_dx, *df_dthetas = grad(
+        inner_fx, [inner_x, *inner_args], disconnected_inputs="ignore"
+    )
+
+    replace = dict(zip(fgraph.inputs, (x_star, *args), strict=True))
+    df_dx_star, *df_dthetas_stars = graph_replace([df_dx, *df_dthetas], replace=replace)
+
+    grad_wrt_args = [
+        (-df_dtheta_star / df_dx_star) * output_grad
+        for df_dtheta_star in df_dthetas_stars
+    ]
+
+    return grad_wrt_args
+
+
+def implict_optimization_grads(
+    df_dx: Variable,
+    df_dtheta_columns: Sequence[Variable],
+    args: Sequence[Variable],
+    x_star: Variable,
+    output_grad: Variable,
+    fgraph: FunctionGraph,
+):
+    r"""
+    Compute gradients of an optimization problem with respect to its parameters.
+
+    The gradents are computed using the implicit function theorem. Given a fuction `f(x, theta) =`, and a function
+    `x_star(theta)` that, given input parameters theta returns `x` such that `f(x_star(theta), theta) = 0`, we can
+    compute the gradients of `x_star` with respect to `theta` as follows:
+
+    .. math::
+
+        \underbrace{\frac{\partial f}{\partial x}\left(x^*(\theta), \theta\right)}_{\text{Jacobian wrt } x}
+        \frac{d x^*(\theta)}{d \theta}
+        +
+        \underbrace{\frac{\partial f}{\partial \theta}\left(x^*(\theta), \theta\right)}_{\text{Jacobian wrt } \theta}
+        = 0
+
+    Which, after rearranging, gives us:
+
+    .. math::
+
+        \frac{d x^*(\theta)}{d \theta} = - \left(\frac{\partial f}{\partial x}\left(x^*(\theta), \theta\right)\right)^{-1} \frac{\partial f}{\partial \theta}\left(x^*(\theta), \theta\right)
+
+    Note that this method assumes `f(x_star(theta), theta) = 0`; so it is not immediately applicable to a minimization
+    problem, where `f` is the objective function. In this case, we instead take `f` to be the gradient of the objective
+    function, which *is* indeed zero at the minimum.
+
+    Parameters
+    ----------
+    df_dx : Variable
+        The Jacobian of the objective function with respect to the variable `x`.
+    df_dtheta_columns : Sequence[Variable]
+        The Jacobians of the objective function with respect to the optimization parameters `theta`.
+        Each column (or columns) corresponds to a different parameter. Should be returned by pytensor.gradient.jacobian.
+    args : Sequence[Variable]
+        The optimization parameters `theta`.
+    x_star : Variable
+        Symbolic graph representing the value of the variable `x` such that `f(x_star, theta) = 0 `.
+    output_grad : Variable
+        The gradient of the output with respect to the objective function.
+    fgraph : FunctionGraph
+        The function graph that contains the inputs and outputs of the optimization problem.
+    """
+    df_dtheta = concatenate(
+        [atleast_2d(jac_col, left=False) for jac_col in df_dtheta_columns],
+        axis=-1,
+    )
+
+    replace = dict(zip(fgraph.inputs, (x_star, *args), strict=True))
+
+    df_dx_star, df_dtheta_star = graph_replace(
+        [atleast_2d(df_dx), df_dtheta], replace=replace
+    )
+
+    grad_wrt_args_vector = solve(-df_dx_star, df_dtheta_star)
+    grad_wrt_args = _get_parameter_grads_from_vector(
+        grad_wrt_args_vector, x_star, args, output_grad
+    )
+
+    return grad_wrt_args
+
+
 class MinimizeScalarOp(ScipyWrapperOp):
     __props__ = ("method",)
 
@@ -242,20 +360,17 @@ def L_op(self, inputs, outputs, output_grads):
         inner_fx = self.fgraph.outputs[0]
 
         implicit_f = grad(inner_fx, inner_x)
-        df_dx, *df_dthetas = grad(
-            implicit_f, [inner_x, *inner_args], disconnected_inputs="ignore"
-        )
 
-        replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
-        df_dx_star, *df_dthetas_stars = graph_replace(
-            [df_dx, *df_dthetas], replace=replace
+        grad_wrt_args = scalar_implict_optimization_grads(
+            inner_fx=implicit_f,
+            inner_x=inner_x,
+            inner_args=inner_args,
+            args=args,
+            x_star=x_star,
+            output_grad=output_grad,
+            fgraph=self.fgraph,
         )
 
-        grad_wrt_args = [
-            (-df_dtheta_star / df_dx_star) * output_grad
-            for df_dtheta_star in df_dthetas_stars
-        ]
-
         return [zeros_like(x), *grad_wrt_args]
 
 
@@ -348,34 +463,17 @@ def L_op(self, inputs, outputs, output_grads):
 
         implicit_f = grad(inner_fx, inner_x)
 
-        df_dx = atleast_2d(concatenate(jacobian(implicit_f, [inner_x]), axis=-1))
-
-        df_dtheta = concatenate(
-            [
-                atleast_2d(x, left=False)
-                for x in jacobian(implicit_f, inner_args, disconnected_inputs="ignore")
-            ],
-            axis=-1,
+        df_dx, *df_dtheta_columns = jacobian(
+            implicit_f, [inner_x, *inner_args], disconnected_inputs="ignore"
+        )
+        grad_wrt_args = implict_optimization_grads(
+            df_dx=df_dx,
+            df_dtheta_columns=df_dtheta_columns,
+            args=args,
+            x_star=x_star,
+            output_grad=output_grad,
+            fgraph=self.fgraph,
         )
-
-        replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
-
-        df_dx_star, df_dtheta_star = graph_replace([df_dx, df_dtheta], replace=replace)
-
-        grad_wrt_args_vector = solve(-df_dx_star, df_dtheta_star)
-
-        cursor = 0
-        grad_wrt_args = []
-
-        for arg in args:
-            arg_shape = arg.shape
-            arg_size = arg_shape.prod()
-            arg_grad = grad_wrt_args_vector[:, cursor : cursor + arg_size].reshape(
-                (*x_star.shape, *arg_shape)
-            )
-
-            grad_wrt_args.append(dot(output_grad, arg_grad))
-            cursor += arg_size
 
         return [zeros_like(x), *grad_wrt_args]
 
@@ -432,7 +530,7 @@ def minimize(
 
 
 class RootOp(ScipyWrapperOp):
-    __props__ = ("method", "jac")
+    __props__ = ("method", "jac", "optimizer_kwargs")
 
     def __init__(
         self,
@@ -489,35 +587,17 @@ def L_op(
         inner_fx = self.fgraph.outputs[0]
 
         df_dx = jacobian(inner_fx, inner_x) if not self.jac else self.fgraph.outputs[1]
-
-        df_dtheta = concatenate(
-            [
-                atleast_2d(jac_column, left=False)
-                for jac_column in jacobian(
-                    inner_fx, inner_args, disconnected_inputs="ignore"
-                )
-            ],
-            axis=-1,
+        df_dtheta_columns = jacobian(inner_fx, inner_args, disconnected_inputs="ignore")
+
+        grad_wrt_args = implict_optimization_grads(
+            df_dx=df_dx,
+            df_dtheta_columns=df_dtheta_columns,
+            args=args,
+            x_star=x_star,
+            output_grad=output_grad,
+            fgraph=self.fgraph,
         )
 
-        replace = dict(zip(self.fgraph.inputs, (x_star, *args), strict=True))
-        df_dx_star, df_dtheta_star = graph_replace([df_dx, df_dtheta], replace=replace)
-
-        grad_wrt_args_vector = solve(-df_dx_star, df_dtheta_star)
-
-        cursor = 0
-        grad_wrt_args = []
-
-        for arg in args:
-            arg_shape = arg.shape
-            arg_size = arg_shape.prod()
-            arg_grad = grad_wrt_args_vector[:, cursor : cursor + arg_size].reshape(
-                (*x_star.shape, *arg_shape)
-            )
-
-            grad_wrt_args.append(dot(output_grad, arg_grad))
-            cursor += arg_size
-
         return [zeros_like(x), *grad_wrt_args]
 
 
@@ -529,11 +609,7 @@ def root(
 ):
     """Find roots of a system of equations using scipy.optimize.root."""
 
-    args = [
-        arg
-        for arg in truncated_graph_inputs([equations], [variables])
-        if (arg is not variables and not isinstance(arg, Constant))
-    ]
+    args = _find_optimization_parameters(equations, variables)
 
     root_op = RootOp(variables, *args, equations=equations, method=method, jac=jac)
 

From e1154a2a5fddeb99768f24bec56be12f45ec2d1c Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Mon, 9 Jun 2025 18:18:47 +0800
Subject: [PATCH 16/22] Implement `root_scalar`

---
 pytensor/tensor/optimize.py   | 108 +++++++++++++++++++++++++++++++++-
 tests/tensor/test_optimize.py |  31 +++++++++-
 2 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index b61fb960a3..6fc58e9317 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -7,6 +7,7 @@
 from scipy.optimize import minimize as scipy_minimize
 from scipy.optimize import minimize_scalar as scipy_minimize_scalar
 from scipy.optimize import root as scipy_root
+from scipy.optimize import root_scalar as scipy_root_scalar
 
 from pytensor import Variable, function, graph_replace
 from pytensor.gradient import grad, hessian, jacobian
@@ -529,8 +530,111 @@ def minimize(
     return minimize_op(x, *args)
 
 
+class RootScalarOp(ScipyWrapperOp):
+    __props__ = ("method", "jac", "hess")
+
+    def __init__(
+        self,
+        variables,
+        *args,
+        equation,
+        method,
+        jac: bool = False,
+        hess: bool = False,
+        optimizer_kwargs=None,
+    ):
+        self.fgraph = FunctionGraph([variables, *args], [equation])
+
+        if jac:
+            f_prime = grad(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            self.fgraph.add_output(f_prime)
+
+        if hess:
+            if not jac:
+                raise ValueError(
+                    "Cannot set `hess=True` without `jac=True`. No methods use second derivatives without also"
+                    " using first derivatives."
+                )
+            f_double_prime = grad(self.fgraph.outputs[-1], self.fgraph.inputs[0])
+            self.fgraph.add_output(f_double_prime)
+
+        self.method = method
+        self.optimizer_kwargs = optimizer_kwargs if optimizer_kwargs is not None else {}
+        self.jac = jac
+        self.hess = hess
+
+        self._fn = None
+        self._fn_wrapped = None
+
+    def perform(self, node, inputs, outputs):
+        f = self.fn_wrapped
+        f.clear_cache()
+        # f.copy_x = True
+
+        variables, *args = inputs
+
+        res = scipy_root_scalar(
+            f=f.value,
+            fprime=f.grad if self.jac else None,
+            fprime2=f.hess if self.hess else None,
+            x0=variables,
+            args=tuple(args),
+            method=self.method,
+            **self.optimizer_kwargs,
+        )
+
+        outputs[0][0] = np.array(res.root)
+        outputs[1][0] = np.bool_(res.converged)
+
+    def L_op(self, inputs, outputs, output_grads):
+        x, *args = inputs
+        x_star, _ = outputs
+        output_grad, _ = output_grads
+
+        inner_x, *inner_args = self.fgraph.inputs
+        inner_fx = self.fgraph.outputs[0]
+
+        grad_wrt_args = scalar_implict_optimization_grads(
+            inner_fx=inner_fx,
+            inner_x=inner_x,
+            inner_args=inner_args,
+            args=args,
+            x_star=x_star,
+            output_grad=output_grad,
+            fgraph=self.fgraph,
+        )
+
+        return [zeros_like(x), *grad_wrt_args]
+
+
+def root_scalar(
+    equation: TensorVariable,
+    variables: TensorVariable,
+    method: str = "secant",
+    jac: bool = False,
+    hess: bool = False,
+    optimizer_kwargs: dict | None = None,
+):
+    """
+    Find roots of a scalar equation using scipy.optimize.root_scalar.
+    """
+    args = _find_optimization_parameters(equation, variables)
+
+    root_scalar_op = RootScalarOp(
+        variables,
+        *args,
+        equation=equation,
+        method=method,
+        jac=jac,
+        hess=hess,
+        optimizer_kwargs=optimizer_kwargs,
+    )
+
+    return root_scalar_op(variables, *args)
+
+
 class RootOp(ScipyWrapperOp):
-    __props__ = ("method", "jac", "optimizer_kwargs")
+    __props__ = ("method", "jac")
 
     def __init__(
         self,
@@ -616,4 +720,4 @@ def root(
     return root_op(variables, *args)
 
 
-__all__ = ["minimize", "root"]
+__all__ = ["minimize_scalar", "minimize", "root_scalar", "root"]
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index 4dcfe08d0f..2ff80d2367 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -4,7 +4,7 @@
 import pytensor
 import pytensor.tensor as pt
 from pytensor import config, function
-from pytensor.tensor.optimize import minimize, minimize_scalar, root
+from pytensor.tensor.optimize import minimize, minimize_scalar, root, root_scalar
 from tests import unittest_tools as utt
 
 
@@ -110,6 +110,35 @@ def f(x, a, b):
     utt.verify_grad(f, [x0, a_val, b_val], eps=1e-6)
 
 
+@pytest.mark.parametrize(
+    "method, jac, hess",
+    [("secant", False, False), ("newton", True, False), ("halley", True, True)],
+)
+def test_root_scalar(method, jac, hess):
+    x = pt.scalar("x")
+    a = pt.scalar("a")
+
+    def fn(x, a):
+        return x + 2 * a * pt.cos(x)
+
+    f = fn(x, a)
+    root_f, success = root_scalar(f, x, method=method, jac=jac, hess=hess)
+    func = pytensor.function([x, a], [root_f, success])
+
+    x0 = 0.0
+    a_val = 1.0
+    solution, success = func(x0, a_val)
+
+    assert success
+    np.testing.assert_allclose(solution, -1.02986653, atol=1e-6, rtol=1e-6)
+
+    def root_fn(x, a):
+        f = fn(x, a)
+        return root_scalar(f, x, method=method, jac=jac, hess=hess)[0]
+
+    utt.verify_grad(root_fn, [x0, a_val], eps=1e-6)
+
+
 def test_root_simple():
     x = pt.scalar("x")
     a = pt.scalar("a")

From f06526e6c0bab1d29220bc51d943775ffbbef41f Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Mon, 9 Jun 2025 19:07:25 +0800
Subject: [PATCH 17/22] =?UTF-8?q?mypy=20=F0=9F=98=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pytensor/tensor/optimize.py | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 6fc58e9317..e731174bdd 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -139,10 +139,14 @@ def _get_parameter_grads_from_vector(
     Given a single concatenated vector of objective function gradients with respect to raveled optimization parameters,
     returns the contribution of each parameter to the total loss function, with the unraveled shape of the parameter.
     """
+    grad_wrt_args_vector = cast(TensorVariable, grad_wrt_args_vector)
+    x_star = cast(TensorVariable, x_star)
+
     cursor = 0
     grad_wrt_args = []
 
     for arg in args:
+        arg = cast(TensorVariable, arg)
         arg_shape = arg.shape
         arg_size = arg_shape.prod()
         arg_grad = grad_wrt_args_vector[:, cursor : cursor + arg_size].reshape(
@@ -233,8 +237,9 @@ def scalar_implict_optimization_grads(
     output_grad: Variable,
     fgraph: FunctionGraph,
 ) -> list[Variable]:
-    df_dx, *df_dthetas = grad(
-        inner_fx, [inner_x, *inner_args], disconnected_inputs="ignore"
+    df_dx, *df_dthetas = cast(
+        list[Variable],
+        grad(inner_fx, [inner_x, *inner_args], disconnected_inputs="ignore"),
     )
 
     replace = dict(zip(fgraph.inputs, (x_star, *args), strict=True))
@@ -242,7 +247,7 @@ def scalar_implict_optimization_grads(
 
     grad_wrt_args = [
         (-df_dtheta_star / df_dx_star) * output_grad
-        for df_dtheta_star in df_dthetas_stars
+        for df_dtheta_star in cast(list[TensorVariable], df_dthetas_stars)
     ]
 
     return grad_wrt_args
@@ -297,15 +302,21 @@ def implict_optimization_grads(
     fgraph : FunctionGraph
         The function graph that contains the inputs and outputs of the optimization problem.
     """
+    df_dx = cast(TensorVariable, df_dx)
+
     df_dtheta = concatenate(
-        [atleast_2d(jac_col, left=False) for jac_col in df_dtheta_columns],
+        [
+            atleast_2d(jac_col, left=False)
+            for jac_col in cast(list[TensorVariable], df_dtheta_columns)
+        ],
         axis=-1,
     )
 
     replace = dict(zip(fgraph.inputs, (x_star, *args), strict=True))
 
-    df_dx_star, df_dtheta_star = graph_replace(
-        [atleast_2d(df_dx), df_dtheta], replace=replace
+    df_dx_star, df_dtheta_star = cast(
+        list[TensorVariable],
+        graph_replace([atleast_2d(df_dx), df_dtheta], replace=replace),
     )
 
     grad_wrt_args_vector = solve(-df_dx_star, df_dtheta_star)
@@ -546,7 +557,9 @@ def __init__(
         self.fgraph = FunctionGraph([variables, *args], [equation])
 
         if jac:
-            f_prime = grad(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            f_prime = cast(
+                Variable, grad(self.fgraph.outputs[0], self.fgraph.inputs[0])
+            )
             self.fgraph.add_output(f_prime)
 
         if hess:
@@ -555,7 +568,9 @@ def __init__(
                     "Cannot set `hess=True` without `jac=True`. No methods use second derivatives without also"
                     " using first derivatives."
                 )
-            f_double_prime = grad(self.fgraph.outputs[-1], self.fgraph.inputs[0])
+            f_double_prime = cast(
+                Variable, grad(self.fgraph.outputs[-1], self.fgraph.inputs[0])
+            )
             self.fgraph.add_output(f_double_prime)
 
         self.method = method

From 33889f0fbe64ab96e6356a3bf9b80338531d700a Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Tue, 10 Jun 2025 18:34:57 +0800
Subject: [PATCH 18/22] Add specialized `build_fn` to each Op to handle scipy
 quirks

---
 pytensor/tensor/optimize.py | 106 ++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 12 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index e731174bdd..d3f8137f87 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -9,14 +9,20 @@
 from scipy.optimize import root as scipy_root
 from scipy.optimize import root_scalar as scipy_root_scalar
 
+import pytensor.scalar as ps
 from pytensor import Variable, function, graph_replace
 from pytensor.gradient import grad, hessian, jacobian
 from pytensor.graph import Apply, Constant, FunctionGraph
-from pytensor.graph.basic import truncated_graph_inputs
+from pytensor.graph.basic import ancestors, truncated_graph_inputs
 from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
-from pytensor.scalar import bool as scalar_bool
-from pytensor.tensor import dot
-from pytensor.tensor.basic import atleast_2d, concatenate, zeros_like
+from pytensor.tensor.basic import (
+    atleast_2d,
+    concatenate,
+    tensor,
+    tensor_from_scalar,
+    zeros_like,
+)
+from pytensor.tensor.math import dot
 from pytensor.tensor.slinalg import solve
 from pytensor.tensor.variable import TensorVariable
 
@@ -223,9 +229,31 @@ def make_node(self, *inputs):
                 input.type == inner_input.type
             ), f"Input {input} does not match expected type {inner_input.type}"
 
-        return Apply(
-            self, inputs, [self.inner_inputs[0].type(), scalar_bool("success")]
-        )
+        return Apply(self, inputs, [self.inner_inputs[0].type(), ps.bool("success")])
+
+
+class ScipyScalarWrapperOp(ScipyWrapperOp):
+    def build_fn(self):
+        """
+        This is overloaded because scipy converts scalar inputs to lists, changing the return type. The
+        wrapper function logic is there to handle this.
+        """
+
+        # We have no control over the inputs to the scipy inner function for scalar_minimize. As a result,
+        # we need to adjust the graph to work with what scipy will be passing into the inner function --
+        # always scalar, and always float64
+        x, *args = self.inner_inputs
+        new_root_x = ps.float64(name="x_scalar")
+        new_x = tensor_from_scalar(new_root_x.astype(x.type.dtype))
+
+        new_outputs = graph_replace(self.inner_outputs, {x: new_x})
+
+        self._fn = fn = function([new_root_x, *args], new_outputs, trust_input=True)
+
+        # Do this reassignment to see the compiled graph in the dprint
+        # self.fgraph = fn.maker.fgraph
+
+        self._fn_wrapped = LRUCache1(fn)
 
 
 def scalar_implict_optimization_grads(
@@ -327,7 +355,7 @@ def implict_optimization_grads(
     return grad_wrt_args
 
 
-class MinimizeScalarOp(ScipyWrapperOp):
+class MinimizeScalarOp(ScipyScalarWrapperOp):
     __props__ = ("method",)
 
     def __init__(
@@ -338,6 +366,14 @@ def __init__(
         method: str = "brent",
         optimizer_kwargs: dict | None = None,
     ):
+        if not x.ndim == 0:
+            raise ValueError(
+                "The variable `x` must be a scalar (0-dimensional) tensor for minimize_scalar."
+            )
+        if not objective.ndim == 0:
+            raise ValueError(
+                "The objective function must be a scalar (0-dimensional) tensor for minimize_scalar."
+            )
         self.fgraph = FunctionGraph([x, *args], [objective])
 
         self.method = method
@@ -351,7 +387,7 @@ def perform(self, node, inputs, outputs):
 
         # minimize_scalar doesn't take x0 as an argument. The Op still needs this input (to symbolically determine
         # the args of the objective function), but it is not used in the optimization.
-        _, *args = inputs
+        x0, *args = inputs
 
         res = scipy_minimize_scalar(
             fun=f.value,
@@ -360,7 +396,7 @@ def perform(self, node, inputs, outputs):
             **self.optimizer_kwargs,
         )
 
-        outputs[0][0] = np.array(res.x)
+        outputs[0][0] = np.array(res.x, dtype=x0.dtype)
         outputs[1][0] = np.bool_(res.success)
 
     def L_op(self, inputs, outputs, output_grads):
@@ -423,6 +459,15 @@ def __init__(
         hessp: bool = False,
         optimizer_kwargs: dict | None = None,
     ):
+        if not objective.ndim == 0:
+            raise ValueError(
+                "The objective function must be a scalar (0-dimensional) tensor for minimize."
+            )
+        if not isinstance(x, Variable) and x not in ancestors([objective]):
+            raise ValueError(
+                "The variable `x` must be an input to the computational graph of the objective function."
+            )
+
         self.fgraph = FunctionGraph([x, *args], [objective])
 
         if jac:
@@ -462,7 +507,7 @@ def perform(self, node, inputs, outputs):
 
         f.clear_cache()
 
-        outputs[0][0] = res.x
+        outputs[0][0] = res.x.astype(x0.dtype)
         outputs[1][0] = np.bool_(res.success)
 
     def L_op(self, inputs, outputs, output_grads):
@@ -541,7 +586,7 @@ def minimize(
     return minimize_op(x, *args)
 
 
-class RootScalarOp(ScipyWrapperOp):
+class RootScalarOp(ScipyScalarWrapperOp):
     __props__ = ("method", "jac", "hess")
 
     def __init__(
@@ -554,6 +599,17 @@ def __init__(
         hess: bool = False,
         optimizer_kwargs=None,
     ):
+        if not equation.ndim == 0:
+            raise ValueError(
+                "The equation must be a scalar (0-dimensional) tensor for root_scalar."
+            )
+        if not isinstance(variables, Variable) or variables not in ancestors(
+            [equation]
+        ):
+            raise ValueError(
+                "The variable `variables` must be an input to the computational graph of the equation."
+            )
+
         self.fgraph = FunctionGraph([variables, *args], [equation])
 
         if jac:
@@ -673,6 +729,32 @@ def __init__(
         self._fn = None
         self._fn_wrapped = None
 
+    def build_fn(self):
+        outputs = self.inner_outputs
+        variables, *args = self.inner_inputs
+
+        if variables.ndim > 0:
+            new_root_variables = variables
+            new_outputs = outputs
+        else:
+            # If the user passes a scalar optimization problem to root, scipy will automatically upcast it to
+            # a 1d array. The inner function needs to be adjusted to handle this.
+            new_root_variables = tensor(
+                name="variables_vector", shape=(1,), dtype=variables.type.dtype
+            )
+            new_variables = new_root_variables.squeeze()
+
+            new_outputs = graph_replace(outputs, {variables: new_variables})
+
+        self._fn = fn = function(
+            [new_root_variables, *args], new_outputs, trust_input=True
+        )
+
+        # Do this reassignment to see the compiled graph in the dprint
+        # self.fgraph = fn.maker.fgraph
+
+        self._fn_wrapped = LRUCache1(fn)
+
     def perform(self, node, inputs, outputs):
         f = self.fn_wrapped
         f.clear_cache()

From cb3d7e739ef2e4e24d352615706f55689ffcbf56 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Tue, 10 Jun 2025 18:35:38 +0800
Subject: [PATCH 19/22] Changes to support float32 inputs

---
 pytensor/tensor/optimize.py   | 32 +++++++++-------
 tests/tensor/test_optimize.py | 69 ++++++++++++++++++++++++-----------
 2 files changed, 67 insertions(+), 34 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index d3f8137f87..898ad49f2e 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -52,6 +52,9 @@ def __init__(self, fn, copy_x: bool = False):
         self.last_result = None
         self.copy_x = copy_x
 
+        # Scipy does not respect dtypes *at all*, so we have to force it ourselves.
+        self.dtype = fn.maker.fgraph.inputs[0].type.dtype
+
         self.cache_hits = 0
         self.cache_misses = 0
 
@@ -67,9 +70,7 @@ def __call__(self, x, *args):
         If the input `x` is the same as the last input, return the cached result. Otherwise update the cache with the
         new input and result.
         """
-        # scipy.optimize.scalar_minimize and scalar_root don't take initial values as an argument, so we can't control
-        # the first input to the inner function. Of course, they use a scalar, but we need a 0d numpy array.
-        x = np.asarray(x)
+        x = x.astype(self.dtype)
 
         if self.last_result is None or not (x == self.last_x).all():
             self.cache_misses += 1
@@ -160,6 +161,7 @@ def _get_parameter_grads_from_vector(
         )
 
         grad_wrt_args.append(dot(output_grad, arg_grad))
+
         cursor += arg_size
 
     return grad_wrt_args
@@ -175,17 +177,11 @@ def build_fn(self):
         """
         outputs = self.inner_outputs
         self._fn = fn = function(self.inner_inputs, outputs, trust_input=True)
+
         # Do this reassignment to see the compiled graph in the dprint
         # self.fgraph = fn.maker.fgraph
 
-        if self.inner_inputs[0].type.shape == ():
-
-            def fn_wrapper(x, *args):
-                return fn(x.squeeze(), *args)
-
-            self._fn_wrapped = LRUCache1(fn_wrapper)
-        else:
-            self._fn_wrapped = LRUCache1(fn)
+        self._fn_wrapped = LRUCache1(fn)
 
     @property
     def fn(self):
@@ -771,7 +767,9 @@ def perform(self, node, inputs, outputs):
             **self.optimizer_kwargs,
         )
 
-        outputs[0][0] = res.x.reshape(variables.shape)
+        # There's a reshape here to cover the case where variables is a scalar. Scipy will still return a
+        # (1, 1) matrix in in this case, which causes errors downstream (since pytensor expects a scalar).
+        outputs[0][0] = res.x.reshape(variables.shape).astype(variables.dtype)
         outputs[1][0] = np.bool_(res.success)
 
     def L_op(
@@ -807,12 +805,20 @@ def root(
     variables: TensorVariable,
     method: str = "hybr",
     jac: bool = True,
+    optimizer_kwargs: dict | None = None,
 ):
     """Find roots of a system of equations using scipy.optimize.root."""
 
     args = _find_optimization_parameters(equations, variables)
 
-    root_op = RootOp(variables, *args, equations=equations, method=method, jac=jac)
+    root_op = RootOp(
+        variables,
+        *args,
+        equations=equations,
+        method=method,
+        jac=jac,
+        optimizer_kwargs=optimizer_kwargs,
+    )
 
     return root_op(variables, *args)
 
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
index 2ff80d2367..6c2fdaa6ee 100644
--- a/tests/tensor/test_optimize.py
+++ b/tests/tensor/test_optimize.py
@@ -59,7 +59,12 @@ def test_simple_minimize():
     minimized_x_val, success_val = f(a_val, c_val, 0.0)
 
     assert success_val
-    assert minimized_x_val == (2 * a_val * c_val)
+    np.testing.assert_allclose(
+        minimized_x_val,
+        2 * a_val * c_val,
+        atol=1e-8 if config.floatX == "float64" else 1e-6,
+        rtol=1e-8 if config.floatX == "float64" else 1e-6,
+    )
 
     def f(x, a, b):
         objective = (x - a * b) ** 2
@@ -82,7 +87,7 @@ def test_minimize_vector_x(method, jac, hess):
     def rosenbrock_shifted_scaled(x, a, b):
         return (a * (x[1:] - x[:-1] ** 2) ** 2 + (1 - x[:-1]) ** 2).sum() + b
 
-    x = pt.dvector("x")
+    x = pt.tensor("x", shape=(None,))
     a = pt.scalar("a")
     b = pt.scalar("b")
 
@@ -91,23 +96,30 @@ def rosenbrock_shifted_scaled(x, a, b):
         objective, x, method=method, jac=jac, hess=hess, optimizer_kwargs={"tol": 1e-16}
     )
 
-    a_val = 0.5
-    b_val = 1.0
-    x0 = np.zeros(5).astype(floatX)
-    x_star_val = minimized_x.eval({a: a_val, b: b_val, x: x0})
+    fn = pytensor.function([x, a, b], [minimized_x, success])
 
-    assert success.eval({a: a_val, b: b_val, x: x0})
+    a_val = np.array(0.5, dtype=floatX)
+    b_val = np.array(1.0, dtype=floatX)
+    x0 = np.zeros((5,)).astype(floatX)
+    x_star_val, success = fn(x0, a_val, b_val)
+
+    assert success
 
     np.testing.assert_allclose(
-        x_star_val, np.ones_like(x_star_val), atol=1e-6, rtol=1e-6
+        x_star_val,
+        np.ones_like(x_star_val),
+        atol=1e-8 if config.floatX == "float64" else 1e-3,
+        rtol=1e-8 if config.floatX == "float64" else 1e-3,
     )
 
+    assert x_star_val.dtype == floatX
+
     def f(x, a, b):
         objective = rosenbrock_shifted_scaled(x, a, b)
         out = minimize(objective, x)[0]
         return out
 
-    utt.verify_grad(f, [x0, a_val, b_val], eps=1e-6)
+    utt.verify_grad(f, [x0, a_val, b_val], eps=1e-3 if floatX == "float32" else 1e-6)
 
 
 @pytest.mark.parametrize(
@@ -130,7 +142,12 @@ def fn(x, a):
     solution, success = func(x0, a_val)
 
     assert success
-    np.testing.assert_allclose(solution, -1.02986653, atol=1e-6, rtol=1e-6)
+    np.testing.assert_allclose(
+        solution,
+        -1.02986653,
+        atol=1e-8 if config.floatX == "float64" else 1e-6,
+        rtol=1e-8 if config.floatX == "float64" else 1e-6,
+    )
 
     def root_fn(x, a):
         f = fn(x, a)
@@ -147,7 +164,7 @@ def fn(x, a):
         return x + 2 * a * pt.cos(x)
 
     f = fn(x, a)
-    root_f, success = root(f, x)
+    root_f, success = root(f, x, method="lm", optimizer_kwargs={"tol": 1e-8})
     func = pytensor.function([x, a], [root_f, success])
 
     x0 = 0.0
@@ -155,7 +172,12 @@ def fn(x, a):
     solution, success = func(x0, a_val)
 
     assert success
-    np.testing.assert_allclose(solution, -1.02986653, atol=1e-6, rtol=1e-6)
+    np.testing.assert_allclose(
+        solution,
+        -1.02986653,
+        atol=1e-8 if config.floatX == "float64" else 1e-6,
+        rtol=1e-8 if config.floatX == "float64" else 1e-6,
+    )
 
     def root_fn(x, a):
         f = fn(x, a)
@@ -165,24 +187,27 @@ def root_fn(x, a):
 
 
 def test_root_system_of_equations():
-    x = pt.dvector("x")
-    a = pt.dvector("a")
-    b = pt.dvector("b")
+    x = pt.tensor("x", shape=(None,))
+    a = pt.tensor("a", shape=(None,))
+    b = pt.tensor("b", shape=(None,))
 
     f = pt.stack([a[0] * x[0] * pt.cos(x[1]) - b[0], x[0] * x[1] - a[1] * x[1] - b[1]])
 
-    root_f, success = root(f, x)
+    root_f, success = root(f, x, method="lm", optimizer_kwargs={"tol": 1e-8})
     func = pytensor.function([x, a, b], [root_f, success])
 
-    x0 = np.array([1.0, 1.0])
-    a_val = np.array([1.0, 1.0])
-    b_val = np.array([4.0, 5.0])
+    x0 = np.array([1.0, 1.0], dtype=floatX)
+    a_val = np.array([1.0, 1.0], dtype=floatX)
+    b_val = np.array([4.0, 5.0], dtype=floatX)
     solution, success = func(x0, a_val, b_val)
 
     assert success
 
     np.testing.assert_allclose(
-        solution, np.array([6.50409711, 0.90841421]), atol=1e-6, rtol=1e-6
+        solution,
+        np.array([6.50409711, 0.90841421]),
+        atol=1e-8 if config.floatX == "float64" else 1e-6,
+        rtol=1e-8 if config.floatX == "float64" else 1e-6,
     )
 
     def root_fn(x, a, b):
@@ -191,4 +216,6 @@ def root_fn(x, a, b):
         )
         return root(f, x)[0]
 
-    utt.verify_grad(root_fn, [x0, a_val, b_val], eps=1e-6)
+    utt.verify_grad(
+        root_fn, [x0, a_val, b_val], eps=1e-6 if floatX == "float64" else 1e-3
+    )

From e2e2c84bbc3019b565bcaa85c1639e3cbbd53c1d Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Tue, 10 Jun 2025 18:36:37 +0800
Subject: [PATCH 20/22] Check inputs to RootOp

---
 pytensor/tensor/optimize.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 898ad49f2e..9bb5d97d71 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -712,6 +712,15 @@ def __init__(
         jac: bool = True,
         optimizer_kwargs: dict | None = None,
     ):
+        if variables.ndim != equations.ndim:
+            raise ValueError(
+                "The variable `variables` must have the same number of dimensions as the equations."
+            )
+        if variables not in ancestors([equations]):
+            raise ValueError(
+                "The variable `variables` must be an input to the computational graph of the equations."
+            )
+
         self.fgraph = FunctionGraph([variables, *args], [equations])
 
         if jac:

From 0cd959ebcfb704b4b2681599b4b3d7b574866364 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Tue, 10 Jun 2025 18:42:29 +0800
Subject: [PATCH 21/22] More mypy

---
 pytensor/tensor/optimize.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 9bb5d97d71..66ade0e66f 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -362,11 +362,11 @@ def __init__(
         method: str = "brent",
         optimizer_kwargs: dict | None = None,
     ):
-        if not x.ndim == 0:
+        if not cast(TensorVariable, x).ndim == 0:
             raise ValueError(
                 "The variable `x` must be a scalar (0-dimensional) tensor for minimize_scalar."
             )
-        if not objective.ndim == 0:
+        if not cast(TensorVariable, objective).ndim == 0:
             raise ValueError(
                 "The objective function must be a scalar (0-dimensional) tensor for minimize_scalar."
             )
@@ -455,11 +455,11 @@ def __init__(
         hessp: bool = False,
         optimizer_kwargs: dict | None = None,
     ):
-        if not objective.ndim == 0:
+        if not cast(TensorVariable, objective).ndim == 0:
             raise ValueError(
                 "The objective function must be a scalar (0-dimensional) tensor for minimize."
             )
-        if not isinstance(x, Variable) and x not in ancestors([objective]):
+        if x not in ancestors([objective]):
             raise ValueError(
                 "The variable `x` must be an input to the computational graph of the objective function."
             )
@@ -712,7 +712,7 @@ def __init__(
         jac: bool = True,
         optimizer_kwargs: dict | None = None,
     ):
-        if variables.ndim != equations.ndim:
+        if cast(TensorVariable, variables).ndim != cast(TensorVariable, equations).ndim:
             raise ValueError(
                 "The variable `variables` must have the same number of dimensions as the equations."
             )

From bfa63f6a450acaf61d244c712db37776dc971e48 Mon Sep 17 00:00:00 2001
From: jessegrabowski <jessegrabowski@gmail.com>
Date: Tue, 10 Jun 2025 19:05:44 +0800
Subject: [PATCH 22/22] Fix bug when minimize gets scalar input

---
 pytensor/tensor/optimize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
index 66ade0e66f..09a11563bb 100644
--- a/pytensor/tensor/optimize.py
+++ b/pytensor/tensor/optimize.py
@@ -503,7 +503,7 @@ def perform(self, node, inputs, outputs):
 
         f.clear_cache()
 
-        outputs[0][0] = res.x.astype(x0.dtype)
+        outputs[0][0] = res.x.reshape(x0.shape).astype(x0.dtype)
         outputs[1][0] = np.bool_(res.success)
 
     def L_op(self, inputs, outputs, output_grads):