microsoft · xadupre · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025 · justinchuby
diff --git a/docs/articles/2023-06-20-introducing-onnx-script/index.md b/docs/articles/2023-06-20-introducing-onnx-script/index.md
@@ -326,12 +326,12 @@ Finally, a _huge thank you_ to the wonderful engineering team at Microsoft that
 [onnx-expand-operator]: https://onnx.ai/onnx/operators/onnx__Expand.html#expand-13
 [onnxscript-pypi]: https://pypi.org/project/onnxscript
 [onnxscript-github]: https://github.com/microsoft/onnxscript
-[torch-onnx]: https://pytorch.org/docs/stable/onnx.html
-[torch-ir]: https://pytorch.org/docs/stable/ir.html
-[torch-dynamo]: https://pytorch.org/docs/stable/dynamo/index.html
-[torch-onnx-dynamoexport]: https://pytorch.org/docs/main/onnx.html#preview-torch-onnx-torchdynamo-exporter
-[torch-onnx-customops]: https://pytorch.org/docs/stable/onnx.html#onnx-script-functions
-[torch-chunk]: https://pytorch.org/docs/stable/generated/torch.chunk.html
+[torch-onnx]: https://docs.pytorch.org/docs/stable/onnx.html
+[torch-ir]: https://docs.pytorch.org/docs/stable/ir.html
+[torch-dynamo]: https://docs.pytorch.org/docs/stable/dynamo/index.html
+[torch-onnx-dynamoexport]: https://docs.pytorch.org/docs/stable/onnx.html#preview-torch-onnx-torchdynamo-exporter
+[torch-onnx-customops]: https://docs.pytorch.org/docs/stable/onnx.html#onnx-script-functions
+[torch-chunk]: https://docs.pytorch.org/docs/stable/generated/torch.chunk.html
 [netron]: https://netron.app
 [numpy]: https://numpy.org
 [pdb]: https://docs.python.org/3/library/pdb.html
diff --git a/docs/conf.py b/docs/conf.py
@@ -92,7 +92,7 @@
     "onnx_ir": ("https://onnx.ai/ir-py/", None),
     "onnxruntime": ("https://onnxruntime.ai/docs/api/python/", None),
     "scipy": ("https://docs.scipy.org/doc/scipy/", None),
-    "torch": ("https://pytorch.org/docs/main/", None),
+    "torch": ("https://docs.pytorch.org/docs/stable/", None),
 }
 
 # -- Options for Sphinx Gallery ----------------------------------------------

diff --git a/onnxscript/function_libs/torch_lib/ops/core.py b/onnxscript/function_libs/torch_lib/ops/core.py
@@ -243,7 +243,7 @@ def aten_addmm(
     alpha = float(alpha)
     beta = float(beta)
 
-    # addmm only accepts 2d tensors: https://pytorch.org/docs/stable/generated/torch.addmm.html
+    # addmm only accepts 2d tensors: https://docs.pytorch.org/docs/stable/generated/torch.addmm.html
     return op.Gemm(mat1, mat2, self, alpha=alpha, beta=beta)
 
 
@@ -3710,7 +3710,7 @@ def aten_frac(self: TFloat) -> TFloat:
     Computes the fractional portion of each element in input.
     """
 
-    # https://pytorch.org/docs/stable/generated/torch.frac.html
+    # https://docs.pytorch.org/docs/stable/generated/torch.frac.html
     return op.Sub(self, op.Mul(op.Floor(op.Abs(self)), op.Sign(self)))
 
 
@@ -6311,7 +6311,7 @@ def aten_native_layer_norm(
 ) -> Tuple[TReal, TReal, TReal]:
     """native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)"""
 
-    # https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html#torch.nn.LayerNorm
+    # https://docs.pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html#torch.nn.LayerNorm
     # The mean and standard-deviation are calculated over the last D dimensions,
     # where D is the dimension of normalized_shape. For example, if normalized_shape is
     # (3, 5) (a 2-dimensional shape), the mean and standard-deviation are computed
@@ -7901,7 +7901,7 @@ def aten_slice_scatter(
     # And, 'end' also must be specified, and end-start must be equal to the size of 'src'
     # Assert(end-start == shape(src) > 0)
     # Try torch sample to get more information:
-    # https://pytorch.org/docs/master/generated/torch.slice_scatter.html?highlight=slice_scatter#torch.slice_scatter
+    # https://docs.pytorch.org/master/generated/torch.slice_scatter.html?highlight=slice_scatter#torch.slice_scatter
     # Take (torch.zeros(8, 8), torch.ones(2, 8), 0, 6, 64, 1) as example:
     # Step 1: get 1D tensor from 0 to dim_size-1, then Slice it using start, end and step.
     # We cannot use Range(start, end, step) directly as start or end may out of range.

diff --git a/onnxscript/function_libs/torch_lib/ops/nn.py b/onnxscript/function_libs/torch_lib/ops/nn.py
@@ -1754,7 +1754,7 @@ def aten_scaled_dot_product_attention(
 ) -> TFloat:
     """scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> Tensor
 
-    Reference: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+    Reference: https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 
     Equivalent to the PyTorch code::
         scale_factor = 1 / math.sqrt(Q.size(-1)) if scale is None else scale
@@ -1776,7 +1776,7 @@ def aten_scaled_dot_product_attention(
         "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
     )
 
-    # Reference: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+    # Reference: https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
     if scale is None:
         scale = _attention_scale(query)
     scale = op.CastLike(scale, query)
@@ -1825,7 +1825,7 @@ def aten__scaled_dot_product_flash_attention(
     """_scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
 
     One of the implementations of scaled_dot_product_attention.
-    Reference: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+    Reference: https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 
     NOTE: Currently, there are three implementations of nn.scaled_dot_product_attention in PyTorch due to optimization.
     However, it's the same implementation from ONNX perspective.
@@ -1964,7 +1964,7 @@ def aten_scaled_dot_product_attention_bool_mask(
 ) -> TFloat:
     """scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> Tensor
 
-    Reference: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+    Reference: https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 
     Equivalent to the PyTorch code::
         scale_factor = 1 / math.sqrt(Q.size(-1)) if scale is None else scale

diff --git a/onnxscript/function_libs/torch_lib/ops/special.py b/onnxscript/function_libs/torch_lib/ops/special.py
@@ -343,7 +343,7 @@ def aten_special_sinc(self: TFloat) -> TFloat:
     """special_sinc(Tensor self) -> Tensor"""
 
     # This computes the normalized sinc, where the input is multiplied by pi.
-    # https://pytorch.org/docs/stable/special.html#torch.special.sinc
+    # https://docs.pytorch.org/docs/stable/special.html#torch.special.sinc
     pi_self = self * _MATH_PI
 
     return op.Where(self == 0.0, op.CastLike(1, self), op.Sin(pi_self) / pi_self)
@@ -365,7 +365,7 @@ def aten_special_xlog1py(self: TensorType, other: TensorType) -> TensorType:
 def aten_special_xlogy(self: TFloat, other: TFloat) -> TFloat:
     """special_xlogy(Tensor self, Tensor other) -> Tensor"""
 
-    # https://pytorch.org/docs/stable/special.html#torch.special.xlogy
+    # https://docs.pytorch.org/docs/stable/special.html#torch.special.xlogy
     # out := {
     #     NaN if other == NaN
     #     0 if self == 0

diff --git a/onnxscript/function_libs/torch_lib/tensor_typing.py b/onnxscript/function_libs/torch_lib/tensor_typing.py
@@ -26,7 +26,7 @@
 )
 
 # NOTE: We do not care about unsigned types beyond UINT8 because PyTorch does not us them.
-# More detail can be found: https://pytorch.org/docs/stable/tensors.html
+# More detail can be found: https://docs.pytorch.org/docs/stable/tensors.html
 
 _TensorType = Union[
     BFLOAT16,

diff --git a/onnxscript/onnx_opset/_impl/opset12.py b/onnxscript/onnx_opset/_impl/opset12.py
@@ -7,8 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801,E741
-# ruff: noqa: D214,D402,D405,D411,D412,D416,D417
+# ruff: noqa: D402
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -666,7 +665,7 @@ def MaxPool(
          subset of the input tensor according to the kernel size and downsampling the
          data into the output tensor Y for further processing. The output spatial shape is calculated differently
          depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
-         With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+         With explicit padding (https://docs.pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
          ```
          output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
          ```
@@ -820,7 +819,7 @@ def NegativeLogLikelihoodLoss(
             sum(loss) / sum(weight[target[n][d_1][d_2]...[d_k]]]), for all samples.
         If "reduction" attribute is set to "sum", the output is a scalar:
             sum(loss).
-        See also https://pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
+        See also https://docs.pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
         Example 1:
             // negative log likelihood loss, "none" reduction
             N, C, d1 = 2, 3, 2

diff --git a/onnxscript/onnx_opset/_impl/opset13.py b/onnxscript/onnx_opset/_impl/opset13.py
@@ -7,8 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801,E741
-# ruff: noqa: D214,D402,D405,D411,D412,D416,D417
+# ruff: noqa: D214, D402, D405, D411, D416, D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -2058,7 +2057,7 @@ def NegativeLogLikelihoodLoss(
 
         If "reduction" attribute is set to "sum", the output is a scalar: `sum(loss)`.
 
-        See also https://pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
+        See also https://docs.pytorch.org/docs/stable/nn.html#torch.nn.NLLLoss.
 
         Example 1:
 

diff --git a/onnxscript/onnx_opset/_impl/opset16.py b/onnxscript/onnx_opset/_impl/opset16.py
@@ -7,8 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801,E741
-# ruff: noqa: D214,D402,D405,D411,D412,D416,D417
+# ruff: noqa: D214, D402, D405, D411, D416
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -128,7 +127,7 @@
         They are used to interpolate output values of `Y[N, C, H_out, W_out]`.
 
         The GridSample operator is often used in doing grid generator and sampler in the [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
-        See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/master/generated/torch.nn.functional.grid_sample.html#torch-nn-functional-grid-sample).
+        See also in [torch.nn.functional.grid_sample](https://docs.pytorch.org/master/generated/torch.nn.functional.grid_sample.html#torch-nn-functional-grid-sample).
 
 
         Args:
@@ -253,38 +252,7 @@
     B_If: TypeAlias = BOOL
 
     V_If: TypeAlias = Union[
-        Optional[Sequence[BFLOAT16]],
-        Optional[Sequence[BOOL]],
-        Optional[Sequence[COMPLEX128]],
-        Optional[Sequence[COMPLEX64]],
-        Optional[Sequence[DOUBLE]],
-        Optional[Sequence[FLOAT]],
-        Optional[Sequence[FLOAT16]],
-        Optional[Sequence[INT16]],
-        Optional[Sequence[INT32]],
-        Optional[Sequence[INT64]],
-        Optional[Sequence[INT8]],
-        Optional[Sequence[STRING]],
-        Optional[Sequence[UINT16]],
-        Optional[Sequence[UINT32]],
-        Optional[Sequence[UINT64]],
-        Optional[Sequence[UINT8]],
-        Optional[BFLOAT16],
-        Optional[BOOL],
-        Optional[COMPLEX128],
-        Optional[COMPLEX64],
-        Optional[DOUBLE],
-        Optional[FLOAT],
-        Optional[FLOAT16],
-        Optional[INT16],
-        Optional[INT32],
-        Optional[INT64],
-        Optional[INT8],
-        Optional[STRING],
-        Optional[UINT16],
-        Optional[UINT32],
-        Optional[UINT64],
-        Optional[UINT8],
+        None,
         Sequence[BFLOAT16],
         Sequence[BOOL],
         Sequence[COMPLEX128],

diff --git a/onnxscript/onnx_opset/_impl/opset18.py b/onnxscript/onnx_opset/_impl/opset18.py
@@ -7,8 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801,E741
-# ruff: noqa: D214,D402,D405,D411,D412,D416,D417
+# ruff: noqa: D402, D405
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -235,7 +234,7 @@ def Col2Im(
 
         The operator rearranges column blocks back into a multidimensional image
 
-        Col2Im behaves similarly to PyTorch's fold https://pytorch.org/docs/stable/generated/torch.nn.Fold.html,
+        Col2Im behaves similarly to PyTorch's fold https://docs.pytorch.org/docs/stable/generated/torch.nn.Fold.html,
         but it only supports *batched* multi-dimensional image tensors.
         Another implementation in Python with N-dimension support can be found at https://github.com/f-dangel/unfoldNd/.
 

diff --git a/onnxscript/onnx_opset/_impl/opset19.py b/onnxscript/onnx_opset/_impl/opset19.py
@@ -7,8 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801,E741
-# ruff: noqa: D214,D402,D405,D411,D412,D416,D417
+# ruff: noqa: D214, D402, D405, D411, D412, D416
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -72,7 +71,7 @@
          subset of the input tensor according to the kernel size and downsampling the
          data into the output tensor Y for further processing. The output spatial shape is calculated differently
          depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
-         With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
+         With explicit padding (https://docs.pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
          ```
          output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
          ```
@@ -701,53 +700,14 @@
     B_If: TypeAlias = BOOL
 
     V_If: TypeAlias = Union[
-        Optional[Sequence[BFLOAT16]],
-        Optional[Sequence[BOOL]],
-        Optional[Sequence[COMPLEX128]],
-        Optional[Sequence[COMPLEX64]],
-        Optional[Sequence[DOUBLE]],
-        Optional[Sequence[FLOAT]],
-        Optional[Sequence[FLOAT16]],
-        Optional[Sequence[INT16]],
-        Optional[Sequence[INT32]],
-        Optional[Sequence[INT64]],
-        Optional[Sequence[INT8]],
-        Optional[Sequence[STRING]],
-        Optional[Sequence[UINT16]],
-        Optional[Sequence[UINT32]],
-        Optional[Sequence[UINT64]],
-        Optional[Sequence[UINT8]],
-        Optional[BFLOAT16],
-        Optional[BOOL],
-        Optional[COMPLEX128],
-        Optional[COMPLEX64],
-        Optional[DOUBLE],
-        Optional[FLOAT],
-        Optional[FLOAT16],
-        Optional[FLOAT8E4M3FN],
-        Optional[FLOAT8E4M3FNUZ],
-        Optional[FLOAT8E5M2],
-        Optional[FLOAT8E5M2FNUZ],
-        Optional[INT16],
-        Optional[INT32],
-        Optional[INT64],
-        Optional[INT8],
-        Optional[STRING],
-        Optional[UINT16],
-        Optional[UINT32],
-        Optional[UINT64],
-        Optional[UINT8],
+        None,
         Sequence[BFLOAT16],
         Sequence[BOOL],
         Sequence[COMPLEX128],
         Sequence[COMPLEX64],
         Sequence[DOUBLE],
         Sequence[FLOAT],
         Sequence[FLOAT16],
-        Sequence[FLOAT8E4M3FN],
-        Sequence[FLOAT8E4M3FNUZ],
-        Sequence[FLOAT8E5M2],
-        Sequence[FLOAT8E5M2FNUZ],
         Sequence[INT16],
         Sequence[INT32],
         Sequence[INT64],
@@ -777,6 +737,10 @@
         UINT32,
         UINT64,
         UINT8,
+        Sequence[FLOAT8E4M3FN],
+        Sequence[FLOAT8E4M3FNUZ],
+        Sequence[FLOAT8E5M2],
+        Sequence[FLOAT8E5M2FNUZ],
     ]
 
     def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) -> V_If:

diff --git a/onnxscript/onnx_opset/_impl/opset20.py b/onnxscript/onnx_opset/_impl/opset20.py
@@ -7,8 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801,E741
-# ruff: noqa: D214,D402,D405,D411,D412,D416,D417
+# ruff: noqa: D402
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -60,7 +59,7 @@ def AffineGrid(
 
 
         Generates a 2D or 3D flow field (sampling grid), given a batch of affine matrices theta
-        (https://pytorch.org/docs/stable/generated/torch.nn.functional.affine_grid.html).
+        (https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.affine_grid.html).
         An affine matrix `theta` is applied to a position tensor represented in its homogeneous expression. Here is an example in 3D:
         ::
 
@@ -304,7 +303,7 @@ def GridSample(
 
         The GridSample operator is often used in doing grid generator and sampler in the
         [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
-        See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html).
+        See also in [torch.nn.functional.grid_sample](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html).
 
 
         Args: