From 10c80a5d769515eaee70aa93657b9f09d2a0c91f Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@meta.com>
Date: Wed, 22 Oct 2025 09:24:00 -0700
Subject: [PATCH] Use fastmath in GeLU (TritonBench) (#506)

Summary:

Use fastmath intrinsics in `tanh_approx_fp32` when we don't have a native fast tanh instruction. We need to use the sigmoid formulation rather than dividing two exponents due to numeric stability.

Reviewed By: xuzhao9

Differential Revision: D83082730
---
 tritonbench/operators/gdpa/math.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/tritonbench/operators/gdpa/math.py b/tritonbench/operators/gdpa/math.py
index 7fedaf290..5f948a6fa 100644
--- a/tritonbench/operators/gdpa/math.py
+++ b/tritonbench/operators/gdpa/math.py
@@ -28,6 +28,13 @@
         from triton.language.math import fast_dividef, fast_expf
 
 
+HAS_FAST_TANH_INSTRUCTION = (
+    torch.version.cuda is not None
+    and torch.cuda.is_available()
+    and torch.cuda.get_device_capability()[0] >= 9  # >= H100
+)
+
+
 # Don't change the order of the enum values, as they are used to index
 # Only add new activation functions at the end of the enum
 class Activation(str, Enum):
@@ -50,17 +57,6 @@ def activation_string_to_int(s: str):
     return activation_to_int.get(enum_val)
 
 
-def is_hip_or_a100():
-    try:
-        if triton.runtime.driver.active.get_current_target().backend == "hip":
-            return True
-        elif torch.cuda.get_device_capability()[0] < 9:  # A100
-            return True
-        return False
-    except Exception:
-        return False
-
-
 @triton.jit
 def tanh(x):
     # Tanh is just a scaled sigmoid
@@ -79,11 +75,11 @@ def gelu_grad(x):
     return cdf + x * pdf
 
 
-if is_hip_or_a100():
-    # For AMD or A100, use tanh as a fallback
+if not HAS_FAST_TANH_INSTRUCTION:
+
     @triton.jit
     def tanh_approx_fp32(x):
-        return tanh(x)
+        return 2 * fast_dividef(1.0, 1.0 + fast_expf(-2.0 * x)) - 1.0
 
 else: