From 67a54cd8a04a32670e4a3e5dadcb28b590eed307 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Thu, 27 May 2021 22:52:04 +0800
Subject: [PATCH 1/4] [to #34725944] Add cuda & x86 schedules for fast_softmax

---
 python/tvm/relay/op/strategy/cuda.py    | 12 ++++++++++++
 python/tvm/relay/op/strategy/x86.py     | 12 ++++++++++++
 python/tvm/topi/cuda/softmax.py         | 21 ++++++++++++++++++---
 python/tvm/topi/x86/nn.py               | 11 +++++++++++
 tests/python/relay/test_op_fast_math.py | 11 +++++++----
 5 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 6c5b1e0cdead..c4b3863707bd 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -89,6 +89,18 @@ def softmax_strategy_cuda(attrs, inputs, out_type, target):
     return strategy
 
 
+@fast_softmax_strategy.register(["cuda", "gpu"])
+def fast_softmax_strategy_cuda(attrs, inputs, out_type, target):
+    """fast softmax cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_softmax(topi.nn.fast_softmax),
+        wrap_topi_schedule(topi.cuda.schedule_softmax),
+        name="fast_softmax.cuda",
+    )
+    return strategy
+
+
 @schedule_log_softmax.register(["cuda", "gpu"])
 def schedule_log_softmax_cuda(attrs, outs, target):
     """scheudle log_softmax for cuda"""
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 60bd92ef63d1..22df9d431b01 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -79,6 +79,18 @@ def softmax_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
+@fast_softmax_strategy.register("cpu")
+def softmax_strategy_cpu(attrs, inputs, out_type, target):
+    """fast_softmax x86 strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_softmax(topi.nn.fast_softmax),
+        wrap_topi_schedule(topi.x86.schedule_softmax),
+        name="fast_softmax.x86",
+    )
+    return strategy
+
+
 @schedule_log_softmax.register("cpu")
 def schedule_log_softmax_cpu(attrs, outs, target):
     """schedule log_softmax op for x86"""
diff --git a/python/tvm/topi/cuda/softmax.py b/python/tvm/topi/cuda/softmax.py
index 99fbdd0367db..b743aefc50d5 100644
--- a/python/tvm/topi/cuda/softmax.py
+++ b/python/tvm/topi/cuda/softmax.py
@@ -47,8 +47,15 @@ def schedule_softmax(outs):
         expsum = softmax.op.input_tensors[1]
         exp = softmax.op.input_tensors[0]
         max_elem = s[exp].op.input_tensors[1]
+        delta = None
+    elif op_tag == "fast_softmax_output":
+        expsum = softmax.op.input_tensors[1]
+        exp = softmax.op.input_tensors[0]
+        delta = s[exp].op.input_tensors[0]
+        max_elem = s[delta].op.input_tensors[1]
     elif op_tag == "log_softmax_output":
         exp = None
+        delta = None
         max_elem = softmax.op.input_tensors[1]
         expsum = softmax.op.input_tensors[2]
     else:
@@ -73,6 +80,8 @@ def sched_warp_softmax():
 
     if len(softmax.shape) > 2:
         ops = [max_elem.op, expsum.op, softmax.op]
+        if delta is not None:
+            ops.append(delta.op)
         if exp is not None:
             ops.append(exp.op)
 
@@ -99,7 +108,10 @@ def sched_warp_softmax():
         s[expsum].compute_at(s[softmax], xo)
 
         # (2) exp
-        if exp is not None:
+        if delta is not None:
+            s[exp].compute_inline()
+            s[delta].compute_inline()
+        elif exp is not None:
             xo, xi = s[exp].split(exp.op.axis[1], nparts=num_thread)
             _, xii = s[exp].split(xi, factor=4)
             s[exp].vectorize(xii)
@@ -112,7 +124,7 @@ def sched_warp_softmax():
         k = max_elem.op.reduce_axis[0]
         ko, _ = s[max_elem].split(k, nparts=num_thread)
         s[max_elem].bind(ko, thread_x)
-        if exp is not None:
+        if exp is not None and delta is None:
             s[max_elem].compute_at(s[exp], xo)
         else:
             s[max_elem].bind(ko, thread_x)
@@ -123,7 +135,10 @@ def sched_warp_softmax():
         block_x = te.thread_axis("blockIdx.x")
         thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
 
-        if exp is not None:
+        if delta is not None:
+            s[exp].compute_inline()
+            s[delta].compute_inline()
+        elif exp is not None:
             s[exp].bind(exp.op.axis[0], block_x)
 
         s[max_elem].bind(max_elem.op.axis[0], block_x)
diff --git a/python/tvm/topi/x86/nn.py b/python/tvm/topi/x86/nn.py
index 0994700fe98c..be89ecc7988f 100644
--- a/python/tvm/topi/x86/nn.py
+++ b/python/tvm/topi/x86/nn.py
@@ -42,9 +42,17 @@ def schedule_softmax(outs):
         exp = softmax.op.input_tensors[0]
         expsum = softmax.op.input_tensors[1]
         max_elem = s[exp].op.input_tensors[1]
+        delta = None
+        axis = int(softmax.op.attrs["axis"])
+    if op_tag == "fast_softmax_output":
+        exp = softmax.op.input_tensors[0]
+        expsum = softmax.op.input_tensors[1]
+        delta = s[exp].op.input_tensors[0]
+        max_elem = s[delta].op.input_tensors[1]
         axis = int(softmax.op.attrs["axis"])
     elif op_tag == "log_softmax_output":
         exp = None
+        delta = None
         max_elem = softmax.op.input_tensors[1]
         expsum = softmax.op.input_tensors[2]
         axis = 1
@@ -65,6 +73,9 @@ def schedule_softmax(outs):
     s[max_elem].compute_at(s[softmax], fused_outer_axes)
     s[expsum].compute_at(s[softmax], fused_outer_axes)
 
+    if delta is not None:
+        s[exp].compute_inline()
+        s[delta].compute_inline()
     if exp is not None:
         s[exp].compute_at(s[softmax], fused_outer_axes)
 
diff --git a/tests/python/relay/test_op_fast_math.py b/tests/python/relay/test_op_fast_math.py
index c9314fae37ac..f968dbedddfe 100644
--- a/tests/python/relay/test_op_fast_math.py
+++ b/tests/python/relay/test_op_fast_math.py
@@ -23,9 +23,11 @@
 from tvm import topi
 from tvm import te
 from tvm.contrib import graph_executor
+from tvm.topi import testing
 
 
-def test_fastmath():
+@tvm.testing.parametrize_targets("llvm", "cuda")
+def test_fastmath(target, dev):
     def test_apply(relay_op, name, f_numpy, low, high, step, dtype="float32"):
         a_np = np.arange(low, high, step).astype(dtype).reshape((1, -1))
         b_np = f_numpy(a_np)
@@ -36,13 +38,14 @@ def test_apply(relay_op, name, f_numpy, low, high, step, dtype="float32"):
         mod = tvm.IRModule.from_expr(func)
 
         with tvm.transform.PassContext(opt_level=3, required_pass=["FastMath"]):
-            graph, lib, params = relay.build(mod, target="llvm", params=None)
+            graph, lib, params = relay.build(mod, target=target, params=None)
 
         # Check that the op related to fast math have been convered to function in lib
         func_name = "fused_" + name
-        assert lib.get_function(func_name)
+        # When there're multiple targets in tvm.testing.parametrize_targets, the function
+        # built will have a "_1" in function name
+        assert func_name in graph
 
-        dev = tvm.cpu(0)
         m = graph_executor.create(graph, lib, dev)
         # Set inputs
         m.set_input("x", tvm.nd.array(a_np, dev))

From fd899590bd029f2110ac25af3c6a45d971287d53 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Fri, 28 May 2021 09:41:46 +0800
Subject: [PATCH 2/4] Update

---
 python/tvm/relay/op/strategy/x86.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 22df9d431b01..c21ec4d13906 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -80,7 +80,7 @@ def softmax_strategy_cpu(attrs, inputs, out_type, target):
 
 
 @fast_softmax_strategy.register("cpu")
-def softmax_strategy_cpu(attrs, inputs, out_type, target):
+def fast_softmax_strategy_cpu(attrs, inputs, out_type, target):
     """fast_softmax x86 strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(

From 47151f132d273013fc165d028f04e2cb0013d423 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Fri, 28 May 2021 10:53:58 +0800
Subject: [PATCH 3/4] Bug fix

---
 python/tvm/topi/x86/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/topi/x86/nn.py b/python/tvm/topi/x86/nn.py
index be89ecc7988f..4c39f2ad7382 100644
--- a/python/tvm/topi/x86/nn.py
+++ b/python/tvm/topi/x86/nn.py
@@ -44,7 +44,7 @@ def schedule_softmax(outs):
         max_elem = s[exp].op.input_tensors[1]
         delta = None
         axis = int(softmax.op.attrs["axis"])
-    if op_tag == "fast_softmax_output":
+    elif op_tag == "fast_softmax_output":
         exp = softmax.op.input_tensors[0]
         expsum = softmax.op.input_tensors[1]
         delta = s[exp].op.input_tensors[0]

From 20318ba0b8ea7c112d16200d1604667039648e39 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Mon, 31 May 2021 10:36:03 +0800
Subject: [PATCH 4/4] Re-trigger CI

---
 python/tvm/relay/op/strategy/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index c4b3863707bd..dfdde2b3506d 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -91,7 +91,7 @@ def softmax_strategy_cuda(attrs, inputs, out_type, target):
 
 @fast_softmax_strategy.register(["cuda", "gpu"])
 def fast_softmax_strategy_cuda(attrs, inputs, out_type, target):
-    """fast softmax cuda strategy"""
+    """fast_softmax cuda strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_softmax(topi.nn.fast_softmax),