PaddlePaddle · luotao1 · Jan 19, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
@@ -107,6 +107,7 @@
     is_compiled_with_rocm,
     is_compiled_with_xpu,
     name_scope,
+    process_type_promotion,
     program_guard,
     require_version,
     set_flags,

diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
@@ -41,6 +41,7 @@
     get_flags,
     in_pir_mode,
     paddle_type_to_proto_type,
+    process_type_promotion,
     set_flags,
 )
 from .incubate.checkpoint import auto_checkpoint as acp
@@ -1770,6 +1771,8 @@ def run(
                 return_numpy=return_numpy,
             )
         else:
+            # do type promotion if necessary
+            program = process_type_promotion(program)
             res = self._run_impl(
                 program=program,
                 feed=feed,

@@ -56,6 +56,16 @@
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 _global_flags_ = core.globals()
 
+SUPPORT_PROMOTION_OPS_AND_INPUTNAME = {
+    "elementwise_add": ['X', 'Y'],
+    "elementwise_add_grad": ['X', 'Y'],
+    "elementwise_sub": ['X', 'Y'],
+    "elementwise_sub_grad": ['X', 'Y'],
+    "elementwise_mul": ['X', 'Y'],
+    "elementwise_mul_grad": ['X', 'Y'],
+    "where": ['X', 'Y'],
+}
+
 
 def _global_flags():
     return _global_flags_
@@ -8144,3 +8154,99 @@ def _get_paddle_place_list(places):
         ret.append(p)
 
     return ret
+
+
+def dtype_to_str(in_dtype):
+    if in_dtype == core.VarDesc.VarType.FP16:
+        return "fp16"
+    elif in_dtype == core.VarDesc.VarType.BF16:
+        return "bf16"
+    elif in_dtype == core.VarDesc.VarType.FP32:
+        return "fp32"
+    elif in_dtype == core.VarDesc.VarType.FP64:
+        return "fp64"
+    else:
+        return None
+
+
+def add_cast_for_type_promotion(op, block, idx, var_name, out_dtype):
+    op_device = op.attr('op_device')
+    cast_name = var_name.name + '.cast_' + dtype_to_str(out_dtype)
+    out_var = block.create_var(
+        name=cast_name,
+        dtype=out_dtype,
+        persistable=False,
+        stop_gradient=var_name.stop_gradient,
+    )
+    op_role = (
+        int(core.op_proto_and_checker_maker.OpRole.Forward)
+        if not op.has_attr('op_role')
+        else op.attr('op_role')
+    )
+    block._insert_op_without_sync(
+        idx,
+        type="cast",
+        inputs={"X": var_name},
+        outputs={"Out": out_var},
+        attrs={
+            "in_dtype": var_name.dtype,
+            "out_dtype": out_var.dtype,
+            "op_device": op_device,
+            "op_role": op_role,
+        },
+    )
+    op.desc._rename_input(var_name.name, out_var.name)
+
+
+def process_type_promotion(program):
+    org_program = program
+    if program is None:
+        program = default_main_program()
+    # not support pir for now
+    if not isinstance(program, Program):
+        return org_program
+    global_block = program.global_block()
+    all_params = global_block.all_parameters()
+    for block in program.blocks:
+        ops = block.ops
+        idx = 0
+        while idx < len(ops):
+            op = ops[idx]
+            var_name = None
+            all_dtypes = []
+            all_input_name_need_cast = []
+
+            need_transed_var_names = SUPPORT_PROMOTION_OPS_AND_INPUTNAME.get(
+                op.type, None
+            )
+            # type promotion only support some dyadic api
+            if need_transed_var_names is None:
+                idx += 1
+                continue
+
+            # get all dtype and input_name
+            for input_idx in range(len(op.input_arg_names)):
+                if op.input_names[input_idx] in need_transed_var_names:
+                    input_arg_name = op.input_arg_names[input_idx]
+                    all_dtypes.append(
+                        op.block._var_recursive(input_arg_name).dtype
+                    )
+                    all_input_name_need_cast.append(input_arg_name)
+
+            # only support promote between float
+            if core.need_type_promotion(*all_dtypes):
+                common_dtype = core.get_promote_dtype(op.type, *all_dtypes)
+                for input_name_need_cast in all_input_name_need_cast:
+                    var_name = op.block._var_recursive(input_name_need_cast)
+                    if var_name.dtype != common_dtype:
+                        # add cast op for different dtype
+                        add_cast_for_type_promotion(
+                            op,
+                            block,
+                            idx,
+                            var_name,
+                            common_dtype,
+                        )
+                        idx += 1
+            idx += 1
+    return program
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
@@ -534,19 +534,13 @@ def __impl__(self, other_var):
             if lhs_dtype != rhs_dtype:
                 if method_name in SUPPORT_PROMOTION_OPS:
                     if core.need_type_promotion(lhs_dtype, rhs_dtype):
-                        common_dtype = core.get_promote_dtype(
-                            op_type, lhs_dtype, rhs_dtype
-                        )
+                        # only report warning here, real promotion deal in Executor
                         warnings.warn(
-                            f"The input dtypes of OP {op_type} are {lhs_dtype} and {rhs_dtype}, the output will be auto-promoted to {common_dtype}"
+                            f"The input dtypes of OP {op_type} are {lhs_dtype} and {rhs_dtype}, the output will be auto-promoted"
                         )
                         warnings.filterwarnings(
                             "ignore", message="The input dtypes of OP"
                         )
-                        if rhs_dtype != common_dtype:
-                            other_var = astype(other_var, common_dtype)
-                        if lhs_dtype != common_dtype:
-                            self = astype(self, common_dtype)
                     else:
                         # NOTE(zoooo0820): Currently, we still keep the old illogical \
                         # logic for compatibility reasons

diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
@@ -33,7 +33,12 @@
     unique_name,
 )
 from paddle.base.executor import Executor, global_scope
-from paddle.base.framework import Parameter, dygraph_not_support, static_only
+from paddle.base.framework import (
+    Parameter,
+    dygraph_not_support,
+    process_type_promotion,
+    static_only,
+)
 from paddle.base.log_helper import get_logger
 from paddle.framework.io_utils import (
     _clone_var_in_block_,
@@ -587,6 +592,10 @@ def save_inference_model(
     _check_vars('fetch_vars', fetch_vars)
 
     program = _get_valid_program(kwargs.get('program', None))
+
+    # do type promotion
+    program = process_type_promotion(program)
+
     clip_extra = kwargs.get('clip_extra', True)
     program = normalize_program(
         program,
@@ -903,6 +912,9 @@ def load_inference_model(path_prefix, executor, **kwargs):
         # deserialize bytes to program
         program = deserialize_program(program_bytes)
 
+        # do type promotion
+        program = process_type_promotion(program)
+
         vars = list(filter(is_persistable, program.list_vars()))
         if len(vars) > 0:
             load_vars(
@@ -958,6 +970,9 @@ def load_inference_model(path_prefix, executor, **kwargs):
         # deserialize bytes to program
         program = deserialize_program(program_bytes)
 
+        # do type promotion
+        program = process_type_promotion(program)
+
         vars = list(filter(is_persistable, program.list_vars()))
         if len(vars) > 0:
             load_dirname = os.path.dirname(params_path)

diff --git a/test/legacy_test/test_tensor_type_promotion.py b/test/legacy_test/test_tensor_type_promotion.py
@@ -119,6 +119,31 @@ def test_dtype_is_expected(self):
     )
 
 
+class TestAPIAddInStatic(TestOperatorOverloadAddInStatic):
+    def run_api(self):
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            self.generate_test_value()
+
+            out = paddle.add(self.l_value, self.r_value)
+            out_reverse = paddle.add(self.r_value, self.l_value)
+
+        res = self.exe.run(prog, fetch_list=[out, out_reverse])
+        return res
+
+
+create_test_case(TestAPIAddInStatic, 'float16', 'float32', 'float32')
+create_test_case(TestAPIAddInStatic, 'float16', 'float64', 'float64')
+
+create_test_case(TestAPIAddInStatic, 'float32', 'float64', 'float64')
+
+
+if paddle.is_compiled_with_cuda() and paddle.base.core.supports_bfloat16():
+    create_test_case(TestAPIAddInStatic, 'bfloat16', 'float16', 'float32')
+    create_test_case(TestAPIAddInStatic, 'bfloat16', 'float32', 'float32')
+    create_test_case(TestAPIAddInStatic, 'bfloat16', 'float64', 'float64')
+
+
 class TestOperatorOverloadSubInStatic(TestOperatorOverloadAddInStatic):
     def run_api(self):
         prog = paddle.static.Program()
@@ -156,74 +181,64 @@ def run_api(self):
     )
 
 
-class TestOperatorOverloadMulInStatic(TestOperatorOverloadAddInStatic):
+class TestAPISubInStatic(TestOperatorOverloadAddInStatic):
     def run_api(self):
         prog = paddle.static.Program()
         with paddle.static.program_guard(prog):
             self.generate_test_value()
 
-            out = self.l_value * self.r_value
-            out_reverse = self.r_value * self.l_value
+            out = paddle.subtract(self.l_value, self.r_value)
+            out_reverse = paddle.subtract(self.r_value, self.l_value)
 
         res = self.exe.run(prog, fetch_list=[out, out_reverse])
         return res
 
 
-create_test_case(
-    TestOperatorOverloadMulInStatic, 'float16', 'float32', 'float32'
-)
-create_test_case(
-    TestOperatorOverloadMulInStatic, 'float16', 'float64', 'float64'
-)
+create_test_case(TestAPISubInStatic, 'float16', 'float32', 'float32')
+create_test_case(TestAPISubInStatic, 'float16', 'float64', 'float64')
 
-create_test_case(
-    TestOperatorOverloadMulInStatic, 'float32', 'float64', 'float64'
-)
+create_test_case(TestAPIAddInStatic, 'float32', 'float64', 'float64')
 
-if paddle.is_compiled_with_cuda() and paddle.base.core.supports_bfloat16():
-    create_test_case(
-        TestOperatorOverloadMulInStatic, 'bfloat16', 'float16', 'float32'
-    )
-    create_test_case(
-        TestOperatorOverloadMulInStatic, 'bfloat16', 'float32', 'float32'
-    )
-    create_test_case(
-        TestOperatorOverloadMulInStatic, 'bfloat16', 'float64', 'float64'
-    )
 
+if paddle.is_compiled_with_cuda() and paddle.base.core.supports_bfloat16():
+    create_test_case(TestAPISubInStatic, 'bfloat16', 'float16', 'float32')
+    create_test_case(TestAPISubInStatic, 'bfloat16', 'float32', 'float32')
+    create_test_case(TestAPISubInStatic, 'bfloat16', 'float64', 'float64')
 
-class TestOperatorOverloadGTInStatic(TestOperatorOverloadAddInStatic):
-    def set_dtype(self):
-        self.ldtype = 'float32'
-        self.rdtype = 'float64'
-        self.expected_out_dtype = 'bool'
 
+class TestOperatorOverloadMulInStatic(TestOperatorOverloadAddInStatic):
     def run_api(self):
         prog = paddle.static.Program()
         with paddle.static.program_guard(prog):
             self.generate_test_value()
 
-            out = self.l_value > self.r_value
-            out_reverse = self.r_value > self.l_value
+            out = self.l_value * self.r_value
+            out_reverse = self.r_value * self.l_value
 
         res = self.exe.run(prog, fetch_list=[out, out_reverse])
         return res
 
 
-create_test_case(TestOperatorOverloadGTInStatic, 'float16', 'float32', 'bool')
-create_test_case(TestOperatorOverloadGTInStatic, 'float16', 'float64', 'bool')
+create_test_case(
+    TestOperatorOverloadMulInStatic, 'float16', 'float32', 'float32'
+)
+create_test_case(
+    TestOperatorOverloadMulInStatic, 'float16', 'float64', 'float64'
+)
 
-create_test_case(TestOperatorOverloadGTInStatic, 'float32', 'float64', 'bool')
+create_test_case(
+    TestOperatorOverloadMulInStatic, 'float32', 'float64', 'float64'
+)
 
 if paddle.is_compiled_with_cuda() and paddle.base.core.supports_bfloat16():
     create_test_case(
-        TestOperatorOverloadGTInStatic, 'bfloat16', 'float16', 'bool'
+        TestOperatorOverloadMulInStatic, 'bfloat16', 'float16', 'float32'
     )
     create_test_case(
-        TestOperatorOverloadGTInStatic, 'bfloat16', 'float32', 'bool'
+        TestOperatorOverloadMulInStatic, 'bfloat16', 'float32', 'float32'
     )
     create_test_case(
-        TestOperatorOverloadGTInStatic, 'bfloat16', 'float64', 'bool'
+        TestOperatorOverloadMulInStatic, 'bfloat16', 'float64', 'float64'
     )