hidet-org · BolinSNLHM · Apr 25, 2023 · Apr 25, 2023 · Apr 26, 2023 · Apr 26, 2023
diff --git a/.gitignore b/.gitignore
@@ -204,3 +204,4 @@ build-release
 
 # intermediate files
 /gallery/**/*.json
+/python/opt9.py
diff --git a/python/hidet/backend/build.py b/python/hidet/backend/build.py
@@ -121,13 +121,13 @@ def compile(self, src_path: str, out_lib_path: str, options: Optional[Dict[str,
             # optimize host side code via -O3
             '-O3',
             # enable openmp support for cpu kernels
-            '-Xcompiler -fopenmp',
+            '-Xcompiler -fopenmp,-fPIC,-m64,-mavx2,-march=native,-O3,-funroll-loops,-ffast-math',
             # the target PTX and SASS version.
             '-gencode arch=compute_{cc},code=sm_{cc}'.format(cc=cc_code),
             # allow ptxas (PTX assembler) to output information like register/smem usage.
             '--ptxas-options=-v',
             # compile into position independent code.
-            '--compiler-options -fPIC',
+            # '--compiler-options -fPIC,-m64,-mavx2,-march=native, -O3',
             # embed the line information into the binary, allow Nsight Compute to get the source code for profiling.
             '-lineinfo',
             # ftz=true and prec-div=false for fast math
@@ -184,6 +184,10 @@ def compile(self, src_path: str, out_lib_path: str, options: Optional[Dict[str,
             *['-L{}'.format(library_dir) for library_dir in self.library_dirs],
             # apply -O3 optimization.
             '-O3',
+            # support avx intrinsics
+            '-mavx2',
+            '-m64',
+            '-march=native',
             # compile into position independent code.
             '-fPIC',
             # enable OpenMP.

diff --git a/python/hidet/backend/codegen.py b/python/hidet/backend/codegen.py
@@ -441,7 +441,9 @@ def visit_ForStmt(self, stmt: ForStmt):
                 doc += NewLine() + '#pragma unroll'
         elif stmt.attr.parallel:
             if stmt.attr.parallel_threads:
-                doc += NewLine() + '#pragma omp parallel for num_threads({})'.format(stmt.attr.parallel_threads)
+                doc += NewLine() + '#pragma omp parallel for schedule(dynamic) num_threads({})'.format(
+                    stmt.attr.parallel_threads
+                )
             else:
                 doc += NewLine() + '#pragma omp parallel for'
         doc += NewLine() + Text('for (') + init_doc + '; ' + cond_doc + '; ' + update_doc + ') '
@@ -555,6 +557,8 @@ def visit_DataType(self, t: DataType):
             'tfloat32': 'tfloat32_t',
             'complex64': 'complex64_t',
             'complex128': 'complex128_t',
+            'float32x4': '__m128',
+            'float32x8': '__m256',
         }
         return Text(scalar_type_map[t.name])
 
@@ -613,6 +617,8 @@ def require_headers(self) -> Doc:
         doc += Text('#include <hidet/runtime/cuda/complex.h>') + NewLine()
         doc += Text('#include <hidet/runtime/cuda/context.h>') + NewLine()
 
+        doc += Text('#include <immintrin.h>') + NewLine()
+
         # nvcc use float to 'store' tfloat32 data
         doc += Text('typedef float tfloat32_t;') + NewLine()
         doc += Text('typedef __nv_bfloat16 bfloat16_t;') + NewLine()
@@ -684,9 +690,69 @@ def require_headers(self) -> Doc:
         doc += Text('#include <hidet/runtime/cpu/float16.h>') + NewLine()
         doc += Text('#include <hidet/runtime/cpu/bfloat16.h>') + NewLine()
         doc += Text('#include <hidet/runtime/cpu/complex.h>') + NewLine()
+        doc += Text('#include <immintrin.h>')
         doc += NewLine()
         return doc
 
+    def visit_ScalarType(self, t: DataType):
+        # float16, bfloat16 and tfloat32 are not supported on CPU yet
+        # https://moocaholic.medium.com/fp64-fp32-fp16-bfloat16-tf32-and-other-members-of-the-zoo-a1ca7897d407
+        scalar_type_map = {
+            'bool': 'bool',
+            'uint8': 'uint8_t',
+            'uint16': 'uint16_t',
+            'uint32': 'uint32_t',
+            'uint64': 'uint64_t',
+            'int8': 'int8_t',
+            'int16': 'int16_t',
+            'int32': 'int32_t',
+            'int64': 'int64_t',
+            'float16': 'half',
+            'float32': 'float',
+            'float64': 'double',
+            'bfloat16': 'bfloat16_t',
+            'tfloat32': 'float',
+            'float32x4': '__m128',
+            'float32x8': '__m256',
+        }
+        return Text(scalar_type_map[t.name])
+
+    def visit_IRModule(self, module: IRModule) -> Doc:
+        self.ir_module = module
+        doc = Doc()
+        # todo: only add necessary headers
+        doc += Text('#include <stdint.h>') + NewLine()
+        doc += Text('#include <hidet/runtime/cpu_context.h>') + NewLine()
+        doc += Text('#include <math.h>') + NewLine()
+        # float16 and bfloat16 emulation
+        doc += Text('#include <hidet/cpu/float16.h>') + NewLine()
+        doc += Text('#include <hidet/cpu/bfloat16.h>') + NewLine()
+
+        # Headers for avx intrinsics
+        doc += Text('#include <immintrin.h>') + NewLine()
+
+        if module.task is not None:
+            doc += '/*' + NewLine()
+            doc += str(module.task) + NewLine()
+            doc += '*/' + NewLine()
+
+        doc += Text('extern "C" {') + NewLine()
+
+        # add namespace to activate data type and function
+        doc += Text('using float16::Half;') + NewLine()
+        doc += Text('using bfloat16::BFloat16;') + NewLine()
+
+        # use typedef to map half and bfloat16 type
+        doc += Text('typedef Half half;') + NewLine()
+        doc += Text('typedef BFloat16 bfloat16_t;') + NewLine()
+
+        call_graph = CallGraph(module)
+        for node in call_graph.reversed_order:
+            doc += self(node.func) + NewLine()
+
+        doc += NewLine() + '}'
+        return doc
+
     def visit_Function(self, func: Function) -> Doc:
         self.namer.clear()
 

diff --git a/python/hidet/ffi/runtime_api.py b/python/hidet/ffi/runtime_api.py
@@ -21,7 +21,7 @@ class RuntimeAPI:
     _register_callback = get_func('register_callback', [c_char_p, c_void_p], None)
     _allocate_cuda_storage = get_func('allocate_cuda_storage', [c_uint64], c_uint64)
     _free_cuda_storage = get_func('free_cuda_storage', [c_uint64], None)
-    _reset_symbol_table = get_func('reset_symbol_table', [], None)
+    # _reset_symbol_table = get_func('reset_symbol_table', [], None)
     _get_symbol_value = get_func('get_symbol_value', [c_char_p], c_int32)
     _set_symbol_value = get_func('set_symbol_value', [c_char_p, c_int32], None)
 

diff --git a/python/hidet/graph/ops/__init__.py b/python/hidet/graph/ops/__init__.py
@@ -46,6 +46,9 @@
 from .definitions.fusion import fused_operator
 from .definitions.special import barrier
 
+from .definitions.matmul import matmul_x86
+from .definitions.matmul import matmul_x86_onednn
+
 from .definitions import utils
 
 from . import schedules
diff --git a/python/hidet/graph/ops/definitions/__init__.py b/python/hidet/graph/ops/definitions/__init__.py
@@ -34,7 +34,7 @@
 from .conv3d_transpose import conv3d_transpose
 from .matmul import batch_matmul, matmul
 
-from .matmul import BatchMatmulOp, MatmulOp
+from .matmul import BatchMatmulOp, MatmulOp, Matmulx86Op
 from .conv2d import Conv2dOp
 from .arithmetic import ErfOp, PowOp, AddOp, SubtractOp, MultiplyOp, DivideOp, WhereOp
 from .compare import EqualOp

diff --git a/python/hidet/graph/ops/definitions/matmul/__init__.py b/python/hidet/graph/ops/definitions/matmul/__init__.py
@@ -12,3 +12,9 @@
 from .matmul import matmul, MatmulOp, MatmulTask
 from .batch_matmul import batch_matmul, BatchMatmulOp, BatchMatmulTask
 from . import resolve
+
+from .matmul_f32_x86 import matmul_x86
+from .matmul_f32_x86_v2 import matmul_x86_onednn
+
+from .matmul_f32_x86 import MatmulF32Taskx86, Matmulx86Op
+from .matmul_f32_x86_v2 import MatmulF32Taskx86OneDNN, MatmulX86OneDNNOp
Original file line number	Diff line number	Diff line change
		@@ -204,3 +204,4 @@ build-release

		# intermediate files
		/gallery/*/.json
		/python/opt9.py