From 67a27443f5ae8c469b8304c76f2a922e316b8703 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 22 Apr 2025 03:38:55 -0700 Subject: [PATCH 01/28] Add python packaging files --- pyproject.toml | 31 ++++++++++++++++++ setup.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 pyproject.toml create mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000000..46b76e28f22c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,31 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"] +build-backend = "setuptools.build_meta" + +[project] +name = "pyomp" +version = "0.2.0" +description = "Python OpenMP library based on Numba" +readme = "README.md" +requires-python = ">=3.8, <=3.12" +license = { text = "BSD 2-Clause License" } +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Topic :: Software Development :: Compilers", +] +dependencies = ["numba>=0.57, <0.58", "lark", "cffi"] + +[project.urls] +Homepage = "https://github.com/Python-for-HPC/PyOMP" +Issues = "https://github.com/Python-for-HPC/PyOMP/issues" + +[tool.setuptools.packages.find] +where = ["."] +include = ["numba.openmp"] + +[tool.setuptools.package-data] +"numba.openmp" = ["libs/*"] diff --git a/setup.py b/setup.py new file mode 100644 index 000000000000..487343fcc112 --- /dev/null +++ b/setup.py @@ -0,0 +1,85 @@ +# setup.py +import os +import numba +import sysconfig +import numpy as np +import subprocess +from pathlib import Path +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext +from setuptools.command.build_clib import build_clib + + +numba_dir = os.path.dirname(numba.__file__) +bundle_lib = ( + "bundle", + { + "sources": [ + "numba/openmp/nrt/init.c", + f"{numba_dir}/_helpermod.c", + f"{numba_dir}/cext/utils.c", + f"{numba_dir}/cext/dictobject.c", + f"{numba_dir}/cext/listobject.c", + f"{numba_dir}/core/runtime/_nrt_pythonmod.c", + f"{numba_dir}/core/runtime/nrt.cpp", + ], + "include_dirs": [ + sysconfig.get_paths()["include"], + np.get_include(), + ], + }, +) + + +class BuildStaticBundle(build_clib): + def finalize_options(self): + super().finalize_options() + self.build_temp = (Path("numba/openmp/nrt") / self.build_temp).absolute() + self.build_temp.mkdir(parents=True, exist_ok=True) + self.build_temp = str(self.build_temp) + self.build_clib = str(Path("numba/openmp/libs").absolute()) + + +class CMakeExtension(Extension): + def __init__(self, name, sourcedir): + # don't invoke the original build_ext for this special extension + super().__init__(name, sources=[]) + self.sourcedir = sourcedir + + +class BuildPass(build_ext): + def run(self): + for ext in self.extensions: + if isinstance(ext, CMakeExtension): + self.build_cmake(ext) + return + super().run() + + def build_cmake(self, ext): + build_dir = (Path(ext.sourcedir) / self.build_temp).absolute() + subprocess.run( + [ + "cmake", + "-S", + ext.sourcedir, + "-B", + build_dir, + "--install-prefix", + Path("numba/openmp/libs").absolute(), + "-DCMAKE_BUILD_TYPE=Release", + ], + check=True, + ) + + subprocess.run(["cmake", "--build", build_dir, "-j"], check=True) + subprocess.run( + ["cmake", "--install", build_dir], + check=True, + ) + + +setup( + libraries=[bundle_lib], + ext_modules=[CMakeExtension("libIntrinsicsOpenMP", "numba/openmp/pass")], + cmdclass={"build_clib": BuildStaticBundle, "build_ext": BuildPass}, +) From 45e1f358eee7068ff99396411e344c23c11271e8 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 22 Apr 2025 03:40:57 -0700 Subject: [PATCH 02/28] Add pyomp numba extension files --- numba/openmp/__init__.py | 7778 +++++++++++++++++++++ numba/openmp/nrt/init.c | 3 + numba/openmp/pass/CGIntrinsicsOpenMP.cpp | 3102 ++++++++ numba/openmp/pass/CGIntrinsicsOpenMP.h | 527 ++ numba/openmp/pass/CMakeLists.txt | 40 + numba/openmp/pass/DebugOpenMP.cpp | 16 + numba/openmp/pass/DebugOpenMP.h | 28 + numba/openmp/pass/IntrinsicsOpenMP.cpp | 732 ++ numba/openmp/pass/IntrinsicsOpenMP.h | 14 + numba/openmp/pass/IntrinsicsOpenMP_CAPI.h | 23 + numba/openmp/tests/test_openmp.py | 4959 +++++++++++++ 11 files changed, 17222 insertions(+) create mode 100644 numba/openmp/__init__.py create mode 100644 numba/openmp/nrt/init.c create mode 100644 numba/openmp/pass/CGIntrinsicsOpenMP.cpp create mode 100644 numba/openmp/pass/CGIntrinsicsOpenMP.h create mode 100644 numba/openmp/pass/CMakeLists.txt create mode 100644 numba/openmp/pass/DebugOpenMP.cpp create mode 100644 numba/openmp/pass/DebugOpenMP.h create mode 100644 numba/openmp/pass/IntrinsicsOpenMP.cpp create mode 100644 numba/openmp/pass/IntrinsicsOpenMP.h create mode 100644 numba/openmp/pass/IntrinsicsOpenMP_CAPI.h create mode 100644 numba/openmp/tests/test_openmp.py diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py new file mode 100644 index 000000000000..52e31d4a45c4 --- /dev/null +++ b/numba/openmp/__init__.py @@ -0,0 +1,7778 @@ +import warnings +from numba.core.withcontexts import WithContext, _CallContextType +from lark import Lark, Transformer +from lark.exceptions import VisitError +from numba.core.ir_utils import ( + get_call_table, + dump_blocks, + dprint_func_ir, + replace_vars, + apply_copy_propagate_extensions, + visit_vars_extensions, + remove_dels, + visit_vars_inner, + visit_vars, + get_name_var_table, + replace_var_names, + get_definition, + build_definitions, + dead_code_elimination, + mk_unique_var, + find_topo_order, + flatten_labels, +) +from numba.core.analysis import ( + compute_cfg_from_blocks, + compute_use_defs, + compute_live_map, + _fix_loop_exit, +) +from numba.core import ( + ir, + config, + types, + typeinfer, + cgutils, + compiler, + transforms, + bytecode, + typed_passes, + imputils, + typing, + cpu, + compiler_machinery, +) +from numba.core.compiler_machinery import PassManager +from numba.core.compiler import DefaultPassBuilder +from numba.core.untyped_passes import ( + TranslateByteCode, + FixupArgs, + IRProcessing, + InlineClosureLikes, + RewriteSemanticConstants, + DeadBranchPrune, + GenericRewrites, + RewriteDynamicRaises, + MakeFunctionToJitFunction, + InlineInlinables, + FindLiterallyCalls, + LiteralUnroll, + LiteralPropagationSubPipelinePass, + WithLifting, +) +from numba import np as numba_np +from numba import cuda as numba_cuda +from numba.core.controlflow import CFGraph +from numba.core.ssa import _run_ssa +from numba.extending import overload, intrinsic +from numba.core.callconv import ( + BaseCallConv, + MinimalCallConv, + errcode_t, + RETCODE_OK, + Status, + excinfo_t, + CPUCallConv, +) +from functools import cached_property +from numba.core.datamodel.registry import register_default as model_register +from numba.core.datamodel.registry import default_manager as model_manager +from numba.core.datamodel.models import OpaqueModel +from numba.core.types.functions import Dispatcher, ExternalFunction +from numba.core.dispatcher import _FunctionCompiler +from numba.np.ufunc import array_exprs +from cffi import FFI +import llvmlite.binding as ll +import llvmlite.ir as lir +import operator +import sys +import copy +import os +import numpy as np +from numba.core.analysis import ir_extension_usedefs, _use_defs_result +from numba.core.lowering import Lower +from numba.core.codegen import AOTCodeLibrary, JITCodeLibrary +from numba.cuda import descriptor as cuda_descriptor, compiler as cuda_compiler +from numba.cuda.target import CUDACallConv +import subprocess +import tempfile +import types as python_types +import numba +from pathlib import Path + +llvm_binpath = None +llvm_libpath = None +libpath = Path(__file__).parent / "libs" + + +###### START OF NUMBA EXTENSIONS ###### + + +### ir_utils.py +def dump_block(label, block): + print(label, ":") + for stmt in block.body: + print(" ", stmt) + + +### + + +### analysis.py +def filter_nested_loops(cfg, loops): + blocks_in_loop = set() + # get loop bodies + for loop in loops.values(): + insiders = set(loop.body) | set(loop.entries) | set(loop.exits) + insiders.discard(loop.header) + blocks_in_loop |= insiders + # find loop that is not part of other loops + for loop in loops.values(): + if loop.header not in blocks_in_loop: + yield _fix_loop_exit(cfg, loop) + + +### + + +### config.py +def safe_readenv(name, ctor, default): + value = os.environ.get(name, default) + try: + return ctor(value) + except Exception: + warnings.warn( + "environ %s defined but failed to parse '%s'" % (name, value), + RuntimeWarning, + ) + return default + + +DEBUG_OPENMP = safe_readenv("NUMBA_DEBUG_OPENMP", int, 0) +if DEBUG_OPENMP > 0 and config.DEBUG_ARRAY_OPT == 0: + config.DEBUG_ARRAY_OPT = 1 +DEBUG_OPENMP_LLVM_PASS = safe_readenv("NUMBA_DEBUG_OPENMP_LLVM_PASS", int, 0) +OPENMP_DISABLED = safe_readenv("NUMBA_OPENMP_DISABLED", int, 0) +OPENMP_DEVICE_TOOLCHAIN = safe_readenv("NUMBA_OPENMP_DEVICE_TOOLCHAIN", int, 0) +### + + +class LowerNoSROA(Lower): + @property + def _disable_sroa_like_opt(self): + # Always return True for this instance + return True + + def lower_assign_inst(self, orig, inst): + # This fixes assignments for Arg instructions when the target is a + # CPointer. It sets the backing storage to the pointer of the argument + # itself. + if isinstance(self.context, OpenmpCPUTargetContext) or isinstance( + self.context, OpenmpCUDATargetContext + ): + value = inst.value + if isinstance(value, ir.Arg): + argname = value.name + argty = self.typeof("arg." + argname) + if isinstance(argty, types.CPointer): + llty = self.context.get_value_type(argty) + ptr = lir.values.Argument(self.module, llty, "arg." + argname) + self.varmap[value.name] = ptr + return + + return orig(self, inst) + + def lower_return_inst(self, orig, inst): + if isinstance(self.context, OpenmpCUDATargetContext): + # This fixes Return instructions for CUDA device functions in an + # OpenMP target region. It avoids setting a value to the return + # value pointer argument, which otherwise breaks OpenMP code + # generation (looks like an upstream miscompilation) by DCE any + # memory effects (e.g., to other pointer arguments from a tofrom + # mapping.) + if self.fndesc.qualname == self.context.device_func_name: + self.call_conv._return_errcode_raw(self.builder, RETCODE_OK) + return + return orig(self, inst) + + +def run_intrinsics_openmp_pass(ll_module): + libpass = ( + libpath / f"libIntrinsicsOpenMP.{'dylib' if sys.platform == 'darwin' else 'so'}" + ) + + try: + r = subprocess.run( + [ + llvm_binpath + "/opt", + "-f", + f"-load-pass-plugin={libpass}", + "-passes=intrinsics-openmp", + ], + input=ll_module.as_bitcode(), + check=True, + capture_output=True, + ) + except subprocess.CalledProcessError as e: + print("Error running LLVM pass:", e, file=sys.stderr) + print("Command:", e.cmd, file=sys.stderr) + print("Return code:", e.returncode, file=sys.stderr) + print("Output:", e.output.decode(), file=sys.stderr) + print("Error output:", e.stderr.decode(), file=sys.stderr) + raise + + if DEBUG_OPENMP_LLVM_PASS >= 1: + print(r.stderr.decode(), file=sys.stderr) + + bitcode_output = r.stdout + lowered_module = ll.parse_bitcode(bitcode_output) + + return lowered_module + + +class CustomCPUCodeLibrary(JITCodeLibrary): + def add_llvm_module(self, ll_module): + lowered_module = run_intrinsics_openmp_pass(ll_module) + super().add_llvm_module(lowered_module) + + def _finalize_specific(self): + super()._finalize_specific() + ll.ExecutionEngine.run_static_constructors(self._codegen._engine._ee) + + +class CustomAOTCPUCodeLibrary(AOTCodeLibrary): + def add_llvm_module(self, ll_module): + lowered_module = run_intrinsics_openmp_pass(ll_module) + super().add_llvm_module(lowered_module) + + +class CustomFunctionCompiler(_FunctionCompiler): + def _customize_flags(self, flags): + # We need to disable SSA form for OpenMP analysis to detect variables + # used within regions. + flags.enable_ssa = False + return flags + + +class CustomCompiler(compiler.CompilerBase): + @staticmethod + def custom_untyped_pipeline(state, name="untyped-openmp"): + """Returns an untyped part of the nopython OpenMP pipeline""" + pm = PassManager(name) + if state.func_ir is None: + pm.add_pass(TranslateByteCode, "analyzing bytecode") + pm.add_pass(FixupArgs, "fix up args") + pm.add_pass(IRProcessing, "processing IR") + + # inline closures early in case they are using nonlocal's + # see issue #6585. + pm.add_pass(InlineClosureLikes, "inline calls to locally defined closures") + + # pre typing + if not state.flags.no_rewrites: + pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants") + pm.add_pass(DeadBranchPrune, "dead branch pruning") + pm.add_pass(GenericRewrites, "nopython rewrites") + + pm.add_pass(RewriteDynamicRaises, "rewrite dynamic raises") + + # convert any remaining closures into functions + pm.add_pass( + MakeFunctionToJitFunction, "convert make_function into JIT functions" + ) + # inline functions that have been determined as inlinable and rerun + # branch pruning, this needs to be run after closures are inlined as + # the IR repr of a closure masks call sites if an inlinable is called + # inside a closure + pm.add_pass(InlineInlinables, "inline inlinable functions") + if not state.flags.no_rewrites: + pm.add_pass(DeadBranchPrune, "dead branch pruning") + + pm.add_pass(FindLiterallyCalls, "find literally calls") + pm.add_pass(LiteralUnroll, "handles literal_unroll") + + if state.flags.enable_ssa: + assert False, "SSA form is not supported in OpenMP" + + pm.add_pass(LiteralPropagationSubPipelinePass, "Literal propagation") + # Run WithLifting late to for make_implicit_explicit to work. TODO: We + # should create a pass that does this instead of replicating and hacking + # the untyped pipeline. This handling may also negatively affect + # optimizations. + pm.add_pass(WithLifting, "Handle with contexts") + + pm.finalize() + return pm + + def define_pipelines(self): + # compose pipeline from untyped, typed and lowering parts + dpb = DefaultPassBuilder + pm = PassManager("omp") + untyped_passes = self.custom_untyped_pipeline(self.state) + pm.passes.extend(untyped_passes.passes) + + typed_passes = dpb.define_typed_pipeline(self.state) + pm.passes.extend(typed_passes.passes) + + lowering_passes = dpb.define_nopython_lowering_pipeline(self.state) + pm.passes.extend(lowering_passes.passes) + + pm.finalize() + return [pm] + + +class CustomContext(cpu.CPUContext): + def post_lowering(self, mod, library): + if hasattr(library, "openmp") and library.openmp: + post_lowering_openmp(mod) + super().post_lowering(mod, library) + + +### decorators + + +def jit(*args, **kws): + """ + Equivalent to jit(nopython=True, nogil=True) + """ + if "nopython" in kws: + warnings.warn("nopython is set for njit and is ignored", RuntimeWarning) + if "forceobj" in kws: + warnings.warn("forceobj is set for njit and is ignored", RuntimeWarning) + del kws["forceobj"] + kws.update({"nopython": True, "nogil": True}) + dispatcher = numba.jit(*args, **kws) + dispatcher._compiler.__class__ = CustomFunctionCompiler + dispatcher._compiler.pipeline_class = CustomCompiler + return dispatcher + + +def njit(*args, **kws): + return jit(*args, **kws) + + +class OpenmpCUDATargetContext(cuda_descriptor.CUDATargetContext): + def __init__(self, name, typingctx, target="cuda"): + super().__init__(typingctx, target) + self.device_func_name = name + + def post_lowering(self, mod, library): + if hasattr(library, "openmp") and library.openmp: + post_lowering_openmp(mod) + super().post_lowering(mod, library) + + @cached_property + def call_conv(self): + return CUDACallConv(self) + + +class OpenmpCPUTargetContext(CustomContext): + def __init__(self, name, typingctx, target="cpu"): + super().__init__(typingctx, target) + self.device_func_name = name + + +##### END OF NUMBA EXTENSIONS ###### + + +###### START OF LLVMLITE EXTENSIONS ###### +def get_decl(alloca): + if not isinstance(alloca, lir.instructions.AllocaInstr): + raise TypeError("Expected AllocaInstr, got %s" % type(alloca)) + return '{0} %"{1}"'.format(alloca.type, alloca._get_name()) + + +# TODO: Upstream to llvmlite, it's part of the langref. +class TokenType(lir.Type): + """ + The type for tokens. From the LLVM Language Reference. + + 'The token type is used when a value is associated with an + instruction but all uses of the value must not attempt to + introspect or obscure it. As such, it is not appropriate + to have a phi or select of type token.' + """ + + def _to_string(self): + return "token" + + def __eq__(self, other): + return isinstance(other, TokenType) + + def __hash__(self): + return hash(TokenType) + + +class CallInstrWithOperandBundle(lir.instructions.CallInstr): + def set_tags(self, tags): + self.tags = tags + + # TODO: This is ugly duplication, we should upstream to llvmlite. + def descr(self, buf, add_metadata=True): + def descr_arg(i, a): + if i in self.arg_attributes: + attrs = " ".join(self.arg_attributes[i]._to_list()) + " " + else: + attrs = "" + return "{0} {1}{2}".format(a.type, attrs, a.get_reference()) + + args = ", ".join([descr_arg(i, a) for i, a in enumerate(self.args)]) + + fnty = self.callee.function_type + # Only print function type if variable-argument + if fnty.var_arg: + ty = fnty + # Otherwise, just print the return type. + else: + # Fastmath flag work only in this case + ty = fnty.return_type + callee_ref = "{0} {1}".format(ty, self.callee.get_reference()) + if self.cconv: + callee_ref = "{0} {1}".format(self.cconv, callee_ref) + + tail_marker = "" + if self.tail: + tail_marker = "{0} ".format(self.tail) + + buf.append( + "{tail}{op}{fastmath} {callee}({args}){attr}{tags}{meta}\n".format( + tail=tail_marker, + op=self.opname, + fastmath="".join([" " + attr for attr in self.fastmath]), + callee=callee_ref, + args=args, + attr="".join([" " + attr for attr in self.attributes]), + tags=(" " + self.tags if self.tags is not None else ""), + meta=( + self._stringify_metadata(leading_comma=True) if add_metadata else "" + ), + ) + ) + + +###### END OF LLVMLITE EXTENSIONS ###### + + +def _init(): + global llvm_binpath + global llvm_libpath + + sys_platform = sys.platform + + llvm_version = ( + subprocess.check_output(["llvm-config", "--version"]).decode().strip() + ) + if llvm_version != "14.0.6": + raise RuntimeError( + f"Incompatible LLVM version {llvm_version}, PyOMP expects LLVM 14.0.6" + ) + + llvm_binpath = subprocess.check_output(["llvm-config", "--bindir"]).decode().strip() + llvm_libpath = subprocess.check_output(["llvm-config", "--libdir"]).decode().strip() + iomplib = ( + llvm_libpath + "/libomp" + (".dylib" if sys_platform == "darwin" else ".so") + ) + if DEBUG_OPENMP >= 1: + print("Found OpenMP runtime library at", iomplib) + ll.load_library_permanently(iomplib) + + # libomptarget is unavailable on apple, windows, so return. + if sys_platform.startswith("darwin") or sys_platform.startswith("win32"): + return + + omptargetlib = llvm_libpath + "/libomptarget.so" + if DEBUG_OPENMP >= 1: + print("Found OpenMP target runtime library at", omptargetlib) + ll.load_library_permanently(omptargetlib) + + +_init() + + +# ---------------------------------------------------------------------------------------------- + + +class NameSlice: + def __init__(self, name, the_slice): + self.name = name + self.the_slice = the_slice + + def __str__(self): + return "NameSlice(" + str(self.name) + "," + str(self.the_slice) + ")" + + +class StringLiteral: + def __init__(self, x): + self.x = x + + +@intrinsic +def get_itercount(typingctx, it): + if isinstance(it, types.RangeIteratorType): + sig = typing.signature(it.yield_type, it) + + def codegen(context, builder, signature, args): + assert len(args) == 1 + val = args[0] + pair = context.make_helper(builder, it, val) + return builder.load(pair.count) + + return sig, codegen + + +def remove_privatized(x): + if isinstance(x, ir.Var): + x = x.name + + if isinstance(x, str) and x.endswith("%privatized"): + return x[: len(x) - len("%privatized")] + else: + return x + + +def remove_all_privatized(x): + new_x = None + while new_x != x: + new_x = x + x = remove_privatized(new_x) + + return new_x + + +def typemap_lookup(typemap, x): + orig_x = x + if isinstance(x, ir.Var): + x = x.name + + while True: + if x in typemap: + return typemap[x] + new_x = remove_privatized(x) + if new_x == x: + break + else: + x = new_x + + tkeys = typemap.keys() + + # Get basename (without privatized) + x = remove_all_privatized(x) + + potential_keys = list(filter(lambda y: y.startswith(x), tkeys)) + + for pkey in potential_keys: + pkey_base = remove_all_privatized(pkey) + if pkey_base == x: + return typemap[pkey] + + raise KeyError(f"{orig_x} and all of its non-privatized names not found in typemap") + + +class openmp_tag(object): + def __init__(self, name, arg=None, load=False, non_arg=False, omp_slice=None): + self.name = name + self.arg = arg + self.load = load + self.loaded_arg = None + self.xarginfo = [] + self.non_arg = non_arg + self.omp_slice = omp_slice + + def __getstate__(self): + state = self.__dict__.copy() + if isinstance(self.arg, lir.instructions.AllocaInstr): + del state["arg"] + return state + + def __setstate__(self, state): + self.__dict__.update(state) + if not hasattr(self, "arg"): + self.arg = None + + def var_in(self, var): + assert isinstance(var, str) + + if isinstance(self.arg, ir.Var): + return self.arg.name == var + + if isinstance(self.arg, str): + return self.arg == var + + return False + + def arg_size(self, x, lowerer): + if DEBUG_OPENMP >= 2: + print("arg_size:", x, type(x)) + if isinstance(x, NameSlice): + x = x.name + if isinstance(x, ir.Var): + # Make sure the var referred to has been alloc'ed already. + lowerer._alloca_var(x.name, lowerer.fndesc.typemap[x.name]) + if self.load: + assert False + else: + arg_str = lowerer.getvar(x.name) + return lowerer.context.get_abi_sizeof(arg_str.type.pointee) + elif isinstance(x, lir.instructions.AllocaInstr): + return lowerer.context.get_abi_sizeof(x.type.pointee) + elif isinstance(x, str): + xtyp = lowerer.fndesc.typemap[x] + if DEBUG_OPENMP >= 1: + print("xtyp:", xtyp, type(xtyp)) + lowerer._alloca_var(x, xtyp) + if self.load: + assert False + else: + arg_str = lowerer.getvar(x) + return lowerer.context.get_abi_sizeof(arg_str.type.pointee) + elif isinstance(x, int): + assert False + else: + print("unknown arg type:", x, type(x)) + assert False + + def arg_to_str( + self, x, lowerer, struct_lower=False, var_table=None, gen_copy=False + ): + if DEBUG_OPENMP >= 1: + print("arg_to_str:", x, type(x), self.load, type(self.load)) + if struct_lower: + assert isinstance(x, str) + assert var_table is not None + + typemap = lowerer.fndesc.typemap + + if isinstance(x, NameSlice): + if DEBUG_OPENMP >= 2: + print("nameslice found:", x) + x = x.name + if isinstance(x, ir.Var): + # Make sure the var referred to has been alloc'ed already. + lowerer._alloca_var(x.name, typemap_lookup(typemap, x)) + if self.load: + if not self.loaded_arg: + self.loaded_arg = lowerer.loadvar(x.name) + lop = self.loaded_arg.operands[0] + loptype = lop.type + pointee = loptype.pointee + ref = self.loaded_arg._get_reference() + decl = str(pointee) + " " + ref + else: + arg_str = lowerer.getvar(x.name) + if isinstance(arg_str, lir.values.Argument): + decl = str(arg_str) + else: + decl = get_decl(arg_str) + elif isinstance(x, lir.instructions.AllocaInstr): + decl = get_decl(x) + elif isinstance(x, str): + if "*" in x: + xsplit = x.split("*") + assert len(xsplit) == 2 + # xtyp = get_dotted_type(x, typemap, lowerer) + xtyp = typemap_lookup(typemap, xsplit[0]) + if DEBUG_OPENMP >= 1: + print("xtyp:", xtyp, type(xtyp)) + lowerer._alloca_var(x, xtyp) + if self.load: + if not self.loaded_arg: + self.loaded_arg = lowerer.loadvar(x) + lop = self.loaded_arg.operands[0] + loptype = lop.type + pointee = loptype.pointee + ref = self.loaded_arg._get_reference() + decl = str(pointee) + " " + ref + assert len(xsplit) == 1 + else: + arg_str = lowerer.getvar(xsplit[0]) + # arg_str = lowerer.getvar(x) + if isinstance(arg_str, lir.Argument): + decl = str(arg_str) + else: + decl = get_decl(arg_str) + if len(xsplit) > 1: + cur_typ = xtyp + field_indices = [] + for field in xsplit[1:]: + dm = lowerer.context.data_model_manager.lookup(cur_typ) + findex = dm._fields.index(field) + field_indices.append("i32 " + str(findex)) + cur_typ = dm._members[findex] + fi_str = ",".join(field_indices) + decl += f", {fi_str}" + # decl = f"SCOPE({decl}, {fi_str})" + else: + xtyp = typemap_lookup(typemap, x) + if DEBUG_OPENMP >= 1: + print("xtyp:", xtyp, type(xtyp)) + lowerer._alloca_var(x, xtyp) + if self.load: + if not self.loaded_arg: + self.loaded_arg = lowerer.loadvar(x) + lop = self.loaded_arg.operands[0] + loptype = lop.type + pointee = loptype.pointee + ref = self.loaded_arg._get_reference() + decl = str(pointee) + " " + ref + else: + arg_str = lowerer.getvar(x) + if isinstance(arg_str, lir.values.Argument): + decl = str(arg_str) + elif isinstance(arg_str, lir.instructions.AllocaInstr): + decl = get_decl(arg_str) + else: + assert False, ( + f"Don't know how to get decl string for variable {arg_str} of type {type(arg_str)}" + ) + + if struct_lower and isinstance(xtyp, types.npytypes.Array): + dm = lowerer.context.data_model_manager.lookup(xtyp) + cur_tag_ndim = xtyp.ndim + stride_typ = lowerer.context.get_value_type( + types.intp + ) # lir.Type.int(64) + stride_abi_size = lowerer.context.get_abi_sizeof(stride_typ) + array_var = var_table[self.arg] + if DEBUG_OPENMP >= 1: + print( + "Found array mapped:", + self.name, + self.arg, + xtyp, + type(xtyp), + stride_typ, + type(stride_typ), + stride_abi_size, + array_var, + type(array_var), + ) + size_var = ir.Var(None, self.arg + "_size_var", array_var.loc) + # size_var = array_var.scope.redefine("size_var", array_var.loc) + size_getattr = ir.Expr.getattr(array_var, "size", array_var.loc) + size_assign = ir.Assign(size_getattr, size_var, array_var.loc) + typemap[size_var.name] = types.int64 + lowerer._alloca_var(size_var.name, typemap[size_var.name]) + lowerer.lower_inst(size_assign) + data_field = dm._fields.index("data") + shape_field = dm._fields.index("shape") + strides_field = dm._fields.index("strides") + size_lowered = get_decl(lowerer.getvar(size_var.name)) + fixed_size = cur_tag_ndim + # fixed_size = stride_abi_size * cur_tag_ndim + decl += f", i32 {data_field}, i64 0, {size_lowered}" + decl += f", i32 {shape_field}, i64 0, i64 {fixed_size}" + decl += f", i32 {strides_field}, i64 0, i64 {fixed_size}" + + # see core/datamodel/models.py + # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*data", non_arg=True, omp_slice=(0,lowerer.loadvar(size_var.name)))) + # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*shape", non_arg=True, omp_slice=(0,stride_abi_size * cur_tag_ndim))) + # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*strides", non_arg=True, omp_slice=(0,stride_abi_size * cur_tag_ndim))) + + if gen_copy and isinstance(xtyp, types.npytypes.Array): + native_np_copy, copy_cres = create_native_np_copy(xtyp) + lowerer.library.add_llvm_module(copy_cres.library._final_module) + nnclen = len(native_np_copy) + decl += f', [{nnclen} x i8] c"{native_np_copy}"' + elif isinstance(x, StringLiteral): + decl = str(cgutils.make_bytearray(x.x)) + elif isinstance(x, int): + decl = "i32 " + str(x) + else: + print("unknown arg type:", x, type(x)) + + if self.omp_slice is not None: + + def handle_var(x): + if isinstance(x, ir.Var): + loaded_size = lowerer.loadvar(x.name) + loaded_op = loaded_size.operands[0] + loaded_pointee = loaded_op.type.pointee + ret = str(loaded_pointee) + " " + loaded_size._get_reference() + else: + ret = "i64 " + str(x) + return ret + + start_slice = handle_var(self.omp_slice[0]) + end_slice = handle_var(self.omp_slice[1]) + decl += f", {start_slice}, {end_slice}" + # decl = f"SLICE({decl}, {self.omp_slice[0]}, {self.omp_slice[1]})" + + return decl + + def post_entry(self, lowerer): + for xarginfo, xarginfo_args, x, alloca_tuple_list in self.xarginfo: + loaded_args = [ + lowerer.builder.load(alloca_tuple[2]) + for alloca_tuple in alloca_tuple_list + ] + fa_res = xarginfo.from_arguments(lowerer.builder, tuple(loaded_args)) + # fa_res = xarginfo.from_arguments(lowerer.builder,tuple([xarg for xarg in xarginfo_args])) + assert len(fa_res) == 1 + lowerer.storevar(fa_res[0], x) + + def add_length_firstprivate(self, x, lowerer): + if self.name == "QUAL.OMP.FIRSTPRIVATE": + return [x] + # return [x, self.arg_size(x, lowerer)] + # return [x, lowerer.context.get_constant(types.uintp, self.arg_size(x, lowerer))] + else: + return [x] + + def unpack_arg(self, x, lowerer, xarginfo_list): + if isinstance(x, ir.Var): + return self.add_length_firstprivate(x, lowerer), None + elif isinstance(x, lir.instructions.AllocaInstr): + return self.add_length_firstprivate(x, lowerer), None + elif isinstance(x, str): + xtyp = lowerer.fndesc.typemap[x] + if DEBUG_OPENMP >= 2: + print("xtyp:", xtyp, type(xtyp)) + if self.load: + return self.add_length_firstprivate(x, lowerer), None + else: + names_to_unpack = [] + # names_to_unpack = ["QUAL.OMP.FIRSTPRIVATE"] + # names_to_unpack = ["QUAL.OMP.PRIVATE", "QUAL.OMP.FIRSTPRIVATE"] + if ( + isinstance(xtyp, types.npytypes.Array) + and self.name in names_to_unpack + ): + # from core/datamodel/packer.py + xarginfo = lowerer.context.get_arg_packer((xtyp,)) + xloaded = lowerer.loadvar(x) + xarginfo_args = list( + xarginfo.as_arguments(lowerer.builder, [xloaded]) + ) + xarg_alloca_vars = [] + for xarg in xarginfo_args: + if DEBUG_OPENMP >= 2: + print( + "xarg:", + type(xarg), + xarg, + "agg:", + xarg.aggregate, + type(xarg.aggregate), + "ind:", + xarg.indices, + ) + print(xarg.aggregate.type.elements[xarg.indices[0]]) + alloca_name = "$alloca_" + xarg.name + alloca_typ = xarg.aggregate.type.elements[xarg.indices[0]] + alloca_res = lowerer.alloca_lltype(alloca_name, alloca_typ) + if DEBUG_OPENMP >= 2: + print( + "alloca:", + alloca_name, + alloca_typ, + alloca_res, + alloca_res.get_reference(), + ) + xarg_alloca_vars.append((alloca_name, alloca_typ, alloca_res)) + lowerer.builder.store(xarg, alloca_res) + xarginfo_list.append((xarginfo, xarginfo_args, x, xarg_alloca_vars)) + rets = [] + for i, xarg in enumerate(xarg_alloca_vars): + rets.append(xarg[2]) + if i == 4: + alloca_name = "$alloca_total_size_" + str(x) + if DEBUG_OPENMP >= 2: + print("alloca_name:", alloca_name) + alloca_typ = lowerer.context.get_value_type( + types.intp + ) # lir.Type.int(64) + alloca_res = lowerer.alloca_lltype(alloca_name, alloca_typ) + if DEBUG_OPENMP >= 2: + print( + "alloca:", + alloca_name, + alloca_typ, + alloca_res, + alloca_res.get_reference(), + ) + mul_res = lowerer.builder.mul( + lowerer.builder.load(xarg_alloca_vars[2][2]), + lowerer.builder.load(xarg_alloca_vars[3][2]), + ) + lowerer.builder.store(mul_res, alloca_res) + rets.append(alloca_res) + else: + rets.append(self.arg_size(xarg[2], lowerer)) + return rets, [x] + else: + return self.add_length_firstprivate(x, lowerer), None + elif isinstance(x, int): + return self.add_length_firstprivate(x, lowerer), None + else: + print("unknown arg type:", x, type(x)) + + return self.add_length_firstprivate(x, lowerer), None + + def unpack_arrays(self, lowerer): + if isinstance(self.arg, list): + arg_list = self.arg + elif self.arg is not None: + arg_list = [self.arg] + else: + return [self] + new_xarginfo = [] + unpack_res = [self.unpack_arg(arg, lowerer, new_xarginfo) for arg in arg_list] + new_args = [x[0] for x in unpack_res] + arrays_to_private = [] + for x in unpack_res: + if x[1]: + arrays_to_private.append(x[1]) + ot_res = openmp_tag(self.name, sum(new_args, []), self.load) + ot_res.xarginfo = new_xarginfo + return [ot_res] + ( + [] + if len(arrays_to_private) == 0 + else [openmp_tag("QUAL.OMP.PRIVATE", sum(arrays_to_private, []), self.load)] + ) + + def lower(self, lowerer, debug): + decl = "" + if debug and DEBUG_OPENMP >= 1: + print("openmp_tag::lower", self.name, self.arg, type(self.arg)) + + if isinstance(self.arg, list): + arg_list = self.arg + elif self.arg is not None: + arg_list = [self.arg] + else: + arg_list = [] + typemap = lowerer.fndesc.typemap + assert len(arg_list) <= 1 + + if self.name == "QUAL.OMP.TARGET.IMPLICIT": + assert False # shouldn't get here anymore + + name_to_use = self.name + + is_array = self.arg in typemap and isinstance( + typemap[self.arg], types.npytypes.Array + ) + + gen_copy = name_to_use in ["QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.LASTPRIVATE"] + + if ( + name_to_use + in [ + "QUAL.OMP.MAP.TOFROM", + "QUAL.OMP.MAP.TO", + "QUAL.OMP.MAP.FROM", + "QUAL.OMP.MAP.ALLOC", + ] + and is_array + ): + # name_to_use += ".STRUCT" + # var_table = get_name_var_table(lowerer.func_ir.blocks) + # decl = ",".join([self.arg_to_str(x, lowerer, struct_lower=True, var_table=var_table) for x in arg_list]) + decl = ",".join( + [ + self.arg_to_str(x, lowerer, struct_lower=False, gen_copy=gen_copy) + for x in arg_list + ] + ) + else: + decl = ",".join( + [ + self.arg_to_str(x, lowerer, struct_lower=False, gen_copy=gen_copy) + for x in arg_list + ] + ) + + return '"' + name_to_use + '"(' + decl + ")" + + def replace_vars_inner(self, var_dict): + if isinstance(self.arg, ir.Var): + self.arg = replace_vars_inner(self.arg, var_dict) + + def add_to_usedef_set(self, use_set, def_set, start): + assert start == True or start == False + if DEBUG_OPENMP >= 3: + print("add_to_usedef_set", start, self.name, "is_dsa=", is_dsa(self.name)) + + def add_arg(arg, the_set): + if isinstance(self.arg, ir.Var): + the_set.add(self.arg.name) + elif isinstance(self.arg, str): + the_set.add(self.arg) + elif isinstance(self.arg, NameSlice): + assert isinstance(self.arg.name, str), "Expected str in NameSlice arg" + the_set.add(self.arg.name) + # TODO: Create a good error check mechanism. + # else: ? + + if self.name.startswith("DIR.OMP"): + assert not isinstance(self.arg, (ir.Var, str)) + return + + if self.name in [ + "QUAL.OMP.MAP.TO", + "QUAL.OMP.IF", + "QUAL.OMP.NUM_THREADS", + "QUAL.OMP.NUM_TEAMS", + "QUAL.OMP.THREAD_LIMIT", + "QUAL.OMP.SCHEDULE.STATIC", + "QUAL.OMP.SCHEDULE.RUNTIME", + "QUAL.OMP.SCHEDULE.GUIDED", + "QUAL.OMP.SCHEDULE_DYNAMIC", + "QUAL.OMP.FIRSTPRIVATE", + "QUAL.OMP.COPYIN", + "QUAL.OMP.COPYPRIVATE", + "QUAL.OMP.NORMALIZED.LB", + "QUAL.OMP.NORMALIZED.START", + "QUAL.OMP.NORMALIZED.UB", + "QUAL.OMP.MAP.TO.STRUCT", + ]: + if start: + add_arg(self.arg, use_set) + elif self.name in [ + "QUAL.OMP.PRIVATE", + "QUAL.OMP.LINEAR", + "QUAL.OMP.NORMALIZED.IV", + "QUAL.OMP.MAP.ALLOC", + "QUAL.OMP.MAP.ALLOC.STRUCT", + ]: + # Intentionally do nothing. + pass + elif self.name in ["QUAL.OMP.SHARED"]: + add_arg(self.arg, use_set) + elif self.name in [ + "QUAL.OMP.MAP.TOFROM", + "QUAL.OMP.TARGET.IMPLICIT", + "QUAL.OMP.MAP.TOFROM.STRUCT", + ]: + if start: + add_arg(self.arg, use_set) + else: + add_arg(self.arg, use_set) + add_arg(self.arg, def_set) + elif self.name in [ + "QUAL.OMP.MAP.FROM", + "QUAL.OMP.LASTPRIVATE", + "QUAL.OMP.MAP.FROM.STRUCT", + ] or self.name.startswith("QUAL.OMP.REDUCTION"): + if not start: + add_arg(self.arg, use_set) + add_arg(self.arg, def_set) + else: + # All other clauses should not have a variable argument. + if isinstance(self.arg, (ir.Var, str)): + print("Bad usedef tag:", self.name, self.arg) + assert not isinstance(self.arg, (ir.Var, str)) + + def __str__(self): + return ( + "openmp_tag(" + + str(self.name) + + "," + + str(self.arg) + + ( + "" + if self.omp_slice is None + else f", omp_slice({self.omp_slice[0]},{self.omp_slice[1]})" + ) + + ")" + ) + + def __repr__(self): + return self.__str__() + + +def openmp_tag_list_to_str(tag_list, lowerer, debug): + tag_strs = [x.lower(lowerer, debug) for x in tag_list] + return "[ " + ", ".join(tag_strs) + " ]" + + +def list_vars_from_tags(tags): + used_vars = [] + for t in tags: + if isinstance(t.arg, ir.Var): + used_vars.append(t.arg) + return used_vars + + +def openmp_region_alloca(obj, alloca_instr, typ): + obj.alloca(alloca_instr, typ) + + +def push_alloca_callback(lowerer, callback, data, builder): + # cgutils.push_alloca_callbacks(callback, data) + if not hasattr(builder, "_lowerer_push_alloca_callbacks"): + builder._lowerer_push_alloca_callbacks = 0 + builder._lowerer_push_alloca_callbacks += 1 + + +def pop_alloca_callback(lowerer, builder): + # cgutils.pop_alloca_callbacks() + builder._lowerer_push_alloca_callbacks -= 1 + + +def in_openmp_region(builder): + if hasattr(builder, "_lowerer_push_alloca_callbacks"): + return builder._lowerer_push_alloca_callbacks > 0 + else: + return False + + +def find_target_start_end(func_ir, target_num): + start_block = None + end_block = None + + for label, block in func_ir.blocks.items(): + if isinstance(block.body[0], openmp_region_start): + block_target_num = block.body[0].has_target() + if target_num == block_target_num: + start_block = label + if start_block is not None and end_block is not None: + return start_block, end_block + elif isinstance(block.body[0], openmp_region_end): + block_target_num = block.body[0].start_region.has_target() + if target_num == block_target_num: + end_block = label + if start_block is not None and end_block is not None: + return start_block, end_block + + dprint_func_ir(func_ir, "find_target_start_end") + print("target_num:", target_num) + assert False + + +def get_tags_of_type(clauses, ctype): + ret = [] + for c in clauses: + if c.name == ctype: + ret.append(c) + return ret + + +def copy_one(x, calltypes): + if DEBUG_OPENMP >= 2: + print("copy_one:", x, type(x)) + if isinstance(x, ir.Loc): + return copy.copy(x) + elif isinstance(x, ir.Expr): + if x in calltypes: + ctyp = calltypes[x] + else: + ctyp = None + ret = ir.Expr( + copy_one(x.op, calltypes), + copy_one(x.loc, calltypes), + **copy_one(x._kws, calltypes), + ) + if ctyp and ret not in calltypes: + calltypes[ret] = ctyp + return ret + elif isinstance(x, dict): + return {k: copy_one(v, calltypes) for k, v in x.items()} + elif isinstance(x, list): + return [copy_one(v, calltypes) for v in x] + elif isinstance(x, tuple): + return tuple([copy_one(v, calltypes) for v in x]) + elif isinstance(x, ir.Const): + return ir.Const( + copy_one(x.value, calltypes), copy_one(x.loc, calltypes), x.use_literal_type + ) + elif isinstance( + x, + ( + int, + float, + str, + ir.Global, + python_types.BuiltinFunctionType, + ir.UndefinedType, + type(None), + types.functions.ExternalFunction, + ), + ): + return x + elif isinstance(x, ir.Var): + return ir.Var(x.scope, copy_one(x.name, calltypes), copy_one(x.loc, calltypes)) + elif isinstance(x, ir.Del): + return ir.Del(copy_one(x.value, calltypes), copy_one(x.loc, calltypes)) + elif isinstance(x, ir.Jump): + return ir.Jump(copy_one(x.target, calltypes), copy_one(x.loc, calltypes)) + elif isinstance(x, ir.Return): + return ir.Return(copy_one(x.value, calltypes), copy_one(x.loc, calltypes)) + elif isinstance(x, ir.Branch): + return ir.Branch( + copy_one(x.cond, calltypes), + copy_one(x.truebr, calltypes), + copy_one(x.falsebr, calltypes), + copy_one(x.loc, calltypes), + ) + elif isinstance(x, ir.Print): + ctyp = calltypes[x] + ret = copy.copy(x) + calltypes[ret] = ctyp + return ret + elif isinstance(x, ir.Assign): + return ir.Assign( + copy_one(x.value, calltypes), + copy_one(x.target, calltypes), + copy_one(x.loc, calltypes), + ) + elif isinstance(x, ir.Arg): + return ir.Arg( + copy_one(x.name, calltypes), + copy_one(x.index, calltypes), + copy_one(x.loc, calltypes), + ) + elif isinstance(x, ir.SetItem): + ctyp = calltypes[x] + ret = ir.SetItem( + copy_one(x.target, calltypes), + copy_one(x.index, calltypes), + copy_one(x.value, calltypes), + copy_one(x.loc, calltypes), + ) + calltypes[ret] = ctyp + return ret + elif isinstance(x, ir.StaticSetItem): + ctyp = calltypes[x] + ret = ir.StaticSetItem( + copy_one(x.target, calltypes), + copy_one(x.index, calltypes), + copy_one(x.index_var, calltypes), + copy_one(x.value, calltypes), + copy_one(x.loc, calltypes), + ) + calltypes[ret] = ctyp + return ret + elif isinstance(x, ir.FreeVar): + return ir.FreeVar( + copy_one(x.index, calltypes), + copy_one(x.name, calltypes), + copy_one(x.value, calltypes), + copy_one(x.loc, calltypes), + ) + elif isinstance(x, slice): + return slice( + copy_one(x.start, calltypes), + copy_one(x.stop, calltypes), + copy_one(x.step, calltypes), + ) + elif isinstance(x, ir.PopBlock): + return ir.PopBlock(copy_one(x.loc, calltypes)) + elif isinstance(x, ir.SetAttr): + ctyp = calltypes[x] + ret = ir.SetAttr( + copy_one(x.target, calltypes), + copy_one(x.attr, calltypes), + copy_one(x.value, calltypes), + copy_one(x.loc, calltypes), + ) + calltypes[ret] = ctyp + return ret + elif isinstance(x, ir.DelAttr): + return ir.DelAttr( + copy_one(x.target, calltypes), + copy_one(x.attr, calltypes), + copy_one(x.loc, calltypes), + ) + elif isinstance(x, types.Type): + return x # Don't copy types. + print("Failed to handle the following type when copying target IR.", type(x), x) + assert False + + +def copy_ir(input_ir, calltypes, depth=1): + assert depth >= 0 and depth <= 1 + + # This is a depth 0 copy. + cur_ir = input_ir.copy() + if depth == 1: + for blk in cur_ir.blocks.values(): + for i in range(len(blk.body)): + if not isinstance( + blk.body[i], (openmp_region_start, openmp_region_end) + ): + blk.body[i] = copy_one(blk.body[i], calltypes) + + return cur_ir + + +def is_target_tag(x): + ret = x.startswith("DIR.OMP.TARGET") and x not in [ + "DIR.OMP.TARGET.DATA", + "DIR.OMP.TARGET.ENTER.DATA", + "DIR.OMP.TARGET.EXIT.DATA", + ] + return ret + + +def replace_np_empty_with_cuda_shared( + outlined_ir, typemap, calltypes, prefix, typingctx +): + if DEBUG_OPENMP >= 2: + print("starting replace_np_empty_with_cuda_shared") + outlined_ir = outlined_ir.blocks + converted_arrays = [] + consts = {} + topo_order = find_topo_order(outlined_ir) + mode = 0 # 0 = non-target region, 1 = target region, 2 = teams region, 3 = teams parallel region + # For each block in topological order... + for label in topo_order: + block = outlined_ir[label] + new_block_body = [] + blen = len(block.body) + index = 0 + # For each statement in the block. + while index < blen: + stmt = block.body[index] + # Adjust mode based on the start of an openmp region. + if isinstance(stmt, openmp_region_start): + if "TARGET" in stmt.tags[0].name: + assert mode == 0 + mode = 1 + if "TEAMS" in stmt.tags[0].name and mode == 1: + mode = 2 + if "PARALLEL" in stmt.tags[0].name and mode == 2: + mode = 3 + new_block_body.append(stmt) + # Adjust mode based on the end of an openmp region. + elif isinstance(stmt, openmp_region_end): + if mode == 3 and "PARALLEL" in stmt.tags[0].name: + mode = 2 + if mode == 2 and "TEAMS" in stmt.tags[0].name: + mode = 1 + if mode == 1 and "TARGET" in stmt.tags[0].name: + mode = 0 + new_block_body.append(stmt) + # Fix calltype for the np.empty call to have literal as first + # arg and include explicit dtype. + elif ( + isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Expr) + and stmt.value.op == "call" + and stmt.value.func in converted_arrays + ): + size = consts[stmt.value.args[0].name] + # The 1D case where the dimension size is directly a const. + if isinstance(size, ir.Const): + size = size.value + signature = calltypes[stmt.value] + signature_args = ( + types.scalars.IntegerLiteral(size), + types.functions.NumberClass(signature.return_type.dtype), + ) + del calltypes[stmt.value] + calltypes[stmt.value] = typing.templates.Signature( + signature.return_type, signature_args, signature.recvr + ) + # The 2D+ case where the dimension sizes are in a tuple. + elif isinstance(size, ir.Expr): + signature = calltypes[stmt.value] + signature_args = ( + types.Tuple( + [ + types.scalars.IntegerLiteral(consts[x.name].value) + for x in size.items + ] + ), + types.functions.NumberClass(signature.return_type.dtype), + ) + del calltypes[stmt.value] + calltypes[stmt.value] = typing.templates.Signature( + signature.return_type, signature_args, signature.recvr + ) + + # These lines will force the function to be in the data structures that lowering uses. + afnty = typemap[stmt.value.func.name] + afnty.get_call_type(typingctx, signature_args, {}) + if len(stmt.value.args) == 1: + dtype_to_use = signature.return_type.dtype + # If dtype in kwargs then remove it. + if len(stmt.value.kws) > 0: + for kwarg in stmt.value.kws: + if kwarg[0] == "dtype": + stmt.value.kws = list( + filter(lambda x: x[0] != "dtype", stmt.value.kws) + ) + break + new_block_body.append( + ir.Assign( + ir.Global("np", np, lhs.loc), + ir.Var(lhs.scope, mk_unique_var(".np_global"), lhs.loc), + lhs.loc, + ) + ) + typemap[new_block_body[-1].target.name] = types.Module(np) + new_block_body.append( + ir.Assign( + ir.Expr.getattr( + new_block_body[-1].target, str(dtype_to_use), lhs.loc + ), + ir.Var(lhs.scope, mk_unique_var(".np_dtype"), lhs.loc), + lhs.loc, + ) + ) + typemap[new_block_body[-1].target.name] = ( + types.functions.NumberClass(signature.return_type.dtype) + ) + stmt.value.args.append(new_block_body[-1].target) + else: + raise NotImplementedError( + "np.empty having more than shape and dtype arguments not yet supported." + ) + new_block_body.append(stmt) + # Keep track of variables assigned from consts or from build_tuples make up exclusively of + # variables assigned from consts. + elif isinstance(stmt, ir.Assign) and ( + isinstance(stmt.value, ir.Const) + or ( + isinstance(stmt.value, ir.Expr) + and stmt.value.op == "build_tuple" + and all([x.name in consts for x in stmt.value.items]) + ) + ): + consts[stmt.target.name] = stmt.value + new_block_body.append(stmt) + # If we see a global for the numpy module. + elif ( + isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Global) + and isinstance(stmt.value.value, python_types.ModuleType) + and stmt.value.value.__name__ == "numpy" + ): + lhs = stmt.target + index += 1 + next_stmt = block.body[index] + # And the next statement is a getattr for the name "empty" on the numpy module + # and we are in a target region. + if ( + isinstance(next_stmt, ir.Assign) + and isinstance(next_stmt.value, ir.Expr) + and next_stmt.value.value == lhs + and next_stmt.value.op == "getattr" + and next_stmt.value.attr == "empty" + and mode > 0 + ): + # Remember that we are converting this np.empty into a CUDA call. + converted_arrays.append(next_stmt.target) + + # Create numba.cuda module variable. + new_block_body.append( + ir.Assign( + ir.Global("numba", numba, lhs.loc), + ir.Var( + lhs.scope, mk_unique_var(".cuda_shared_global"), lhs.loc + ), + lhs.loc, + ) + ) + typemap[new_block_body[-1].target.name] = types.Module(numba) + new_block_body.append( + ir.Assign( + ir.Expr.getattr(new_block_body[-1].target, "cuda", lhs.loc), + ir.Var( + lhs.scope, + mk_unique_var(".cuda_shared_getattr"), + lhs.loc, + ), + lhs.loc, + ) + ) + typemap[new_block_body[-1].target.name] = types.Module(numba.cuda) + + if mode == 1: + raise NotImplementedError( + "np.empty used in non-teams or parallel target region" + ) + pass + elif mode == 2: + # Create numba.cuda.shared module variable. + new_block_body.append( + ir.Assign( + ir.Expr.getattr( + new_block_body[-1].target, "shared", lhs.loc + ), + ir.Var( + lhs.scope, + mk_unique_var(".cuda_shared_getattr"), + lhs.loc, + ), + lhs.loc, + ) + ) + typemap[new_block_body[-1].target.name] = types.Module( + numba.cuda.stubs.shared + ) + elif mode == 3: + # Create numba.cuda.local module variable. + new_block_body.append( + ir.Assign( + ir.Expr.getattr( + new_block_body[-1].target, "local", lhs.loc + ), + ir.Var( + lhs.scope, + mk_unique_var(".cuda_local_getattr"), + lhs.loc, + ), + lhs.loc, + ) + ) + typemap[new_block_body[-1].target.name] = types.Module( + numba.cuda.stubs.local + ) + + # Change the typemap for the original function variable for np.empty. + afnty = typingctx.resolve_getattr( + typemap[new_block_body[-1].target.name], "array" + ) + del typemap[next_stmt.target.name] + typemap[next_stmt.target.name] = afnty + # Change the variable that previously was assigned np.empty to now be one of + # the CUDA array allocators. + new_block_body.append( + ir.Assign( + ir.Expr.getattr( + new_block_body[-1].target, "array", lhs.loc + ), + next_stmt.target, + lhs.loc, + ) + ) + else: + new_block_body.append(stmt) + new_block_body.append(next_stmt) + else: + new_block_body.append(stmt) + index += 1 + block.body = new_block_body + + +class openmp_region_start(ir.Stmt): + def __init__(self, tags, region_number, loc, firstprivate_dead_after=None): + if DEBUG_OPENMP >= 2: + print("region ids openmp_region_start::__init__", id(self)) + self.tags = tags + self.region_number = region_number + self.loc = loc + self.omp_region_var = None + self.omp_metadata = None + self.tag_vars = set() + self.normal_iv = None + self.target_copy = False + self.firstprivate_dead_after = ( + [] if firstprivate_dead_after is None else firstprivate_dead_after + ) + for tag in self.tags: + if isinstance(tag.arg, ir.Var): + self.tag_vars.add(tag.arg.name) + elif isinstance(tag.arg, str): + self.tag_vars.add(tag.arg) + elif isinstance(tag.arg, NameSlice): + self.tag_vars.add(tag.arg.name) + + if tag.name == "QUAL.OMP.NORMALIZED.IV": + self.normal_iv = tag.arg + if DEBUG_OPENMP >= 1: + print("tags:", self.tags) + print("tag_vars:", sorted(self.tag_vars)) + self.acq_res = False + self.acq_rel = False + self.alloca_queue = [] + self.end_region = None + + def __getstate__(self): + state = self.__dict__.copy() + return state + + def __setstate__(self, state): + self.__dict__.update(state) + + def replace_var_names(self, namedict): + for i in range(len(self.tags)): + if isinstance(self.tags[i].arg, ir.Var): + if self.tags[i].arg.name in namedict: + var = self.tags[i].arg + self.tags[i].arg = ir.Var(var.scope, namedict[var.name], var.log) + elif isinstance(self.tags[i].arg, str): + if "*" in self.tags[i].arg: + xsplit = self.tags[i].arg.split("*") + assert len(xsplit) == 2 + if xsplit[0] in namedict: + self.tags[i].arg = namedict[xsplit[0]] + "*" + xsplit[1] + else: + if self.tags[i].arg in namedict: + self.tags[i].arg = namedict[self.tags[i].arg] + + def add_tag(self, tag): + tag_arg_str = None + if isinstance(tag.arg, ir.Var): + tag_arg_str = tag.arg.name + elif isinstance(tag.arg, str): + tag_arg_str = tag.arg + elif isinstance(tag.arg, lir.instructions.AllocaInstr): + tag_arg_str = tag.arg._get_name() + else: + assert False + if isinstance(tag_arg_str, str): + self.tag_vars.add(tag_arg_str) + self.tags.append(tag) + + def get_var_dsa(self, var): + assert isinstance(var, str) + for tag in self.tags: + if is_dsa(tag.name) and tag.var_in(var): + return tag.name + return None + + def requires_acquire_release(self): + pass + # self.acq_res = True + + def requires_combined_acquire_release(self): + pass + # self.acq_rel = True + + def has_target(self): + for t in self.tags: + if is_target_tag(t.name): + return t.arg + return None + + def list_vars(self): + return list_vars_from_tags(self.tags) + + def update_tags(self): + with self.builder.goto_block(self.block): + cur_instr = -1 + + while True: + last_instr = self.builder.block.instructions[cur_instr] + if ( + isinstance(last_instr, lir.instructions.CallInstr) + and last_instr.tags is not None + and len(last_instr.tags) > 0 + ): + break + cur_instr -= 1 + + last_instr.tags = openmp_tag_list_to_str(self.tags, self.lowerer, False) + if DEBUG_OPENMP >= 1: + print("last_tags:", last_instr.tags, type(last_instr.tags)) + + def alloca(self, alloca_instr, typ): + # We can't process these right away since the processing required can + # lead to infinite recursion. So, we just accumulate them in a queue + # and then process them later at the end_region marker so that the + # variables are guaranteed to exist in their full form so that when we + # process them then they won't lead to infinite recursion. + self.alloca_queue.append((alloca_instr, typ)) + + def process_alloca_queue(self): + # This should be old code...making sure with the assertion. + assert len(self.alloca_queue) == 0 + has_update = False + for alloca_instr, typ in self.alloca_queue: + has_update = self.process_one_alloca(alloca_instr, typ) or has_update + if has_update: + self.update_tags() + self.alloca_queue = [] + + def post_lowering_process_alloca_queue(self, enter_directive): + has_update = False + if DEBUG_OPENMP >= 1: + print("starting post_lowering_process_alloca_queue") + for alloca_instr, typ in self.alloca_queue: + has_update = self.process_one_alloca(alloca_instr, typ) or has_update + if has_update: + if DEBUG_OPENMP >= 1: + print( + "post_lowering_process_alloca_queue has update:", + enter_directive.tags, + ) + enter_directive.tags = openmp_tag_list_to_str( + self.tags, self.lowerer, False + ) + # LLVM IR is doing some string caching and the following line is necessary to + # reset that caching so that the original tag text can be overwritten above. + enter_directive._clear_string_cache() + if DEBUG_OPENMP >= 1: + print( + "post_lowering_process_alloca_queue updated tags:", + enter_directive.tags, + ) + self.alloca_queue = [] + + def process_one_alloca(self, alloca_instr, typ): + avar = alloca_instr.name + if DEBUG_OPENMP >= 1: + print( + "openmp_region_start process_one_alloca:", + id(self), + alloca_instr, + avar, + typ, + type(alloca_instr), + self.tag_vars, + ) + + has_update = False + if ( + self.normal_iv is not None + and avar != self.normal_iv + and avar.startswith(self.normal_iv) + ): + for i in range(len(self.tags)): + if DEBUG_OPENMP >= 1: + print("Replacing normalized iv with", avar) + self.tags[i].arg = avar + has_update = True + break + + if not self.needs_implicit_vars(): + return has_update + if avar not in self.tag_vars: + if DEBUG_OPENMP >= 1: + print( + f"LLVM variable {avar} didn't previously exist in the list of vars so adding as private." + ) + self.add_tag( + openmp_tag("QUAL.OMP.PRIVATE", alloca_instr) + ) # is FIRSTPRIVATE right here? + has_update = True + return has_update + + def needs_implicit_vars(self): + first_tag = self.tags[0] + if ( + first_tag.name == "DIR.OMP.PARALLEL" + or first_tag.name == "DIR.OMP.PARALLEL.LOOP" + or first_tag.name == "DIR.OMP.TASK" + ): + return True + return False + + def update_context(self, context, builder): + cctyp = type(context.call_conv) + # print("start update_context id(context)", id(context), "id(const.call_conv)", id(context.call_conv), "cctyp", cctyp, "id(cctyp)", id(cctyp)) + + if ( + not hasattr(cctyp, "pyomp_patch_installed") + or cctyp.pyomp_patch_installed == False + ): + cctyp.pyomp_patch_installed = True + # print("update_context", "id(cctyp.return_user_exec)", id(cctyp.return_user_exc), "id(context)", id(context)) + setattr(cctyp, "orig_return_user_exc", cctyp.return_user_exc) + + def pyomp_return_user_exc(self, builder, *args, **kwargs): + # print("pyomp_return_user_exc") + # Handle exceptions in OpenMP regions by emitting a trap and an + # unreachable terminator. + if in_openmp_region(builder): + fnty = lir.types.FunctionType(lir.types.VoidType(), []) + fn = builder.module.declare_intrinsic("llvm.trap", (), fnty) + builder.call(fn, []) + builder.unreachable() + return + self.orig_return_user_exc(builder, *args, **kwargs) + + setattr(cctyp, "return_user_exc", pyomp_return_user_exc) + # print("after", id(pyomp_return_user_exc), id(cctyp.return_user_exc)) + + setattr( + cctyp, "orig_return_status_propagate", cctyp.return_status_propagate + ) + + def pyomp_return_status_propagate(self, builder, *args, **kwargs): + if in_openmp_region(builder): + return + self.orig_return_status_propagate(builder, *args, **kwargs) + + setattr(cctyp, "return_status_propagate", pyomp_return_status_propagate) + + cemtyp = type(context.error_model) + # print("start update_context id(context)", id(context), "id(const.error_model)", id(context.error_model), "cemtyp", cemtyp, "id(cemtyp)", id(cemtyp)) + + if ( + not hasattr(cemtyp, "pyomp_patch_installed") + or cemtyp.pyomp_patch_installed == False + ): + cemtyp.pyomp_patch_installed = True + # print("update_context", "id(cemtyp.return_user_exec)", id(cemtyp.fp_zero_division), "id(context)", id(context)) + setattr(cemtyp, "orig_fp_zero_division", cemtyp.fp_zero_division) + + def pyomp_fp_zero_division(self, builder, *args, **kwargs): + # print("pyomp_fp_zero_division") + if in_openmp_region(builder): + return False + return self.orig_fp_zero_division(builder, *args, **kwargs) + + setattr(cemtyp, "fp_zero_division", pyomp_fp_zero_division) + # print("after", id(pyomp_fp_zero_division), id(cemtyp.fp_zero_division)) + + pyapi = context.get_python_api(builder) + ptyp = type(pyapi) + + if ( + not hasattr(ptyp, "pyomp_patch_installed") + or ptyp.pyomp_patch_installed == False + ): + ptyp.pyomp_patch_installed = True + # print("update_context", "id(ptyp.emit_environment_sentry)", id(ptyp.emit_environment_sentry), "id(context)", id(context)) + setattr(ptyp, "orig_emit_environment_sentry", ptyp.emit_environment_sentry) + + def pyomp_emit_environment_sentry(self, *args, **kwargs): + builder = self.builder + # print("pyomp_emit_environment_sentry") + if in_openmp_region(builder): + return False + return self.orig_emit_environment_sentry(*args, **kwargs) + + setattr(ptyp, "emit_environment_sentry", pyomp_emit_environment_sentry) + # print("after", id(pyomp_emit_environment_sentry), id(ptyp.emit_environment_sentry)) + + def fix_dispatchers(self, typemap, typingctx, cuda_target): + fixup_dict = {} + for k, v in typemap.items(): + if isinstance(v, Dispatcher) and not isinstance( + v, numba_cuda.types.CUDADispatcher + ): + # targetoptions = v.targetoptions.copy() + # targetoptions['device'] = True + # targetoptions['debug'] = targetoptions.get('debug', False) + # targetoptions['opt'] = targetoptions.get('opt', True) + vdispatcher = v.dispatcher + vdispatcher.targetoptions.pop("nopython", None) + vdispatcher.targetoptions.pop("boundscheck", None) + disp = typingctx.resolve_value_type(vdispatcher) + fixup_dict[k] = disp + for sig in vdispatcher.overloads.keys(): + disp.dispatcher.compile_device(sig, cuda_target=cuda_target) + + for k, v in fixup_dict.items(): + del typemap[k] + typemap[k] = v + + def lower(self, lowerer): + typingctx = lowerer.context.typing_context + targetctx = lowerer.context + typemap = lowerer.fndesc.typemap + calltypes = lowerer.fndesc.calltypes + context = lowerer.context + builder = lowerer.builder + mod = builder.module + library = lowerer.library + library.openmp = True + self.block = builder.block + self.builder = builder + self.lowerer = lowerer + self.update_context(context, builder) + if DEBUG_OPENMP >= 1: + print( + "region ids lower:block", + id(self), + self, + id(self.block), + self.block, + type(self.block), + self.tags, + len(self.tags), + "builder_id:", + id(self.builder), + "block_id:", + id(self.block), + ) + for k, v in lowerer.func_ir.blocks.items(): + print("block post copy:", k, id(v), id(v.body)) + + # Convert implicit tags to explicit form now that we have typing info. + for i in range(len(self.tags)): + cur_tag = self.tags[i] + if cur_tag.name == "QUAL.OMP.TARGET.IMPLICIT": + if isinstance( + typemap_lookup(typemap, cur_tag.arg), types.npytypes.Array + ): + cur_tag.name = "QUAL.OMP.MAP.TOFROM" + else: + cur_tag.name = "QUAL.OMP.FIRSTPRIVATE" + + if DEBUG_OPENMP >= 1: + for otag in self.tags: + print("otag:", otag, type(otag.arg)) + + # Remove LLVM vars that might have been added if this is an OpenMP + # region inside a target region. + count_alloca_instr = len( + list( + filter( + lambda x: isinstance(x.arg, lir.instructions.AllocaInstr), self.tags + ) + ) + ) + assert count_alloca_instr == 0 + # self.tags = list(filter(lambda x: not isinstance(x.arg, lir.instructions.AllocaInstr), self.tags)) + if DEBUG_OPENMP >= 1: + print("after LLVM tag filter", self.tags, len(self.tags)) + for otag in self.tags: + print("otag:", otag, type(otag.arg)) + + host_side_target_tags = [] + target_num = self.has_target() + + def add_struct_tags(self, var_table): + extras_before = [] + struct_tags = [] + for i in range(len(self.tags)): + cur_tag = self.tags[i] + if cur_tag.name in [ + "QUAL.OMP.MAP.TOFROM", + "QUAL.OMP.MAP.TO", + "QUAL.OMP.MAP.FROM", + "QUAL.OMP.MAP.ALLOC", + ]: + cur_tag_var = cur_tag.arg + if isinstance(cur_tag_var, NameSlice): + cur_tag_var = cur_tag_var.name + assert isinstance(cur_tag_var, str) + cur_tag_typ = typemap_lookup(typemap, cur_tag_var) + if isinstance(cur_tag_typ, types.npytypes.Array): + cur_tag_ndim = cur_tag_typ.ndim + stride_typ = lowerer.context.get_value_type( + types.intp + ) # lir.Type.int(64) + stride_abi_size = context.get_abi_sizeof(stride_typ) + array_var = var_table[cur_tag_var] + if DEBUG_OPENMP >= 1: + print( + "Found array mapped:", + cur_tag.name, + cur_tag.arg, + cur_tag_typ, + type(cur_tag_typ), + stride_typ, + type(stride_typ), + stride_abi_size, + array_var, + type(array_var), + ) + uniqueness = get_unique() + if isinstance(cur_tag.arg, NameSlice): + the_slice = cur_tag.arg.the_slice[0][0] + assert the_slice.step is None + if isinstance(the_slice.start, int): + start_index_var = ir.Var( + None, + f"{cur_tag_var}_start_index_var{target_num}{uniqueness}", + array_var.loc, + ) + start_assign = ir.Assign( + ir.Const(the_slice.start, array_var.loc), + start_index_var, + array_var.loc, + ) + + typemap[start_index_var.name] = types.int64 + lowerer.lower_inst(start_assign) + extras_before.append(start_assign) + lowerer._alloca_var( + start_index_var.name, typemap[start_index_var.name] + ) + lowerer.loadvar(start_index_var.name) + else: + start_index_var = the_slice.start + assert isinstance(start_index_var, str) + start_index_var = ir.Var( + None, start_index_var, array_var.loc + ) + if isinstance(the_slice.stop, int): + end_index_var = ir.Var( + None, + f"{cur_tag_var}_end_index_var{target_num}{uniqueness}", + array_var.loc, + ) + end_assign = ir.Assign( + ir.Const(the_slice.stop, array_var.loc), + end_index_var, + array_var.loc, + ) + typemap[end_index_var.name] = types.int64 + lowerer.lower_inst(end_assign) + extras_before.append(end_assign) + lowerer._alloca_var( + end_index_var.name, typemap[end_index_var.name] + ) + lowerer.loadvar(end_index_var.name) + else: + end_index_var = the_slice.stop + assert isinstance(end_index_var, str) + end_index_var = ir.Var( + None, end_index_var, array_var.loc + ) + + num_elements_var = ir.Var( + None, + f"{cur_tag_var}_num_elements_var{target_num}{uniqueness}", + array_var.loc, + ) + size_binop = ir.Expr.binop( + operator.sub, + end_index_var, + start_index_var, + array_var.loc, + ) + size_assign = ir.Assign( + size_binop, num_elements_var, array_var.loc + ) + calltypes[size_binop] = typing.signature( + types.int64, types.int64, types.int64 + ) + else: + start_index_var = 0 + num_elements_var = ir.Var( + None, + f"{cur_tag_var}_num_elements_var{target_num}{uniqueness}", + array_var.loc, + ) + size_getattr = ir.Expr.getattr( + array_var, "size", array_var.loc + ) + size_assign = ir.Assign( + size_getattr, num_elements_var, array_var.loc + ) + + typemap[num_elements_var.name] = types.int64 + lowerer.lower_inst(size_assign) + extras_before.append(size_assign) + lowerer._alloca_var( + num_elements_var.name, typemap[num_elements_var.name] + ) + + # see core/datamodel/models.py + lowerer.loadvar(num_elements_var.name) # alloca the var + + # see core/datamodel/models.py + if isinstance(start_index_var, ir.Var): + lowerer.loadvar(start_index_var.name) # alloca the var + if isinstance(num_elements_var, ir.Var): + lowerer.loadvar(num_elements_var.name) # alloca the var + struct_tags.append( + openmp_tag( + cur_tag.name + ".STRUCT", + cur_tag_var + "*data", + non_arg=True, + omp_slice=(start_index_var, num_elements_var), + ) + ) + struct_tags.append( + openmp_tag( + "QUAL.OMP.MAP.TO.STRUCT", + cur_tag_var + "*shape", + non_arg=True, + omp_slice=(0, 1), + ) + ) + struct_tags.append( + openmp_tag( + "QUAL.OMP.MAP.TO.STRUCT", + cur_tag_var + "*strides", + non_arg=True, + omp_slice=(0, 1), + ) + ) + # Peel off NameSlice, it served its purpose and is not + # needed by the rest of compilation. + if isinstance(cur_tag.arg, NameSlice): + cur_tag.arg = cur_tag.arg.name + + return struct_tags, extras_before + + if self.tags[0].name in [ + "DIR.OMP.TARGET.DATA", + "DIR.OMP.TARGET.ENTER.DATA", + "DIR.OMP.TARGET.EXIT.DATA", + "DIR.OMP.TARGET.UPDATE", + ]: + var_table = get_name_var_table(lowerer.func_ir.blocks) + struct_tags, extras_before = add_struct_tags(self, var_table) + self.tags.extend(struct_tags) + for extra in extras_before: + lowerer.lower_inst(extra) + + elif target_num is not None and self.target_copy != True: + var_table = get_name_var_table(lowerer.func_ir.blocks) + + ompx_attrs = list( + filter(lambda x: x.name == "QUAL.OMP.OMPX_ATTRIBUTE", self.tags) + ) + self.tags = list( + filter(lambda x: x.name != "QUAL.OMP.OMPX_ATTRIBUTE", self.tags) + ) + selected_device = 0 + device_tags = get_tags_of_type(self.tags, "QUAL.OMP.DEVICE") + if len(device_tags) > 0: + device_tag = device_tags[-1] + if isinstance(device_tag.arg, int): + selected_device = device_tag.arg + else: + assert False + if DEBUG_OPENMP >= 1: + print("new selected device:", selected_device) + + struct_tags, extras_before = add_struct_tags(self, var_table) + self.tags.extend(struct_tags) + if DEBUG_OPENMP >= 1: + for otag in self.tags: + print("tag in target:", otag, type(otag.arg)) + + from numba.core.compiler import Compiler, Flags + + if DEBUG_OPENMP >= 1: + print("openmp start region lower has target", type(lowerer.func_ir)) + # Make a copy of the host IR being lowered. + dprint_func_ir(lowerer.func_ir, "original func_ir") + func_ir = copy_ir(lowerer.func_ir, calltypes) + dprint_func_ir(func_ir, "copied func_ir") + if DEBUG_OPENMP >= 1: + for k, v in lowerer.func_ir.blocks.items(): + print( + "region ids block post copy:", + k, + id(v), + id(func_ir.blocks[k]), + id(v.body), + id(func_ir.blocks[k].body), + ) + + remove_dels(func_ir.blocks) + + dprint_func_ir(func_ir, "func_ir after remove_dels") + + def fixup_openmp_pairs(blocks): + """The Numba IR nodes for the start and end of an OpenMP region + contain references to each other. When a target region is + outlined that contains these pairs of IR nodes then if we + simply shallow copy them then they'll point to their original + matching pair in the original IR. In this function, we go + through and find what should be matching pairs in the + outlined (target) IR and make those copies point to each + other. + """ + start_dict = {} + end_dict = {} + + # Go through the blocks in the original IR and create a mapping + # between the id of the start nodes with their block label and + # position in the block. Likewise, do the same for end nodes. + for label, block in func_ir.blocks.items(): + for bindex, bstmt in enumerate(block.body): + if isinstance(bstmt, openmp_region_start): + if DEBUG_OPENMP >= 2: + print("region ids found region start", id(bstmt)) + start_dict[id(bstmt)] = (label, bindex) + elif isinstance(bstmt, openmp_region_end): + if DEBUG_OPENMP >= 2: + print( + "region ids found region end", + id(bstmt.start_region), + id(bstmt), + ) + end_dict[id(bstmt.start_region)] = (label, bindex) + assert len(start_dict) == len(end_dict) + + # For each start node that we found above, create a copy in the target IR + # and fixup the references of the copies to point at each other. + for start_id, blockindex in start_dict.items(): + start_block, sbindex = blockindex + + end_block_index = end_dict[start_id] + end_block, ebindex = end_block_index + + if DEBUG_OPENMP >= 2: + start_pre_copy = blocks[start_block].body[sbindex] + end_pre_copy = blocks[end_block].body[ebindex] + + # Create copy of the OpenMP start and end nodes in the target outlined IR. + blocks[start_block].body[sbindex] = copy.copy( + blocks[start_block].body[sbindex] + ) + blocks[end_block].body[ebindex] = copy.copy( + blocks[end_block].body[ebindex] + ) + # Reset some fields in the start OpenMP region because the target IR + # has not been lowered yet. + start_region = blocks[start_block].body[sbindex] + start_region.builder = None + start_region.block = None + start_region.lowerer = None + start_region.target_copy = True + start_region.tags = copy.deepcopy(start_region.tags) + # Remove unnecessary num_teams, thread_limit tags when + # emitting a target directive within a kernel to avoid + # extraneous arguments in the kernel function. + if start_region.has_target() == target_num: + start_region.tags.append(openmp_tag("OMP.DEVICE")) + end_region = blocks[end_block].body[ebindex] + # assert(start_region.omp_region_var is None) + assert len(start_region.alloca_queue) == 0 + # Make start and end copies point at each other. + end_region.start_region = start_region + start_region.end_region = end_region + if DEBUG_OPENMP >= 2: + print( + f"region ids fixup start: {id(start_pre_copy)}->{id(start_region)} end: {id(end_pre_copy)}->{id(end_region)}" + ) + + fixup_openmp_pairs(func_ir.blocks) + state = compiler.StateDict() + fndesc = lowerer.fndesc + state.typemap = fndesc.typemap + state.calltypes = fndesc.calltypes + state.argtypes = fndesc.argtypes + state.return_type = fndesc.restype + if DEBUG_OPENMP >= 1: + print("context:", context, type(context)) + print("targetctx:", targetctx, type(targetctx)) + print("state:", state, dir(state)) + print("fndesc:", fndesc, type(fndesc)) + print("func_ir type:", type(func_ir)) + dprint_func_ir(func_ir, "target func_ir") + internal_codegen = targetctx._internal_codegen + + # Find the start and end IR blocks for this offloaded region. + start_block, end_block = find_target_start_end(func_ir, target_num) + end_target_node = func_ir.blocks[end_block].body[0] + + if DEBUG_OPENMP >= 1: + print("start_block:", start_block) + print("end_block:", end_block) + + blocks_in_region = get_blocks_between_start_end( + func_ir.blocks, start_block, end_block + ) + if DEBUG_OPENMP >= 1: + print("lower blocks_in_region:", blocks_in_region) + + # Find the variables that cross the boundary between the target + # region and the non-target host-side code. + ins, outs = transforms.find_region_inout_vars( + blocks=func_ir.blocks, + livemap=func_ir.variable_lifetime.livemap, + callfrom=start_block, + returnto=end_block, + body_block_ids=blocks_in_region, + ) + + def add_mapped_to_ins(ins, tags): + for tag in tags: + if tag.arg in ins: + continue + + if tag.name in ["QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.MAP.FROM"]: + ins.append(tag.arg) + + add_mapped_to_ins(ins, self.tags) + + normalized_ivs = get_tags_of_type(self.tags, "QUAL.OMP.NORMALIZED.IV") + if DEBUG_OPENMP >= 1: + print("ivs ins", normalized_ivs, ins, outs) + for niv in normalized_ivs: + if DEBUG_OPENMP >= 1: + print("Removing normalized iv from ins", niv.arg) + if niv.arg in ins: + ins.remove(niv.arg) + # Get the types of the variables live-in to the target region. + target_args_unordered = ins + list(set(outs) - set(ins)) + if DEBUG_OPENMP >= 1: + print("ins:", ins, type(ins)) + print("outs:", outs, type(outs)) + print("args:", state.args) + print("rettype:", state.return_type, type(state.return_type)) + print("target_args_unordered:", target_args_unordered) + # Re-use Numba loop lifting code to extract the target region as + # its own function. + region_info = transforms._loop_lift_info( + loop=None, + inputs=ins, + # outputs=outs, + outputs=(), + callfrom=start_block, + returnto=end_block, + ) + + region_blocks = dict((k, func_ir.blocks[k]) for k in blocks_in_region) + + if DEBUG_OPENMP >= 1: + print("region_info:", region_info) + transforms._loop_lift_prepare_loop_func(region_info, region_blocks) + # exit_block_label = max(region_blocks.keys()) + # region_blocks[exit_block_label].body = [] + # exit_scope = region_blocks[exit_block_label].scope + # tmp = exit_scope.make_temp(loc=func_ir.loc) + # region_blocks[exit_block_label].append(ir.Assign(value=ir.Const(0, func_ir.loc), target=tmp, loc=func_ir.loc)) + # region_blocks[exit_block_label].append(ir.Return(value=tmp, loc=func_ir.loc)) + + target_args = [] + outline_arg_typs = [] + # outline_arg_typs = [None] * len(target_args_unordered) + for tag in self.tags: + if DEBUG_OPENMP >= 1: + print(1, "target_arg?", tag, tag.non_arg, is_target_arg(tag.name)) + if ( + tag.arg in target_args_unordered + and not tag.non_arg + and is_target_arg(tag.name) + ): + target_args.append(tag.arg) + # target_arg_index = target_args.index(tag.arg) + atyp = get_dotted_type(tag.arg, typemap, lowerer) + if is_pointer_target_arg(tag.name, atyp): + # outline_arg_typs[target_arg_index] = types.CPointer(atyp) + outline_arg_typs.append(types.CPointer(atyp)) + if DEBUG_OPENMP >= 1: + print(1, "found cpointer target_arg", tag, atyp, id(atyp)) + else: + # outline_arg_typs[target_arg_index] = atyp + outline_arg_typs.append(atyp) + if DEBUG_OPENMP >= 1: + print(1, "found target_arg", tag, atyp, id(atyp)) + + if DEBUG_OPENMP >= 1: + print("target_args:", target_args) + print("target_args_unordered:", target_args_unordered) + print("outline_arg_typs:", outline_arg_typs) + print("extras_before:", extras_before, start_block) + for eb in extras_before: + print(eb) + + assert len(target_args) == len(target_args_unordered) + assert len(target_args) == len(outline_arg_typs) + + # Create the outlined IR from the blocks in the region, making the + # variables crossing into the regions argument. + outlined_ir = func_ir.derive( + blocks=region_blocks, + arg_names=tuple(target_args), + arg_count=len(target_args), + force_non_generator=True, + ) + outlined_ir.blocks[start_block].body = ( + extras_before + outlined_ir.blocks[start_block].body + ) + for stmt in outlined_ir.blocks[min(outlined_ir.blocks.keys())].body: + if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Arg): + stmt.value.index = target_args.index(stmt.value.name) + + def prepend_device_to_func_name(outlined_ir): + # Change the name of the outlined function to prepend the + # word "device" to the function name. + fparts = outlined_ir.func_id.func_qualname.split(".") + fparts[-1] = "device" + str(target_num) + fparts[-1] + outlined_ir.func_id.func_qualname = ".".join(fparts) + outlined_ir.func_id.func_name = fparts[-1] + uid = next(bytecode.FunctionIdentity._unique_ids) + outlined_ir.func_id.unique_name = "{}${}".format( + outlined_ir.func_id.func_qualname, uid + ) + + prepend_device_to_func_name(outlined_ir) + device_func_name = outlined_ir.func_id.func_qualname + if DEBUG_OPENMP >= 1: + print( + "outlined_ir:", + type(outlined_ir), + type(outlined_ir.func_id), + outlined_ir.arg_names, + device_func_name, + ) + dprint_func_ir(outlined_ir, "outlined_ir") + + # Create a copy of the state and the typemap inside of it so that changes + # for compiling the outlined IR don't effect the original compilation state + # of the host. + state_copy = copy.copy(state) + state_copy.typemap = copy.copy(typemap) + + entry_block_num = min(outlined_ir.blocks.keys()) + entry_block = outlined_ir.blocks[entry_block_num] + if DEBUG_OPENMP >= 1: + print("entry_block:", entry_block) + for x in entry_block.body: + print(x) + rev_arg_assigns = [] + # Add entries in the copied typemap for the arguments to the outlined IR. + for idx, zipvar in enumerate(zip(target_args, outline_arg_typs)): + var_in, vtyp = zipvar + arg_name = "arg." + var_in + state_copy.typemap.pop(arg_name, None) + state_copy.typemap[arg_name] = vtyp + + last_block = outlined_ir.blocks[end_block] + last_block.body = ( + [end_target_node] + + last_block.body[:-1] + + rev_arg_assigns + + last_block.body[-1:] + ) + + assert isinstance(last_block.body[-1], ir.Return) + # Add typemap entry for the empty tuple return type. + state_copy.typemap[last_block.body[-1].value.name] = types.none + # end test + + if DEBUG_OPENMP >= 1: + print("selected_device:", selected_device) + + if selected_device == 1: + flags = Flags() + flags.enable_ssa = False + device_lowerer_pipeline = OnlyLower + + subtarget = OpenmpCPUTargetContext( + device_func_name, targetctx.typing_context + ) + # Copy everything (like registries) from cpu context into the new OpenMPCPUTargetContext subtarget + # except call_conv which is the whole point of that class so that the minimal call convention is used. + subtarget.__dict__.update( + { + k: targetctx.__dict__[k] + for k in targetctx.__dict__.keys() - {"call_conv"} + } + ) + # subtarget.install_registry(imputils.builtin_registry) + # Turn off the Numba runtime (incref and decref mostly) for the target compilation. + subtarget.enable_nrt = False + typingctx_outlined = targetctx.typing_context + + import numba.core.codegen as codegen + + subtarget._internal_codegen = codegen.AOTCPUCodegen( + mod.name + f"$device{selected_device}" + ) + subtarget._internal_codegen._library_class = CustomAOTCPUCodeLibrary + subtarget._internal_codegen._engine.set_object_cache(None, None) + device_target = subtarget + elif selected_device == 0: + from numba.core import target_extension + + orig_target = getattr( + target_extension._active_context, + "target", + target_extension._active_context_default, + ) + target_extension._active_context.target = "cuda" + + flags = cuda_compiler.CUDAFlags() + + typingctx_outlined = cuda_descriptor.cuda_target.typing_context + device_target = OpenmpCUDATargetContext( + device_func_name, typingctx_outlined + ) + device_target.fndesc = fndesc + # device_target = cuda_descriptor.cuda_target.target_context + + device_lowerer_pipeline = OnlyLowerCUDA + openmp_cuda_target = numba_cuda.descriptor.CUDATarget("openmp_cuda") + openmp_cuda_target._typingctx = typingctx_outlined + openmp_cuda_target._targetctx = device_target + self.fix_dispatchers( + state_copy.typemap, typingctx_outlined, openmp_cuda_target + ) + + typingctx_outlined.refresh() + device_target.refresh() + dprint_func_ir(outlined_ir, "outlined_ir before replace np.empty") + replace_np_empty_with_cuda_shared( + outlined_ir, + state_copy.typemap, + calltypes, + device_func_name, + typingctx_outlined, + ) + dprint_func_ir(outlined_ir, "outlined_ir after replace np.empty") + else: + raise NotImplementedError("Unsupported OpenMP device number") + + device_target.state_copy = state_copy + # Do not compile (generate native code), just lower (to LLVM) + flags.no_compile = True + flags.no_cpython_wrapper = True + flags.no_cfunc_wrapper = True + # What to do here? + flags.forceinline = True + # Propagate fastmath flag on the outer function to the inner outlined compile. + # TODO: find a good way to handle fastmath. Clang has + # fp-contractions on by default for GPU code. + # flags.fastmath = True#state_copy.flags.fastmath + flags.release_gil = True + flags.nogil = True + flags.inline = "always" + # Create a pipeline that only lowers the outlined target code. No need to + # compile because it has already gone through those passes. + if DEBUG_OPENMP >= 1: + print( + "outlined_ir:", + outlined_ir, + type(outlined_ir), + outlined_ir.arg_names, + ) + dprint_func_ir(outlined_ir, "outlined_ir") + dprint_func_ir(func_ir, "target after outline func_ir") + dprint_func_ir(lowerer.func_ir, "original func_ir") + print("state_copy.typemap:", state_copy.typemap) + print("region ids before compile_ir") + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + + cres = compiler.compile_ir( + typingctx_outlined, + device_target, + outlined_ir, + outline_arg_typs, + types.none, + flags, + {}, + pipeline_class=device_lowerer_pipeline, + is_lifted_loop=False, + ) # tried this as True since code derived from loop lifting code but it goes through the pipeline twice and messes things up + + if DEBUG_OPENMP >= 2: + print("cres:", type(cres)) + print("fndesc:", cres.fndesc, cres.fndesc.mangled_name) + print("metadata:", cres.metadata) + cres_library = cres.library + if DEBUG_OPENMP >= 2: + print("cres_library:", type(cres_library)) + sys.stdout.flush() + cres_library._ensure_finalized() + if DEBUG_OPENMP >= 2: + print("ensure_finalized:") + sys.stdout.flush() + + if DEBUG_OPENMP >= 1: + print("region ids compile_ir") + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + print( + "===================================================================================" + ) + + for k, v in lowerer.func_ir.blocks.items(): + print( + "block post copy:", + k, + id(v), + id(func_ir.blocks[k]), + id(v.body), + id(func_ir.blocks[k].body), + ) + + shared_ext = ".so" + if sys.platform.startswith("win"): + shared_ext = ".dll" + + # TODO: move device pipelines in numba proper. + if selected_device == 1: + if DEBUG_OPENMP >= 1: + with open(cres_library.name + ".ll", "w") as f: + f.write(cres_library.get_llvm_str()) + + fd_o, filename_o = tempfile.mkstemp(".o") + fd_so, filename_so = tempfile.mkstemp(shared_ext) + + target_elf = cres_library.emit_native_object() + with open(filename_o, "wb") as f: + f.write(target_elf) + + # Create shared library as required by the libomptarget host + # plugin. + + subprocess.run( + [ + "ld", + "-shared", + filename_o, + # Do whole archive to include all symbols, esp. for the + # PyOMP_NRT_Init constructor. + "--whole-archive", + libpath / "libbundle.a", + "--no-whole-archive", + "-o", + filename_so, + ], + check=True, + ) + + with open(filename_so, "rb") as f: + target_elf = f.read() + if DEBUG_OPENMP >= 1: + print("filename_o", filename_o, "filename_so", filename_so) + + os.close(fd_o) + os.remove(filename_o) + os.close(fd_so) + os.remove(filename_so) + + if DEBUG_OPENMP >= 1: + print("target_elf:", type(target_elf), len(target_elf)) + sys.stdout.flush() + elif selected_device == 0: + import numba.cuda.api as cudaapi + import numba.cuda.cudadrv.libs as cudalibs + from numba.cuda.cudadrv import driver + from numba.core.llvm_bindings import create_pass_manager_builder + from numba.cuda.codegen import CUDA_TRIPLE + + class OpenMPCUDACodegen: + def __init__(self): + self.cc = cudaapi.get_current_device().compute_capability + self.sm = "sm_" + str(self.cc[0]) + str(self.cc[1]) + self.libdevice_path = cudalibs.get_libdevice() + with open(self.libdevice_path, "rb") as f: + self.libs_mod = ll.parse_bitcode(f.read()) + self.libomptarget_arch = ( + llvm_libpath + "/libomptarget-new-nvptx-" + self.sm + ".bc" + ) + with open(self.libomptarget_arch, "rb") as f: + libomptarget_mod = ll.parse_bitcode(f.read()) + # Link in device, openmp libraries. + self.libs_mod.link_in(libomptarget_mod) + # Initialize asm printers to codegen ptx. + ll.initialize_all_targets() + ll.initialize_all_asmprinters() + target = ll.Target.from_triple(CUDA_TRIPLE) + self.tm = target.create_target_machine(cpu=self.sm, opt=3) + + def _get_target_image_in_memory(self, mod, filename_prefix): + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open(filename_prefix + ".ll", "w") as f: + f.write(str(mod)) + + # Lower openmp intrinsics. + mod = run_intrinsics_openmp_pass(mod) + with ll.create_module_pass_manager() as pm: + pm.add_cfg_simplification_pass() + pm.run(mod) + + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open(filename_prefix + "-intrinsics_omp.ll", "w") as f: + f.write(str(mod)) + + mod.link_in(self.libs_mod, preserve=True) + # Internalize non-kernel function definitions. + for func in mod.functions: + if func.is_declaration: + continue + if func.linkage != ll.Linkage.external: + continue + if "__omp_offload_numba" in func.name: + continue + func.linkage = "internal" + + with ll.create_module_pass_manager() as pm: + self.tm.add_analysis_passes(pm) + pm.add_global_dce_pass() + pm.run(mod) + + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open( + filename_prefix + "-intrinsics_omp-linked.ll", "w" + ) as f: + f.write(str(mod)) + + # Run passes for optimization, including target-specific passes. + # Run function passes. + with ll.create_function_pass_manager(mod) as pm: + self.tm.add_analysis_passes(pm) + with create_pass_manager_builder( + opt=3, slp_vectorize=True, loop_vectorize=True + ) as pmb: + # TODO: upstream adjust_pass_manager to llvmlite? + # self.tm.adjust_pass_manager(pmb) + pmb.populate(pm) + for func in mod.functions: + pm.initialize() + pm.run(func) + pm.finalize() + + # Run module passes. + with ll.create_module_pass_manager() as pm: + self.tm.add_analysis_passes(pm) + with create_pass_manager_builder( + opt=3, slp_vectorize=True, loop_vectorize=True + ) as pmb: + # TODO: upstream adjust_pass_manager to llvmlite? + # self.tm.adjust_pass_manager(pmb) + pmb.populate(pm) + pm.run(mod) + + if DEBUG_OPENMP_LLVM_PASS >= 1: + mod.verify() + with open( + filename_prefix + "-intrinsics_omp-linked-opt.ll", "w" + ) as f: + f.write(str(mod)) + + # Generate ptx assemlby. + ptx = self.tm.emit_assembly(mod) + + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open( + filename_prefix + "-intrinsics_omp-linked-opt.s", "w" + ) as f: + f.write(ptx) + + linker_kwargs = {} + for x in ompx_attrs: + linker_kwargs[x.arg[0]] = ( + tuple(x.arg[1]) if len(x.arg[1]) > 1 else x.arg[1][0] + ) + # NOTE: DO NOT set cc, since the linker will always + # compile for the existing GPU context and it is + # incompatible with the launch_bounds ompx_attribute. + linker = driver.Linker.new(**linker_kwargs) + linker.add_ptx(ptx.encode()) + cubin = linker.complete() + + if DEBUG_OPENMP_LLVM_PASS >= 1: + with open( + filename_prefix + "-intrinsics_omp-linked-opt.o", "wb" + ) as f: + f.write(cubin) + + return cubin + + def _get_target_image_toolchain(self, mod, filename_prefix): + with open(filename_prefix + ".ll", "w") as f: + f.write(str(mod)) + + # Lower openmp intrinsics. + mod = run_intrinsics_openmp_pass(mod) + with ll.create_module_pass_manager() as pm: + pm.add_cfg_simplification_pass() + pm.run(mod) + + with open(filename_prefix + "-intrinsics_omp.ll", "w") as f: + f.write(str(mod)) + + if DEBUG_OPENMP >= 1: + print("libomptarget_arch", self.libomptarget_arch) + subprocess.run( + [ + llvm_binpath + "/llvm-link", + "--suppress-warnings", + "--internalize", + "-S", + filename_prefix + "-intrinsics_omp.ll", + self.libomptarget_arch, + self.libdevice_path, + "-o", + filename_prefix + "-intrinsics_omp-linked.ll", + ], + check=True, + ) + subprocess.run( + [ + llvm_binpath + "/opt", + "-S", + "-O3", + filename_prefix + "-intrinsics_omp-linked.ll", + "-o", + filename_prefix + "-intrinsics_omp-linked-opt.ll", + ], + check=True, + ) + + subprocess.run( + [ + llvm_binpath + "/llc", + "-O3", + "-march=nvptx64", + f"-mcpu={self.sm}", + f"-mattr=+ptx64,+{self.sm}", + filename_prefix + "-intrinsics_omp-linked-opt.ll", + "-o", + filename_prefix + "-intrinsics_omp-linked-opt.s", + ], + check=True, + ) + + subprocess.run( + [ + "ptxas", + "-m64", + "--gpu-name", + self.sm, + filename_prefix + "-intrinsics_omp-linked-opt.s", + "-o", + filename_prefix + "-intrinsics_omp-linked-opt.o", + ], + check=True, + ) + with open( + filename_prefix + "-intrinsics_omp-linked-opt.o", "rb" + ) as f: + target_elf = f.read() + return target_elf + + def get_target_image(self, cres): + filename_prefix = cres_library.name + allmods = cres_library.modules + linked_mod = ll.parse_assembly(str(allmods[0])) + for mod in allmods[1:]: + linked_mod.link_in(ll.parse_assembly(str(mod))) + if OPENMP_DEVICE_TOOLCHAIN >= 1: + return self._get_target_image_toolchain( + linked_mod, filename_prefix + ) + else: + return self._get_target_image_in_memory( + linked_mod, filename_prefix + ) + + target_extension._active_context.target = orig_target + omp_cuda_cg = OpenMPCUDACodegen() + target_elf = omp_cuda_cg.get_target_image(cres) + else: + raise NotImplementedError("Unsupported OpenMP device number") + + # if cuda then run ptxas on the cres and pass that + + # bytes_array_typ = lir.ArrayType(cgutils.voidptr_t, len(target_elf)) + # bytes_array_typ = lir.ArrayType(cgutils.int8_t, len(target_elf)) + # dev_image = cgutils.add_global_variable(mod, bytes_array_typ, ".omp_offloading.device_image") + # dev_image.initializer = lir.Constant.array(cgutils.int8_t, target_elf) + # dev_image.initializer = lir.Constant.array(cgutils.int8_t, target_elf) + add_target_globals_in_numba = int( + os.environ.get("NUMBA_OPENMP_ADD_TARGET_GLOBALS", 0) + ) + if add_target_globals_in_numba != 0: + elftext = cgutils.make_bytearray(target_elf) + dev_image = targetctx.insert_unique_const( + mod, ".omp_offloading.device_image", elftext + ) + mangled_name = cgutils.make_bytearray( + cres.fndesc.mangled_name.encode("utf-8") + b"\x00" + ) + mangled_var = targetctx.insert_unique_const( + mod, ".omp_offloading.entry_name", mangled_name + ) + + llvmused_typ = lir.ArrayType(cgutils.voidptr_t, 2) + llvmused_gv = cgutils.add_global_variable( + mod, llvmused_typ, "llvm.used" + ) + llvmused_syms = [ + lir.Constant.bitcast(dev_image, cgutils.voidptr_t), + lir.Constant.bitcast(mangled_var, cgutils.voidptr_t), + ] + llvmused_gv.initializer = lir.Constant.array( + cgutils.voidptr_t, llvmused_syms + ) + llvmused_gv.linkage = "appending" + else: + host_side_target_tags.append( + openmp_tag( + "QUAL.OMP.TARGET.DEV_FUNC", + StringLiteral(cres.fndesc.mangled_name.encode("utf-8")), + ) + ) + host_side_target_tags.append( + openmp_tag("QUAL.OMP.TARGET.ELF", StringLiteral(target_elf)) + ) + + if DEBUG_OPENMP >= 1: + dprint_func_ir(func_ir, "target after outline compiled func_ir") + + llvm_token_t = TokenType() + fnty = lir.FunctionType(llvm_token_t, []) + tags_to_include = self.tags + host_side_target_tags + # tags_to_include = list(filter(lambda x: x.name != "DIR.OMP.TARGET", tags_to_include)) + self.filtered_tag_length = len(tags_to_include) + if DEBUG_OPENMP >= 1: + print("filtered_tag_length:", self.filtered_tag_length) + + if len(tags_to_include) > 0: + if DEBUG_OPENMP >= 1: + print("push_alloca_callbacks") + + push_alloca_callback(lowerer, openmp_region_alloca, self, builder) + tag_str = openmp_tag_list_to_str(tags_to_include, lowerer, True) + pre_fn = builder.module.declare_intrinsic( + "llvm.directive.region.entry", (), fnty + ) + assert self.omp_region_var is None + self.omp_region_var = builder.call(pre_fn, [], tail=False) + self.omp_region_var.__class__ = CallInstrWithOperandBundle + self.omp_region_var.set_tags(tag_str) + # This is used by the post-lowering pass over LLVM to add LLVM alloca + # vars to the Numba IR openmp node and then when the exit of the region + # is detected then the tags in the enter directive are updated. + self.omp_region_var.save_orig_numba_openmp = self + if DEBUG_OPENMP >= 2: + print("setting omp_region_var", self.omp_region_var._get_name()) + if self.acq_res: + builder.fence("acquire") + if self.acq_rel: + builder.fence("acq_rel") + + for otag in self.tags: # should be tags_to_include? + otag.post_entry(lowerer) + + if DEBUG_OPENMP >= 1: + sys.stdout.flush() + + def __str__(self): + return ( + "openmp_region_start " + + ", ".join([str(x) for x in self.tags]) + + " target=" + + str(self.target_copy) + ) + + +class OnlyLower(compiler.CompilerBase): + def __init__(self, typingctx, targetctx, library, args, restype, flags, locals): + super().__init__(typingctx, targetctx, library, args, restype, flags, locals) + self.state.typemap = targetctx.state_copy.typemap + self.state.calltypes = targetctx.state_copy.calltypes + + def define_pipelines(self): + pms = [] + if not self.state.flags.force_pyobject: + pms.append( + compiler.DefaultPassBuilder.define_nopython_lowering_pipeline( + self.state + ) + ) + return pms + + +class OnlyLowerCUDA(numba_cuda.compiler.CUDACompiler): + def __init__(self, typingctx, targetctx, library, args, restype, flags, locals): + super().__init__(typingctx, targetctx, library, args, restype, flags, locals) + self.state.typemap = targetctx.state_copy.typemap + self.state.calltypes = targetctx.state_copy.calltypes + + def define_pipelines(self): + pm = compiler_machinery.PassManager("cuda") + pm.add_pass(numba_cuda.compiler.CUDALegalization, "CUDA legalization") + lowering_passes = self.define_cuda_lowering_pipeline(self.state) + pm.passes.extend(lowering_passes.passes) + pm.finalize() + return [pm] + + +class openmp_region_end(ir.Stmt): + def __init__(self, start_region, tags, loc): + if DEBUG_OPENMP >= 1: + print("region ids openmp_region_end::__init__", id(self), id(start_region)) + self.start_region = start_region + self.tags = tags + self.loc = loc + self.start_region.end_region = self + + def __new__(cls, *args, **kwargs): + instance = super(openmp_region_end, cls).__new__(cls) + # print("openmp_region_end::__new__", id(instance)) + return instance + + def list_vars(self): + return list_vars_from_tags(self.tags) + + def lower(self, lowerer): + typingctx = lowerer.context.typing_context + targetctx = lowerer.context + typemap = lowerer.fndesc.typemap + context = lowerer.context + builder = lowerer.builder + library = lowerer.library + + if DEBUG_OPENMP >= 2: + print("openmp_region_end::lower", id(self), id(self.start_region)) + sys.stdout.flush() + + if self.start_region.acq_res: + builder.fence("release") + + if DEBUG_OPENMP >= 1: + print("pop_alloca_callbacks") + + if DEBUG_OPENMP >= 2: + print("start_region tag length:", self.start_region.filtered_tag_length) + + if self.start_region.filtered_tag_length > 0: + llvm_token_t = TokenType() + fnty = lir.FunctionType(lir.VoidType(), [llvm_token_t]) + # The callback is only needed if llvm.directive.region.entry was added + # which only happens if tag length > 0. + pop_alloca_callback(lowerer, builder) + + # Process the accumulated allocas in the start region. + self.start_region.process_alloca_queue() + + assert self.start_region.omp_region_var != None + if DEBUG_OPENMP >= 2: + print( + "before adding exit", self.start_region.omp_region_var._get_name() + ) + + for fp in filter( + lambda x: x.name == "QUAL.OMP.FIRSTPRIVATE", self.start_region.tags + ): + new_del = ir.Del(fp.arg, self.loc) + lowerer.lower_inst(new_del) + + pre_fn = builder.module.declare_intrinsic( + "llvm.directive.region.exit", (), fnty + ) + or_end_call = builder.call( + pre_fn, [self.start_region.omp_region_var], tail=True + ) + or_end_call.__class__ = CallInstrWithOperandBundle + or_end_call.set_tags(openmp_tag_list_to_str(self.tags, lowerer, True)) + + if DEBUG_OPENMP >= 1: + print( + "OpenMP end lowering firstprivate_dead_after len:", + len(self.start_region.firstprivate_dead_after), + ) + + for fp in self.start_region.firstprivate_dead_after: + new_del = ir.Del(fp.arg, self.loc) + lowerer.lower_inst(new_del) + + def __str__(self): + return "openmp_region_end " + ", ".join([str(x) for x in self.tags]) + + def has_target(self): + for t in self.tags: + if is_target_tag(t.name): + return t.arg + return None + + +def compute_cfg_from_llvm_blocks(blocks): + cfg = CFGraph() + name_to_index = {} + for b in blocks: + # print("b:", b.name, type(b.name)) + cfg.add_node(b.name) + + for bindex, b in enumerate(blocks): + term = b.terminator + # print("term:", b.name, term, type(term)) + if isinstance(term, lir.instructions.Branch): + cfg.add_edge(b.name, term.operands[0].name) + name_to_index[b.name] = (bindex, [term.operands[0].name]) + elif isinstance(term, lir.instructions.ConditionalBranch): + cfg.add_edge(b.name, term.operands[1].name) + cfg.add_edge(b.name, term.operands[2].name) + name_to_index[b.name] = ( + bindex, + [term.operands[1].name, term.operands[2].name], + ) + elif isinstance(term, lir.instructions.Ret): + name_to_index[b.name] = (bindex, []) + elif isinstance(term, lir.instructions.SwitchInstr): + cfg.add_edge(b.name, term.default.name) + for _, blk in term.cases: + cfg.add_edge(b.name, blk.name) + out_blks = [x[1].name for x in term.cases] + out_blks.append(term.default.name) + name_to_index[b.name] = (bindex, out_blks) + elif isinstance(term, lir.instructions.Unreachable): + pass + else: + print("Unknown term:", term, type(term)) + assert False # Should never get here. + + cfg.set_entry_point("entry") + cfg.process() + return cfg, name_to_index + + +def compute_llvm_topo_order(blocks): + cfg, name_to_index = compute_cfg_from_llvm_blocks(blocks) + post_order = [] + seen = set() + + def _dfs_rec(node): + if node not in seen: + seen.add(node) + succs = cfg._succs[node] + + # If there are no successors then we are done. + # This is the case for an unreachable. + if not succs: + return + + # This is needed so that the inside of loops are + # handled first before their exits. + nexts = name_to_index[node][1] + if len(nexts) == 2: + succs = [nexts[1], nexts[0]] + + for dest in succs: + if (node, dest) not in cfg._back_edges: + _dfs_rec(dest) + post_order.append(node) + + _dfs_rec(cfg.entry_point()) + post_order.reverse() + return post_order, name_to_index + + +class CollectUnknownLLVMVarsPrivate(lir.transforms.Visitor): + def __init__(self): + self.active_openmp_directives = [] + self.start_num = 0 + + # Override the default function visitor to go in topo order + def visit_Function(self, func): + self._function = func + if len(func.blocks) == 0: + return None + if DEBUG_OPENMP >= 1: + print("Collect visit_Function:", func.blocks, type(func.blocks)) + topo_order, name_to_index = compute_llvm_topo_order(func.blocks) + topo_order = list(topo_order) + if DEBUG_OPENMP >= 1: + print("topo_order:", topo_order) + + for bbname in topo_order: + if DEBUG_OPENMP >= 1: + print("Visiting block:", bbname) + self.visit_BasicBlock(func.blocks[name_to_index[bbname][0]]) + + if DEBUG_OPENMP >= 1: + print("Collect visit_Function done") + + def visit_Instruction(self, instr): + if len(self.active_openmp_directives) > 0: + if DEBUG_OPENMP >= 1: + print("Collect instr:", instr, type(instr)) + for op in instr.operands: + if isinstance(op, lir.AllocaInstr): + if DEBUG_OPENMP >= 1: + print("Collect AllocaInstr operand:", op, op.name) + for directive in self.active_openmp_directives: + directive.save_orig_numba_openmp.alloca(op, None) + else: + if DEBUG_OPENMP >= 2: + print("non-alloca:", op, type(op)) + pass + + if isinstance(instr, lir.CallInstr): + if instr.callee.name == "llvm.directive.region.entry": + if DEBUG_OPENMP >= 1: + print( + "Collect Found openmp region entry:", + instr, + type(instr), + "\n", + instr.tags, + type(instr.tags), + id(self), + len(self.active_openmp_directives), + ) + self.active_openmp_directives.append(instr) + if DEBUG_OPENMP >= 1: + print("post append:", len(self.active_openmp_directives)) + assert hasattr(instr, "save_orig_numba_openmp") + if instr.callee.name == "llvm.directive.region.exit": + if DEBUG_OPENMP >= 1: + print( + "Collect Found openmp region exit:", + instr, + type(instr), + "\n", + instr.tags, + type(instr.tags), + id(self), + len(self.active_openmp_directives), + ) + enter_directive = self.active_openmp_directives.pop() + enter_directive.save_orig_numba_openmp.post_lowering_process_alloca_queue( + enter_directive + ) + + +def post_lowering_openmp(mod): + if DEBUG_OPENMP >= 1: + print("post_lowering_openmp") + + # This will gather the information. + collect_fixup = CollectUnknownLLVMVarsPrivate() + collect_fixup.visit(mod) + + if DEBUG_OPENMP >= 1: + print("post_lowering_openmp done") + + +# Callback for ir_extension_usedefs +def openmp_region_start_defs(region, use_set=None, def_set=None): + assert isinstance(region, openmp_region_start) + if use_set is None: + use_set = set() + if def_set is None: + def_set = set() + for tag in region.tags: + tag.add_to_usedef_set(use_set, def_set, start=True) + return _use_defs_result(usemap=use_set, defmap=def_set) + + +def openmp_region_end_defs(region, use_set=None, def_set=None): + assert isinstance(region, openmp_region_end) + if use_set is None: + use_set = set() + if def_set is None: + def_set = set() + # We refer to the clauses from the corresponding start of the region. + start_region = region.start_region + for tag in start_region.tags: + tag.add_to_usedef_set(use_set, def_set, start=False) + return _use_defs_result(usemap=use_set, defmap=def_set) + + +# Extend usedef analysis to support openmp_region_start/end nodes. +ir_extension_usedefs[openmp_region_start] = openmp_region_start_defs +ir_extension_usedefs[openmp_region_end] = openmp_region_end_defs + + +def openmp_region_start_infer(prs, typeinferer): + pass + + +def openmp_region_end_infer(pre, typeinferer): + pass + + +typeinfer.typeinfer_extensions[openmp_region_start] = openmp_region_start_infer +typeinfer.typeinfer_extensions[openmp_region_end] = openmp_region_end_infer + + +def _lower_openmp_region_start(lowerer, prs): + # TODO: if we set it always in numba_fixups we can remove from here + if isinstance(lowerer.context, OpenmpCPUTargetContext) or isinstance( + lowerer.context, OpenmpCUDATargetContext + ): + pass + else: + lowerer.library.__class__ = CustomCPUCodeLibrary + lowerer.context.__class__ = CustomContext + prs.lower(lowerer) + + +def _lower_openmp_region_end(lowerer, pre): + # TODO: if we set it always in numba_fixups we can remove from here + if isinstance(lowerer.context, OpenmpCPUTargetContext) or isinstance( + lowerer.context, OpenmpCUDATargetContext + ): + pass + else: + lowerer.library.__class__ = CustomCPUCodeLibrary + lowerer.context.__class__ = CustomContext + pre.lower(lowerer) + + +def apply_copies_openmp_region( + region, var_dict, name_var_table, typemap, calltypes, save_copies +): + for i in range(len(region.tags)): + region.tags[i].replace_vars_inner(var_dict) + + +apply_copy_propagate_extensions[openmp_region_start] = apply_copies_openmp_region +apply_copy_propagate_extensions[openmp_region_end] = apply_copies_openmp_region + + +def visit_vars_openmp_region(region, callback, cbdata): + for i in range(len(region.tags)): + if DEBUG_OPENMP >= 1: + print("visit_vars before", region.tags[i], type(region.tags[i].arg)) + region.tags[i].arg = visit_vars_inner(region.tags[i].arg, callback, cbdata) + if DEBUG_OPENMP >= 1: + print("visit_vars after", region.tags[i]) + + +visit_vars_extensions[openmp_region_start] = visit_vars_openmp_region +visit_vars_extensions[openmp_region_end] = visit_vars_openmp_region + +# ---------------------------------------------------------------------------------------------- + + +class PythonOpenmp: + def __init__(self, *args): + self.args = args + + def __enter__(self): + pass + + def __exit__(self, typ, val, tb): + pass + + +def iscall(x): + if isinstance(x, ir.Assign): + return isinstance(x.value, ir.Expr) and x.value.op == "call" + elif isinstance(x, ir.Expr): + return x.op == "call" + else: + return False + + +def extract_args_from_openmp(func_ir): + """Find all the openmp context calls in the function and then + use the VarCollector transformer to find all the Python variables + referenced in the openmp clauses. We then add those variables as + regular arguments to the openmp context call just so Numba's + usedef analysis is able to keep variables alive that are only + referenced in openmp clauses. + """ + func_ir._definitions = build_definitions(func_ir.blocks) + var_table = get_name_var_table(func_ir.blocks) + for block in func_ir.blocks.values(): + for inst in block.body: + if iscall(inst): + func_def = get_definition(func_ir, inst.value.func) + if isinstance(func_def, ir.Global) and isinstance( + func_def.value, _OpenmpContextType + ): + str_def = get_definition(func_ir, inst.value.args[0]) + if not isinstance(str_def, ir.Const) or not isinstance( + str_def.value, str + ): + # The non-const openmp string error is handled later. + continue + assert isinstance(str_def, ir.Const) and isinstance( + str_def.value, str + ) + parse_res = var_collector_parser.parse(str_def.value) + visitor = VarCollector() + try: + visit_res = visitor.transform(parse_res) + inst.value.args.extend([var_table[x] for x in visit_res]) + except Exception as f: + print("generic transform exception") + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + # print("Internal error for OpenMp pragma '{}'".format(arg.value)) + sys.exit(-2) + except: + print("fallthrough exception") + # print("Internal error for OpenMp pragma '{}'".format(arg.value)) + sys.exit(-3) + + +def remove_empty_blocks(blocks): + found = True + while found: + found = False + empty_block = None + for label, block in blocks.items(): + if len(block.body) == 1: + assert isinstance(block.body[-1], ir.Jump) + empty_block = label + next_block = block.body[-1].target + break + + if empty_block is not None: + del blocks[empty_block] + + found = True + for block in blocks.values(): + last_stmt = block.body[-1] + if isinstance(last_stmt, ir.Jump): + if last_stmt.target == empty_block: + block.body[-1] = ir.Jump(next_block, last_stmt.loc) + elif isinstance(last_stmt, ir.Branch): + if last_stmt.truebr == empty_block: + block.body[-1] = ir.Branch( + last_stmt.cond, next_block, last_stmt.falsebr, last_stmt.loc + ) + elif block.body[-1].falsebr == empty_block: + block.body[-1] = ir.Branch( + last_stmt.cond, last_stmt.truebr, next_block, last_stmt.loc + ) + elif isinstance(last_stmt, ir.Return): + # Intentionally do nothing. + pass + else: + print(type(last_stmt)) + assert False + + +class _OpenmpContextType(WithContext): + is_callable = True + first_time = True + blk_end_live_map = set() + + def do_numba_fixups(self): + from numba import core + + orig_lower_inst = core.lowering.Lower.lower_inst + core.lowering.Lower.orig_lower_inst = orig_lower_inst + + orig_lower = core.lowering.Lower.lower + core.lowering.Lower.orig_lower = orig_lower + + # Use method to retrieve the outside region live map, which is updated + # during the with-context mutation. + def get_blk_end_live_map(): + return self.blk_end_live_map + + def new_lower(self, inst): + if not isinstance(self, LowerNoSROA): + self.__class__ = LowerNoSROA + if isinstance(inst, openmp_region_start): + return _lower_openmp_region_start(self, inst) + elif isinstance(inst, openmp_region_end): + return _lower_openmp_region_end(self, inst) + # TODO: instead of monkey patching for Del instructions outside the + # openmp region do: (1) either outline to create a function scope + # that will decouple the lifetime of variables inside the OpenMP + # region, (2) or subclass the PostProcessor to extend use-def + # analysis with OpenMP lifetime information. + elif isinstance(inst, ir.Del): + # Lower Del normally in the openmp region. + if in_openmp_region(self.builder): + return self.orig_lower_inst(inst) + + # Lower the Del instruction ONLY if the variable is not live + # after the openmp region. + if inst.value not in get_blk_end_live_map(): + return self.orig_lower_inst(inst) + elif isinstance(inst, ir.Assign): + return self.lower_assign_inst(orig_lower_inst, inst) + elif isinstance(inst, ir.Return): + return self.lower_return_inst(orig_lower_inst, inst) + else: + return self.orig_lower_inst(inst) + + core.lowering.Lower.lower_inst = new_lower + + def mutate_with_body( + self, + func_ir, + blocks, + blk_start, + blk_end, + body_blocks, + dispatcher_factory, + extra, + ): + if _OpenmpContextType.first_time == True: + _OpenmpContextType.first_time = False + self.do_numba_fixups() + + if DEBUG_OPENMP >= 1: + print("pre-dead-code") + dump_blocks(blocks) + if not OPENMP_DISABLED and not hasattr(func_ir, "has_openmp_region"): + # We can't do dead code elimination at this point because if an argument + # is used only in an openmp clause then it is detected as dead and is + # eliminated. We'd have to run through the IR and find all the + # openmp regions and extract the vars used there and then modify the + # IR with something fake just to take the var alive. The other approach + # would be to modify dead code elimination to find the vars referenced + # in openmp context strings. + extract_args_from_openmp(func_ir) + # dead_code_elimination(func_ir) + remove_ssa_from_func_ir(func_ir) + # remove_empty_blocks(blocks) + func_ir.has_openmp_region = True + if DEBUG_OPENMP >= 1: + print("pre-with-removal") + dump_blocks(blocks) + if OPENMP_DISABLED: + # If OpenMP disabled, do nothing except remove the enter_with marker. + sblk = blocks[blk_start] + sblk.body = sblk.body[1:] + else: + if DEBUG_OPENMP >= 1: + print("openmp:mutate_with_body") + dprint_func_ir(func_ir, "func_ir") + print("blocks:", blocks, type(blocks)) + print("blk_start:", blk_start, type(blk_start)) + print("blk_end:", blk_end, type(blk_end)) + print("body_blocks:", body_blocks, type(body_blocks)) + print("extra:", extra, type(extra)) + assert extra is not None + _add_openmp_ir_nodes( + func_ir, blocks, blk_start, blk_end, body_blocks, extra + ) + func_ir._definitions = build_definitions(blocks) + if DEBUG_OPENMP >= 1: + print("post-with-removal") + dump_blocks(blocks) + dispatcher = dispatcher_factory(func_ir) + dispatcher.can_cache = True + + # Find live variables after the region to make sure we don't Del + # them if they are defined in the openmp region. + cfg = compute_cfg_from_blocks(blocks) + usedefs = compute_use_defs(blocks) + live_map = compute_live_map(cfg, blocks, usedefs.usemap, usedefs.defmap) + self.blk_end_live_map = live_map[blk_end] + return dispatcher + + def __call__(self, args): + return PythonOpenmp(args) + + +def remove_indirections(clause): + try: + while len(clause) == 1 and isinstance(clause[0], list): + clause = clause[0] + except: + pass + return clause + + +class default_shared_val: + def __init__(self, val): + self.val = val + + +class UnspecifiedVarInDefaultNone(Exception): + pass + + +class ParallelForExtraCode(Exception): + pass + + +class ParallelForWrongLoopCount(Exception): + pass + + +class ParallelForInvalidCollapseCount(Exception): + pass + + +class NonconstantOpenmpSpecification(Exception): + pass + + +class NonStringOpenmpSpecification(Exception): + pass + + +class MultipleNumThreadsClauses(Exception): + pass + + +openmp_context = _OpenmpContextType() + + +def is_dsa(name): + return ( + name + in [ + "QUAL.OMP.FIRSTPRIVATE", + "QUAL.OMP.PRIVATE", + "QUAL.OMP.SHARED", + "QUAL.OMP.LASTPRIVATE", + "QUAL.OMP.TARGET.IMPLICIT", + ] + or name.startswith("QUAL.OMP.REDUCTION") + or name.startswith("QUAL.OMP.MAP") + ) + + +def get_dotted_type(x, typemap, lowerer): + xsplit = x.split("*") + cur_typ = typemap_lookup(typemap, xsplit[0]) + # print("xsplit:", xsplit, cur_typ, type(cur_typ)) + for field in xsplit[1:]: + dm = lowerer.context.data_model_manager.lookup(cur_typ) + findex = dm._fields.index(field) + cur_typ = dm._members[findex] + # print("dm:", dm, type(dm), dm._members, type(dm._members), dm._fields, type(dm._fields), findex, cur_typ, type(cur_typ)) + return cur_typ + + +def is_target_arg(name): + return ( + name in ["QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.TARGET.IMPLICIT"] + or name.startswith("QUAL.OMP.MAP") + or name.startswith("QUAL.OMP.REDUCTION") + ) + + +def is_pointer_target_arg(name, typ): + if name.startswith("QUAL.OMP.MAP"): + if isinstance(typ, types.npytypes.Array): + return True + else: + return True + if name in ["QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.PRIVATE"]: + return False + if name in ["QUAL.OMP.TARGET.IMPLICIT"]: + if isinstance(typ, types.npytypes.Array): + return True + else: + return False + return False + # print("is_pointer_target_arg:", name, typ, type(typ)) + assert False + + +def is_internal_var(var): + # Determine if a var is a Python var or an internal Numba var. + if var.is_temp: + return True + return var.unversioned_name != var.name + + +def remove_ssa(var_name, scope, loc): + # Get the base name of a variable, removing the SSA extension. + var = ir.Var(scope, var_name, loc) + return var.unversioned_name + + +def user_defined_var(var): + if not isinstance(var, str): + return False + return not var.startswith("$") + + +def has_user_defined_var(the_set): + for x in the_set: + if user_defined_var(x): + return True + return False + + +def get_user_defined_var(the_set): + ret = set() + for x in the_set: + if user_defined_var(x): + ret.add(x) + return ret + + +unique = 0 + + +def get_unique(): + global unique + ret = unique + unique += 1 + return ret + + +def is_private(x): + return x in [ + "QUAL.OMP.PRIVATE", + "QUAL.OMP.FIRSTPRIVATE", + "QUAL.OMP.LASTPRIVATE", + "QUAL.OMP.TARGET.IMPLICIT", + ] + + +def openmp_copy(a): + pass # should always be called through overload + + +@overload(openmp_copy) +def openmp_copy_overload(a): + if DEBUG_OPENMP >= 1: + print("openmp_copy:", a, type(a)) + if isinstance(a, types.npytypes.Array): + + def cimpl(a): + return np.copy(a) + + return cimpl + else: + + def cimpl(a): + return a + + return cimpl + + +def replace_ssa_var_callback(var, vardict): + assert isinstance(var, ir.Var) + while var.unversioned_name in vardict.keys(): + assert vardict[var.unversioned_name].name != var.unversioned_name + new_var = vardict[var.unversioned_name] + var = ir.Var(new_var.scope, new_var.name, new_var.loc) + return var + + +def replace_ssa_vars(blocks, vardict): + """replace variables (ir.Var to ir.Var) from dictionary (name -> ir.Var)""" + # remove identity values to avoid infinite loop + new_vardict = {} + for l, r in vardict.items(): + if l != r.name: + new_vardict[l] = r + visit_vars(blocks, replace_ssa_var_callback, new_vardict) + + +def get_blocks_between_start_end(blocks, start_block, end_block): + cfg = compute_cfg_from_blocks(blocks) + blocks_in_region = [start_block] + + def add_in_region(cfg, blk, blocks_in_region, end_block): + """For each successor in the CFG of the block we're currently + adding to blocks_in_region, add that successor to + blocks_in_region if it isn't the end_block. Then, + recursively call this routine for the added block to add + its successors. + """ + for out_blk, _ in cfg.successors(blk): + if out_blk != end_block and out_blk not in blocks_in_region: + blocks_in_region.append(out_blk) + add_in_region(cfg, out_blk, blocks_in_region, end_block) + + # Calculate all the Numba IR blocks in the target region. + add_in_region(cfg, start_block, blocks_in_region, end_block) + return blocks_in_region + + +class VarName(str): + pass + + +class OnlyClauseVar(VarName): + pass + + +# This Transformer visitor class just finds the referenced python names +# and puts them in a list of VarName. The default visitor function +# looks for list of VarNames in the args to that tree node and then +# concatenates them all together. The final return value is a list of +# VarName that are variables used in the openmp clauses. +class VarCollector(Transformer): + def __init__(self): + super(VarCollector, self).__init__() + + def PYTHON_NAME(self, args): + return [VarName(args)] + + def const_num_or_var(self, args): + return args[0] + + def num_threads_clause(self, args): + (_, num_threads) = args + if isinstance(num_threads, list): + assert len(num_threads) == 1 + return [OnlyClauseVar(num_threads[0])] + else: + return None + + def __default__(self, data, children, meta): + ret = [] + for c in children: + if isinstance(c, list) and len(c) > 0: + if isinstance(c[0], OnlyClauseVar): + ret.extend(c) + return ret + + +def add_tags_to_enclosing(func_ir, cur_block, tags): + enclosing_region = get_enclosing_region(func_ir, cur_block) + if enclosing_region: + for region in enclosing_region: + for tag in tags: + region.add_tag(tag) + + +def add_enclosing_region(func_ir, blocks, openmp_node): + if not hasattr(func_ir, "openmp_enclosing"): + func_ir.openmp_enclosing = {} + if not hasattr(func_ir, "openmp_regions"): + func_ir.openmp_regions = {} + func_ir.openmp_regions[openmp_node] = sorted(blocks) + for b in blocks: + if b not in func_ir.openmp_enclosing: + func_ir.openmp_enclosing[b] = [] + func_ir.openmp_enclosing[b].append(openmp_node) + + +def get_enclosing_region(func_ir, cur_block): + if not hasattr(func_ir, "openmp_enclosing"): + func_ir.openmp_enclosing = {} + if cur_block in func_ir.openmp_enclosing: + return func_ir.openmp_enclosing[cur_block] + else: + return None + + +def get_var_from_enclosing(enclosing_regions, var): + if not enclosing_regions: + return None + if len(enclosing_regions) == 0: + return None + return enclosing_regions[-1].get_var_dsa(var) + + +class OpenmpVisitor(Transformer): + target_num = 0 + + def __init__(self, func_ir, blocks, blk_start, blk_end, body_blocks, loc): + self.func_ir = func_ir + self.blocks = blocks + self.blk_start = blk_start + self.blk_end = blk_end + self.body_blocks = body_blocks + self.loc = loc + super(OpenmpVisitor, self).__init__() + + # --------- Non-parser functions -------------------- + + def remove_explicit_from_one( + self, varset, vars_in_explicit_clauses, clauses, scope, loc + ): + """Go through a set of variables and see if their non-SSA form is in an explicitly + provided data clause. If so, remove it from the set and add a clause so that the + SSA form gets the same data clause. + """ + if DEBUG_OPENMP >= 1: + print( + "remove_explicit start:", + sorted(varset), + sorted(vars_in_explicit_clauses), + ) + diff = set() + # For each variable in the set. + for v in sorted(varset): + # Get the non-SSA form. + flat = remove_ssa(v, scope, loc) + # Skip non-SSA introduced variables (i.e., Python vars). + if flat == v: + continue + if DEBUG_OPENMP >= 1: + print("remove_explicit:", v, flat, flat in vars_in_explicit_clauses) + # If we have the non-SSA form in an explicit data clause. + if flat in vars_in_explicit_clauses: + # We will remove it from the set. + diff.add(v) + # Copy the non-SSA variables data clause. + ccopy = copy.copy(vars_in_explicit_clauses[flat]) + # Change the name in the clause to the SSA form. + ccopy.arg = ir.Var(scope, v, loc) + # Add to the clause set. + clauses.append(ccopy) + # Remove the vars from the set that we added a clause for. + varset.difference_update(diff) + if DEBUG_OPENMP >= 1: + print("remove_explicit end:", sorted(varset)) + + def remove_explicit_from_io_vars( + self, + inputs_to_region, + def_but_live_out, + private_to_region, + vars_in_explicit_clauses, + clauses, + non_user_explicits, + scope, + loc, + ): + """Remove vars in explicit data clauses from the auto-determined vars. + Then call remove_explicit_from_one to take SSA variants out of the auto-determined sets + and to create clauses so that SSA versions get the same clause as the explicit Python non-SSA var. + """ + inputs_to_region.difference_update(vars_in_explicit_clauses.keys()) + def_but_live_out.difference_update(vars_in_explicit_clauses.keys()) + private_to_region.difference_update(vars_in_explicit_clauses.keys()) + inputs_to_region.difference_update(non_user_explicits.keys()) + def_but_live_out.difference_update(non_user_explicits.keys()) + private_to_region.difference_update(non_user_explicits.keys()) + self.remove_explicit_from_one( + inputs_to_region, vars_in_explicit_clauses, clauses, scope, loc + ) + self.remove_explicit_from_one( + def_but_live_out, vars_in_explicit_clauses, clauses, scope, loc + ) + self.remove_explicit_from_one( + private_to_region, vars_in_explicit_clauses, clauses, scope, loc + ) + + def find_io_vars(self, selected_blocks): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + scope = sblk.scope + + cfg = compute_cfg_from_blocks(self.blocks) + usedefs = compute_use_defs(self.blocks) + if DEBUG_OPENMP >= 1: + print("usedefs:", usedefs) + live_map = compute_live_map(cfg, self.blocks, usedefs.usemap, usedefs.defmap) + # Assumes enter_with is first statement in block. + inputs_to_region = live_map[self.blk_start] + if DEBUG_OPENMP >= 1: + print("live_map:", live_map) + print("inputs_to_region:", sorted(inputs_to_region), type(inputs_to_region)) + print("selected blocks:", sorted(selected_blocks)) + all_uses = set() + all_defs = set() + for label in selected_blocks: + all_uses = all_uses.union(usedefs.usemap[label]) + all_defs = all_defs.union(usedefs.defmap[label]) + # Filter out those vars live to the region but not used within it. + inputs_to_region = inputs_to_region.intersection(all_uses) + def_but_live_out = all_defs.difference(inputs_to_region).intersection( + live_map[self.blk_end] + ) + private_to_region = all_defs.difference(inputs_to_region).difference( + live_map[self.blk_end] + ) + + if DEBUG_OPENMP >= 1: + print("all_uses:", sorted(all_uses)) + print("inputs_to_region:", sorted(inputs_to_region)) + print("private_to_region:", sorted(private_to_region)) + print("def_but_live_out:", sorted(def_but_live_out)) + return inputs_to_region, def_but_live_out, private_to_region, live_map + + def get_explicit_vars(self, clauses): + user_vars = {} + non_user_vars = {} + privates = [] + for c in clauses: + if DEBUG_OPENMP >= 1: + print("get_explicit_vars:", c, type(c)) + if isinstance(c, openmp_tag): + if DEBUG_OPENMP >= 1: + print("arg:", c.arg, type(c.arg)) + if isinstance(c.arg, list): + carglist = c.arg + else: + carglist = [c.arg] + # carglist = c.arg if isinstance(c.arg, list) else [c.arg] + for carg in carglist: + if DEBUG_OPENMP >= 1: + print( + "carg:", + carg, + type(carg), + user_defined_var(carg), + is_dsa(c.name), + ) + # Extract the var name from the NameSlice. + if isinstance(carg, NameSlice): + carg = carg.name + if isinstance(carg, str) and is_dsa(c.name): + if user_defined_var(carg): + user_vars[carg] = c + if is_private(c.name): + privates.append(carg) + else: + non_user_vars[carg] = c + return user_vars, privates, non_user_vars + + def filter_unused_vars(self, clauses, used_vars): + new_clauses = [] + for c in clauses: + if DEBUG_OPENMP >= 1: + print("filter_unused_vars:", c, type(c)) + if isinstance(c, openmp_tag): + if DEBUG_OPENMP >= 1: + print("arg:", c.arg, type(c.arg)) + assert not isinstance(c.arg, list) + if DEBUG_OPENMP >= 1: + print( + "c.arg:", + c.arg, + type(c.arg), + user_defined_var(c.arg), + is_dsa(c.name), + ) + + if ( + isinstance(c.arg, str) + and user_defined_var(c.arg) + and is_dsa(c.name) + ): + if c.arg in used_vars: + new_clauses.append(c) + else: + new_clauses.append(c) + return new_clauses + + def get_clause_privates(self, clauses, def_but_live_out, scope, loc): + # Get all the private clauses from the whole set of clauses. + private_clauses_vars = [ + remove_privatized(x.arg) + for x in clauses + if x.name in ["QUAL.OMP.PRIVATE", "QUAL.OMP.FIRSTPRIVATE"] + ] + # private_clauses_vars = [remove_privatized(x.arg) for x in clauses if x.name in ["QUAL.OMP.PRIVATE", "QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.LASTPRIVATE"]] + ret = {} + # Get a mapping of vars in private clauses to the SSA version of variable exiting the region. + for lo in def_but_live_out: + without_ssa = remove_ssa(lo, scope, loc) + if without_ssa in private_clauses_vars: + ret[without_ssa] = lo + return ret + + def make_implicit_explicit( + self, + scope, + vars_in_explicit, + explicit_clauses, + gen_shared, + inputs_to_region, + def_but_live_out, + private_to_region, + for_task=False, + ): + if for_task is None: + for_task = [] + if gen_shared: + for var_name in sorted(inputs_to_region): + if ( + for_task != False + and get_var_from_enclosing(for_task, var_name) != "QUAL.OMP.SHARED" + ): + explicit_clauses.append( + openmp_tag("QUAL.OMP.FIRSTPRIVATE", var_name) + ) + else: + explicit_clauses.append(openmp_tag("QUAL.OMP.SHARED", var_name)) + vars_in_explicit[var_name] = explicit_clauses[-1] + + for var_name in sorted(def_but_live_out): + if ( + for_task != False + and get_var_from_enclosing(for_task, var_name) != "QUAL.OMP.SHARED" + ): + explicit_clauses.append( + openmp_tag("QUAL.OMP.FIRSTPRIVATE", var_name) + ) + else: + explicit_clauses.append(openmp_tag("QUAL.OMP.SHARED", var_name)) + vars_in_explicit[var_name] = explicit_clauses[-1] + + # What to do below for task regions? + for var_name in sorted(private_to_region): + temp_var = ir.Var(scope, var_name, self.loc) + if not is_internal_var(temp_var): + explicit_clauses.append(openmp_tag("QUAL.OMP.PRIVATE", var_name)) + vars_in_explicit[var_name] = explicit_clauses[-1] + + for var_name in sorted(private_to_region): + temp_var = ir.Var(scope, var_name, self.loc) + if is_internal_var(temp_var): + explicit_clauses.append(openmp_tag("QUAL.OMP.PRIVATE", var_name)) + vars_in_explicit[var_name] = explicit_clauses[-1] + + def make_implicit_explicit_target( + self, + scope, + vars_in_explicit, + explicit_clauses, + gen_shared, + inputs_to_region, + def_but_live_out, + private_to_region, + ): + # unversioned_privates = set() # we get rid of SSA on the first openmp region so no SSA forms should be here + if gen_shared: + for var_name in sorted(inputs_to_region): + explicit_clauses.append( + openmp_tag( + "QUAL.OMP.TARGET.IMPLICIT" + if user_defined_var(var_name) + else "QUAL.OMP.PRIVATE", + var_name, + ) + ) + vars_in_explicit[var_name] = explicit_clauses[-1] + for var_name in sorted(def_but_live_out): + explicit_clauses.append( + openmp_tag( + "QUAL.OMP.TARGET.IMPLICIT" + if user_defined_var(var_name) + else "QUAL.OMP.PRIVATE", + var_name, + ) + ) + vars_in_explicit[var_name] = explicit_clauses[-1] + for var_name in sorted(private_to_region): + temp_var = ir.Var(scope, var_name, self.loc) + if not is_internal_var(temp_var): + explicit_clauses.append(openmp_tag("QUAL.OMP.PRIVATE", var_name)) + # explicit_clauses.append(openmp_tag("QUAL.OMP.TARGET.IMPLICIT" if user_defined_var(var_name) else "QUAL.OMP.PRIVATE", var_name)) + vars_in_explicit[var_name] = explicit_clauses[-1] + + for var_name in sorted(private_to_region): + temp_var = ir.Var(scope, var_name, self.loc) + if is_internal_var(temp_var): + explicit_clauses.append( + openmp_tag( + "QUAL.OMP.TARGET.IMPLICIT" + if user_defined_var(var_name) + else "QUAL.OMP.PRIVATE", + var_name, + ) + ) + vars_in_explicit[var_name] = explicit_clauses[-1] + + def add_explicits_to_start( + self, + scope, + vars_in_explicit, + explicit_clauses, + gen_shared, + start_tags, + keep_alive, + ): + start_tags.extend(explicit_clauses) + return [] + # tags_for_enclosing = [] + # for var in vars_in_explicit: + # if not is_private(vars_in_explicit[var].name): + # print("EVAR_COPY FOR", var) + # evar = ir.Var(scope, var, self.loc) + # evar_copy = scope.redefine("evar_copy_aets", self.loc) + # keep_alive.append(ir.Assign(evar, evar_copy, self.loc)) + # #keep_alive.append(ir.Assign(evar, evar, self.loc)) + # tags_for_enclosing.append(openmp_tag("QUAL.OMP.PRIVATE", evar_copy)) + # return tags_for_enclosing + + def flatten(self, all_clauses, start_block): + if DEBUG_OPENMP >= 1: + print("flatten", id(start_block)) + incoming_clauses = [remove_indirections(x) for x in all_clauses] + clauses = [] + default_shared = True + for clause in incoming_clauses: + if DEBUG_OPENMP >= 1: + print("clause:", clause, type(clause)) + if isinstance(clause, openmp_tag): + clauses.append(clause) + elif isinstance(clause, list): + clauses.extend(remove_indirections(clause)) + elif clause == "nowait": + clauses.append(openmp_tag("QUAL.OMP.NOWAIT")) + elif isinstance(clause, default_shared_val): + default_shared = clause.val + if DEBUG_OPENMP >= 1: + print("got new default_shared:", clause.val) + else: + if DEBUG_OPENMP >= 1: + print( + "Unknown clause type in incoming_clauses", clause, type(clause) + ) + assert 0 + + if hasattr(start_block, "openmp_replace_vardict"): + for clause in clauses: + # print("flatten out clause:", clause, clause.arg, type(clause.arg)) + for vardict in start_block.openmp_replace_vardict: + if clause.arg in vardict: + # print("clause.arg in vardict:", clause.arg, type(clause.arg), vardict[clause.arg], type(vardict[clause.arg])) + clause.arg = vardict[clause.arg].name + + return clauses, default_shared + + def add_replacement(self, blocks, replace_vardict): + for b in blocks.values(): + if not hasattr(b, "openmp_replace_vardict"): + b.openmp_replace_vardict = [] + b.openmp_replace_vardict.append(replace_vardict) + + def make_consts_unliteral_for_privates(self, privates, blocks): + for blk in blocks.values(): + for stmt in blk.body: + if ( + isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Const) + and stmt.target.name in privates + ): + stmt.value.use_literal_type = False + + def fix_empty_header(self, block, label): + if len(block.body) == 1: + assert isinstance(block.body[0], ir.Jump) + return self.blocks[block.body[0].target], block.body[0].target + return block, label + + def prepare_for_directive( + self, + clauses, + vars_in_explicit_clauses, + before_start, + after_start, + start_tags, + end_tags, + scope, + ): + start_tags = clauses + call_table, _ = get_call_table(self.blocks) + cfg = compute_cfg_from_blocks(self.blocks) + usedefs = compute_use_defs(self.blocks) + live_map = compute_live_map(cfg, self.blocks, usedefs.usemap, usedefs.defmap) + + def get_loops_in_region(all_loops): + loops = {} + for k, v in all_loops.items(): + if v.header >= self.blk_start and v.header <= self.blk_end: + loops[k] = v + return loops + + all_loops = cfg.loops() + if DEBUG_OPENMP >= 1: + print("all_loops:", all_loops) + print("live_map:", live_map) + print("body_blocks:", self.body_blocks) + + loops = get_loops_in_region(all_loops) + # Find the outer-most loop in this OpenMP region. + loops = list(filter_nested_loops(cfg, loops)) + + if DEBUG_OPENMP >= 1: + print("loops:", loops) + if len(loops) != 1: + raise ParallelForWrongLoopCount( + f"OpenMP parallel for regions must contain exactly one range based loop. The parallel for at line {self.loc} contains {len(loops)} loops." + ) + + collapse_tags = get_tags_of_type(clauses, "QUAL.OMP.COLLAPSE") + new_stmts_for_iterspace = [] + collapse_iterspace_block = set() + iterspace_vars = [] + if len(collapse_tags) > 0: + # Limit all_loops to just loops within the openmp region. + all_loops = get_loops_in_region(all_loops) + # In case of multiple collapse tags, use the last one. + collapse_tag = collapse_tags[-1] + # Remove collapse tags from clauses so they don't go to LLVM pass. + clauses[:] = [x for x in clauses if x not in collapse_tags] + # Add top level loop to loop_order list. + loop_order = list(filter_nested_loops(cfg, all_loops)) + if len(loop_order) != 1: + raise ParallelForWrongLoopCount( + f"OpenMP parallel for region must have only one top-level loop at line {self.loc}." + ) + # Determine how many nested loops we need to process. + collapse_value = collapse_tag.arg - 1 + # Make sure initial collapse value was >= 2. + if collapse_value <= 0: + raise ParallelForInvalidCollapseCount( + f"OpenMP parallel for regions with collapse clauses must be greather than or equal to 2 at line {self.loc}." + ) + + # Delete top-level loop from all_loops. + del all_loops[loop_order[-1].header] + # For remaining nested loops... + for _ in range(collapse_value): + # Get the next most top-level loop. + loops = list(filter_nested_loops(cfg, all_loops)) + # Make sure there is only one. + if len(loops) != 1: + raise ParallelForWrongLoopCount( + f"OpenMP parallel for collapse regions must be perfectly nested for the parallel for at line {self.loc}." + ) + # Add this loop to the loops to process in order. + loop_order.append(loops[0]) + # Delete this loop from all_loops. + del all_loops[loop_order[-1].header] + + if DEBUG_OPENMP >= 2: + print("loop_order:", loop_order) + stmts_to_retain = [] + loop_bounds = [] + for loop in loop_order: + loop_entry = list(loop.entries)[0] + loop_exit = list(loop.exits)[0] + loop_header = loop.header + loop_entry_block = self.blocks[loop_entry] + loop_exit_block = self.blocks[loop_exit] + loop_header_block, _ = self.fix_empty_header( + self.blocks[loop_header], loop_header + ) + + # Copy all stmts from the loop entry block up to the ir.Global + # for range. + call_offset = None + for entry_block_index, stmt in enumerate(loop_entry_block.body): + found_range = False + if ( + isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Global) + and stmt.value.name == "range" + ): + found_range = True + range_target = stmt.target + found_call = False + for call_index in range( + entry_block_index + 1, len(loop_entry_block.body) + ): + call_stmt = loop_entry_block.body[call_index] + if ( + isinstance(call_stmt, ir.Assign) + and isinstance(call_stmt.value, ir.Expr) + and call_stmt.value.op == "call" + and call_stmt.value.func == range_target + ): + found_call = True + # Remove stmts that were retained. + loop_entry_block.body = loop_entry_block.body[ + entry_block_index: + ] + call_offset = call_index - entry_block_index + break + assert found_call + break + stmts_to_retain.append(stmt) + assert found_range + for header_block_index, stmt in enumerate(loop_header_block.body): + if ( + isinstance(stmt, ir.Assign) + and isinstance(stmt.value, ir.Expr) + and stmt.value.op == "iternext" + ): + iternext_inst = loop_header_block.body[header_block_index] + pair_first_inst = loop_header_block.body[header_block_index + 1] + pair_second_inst = loop_header_block.body[ + header_block_index + 2 + ] + + assert ( + isinstance(iternext_inst, ir.Assign) + and isinstance(iternext_inst.value, ir.Expr) + and iternext_inst.value.op == "iternext" + ) + assert ( + isinstance(pair_first_inst, ir.Assign) + and isinstance(pair_first_inst.value, ir.Expr) + and pair_first_inst.value.op == "pair_first" + ) + assert ( + isinstance(pair_second_inst, ir.Assign) + and isinstance(pair_second_inst.value, ir.Expr) + and pair_second_inst.value.op == "pair_second" + ) + stmts_to_retain.extend( + loop_header_block.body[header_block_index + 3 : -1] + ) + loop_index = pair_first_inst.target + break + stmts_to_retain.append(stmt) + loop_bounds.append((call_stmt.value.args[0], loop_index)) + if DEBUG_OPENMP >= 1: + print("collapse 1") + dump_blocks(self.blocks) + # For all the loops except the last... + for loop in loop_order[:-1]: + # Change the unneeded headers to just jump to the next block. + loop_header = loop.header + loop_header_block, real_loop_header = self.fix_empty_header( + self.blocks[loop_header], loop_header + ) + collapse_iterspace_block.add(real_loop_header) + loop_header_block.body[-1] = ir.Jump( + loop_header_block.body[-1].truebr, loop_header_block.body[-1].loc + ) + last_eliminated_loop_header_block = loop_header_block + self.body_blocks = [ + x for x in self.body_blocks if x not in loop.entries + ] + self.body_blocks.remove(loop.header) + if DEBUG_OPENMP >= 1: + print("loop order:", loop_order) + print("loop bounds:", loop_bounds) + print("collapse 2") + dump_blocks(self.blocks) + last_loop = loop_order[-1] + last_loop_entry = list(last_loop.entries)[0] + last_loop_exit = list(last_loop.exits)[0] + last_loop_header = last_loop.header + last_loop_entry_block = self.blocks[last_loop_entry] + last_loop_exit_block = self.blocks[last_loop_exit] + last_loop_header_block, _ = self.fix_empty_header( + self.blocks[last_loop_header], loop_header + ) + last_loop_first_body_block = last_loop_header_block.body[-1].truebr + self.blocks[last_loop_first_body_block].body = ( + stmts_to_retain + self.blocks[last_loop_first_body_block].body + ) + last_loop_header_block.body[-1].falsebr = list(loop_order[0].exits)[0] + new_var_scope = last_loop_entry_block.body[0].target.scope + + # -------- Add vars to remember cumulative product of iteration space sizes. + new_iterspace_var = new_var_scope.redefine("new_iterspace0", self.loc) + start_tags.append( + openmp_tag("QUAL.OMP.FIRSTPRIVATE", new_iterspace_var.name) + ) + iterspace_vars.append(new_iterspace_var) + new_stmts_for_iterspace.append( + ir.Assign(loop_bounds[0][0], new_iterspace_var, self.loc) + ) + for lb_num, loop_bound in enumerate(loop_bounds[1:]): + mul_op = ir.Expr.binop( + operator.mul, new_iterspace_var, loop_bound[0], self.loc + ) + new_iterspace_var = new_var_scope.redefine( + "new_iterspace" + str(lb_num + 1), self.loc + ) + start_tags.append( + openmp_tag("QUAL.OMP.FIRSTPRIVATE", new_iterspace_var.name) + ) + iterspace_vars.append(new_iterspace_var) + new_stmts_for_iterspace.append( + ir.Assign(mul_op, new_iterspace_var, self.loc) + ) + # Change iteration space of innermost loop to the product of all the + # loops' iteration spaces. + last_loop_entry_block.body[call_offset].value.args[0] = new_iterspace_var + + last_eliminated_loop_header_block.body = ( + new_stmts_for_iterspace + last_eliminated_loop_header_block.body + ) + + deconstruct_indices = [] + new_deconstruct_var = new_var_scope.redefine("deconstruct", self.loc) + deconstruct_indices.append( + ir.Assign(loop_bounds[-1][1], new_deconstruct_var, self.loc) + ) + for deconstruct_index in range(len(loop_bounds) - 1): + cur_iterspace_var = iterspace_vars[ + len(loop_bounds) - 2 - deconstruct_index + ] + cur_loop_bound = loop_bounds[deconstruct_index][1] + # if DEBUG_OPENMP >= 1: + # print("deconstructing", cur_iterspace_var) + # deconstruct_indices.append(ir.Print([new_deconstruct_var, cur_iterspace_var], None, self.loc)) + deconstruct_div = ir.Expr.binop( + operator.floordiv, new_deconstruct_var, cur_iterspace_var, self.loc + ) + new_deconstruct_var_loop = new_var_scope.redefine( + "deconstruct" + str(deconstruct_index), self.loc + ) + deconstruct_indices.append( + ir.Assign(deconstruct_div, cur_loop_bound, self.loc) + ) + # if DEBUG_OPENMP >= 1: + # deconstruct_indices.append(ir.Print([cur_loop_bound], None, self.loc)) + new_deconstruct_var_mul = new_var_scope.redefine( + "deconstruct_mul" + str(deconstruct_index), self.loc + ) + deconstruct_indices.append( + ir.Assign( + ir.Expr.binop( + operator.mul, cur_loop_bound, cur_iterspace_var, self.loc + ), + new_deconstruct_var_mul, + self.loc, + ) + ) + # if DEBUG_OPENMP >= 1: + # deconstruct_indices.append(ir.Print([new_deconstruct_var_mul], None, self.loc)) + deconstruct_indices.append( + ir.Assign( + ir.Expr.binop( + operator.sub, + new_deconstruct_var, + new_deconstruct_var_mul, + self.loc, + ), + new_deconstruct_var_loop, + self.loc, + ) + ) + # if DEBUG_OPENMP >= 1: + # deconstruct_indices.append(ir.Print([new_deconstruct_var_loop], None, self.loc)) + new_deconstruct_var = new_deconstruct_var_loop + deconstruct_indices.append( + ir.Assign(new_deconstruct_var, loop_bounds[-1][1], self.loc) + ) + + self.blocks[last_loop_first_body_block].body = ( + deconstruct_indices + self.blocks[last_loop_first_body_block].body + ) + + if DEBUG_OPENMP >= 1: + print("collapse 3", self.blk_start, self.blk_end) + dump_blocks(self.blocks) + + cfg = compute_cfg_from_blocks(self.blocks) + live_map = compute_live_map( + cfg, self.blocks, usedefs.usemap, usedefs.defmap + ) + all_loops = cfg.loops() + loops = get_loops_in_region(all_loops) + loops = list(filter_nested_loops(cfg, loops)) + if DEBUG_OPENMP >= 2: + print("loops after collapse:", loops) + if DEBUG_OPENMP >= 1: + print("blocks after collapse", self.blk_start, self.blk_end) + dump_blocks(self.blocks) + + def _get_loop_kind(func_var, call_table): + if func_var not in call_table: + return False + call = call_table[func_var] + if len(call) == 0: + return False + + return call[0] + + loop = loops[0] + entry = list(loop.entries)[0] + header = loop.header + exit = list(loop.exits)[0] + + loop_blocks_for_io = loop.entries.union(loop.body) + loop_blocks_for_io_minus_entry = loop_blocks_for_io - {entry} + non_loop_blocks = set(self.body_blocks) + non_loop_blocks.difference_update(loop_blocks_for_io) + non_loop_blocks.difference_update(collapse_iterspace_block) + # non_loop_blocks.difference_update({exit}) + + if DEBUG_OPENMP >= 1: + print("non_loop_blocks:", non_loop_blocks) + print("entry:", entry) + print("header:", header) + print("exit:", exit) + print("body_blocks:", self.body_blocks) + print("loop:", loop) + + # Find the first statement after any iterspace calculation ones for collapse. + first_stmt = self.blocks[entry].body[0] + # first_stmt = self.blocks[entry].body[len(new_stmts_for_iterspace)] + if ( + not isinstance(first_stmt, ir.Assign) + or not isinstance(first_stmt.value, ir.Global) + or first_stmt.value.name != "range" + ): + raise ParallelForExtraCode( + f"Extra code near line {self.loc} is not allowed before or after the loop in an OpenMP parallel for region." + ) + + live_end = live_map[self.blk_end] + for non_loop_block in non_loop_blocks: + nlb = self.blocks[non_loop_block] + if isinstance(nlb.body[0], ir.Jump): + # Non-loop empty blocks are fine. + continue + if ( + isinstance(nlb.body[-1], ir.Jump) + and nlb.body[-1].target == self.blk_end + ): + # Loop through all statements in block that jumps to the end of the region. + # If those are all assignments where the LHS is dead then they are safe. + for nlb_stmt in nlb.body[:-1]: + if isinstance(nlb_stmt, ir.PopBlock): + continue + + break + # if not isinstance(nlb_stmt, ir.Assign): + # break # Non-assignment is not known to be safe...will fallthrough to raise exception. + # if nlb_stmt.target.name in live_end: + # break # Non-dead variables in assignment is not safe...will fallthrough to raise exception. + else: + continue + raise ParallelForExtraCode( + f"Extra code near line {self.loc} is not allowed before or after the loop in an OpenMP parallel for region." + ) + + if DEBUG_OPENMP >= 1: + print("loop_blocks_for_io:", loop_blocks_for_io, entry, exit) + print("non_loop_blocks:", non_loop_blocks) + print("header:", header) + + entry_block = self.blocks[entry] + assert isinstance(entry_block.body[-1], ir.Jump) + assert entry_block.body[-1].target == header + exit_block = self.blocks[exit] + header_block = self.blocks[header] + extra_block = ( + None if len(header_block.body) > 1 else header_block.body[-1].target + ) + + latch_block_num = max(self.blocks.keys()) + 1 + + # We have to reformat the Numba style of loop to the form that the LLVM + # openmp pass supports. + header_preds = [x[0] for x in cfg.predecessors(header)] + entry_preds = list(set(header_preds).difference(loop.body)) + back_blocks = list(set(header_preds).intersection(loop.body)) + if DEBUG_OPENMP >= 1: + print("header_preds:", header_preds) + print("entry_preds:", entry_preds) + print("back_blocks:", back_blocks) + assert len(entry_preds) == 1 + entry_pred_label = entry_preds[0] + entry_pred = self.blocks[entry_pred_label] + if extra_block is not None: + header_block = self.blocks[extra_block] + header = extra_block + header_branch = header_block.body[-1] + post_header = {header_branch.truebr, header_branch.falsebr} + post_header.remove(exit) + if DEBUG_OPENMP >= 1: + print("post_header:", post_header) + post_header = self.blocks[list(post_header)[0]] + if DEBUG_OPENMP >= 1: + print("post_header:", post_header) + + normalized = True + + for inst_num, inst in enumerate(entry_block.body): + if ( + isinstance(inst, ir.Assign) + and isinstance(inst.value, ir.Expr) + and inst.value.op == "call" + ): + loop_kind = _get_loop_kind(inst.value.func.name, call_table) + if DEBUG_OPENMP >= 1: + print("loop_kind:", loop_kind) + if loop_kind != False and loop_kind == range: + range_inst = inst + range_args = inst.value.args + if DEBUG_OPENMP >= 1: + print("found one", loop_kind, inst, range_args) + + # ---------------------------------------------- + # Find getiter instruction for this range. + for entry_inst in entry_block.body[inst_num + 1 :]: + if ( + isinstance(entry_inst, ir.Assign) + and isinstance(entry_inst.value, ir.Expr) + and entry_inst.value.op == "getiter" + and entry_inst.value.value == range_inst.target + ): + getiter_inst = entry_inst + break + assert getiter_inst + if DEBUG_OPENMP >= 1: + print("getiter_inst:", getiter_inst) + # ---------------------------------------------- + + assert len(header_block.body) > 3 + if DEBUG_OPENMP >= 1: + print("header block before removing Numba range vars:") + dump_block(header, header_block) + + for ii in range(len(header_block.body)): + ii_inst = header_block.body[ii] + if ( + isinstance(ii_inst, ir.Assign) + and isinstance(ii_inst.value, ir.Expr) + and ii_inst.value.op == "iternext" + ): + iter_num = ii + break + + iternext_inst = header_block.body[iter_num] + pair_first_inst = header_block.body[iter_num + 1] + pair_second_inst = header_block.body[iter_num + 2] + + assert ( + isinstance(iternext_inst, ir.Assign) + and isinstance(iternext_inst.value, ir.Expr) + and iternext_inst.value.op == "iternext" + ) + assert ( + isinstance(pair_first_inst, ir.Assign) + and isinstance(pair_first_inst.value, ir.Expr) + and pair_first_inst.value.op == "pair_first" + ) + assert ( + isinstance(pair_second_inst, ir.Assign) + and isinstance(pair_second_inst.value, ir.Expr) + and pair_second_inst.value.op == "pair_second" + ) + # Remove those nodes from the IR. + header_block.body = ( + header_block.body[:iter_num] + header_block.body[iter_num + 3 :] + ) + if DEBUG_OPENMP >= 1: + print("header block after removing Numba range vars:") + dump_block(header, header_block) + + loop_index = pair_first_inst.target + if DEBUG_OPENMP >= 1: + print("loop_index:", loop_index, type(loop_index)) + # The loop_index from Numba's perspective is not what it is from the + # programmer's perspective. The OpenMP loop index is always private so + # we need to start from Numba's loop index (e.g., $48for_iter.3) and + # trace assignments from that through the header block and then find + # the first such assignment in the first loop block that the header + # branches to. + latest_index = loop_index + for hinst in header_block.body: + if isinstance(hinst, ir.Assign) and isinstance( + hinst.value, ir.Var + ): + if hinst.value.name == latest_index.name: + latest_index = hinst.target + for phinst in post_header.body: + if isinstance(phinst, ir.Assign) and isinstance( + phinst.value, ir.Var + ): + if phinst.value.name == latest_index.name: + latest_index = phinst.target + break + if DEBUG_OPENMP >= 1: + print("latest_index:", latest_index, type(latest_index)) + + if latest_index.name not in vars_in_explicit_clauses: + new_index_clause = openmp_tag( + "QUAL.OMP.PRIVATE", + ir.Var(loop_index.scope, latest_index.name, inst.loc), + ) + clauses.append(new_index_clause) + vars_in_explicit_clauses[latest_index.name] = new_index_clause + else: + if ( + vars_in_explicit_clauses[latest_index.name].name + != "QUAL.OMP.PRIVATE" + ): + pass + # throw error? FIX ME + + if DEBUG_OPENMP >= 1: + for clause in clauses: + print("post-latest_index clauses:", clause) + + start = 0 + step = 1 + size_var = range_args[0] + if len(range_args) == 2: + start = range_args[0] + size_var = range_args[1] + if len(range_args) == 3: + start = range_args[0] + size_var = range_args[1] + try: + step = self.func_ir.get_definition(range_args[2]) + # Only use get_definition to get a const if + # available. Otherwise use the variable. + if not isinstance(step, (int, ir.Const)): + step = range_args[2] + except KeyError: + # If there is more than one definition possible for the + # step variable then just use the variable and don't try + # to convert to a const. + step = range_args[2] + if isinstance(step, ir.Const): + step = step.value + + if DEBUG_OPENMP >= 1: + print("size_var:", size_var, type(size_var)) + + omp_lb_var = loop_index.scope.redefine("$omp_lb", inst.loc) + before_start.append( + ir.Assign(ir.Const(0, inst.loc), omp_lb_var, inst.loc) + ) + + omp_iv_var = loop_index.scope.redefine("$omp_iv", inst.loc) + # before_start.append(ir.Assign(omp_lb_var, omp_iv_var, inst.loc)) + # Don't use omp_lb here because that makes a live-in to the region that + # becomes a parameter to an outlined target region. + after_start.append( + ir.Assign(ir.Const(0, inst.loc), omp_iv_var, inst.loc) + ) + # after_start.append(ir.Assign(omp_lb_var, omp_iv_var, inst.loc)) + + types_mod_var = loop_index.scope.redefine( + "$numba_types_mod", inst.loc + ) + types_mod = ir.Global("types", types, inst.loc) + types_mod_assign = ir.Assign(types_mod, types_mod_var, inst.loc) + before_start.append(types_mod_assign) + + int64_var = loop_index.scope.redefine("$int64_var", inst.loc) + int64_getattr = ir.Expr.getattr(types_mod_var, "int64", inst.loc) + int64_assign = ir.Assign(int64_getattr, int64_var, inst.loc) + before_start.append(int64_assign) + + get_itercount_var = loop_index.scope.redefine( + "$get_itercount", inst.loc + ) + get_itercount_global = ir.Global( + "get_itercount", get_itercount, inst.loc + ) + get_itercount_assign = ir.Assign( + get_itercount_global, get_itercount_var, inst.loc + ) + before_start.append(get_itercount_assign) + + itercount_var = loop_index.scope.redefine("$itercount", inst.loc) + itercount_expr = ir.Expr.call( + get_itercount_var, [getiter_inst.target], (), inst.loc + ) + # itercount_expr = ir.Expr.itercount(getiter_inst.target, inst.loc) + before_start.append( + ir.Assign(itercount_expr, itercount_var, inst.loc) + ) + + omp_ub_var = loop_index.scope.redefine("$omp_ub", inst.loc) + omp_ub_expr = ir.Expr.call(int64_var, [itercount_var], (), inst.loc) + before_start.append(ir.Assign(omp_ub_expr, omp_ub_var, inst.loc)) + + const1_var = loop_index.scope.redefine("$const1", inst.loc) + start_tags.append(openmp_tag("QUAL.OMP.PRIVATE", const1_var)) + const1_assign = ir.Assign( + ir.Const(1, inst.loc), const1_var, inst.loc + ) + before_start.append(const1_assign) + count_add_1 = ir.Expr.binop( + operator.sub, omp_ub_var, const1_var, inst.loc + ) + before_start.append(ir.Assign(count_add_1, omp_ub_var, inst.loc)) + + # before_start.append(ir.Print([omp_ub_var], None, inst.loc)) + + omp_start_var = loop_index.scope.redefine("$omp_start", inst.loc) + if start == 0: + start = ir.Const(start, inst.loc) + before_start.append(ir.Assign(start, omp_start_var, inst.loc)) + + # ---------- Create latch block ------------------------------- + latch_iv = omp_iv_var + + latch_block = ir.Block(scope, inst.loc) + const1_latch_var = loop_index.scope.redefine( + "$const1_latch", inst.loc + ) + start_tags.append(openmp_tag("QUAL.OMP.PRIVATE", const1_latch_var)) + const1_assign = ir.Assign( + ir.Const(1, inst.loc), const1_latch_var, inst.loc + ) + latch_block.body.append(const1_assign) + latch_assign = ir.Assign( + ir.Expr.binop( + operator.add, omp_iv_var, const1_latch_var, inst.loc + ), + latch_iv, + inst.loc, + ) + latch_block.body.append(latch_assign) + latch_block.body.append(ir.Jump(header, inst.loc)) + + self.blocks[latch_block_num] = latch_block + for bb in back_blocks: + if False: + str_var = scope.redefine("$str_var", inst.loc) + str_const = ir.Const("mid start:", inst.loc) + str_assign = ir.Assign(str_const, str_var, inst.loc) + str_print = ir.Print([str_var, size_var], None, inst.loc) + # before_start.append(str_assign) + # before_start.append(str_print) + self.blocks[bb].body = self.blocks[bb].body[:-1] + [ + str_assign, + str_print, + ir.Jump(latch_block_num, inst.loc), + ] + else: + self.blocks[bb].body[-1] = ir.Jump( + latch_block_num, inst.loc + ) + # ------------------------------------------------------------- + + # ---------- Header Manipulation ------------------------------ + step_var = loop_index.scope.redefine("$step_var", inst.loc) + detect_step_assign = ir.Assign( + ir.Const(0, inst.loc), step_var, inst.loc + ) + after_start.append(detect_step_assign) + + if isinstance(step, int): + step_assign = ir.Assign( + ir.Const(step, inst.loc), step_var, inst.loc + ) + elif isinstance(step, ir.Var): + step_assign = ir.Assign(step, step_var, inst.loc) + start_tags.append( + openmp_tag("QUAL.OMP.FIRSTPRIVATE", step.name) + ) + else: + print("Unsupported step:", step, type(step)) + raise NotImplementedError( + f"Unknown step type that isn't a constant or variable but {type(step)} instead." + ) + scale_var = loop_index.scope.redefine("$scale", inst.loc) + fake_iternext = ir.Assign( + ir.Const(0, inst.loc), iternext_inst.target, inst.loc + ) + fake_second = ir.Assign( + ir.Const(0, inst.loc), pair_second_inst.target, inst.loc + ) + scale_assign = ir.Assign( + ir.Expr.binop(operator.mul, step_var, omp_iv_var, inst.loc), + scale_var, + inst.loc, + ) + unnormalize_iv = ir.Assign( + ir.Expr.binop(operator.add, omp_start_var, scale_var, inst.loc), + loop_index, + inst.loc, + ) + cmp_var = loop_index.scope.redefine("$cmp", inst.loc) + iv_lte_ub = ir.Assign( + ir.Expr.binop(operator.le, omp_iv_var, omp_ub_var, inst.loc), + cmp_var, + inst.loc, + ) + old_branch = header_block.body[-1] + new_branch = ir.Branch( + cmp_var, old_branch.truebr, old_branch.falsebr, old_branch.loc + ) + body_label = old_branch.truebr + first_body_block = self.blocks[body_label] + new_end = [iv_lte_ub, new_branch] + # Turn this on to add printing to help debug at runtime. + if False: + str_var = loop_index.scope.redefine("$str_var", inst.loc) + str_const = ir.Const("header1:", inst.loc) + str_assign = ir.Assign(str_const, str_var, inst.loc) + new_end.append(str_assign) + str_print = ir.Print( + [str_var, omp_start_var, omp_iv_var], None, inst.loc + ) + new_end.append(str_print) + + # Prepend original contents of header into the first body block minus the comparison + first_body_block.body = ( + [ + fake_iternext, + fake_second, + step_assign, + scale_assign, + unnormalize_iv, + ] + + header_block.body[:-1] + + first_body_block.body + ) + + header_block.body = new_end + # header_block.body = [fake_iternext, fake_second, unnormalize_iv] + header_block.body[:-1] + new_end + + # ------------------------------------------------------------- + + # const_start_var = loop_index.scope.redefine("$const_start", inst.loc) + # before_start.append(ir.Assign(ir.Const(0, inst.loc), const_start_var, inst.loc)) + # start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", const_start_var.name)) + start_tags.append( + openmp_tag("QUAL.OMP.NORMALIZED.IV", omp_iv_var.name) + ) + start_tags.append( + openmp_tag("QUAL.OMP.NORMALIZED.START", omp_start_var.name) + ) + start_tags.append( + openmp_tag("QUAL.OMP.NORMALIZED.LB", omp_lb_var.name) + ) + start_tags.append( + openmp_tag("QUAL.OMP.NORMALIZED.UB", omp_ub_var.name) + ) + start_tags.append(openmp_tag("QUAL.OMP.PRIVATE", omp_iv_var.name)) + start_tags.append( + openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_start_var.name) + ) + start_tags.append( + openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_lb_var.name) + ) + start_tags.append( + openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_ub_var.name) + ) + tags_for_enclosing = [ + cmp_var.name, + omp_lb_var.name, + omp_start_var.name, + omp_iv_var.name, + types_mod_var.name, + int64_var.name, + itercount_var.name, + omp_ub_var.name, + const1_var.name, + const1_latch_var.name, + get_itercount_var.name, + ] + [x.name for x in iterspace_vars] + tags_for_enclosing = [ + openmp_tag("QUAL.OMP.PRIVATE", x) for x in tags_for_enclosing + ] + # Don't blindly copy code here...this isn't doing what the other spots are doing with privatization. + add_tags_to_enclosing( + self.func_ir, self.blk_start, tags_for_enclosing + ) + # start_tags.append(openmp_tag("QUAL.OMP.NORMALIZED.IV", loop_index.name)) + # start_tags.append(openmp_tag("QUAL.OMP.NORMALIZED.UB", size_var.name)) + return ( + True, + loop_blocks_for_io, + loop_blocks_for_io_minus_entry, + entry_pred, + exit_block, + inst, + size_var, + step_var, + latest_index, + loop_index, + ) + + return False, None, None, None, None, None, None, None, None, None + + def some_for_directive( + self, args, main_start_tag, main_end_tag, first_clause, gen_shared + ): + if DEBUG_OPENMP >= 1: + print("some_for_directive", self.body_blocks) + start_tags = [openmp_tag(main_start_tag)] + end_tags = [openmp_tag(main_end_tag)] + clauses = self.some_data_clause_directive( + args, start_tags, end_tags, first_clause, has_loop=True + ) + + if "PARALLEL" in main_start_tag: + # ---- Back propagate THREAD_LIMIT to enclosed target region. ---- + self.parallel_back_prop(clauses) + + if len(list(filter(lambda x: x.name == "QUAL.OMP.NUM_THREADS", clauses))) > 1: + raise MultipleNumThreadsClauses( + f"Multiple num_threads clauses near line {self.loc} is not allowed in an OpenMP parallel region." + ) + + # --------- Parser functions ------------------------ + + def barrier_directive(self, args): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit barrier_directive", args, type(args)) + or_start = openmp_region_start([openmp_tag("DIR.OMP.BARRIER")], 0, self.loc) + or_start.requires_combined_acquire_release() + or_end = openmp_region_end( + or_start, [openmp_tag("DIR.OMP.END.BARRIER")], self.loc + ) + sblk.body = [or_start] + [or_end] + sblk.body[:] + + def taskwait_directive(self, args): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit taskwait_directive", args, type(args)) + or_start = openmp_region_start([openmp_tag("DIR.OMP.TASKWAIT")], 0, self.loc) + or_start.requires_combined_acquire_release() + or_end = openmp_region_end( + or_start, [openmp_tag("DIR.OMP.END.TASKWAIT")], self.loc + ) + sblk.body = [or_start] + [or_end] + sblk.body[:] + + def taskyield_directive(self, args): + raise NotImplementedError("Taskyield currently unsupported.") + + # Don't need a rule for BARRIER. + # Don't need a rule for TASKWAIT. + # Don't need a rule for TASKYIELD. + + def taskgroup_directive(self, args): + raise NotImplementedError("Taskgroup currently unsupported.") + + # Don't need a rule for taskgroup_construct. + # Don't need a rule for TASKGROUP. + + # Don't need a rule for openmp_construct. + + # def teams_distribute_parallel_for_simd_clause(self, args): + # raise NotImplementedError("""Simd clause for target teams + # distribute parallel loop currently unsupported.""") + # if DEBUG_OPENMP >= 1: + # print("visit device_clause", args, type(args)) + + # Don't need a rule for for_simd_construct. + + def for_simd_directive(self, args): + raise NotImplementedError("For simd currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit for_simd_directive", args, type(args)) + + def for_simd_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit for_simd_clause", args, type(args), args[0]) + return args[0] + + def schedule_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit schedule_clause", args, type(args), args[0]) + return args[0] + + def dist_schedule_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit dist_schedule_clause", args, type(args), args[0]) + return args[0] + + # Don't need a rule for parallel_for_simd_construct. + + def parallel_for_simd_directive(self, args): + raise NotImplementedError("Parallel for simd currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit parallel_for_simd_directive", args, type(args)) + + def parallel_for_simd_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit parallel_for_simd_clause", args, type(args), args[0]) + return args[0] + + # Don't need a rule for target_data_construct. + + def target_data_directive(self, args): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit target_data_directive", args, type(args)) + + before_start = [] + after_start = [] + + clauses, default_shared = self.flatten(args[2:], sblk) + + if DEBUG_OPENMP >= 1: + for clause in clauses: + print("final clause:", clause) + + inputs_to_region, def_but_live_out, private_to_region, live_map = ( + self.find_io_vars(self.body_blocks) + ) + used_in_region = inputs_to_region | def_but_live_out | private_to_region + clauses = self.filter_unused_vars(clauses, used_in_region) + + start_tags = [openmp_tag("DIR.OMP.TARGET.DATA")] + clauses + end_tags = [openmp_tag("DIR.OMP.END.TARGET.DATA")] + + or_start = openmp_region_start(start_tags, 0, self.loc) + or_end = openmp_region_end(or_start, end_tags, self.loc) + sblk.body = before_start + [or_start] + after_start + sblk.body[:] + eblk.body = [or_end] + eblk.body[:] + + add_enclosing_region(self.func_ir, self.body_blocks, or_start) + + # Don't need a rule for DATA. + + def target_data_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit target_data_clause", args, type(args), args[0]) + (val,) = args + if isinstance(val, openmp_tag): + return [val] + elif isinstance(val, list): + return val + elif val == "nowait": + return openmp_tag("QUAL.OMP.NOWAIT") + else: + return val + + def target_enter_data_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit target_enter_data_clause", args, type(args), args[0]) + (val,) = args + if isinstance(val, openmp_tag): + return [val] + elif isinstance(val, list): + return val + elif val == "nowait": + return openmp_tag("QUAL.OMP.NOWAIT") + else: + return val + + def target_exit_data_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit target_exit_data_clause", args, type(args), args[0]) + (val,) = args + if isinstance(val, openmp_tag): + return [val] + elif isinstance(val, list): + return val + elif val == "nowait": + return openmp_tag("QUAL.OMP.NOWAIT") + else: + return val + + def device_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit device_clause", args, type(args)) + return [openmp_tag("QUAL.OMP.DEVICE", args[0])] + + def map_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit map_clause", args, type(args), args[0]) + if args[0] in ["to", "from", "alloc", "tofrom"]: + map_type = args[0].upper() + var_list = args[1] + assert len(args) == 2 + else: + map_type = "TOFROM" # is this default right? FIX ME + var_list = args[0] + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var)) + return ret + + def map_enter_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit map_enter_clause", args, type(args), args[0]) + assert args[0] in ["to", "alloc"] + map_type = args[0].upper() + var_list = args[1] + assert len(args) == 2 + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var)) + return ret + + def map_exit_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit map_exit_clause", args, type(args), args[0]) + assert args[0] in ["from", "release", "delete"] + map_type = args[0].upper() + var_list = args[1] + assert len(args) == 2 + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var)) + return ret + + def depend_with_modifier_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit depend_with_modifier_clause", args, type(args), args[0]) + dep_type = args[1].upper() + var_list = args[2] + assert len(args) == 3 + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.DEPEND." + dep_type, var)) + return ret + + def map_type(self, args): + if DEBUG_OPENMP >= 1: + print("visit map_type", args, type(args), args[0]) + return str(args[0]) + + def map_enter_type(self, args): + if DEBUG_OPENMP >= 1: + print("visit map_enter_type", args, type(args), args[0]) + return str(args[0]) + + def map_exit_type(self, args): + if DEBUG_OPENMP >= 1: + print("visit map_exit_type", args, type(args), args[0]) + return str(args[0]) + + def update_motion_type(self, args): + if DEBUG_OPENMP >= 1: + print("visit update_motion_type", args, type(args), args[0]) + return str(args[0]) + + # Don't need a rule for TO. + # Don't need a rule for FROM. + # Don't need a rule for ALLOC. + # Don't need a rule for TOFROM. + # Don't need a rule for parallel_sections_construct. + + def parallel_sections_directive(self, args): + raise NotImplementedError("Parallel sections currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit parallel_sections_directive", args, type(args)) + + def parallel_sections_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit parallel_sections_clause", args, type(args), args[0]) + return args[0] + + # Don't need a rule for sections_construct. + + def sections_directive(self, args): + raise NotImplementedError("Sections directive currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit sections_directive", args, type(args)) + + # Don't need a rule for SECTIONS. + + def sections_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit sections_clause", args, type(args), args[0]) + return args[0] + + # Don't need a rule for section_construct. + + def section_directive(self, args): + raise NotImplementedError("Section directive currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit section_directive", args, type(args)) + + # Don't need a rule for SECTION. + # Don't need a rule for atomic_construct. + + def atomic_directive(self, args): + raise NotImplementedError("Atomic currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit atomic_directive", args, type(args)) + + # Don't need a rule for ATOMIC. + + def atomic_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit atomic_clause", args, type(args), args[0]) + return args[0] + + # Don't need a rule for READ. + # Don't need a rule for WRITE. + # Don't need a rule for UPDATE. + # Don't need a rule for CAPTURE. + # Don't need a rule for seq_cst_clause. + # Don't need a rule for critical_construct. + + def critical_directive(self, args): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + scope = sblk.scope + + if DEBUG_OPENMP >= 1: + print("visit critical_directive", args, type(args)) + or_start = openmp_region_start([openmp_tag("DIR.OMP.CRITICAL")], 0, self.loc) + or_start.requires_acquire_release() + or_end = openmp_region_end( + or_start, [openmp_tag("DIR.OMP.END.CRITICAL")], self.loc + ) + + inputs_to_region, def_but_live_out, private_to_region, live_map = ( + self.find_io_vars(self.body_blocks) + ) + inputs_to_region = {remove_ssa(x, scope, self.loc): x for x in inputs_to_region} + def_but_live_out = {remove_ssa(x, scope, self.loc): x for x in def_but_live_out} + common_keys = inputs_to_region.keys() & def_but_live_out.keys() + in_def_live_out = { + inputs_to_region[k]: def_but_live_out[k] for k in common_keys + } + if DEBUG_OPENMP >= 1: + print("inputs_to_region:", sorted(inputs_to_region)) + print("def_but_live_out:", sorted(def_but_live_out)) + print("in_def_live_out:", sorted(in_def_live_out)) + + reset = [] + for k, v in in_def_live_out.items(): + reset.append( + ir.Assign( + ir.Var(scope, v, self.loc), ir.Var(scope, k, self.loc), self.loc + ) + ) + + sblk.body = [or_start] + sblk.body[:] + eblk.body = reset + [or_end] + eblk.body[:] + + # Don't need a rule for CRITICAL. + # Don't need a rule for target_construct. + # Don't need a rule for target_teams_distribute_parallel_for_simd_construct. + + def teams_back_prop(self, clauses): + enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start) + if DEBUG_OPENMP >= 1: + print("teams enclosing_regions:", enclosing_regions) + if not enclosing_regions: + return + + for enclosing_region in enclosing_regions[::-1]: + if not self.get_directive_match(enclosing_region.tags, "DIR.OMP.TARGET"): + continue + + nt_tag = self.get_clauses_by_name( + enclosing_region.tags, "QUAL.OMP.NUM_TEAMS" + ) + assert len(nt_tag) == 1 + cur_num_team_clauses = self.get_clauses_by_name( + clauses, "QUAL.OMP.NUM_TEAMS", remove_from_orig=True + ) + if len(cur_num_team_clauses) >= 1: + nt_tag[-1].arg = cur_num_team_clauses[-1].arg + else: + nt_tag[-1].arg = 0 + + nt_tag = self.get_clauses_by_name( + enclosing_region.tags, "QUAL.OMP.THREAD_LIMIT" + ) + assert len(nt_tag) == 1 + cur_num_team_clauses = self.get_clauses_by_name( + clauses, "QUAL.OMP.THREAD_LIMIT", remove_from_orig=True + ) + if len(cur_num_team_clauses) >= 1: + nt_tag[-1].arg = cur_num_team_clauses[-1].arg + else: + nt_tag[-1].arg = 0 + + return + + def check_distribute_nesting(self, dir_tag): + if "DISTRIBUTE" in dir_tag and "TEAMS" not in dir_tag: + enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start) + if ( + len(enclosing_regions) < 1 + or "TEAMS" not in enclosing_regions[-1].tags[0].name + ): + raise NotImplementedError( + "DISTRIBUTE must be nested under or combined with TEAMS." + ) + + def teams_directive(self, args): + if DEBUG_OPENMP >= 1: + print( + "visit teams_directive", args, type(args), self.blk_start, self.blk_end + ) + start_tags = [openmp_tag("DIR.OMP.TEAMS")] + end_tags = [openmp_tag("DIR.OMP.END.TEAMS")] + clauses = self.some_data_clause_directive(args, start_tags, end_tags, 1) + + self.teams_back_prop(clauses) + + def target_directive(self, args): + if sys.platform.startswith("darwin"): + print("ERROR: OpenMP target offloading is unavailable on Darwin") + sys.exit(-1) + self.some_target_directive(args, "TARGET", 1) + + def target_teams_directive(self, args): + self.some_target_directive(args, "TARGET.TEAMS", 2) + + def target_teams_distribute_directive(self, args): + self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE", 3, has_loop=True) + + def target_loop_directive(self, args): + self.some_target_directive( + args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True + ) + + def target_teams_loop_directive(self, args): + self.some_target_directive( + args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 3, has_loop=True + ) + + def target_teams_distribute_parallel_for_directive(self, args): + self.some_target_directive( + args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 5, has_loop=True + ) + + def target_teams_distribute_parallel_for_simd_directive(self, args): + # Intentionally dropping "SIMD" from string as that typically isn't implemented on GPU. + self.some_target_directive( + args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 6, has_loop=True + ) + + def get_clauses_by_name(self, clauses, names, remove_from_orig=False): + if not isinstance(names, list): + names = [names] + + ret = list(filter(lambda x: x.name in names, clauses)) + if remove_from_orig: + clauses[:] = list(filter(lambda x: x.name not in names, clauses)) + return ret + + def get_clauses_by_start(self, clauses, names, remove_from_orig=False): + if not isinstance(names, list): + names = [names] + ret = list( + filter(lambda x: any([x.name.startswith(y) for y in names]), clauses) + ) + if remove_from_orig: + clauses[:] = list( + filter( + lambda x: any([not x.name.startswith(y) for y in names]), clauses + ) + ) + return ret + + def get_clauses_if_contains(self, clauses, names, remove_from_orig=False): + if not isinstance(names, list): + names = [names] + ret = list(filter(lambda x: any([y in x.name for y in names]), clauses)) + if remove_from_orig: + clauses[:] = list( + filter(lambda x: any([not y in x.name for y in names]), clauses) + ) + return ret + + def get_directive_if_contains(self, tags, name): + dir = [x for x in tags if x.name.startswith("DIR")] + assert len(dir) == 1, "Expected one directive tag" + ret = [x for x in dir if name in x.name] + return ret + + def get_directive_match(self, tags, name): + dir = [x for x in tags if x.name.startswith("DIR")] + assert len(dir) == 1, "Expected one directive tag" + ret = [x for x in dir if name == x.name] + return ret + + def target_enter_data_directive(self, args): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit target_enter_data_directive", args, type(args)) + + clauses, _ = self.flatten(args[3:], sblk) + or_start = openmp_region_start( + [openmp_tag("DIR.OMP.TARGET.ENTER.DATA")] + clauses, 0, self.loc + ) + or_end = openmp_region_end( + or_start, [openmp_tag("DIR.OMP.END.TARGET.ENTER.DATA")], self.loc + ) + sblk.body = [or_start] + [or_end] + sblk.body[:] + + def target_exit_data_directive(self, args): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit target_exit_data_directive", args, type(args)) + + clauses, _ = self.flatten(args[3:], sblk) + or_start = openmp_region_start( + [openmp_tag("DIR.OMP.TARGET.EXIT.DATA")] + clauses, 0, self.loc + ) + or_end = openmp_region_end( + or_start, [openmp_tag("DIR.OMP.END.TARGET.EXIT.DATA")], self.loc + ) + sblk.body = [or_start] + [or_end] + sblk.body[:] + + def teams_distribute_parallel_for_simd_directive(self, args): + self.some_distribute_directive( + args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP.SIMD", 5, has_loop=True + ) + + def teams_distribute_parallel_for_directive(self, args): + self.some_distribute_directive( + args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 4, has_loop=True + ) + + def teams_distribute_directive(self, args): + self.some_distribute_directive(args, "TEAMS.DISTRIBUTE", 2, has_loop=True) + + def teams_distribute_simd_directive(self, args): + self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.SIMD", 3, has_loop=True) + + def teams_loop_directive(self, args): + self.some_distribute_directive( + args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True + ) + + def loop_directive(self, args): + # TODO Add error checking that a clause that the parser accepts if we find that + # loop can even take clauses, which we're not sure that it can. + enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start) + if not enclosing_regions or len(enclosing_regions) < 1: + self.some_for_directive( + args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 1, True + ) + else: + if "DISTRIBUTE" in enclosing_regions[-1].tags[0].name: + self.some_distribute_directive(args, "PARALLEL.LOOP", 1, has_loop=True) + elif "TEAMS" in enclosing_regions[-1].tags[0].name: + self.some_distribute_directive( + args, "DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True + ) + else: + if "TARGET" in enclosing_regions[-1].tags[0].name: + self.some_distribute_directive( + args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True + ) + else: + self.some_for_directive( + args, + "DIR.OMP.PARALLEL.LOOP", + "DIR.OMP.END.PARALLEL.LOOP", + 1, + True, + ) + + def distribute_directive(self, args): + self.some_distribute_directive(args, "DISTRIBUTE", 1, has_loop=True) + + def distribute_simd_directive(self, args): + self.some_distribute_directive(args, "DISTRIBUTE.SIMD", 2, has_loop=True) + + def distribute_parallel_for_directive(self, args): + self.some_distribute_directive( + args, "DISTRIBUTE.PARALLEL.LOOP", 3, has_loop=True + ) + + def distribute_parallel_for_simd_directive(self, args): + self.some_distribute_directive( + args, "DISTRIBUTE.PARALLEL.LOOP.SIMD", 4, has_loop=True + ) + + def some_distribute_directive(self, args, dir_tag, lexer_count, has_loop=False): + if DEBUG_OPENMP >= 1: + print( + "visit some_distribute_directive", + args, + type(args), + self.blk_start, + self.blk_end, + ) + + self.check_distribute_nesting(dir_tag) + + target_num = OpenmpVisitor.target_num + OpenmpVisitor.target_num += 1 + + dir_start_tag = "DIR.OMP." + dir_tag + dir_end_tag = "DIR.OMP.END." + dir_tag + start_tags = [openmp_tag(dir_start_tag, target_num)] + end_tags = [openmp_tag(dir_end_tag, target_num)] + + sblk = self.blocks[self.blk_start] + clauses, _ = self.flatten(args[lexer_count:], sblk) + + if "TEAMS" in dir_tag: + # NUM_TEAMS, THREAD_LIMIT are not in clauses, set them to 0 to + # use runtime defaults in teams, thread launching. + if len(self.get_clauses_by_name(clauses, "QUAL.OMP.NUM_TEAMS")) == 0: + start_tags.append(openmp_tag("QUAL.OMP.NUM_TEAMS", 0)) + if len(self.get_clauses_by_name(clauses, "QUAL.OMP.THREAD_LIMIT")) == 0: + start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0)) + self.teams_back_prop(clauses) + elif "PARALLEL" in dir_tag: + self.parallel_back_prop(clauses) + + if DEBUG_OPENMP >= 1: + for clause in clauses: + print("target clause:", clause) + + self.some_data_clause_directive( + clauses, start_tags, end_tags, 0, has_loop=has_loop, for_target=False + ) + + def some_target_directive(self, args, dir_tag, lexer_count, has_loop=False): + if DEBUG_OPENMP >= 1: + print( + "visit some_target_directive", + args, + type(args), + self.blk_start, + self.blk_end, + ) + + self.check_distribute_nesting(dir_tag) + + target_num = OpenmpVisitor.target_num + OpenmpVisitor.target_num += 1 + + dir_start_tag = "DIR.OMP." + dir_tag + dir_end_tag = "DIR.OMP.END." + dir_tag + start_tags = [openmp_tag(dir_start_tag, target_num)] + end_tags = [openmp_tag(dir_end_tag, target_num)] + + sblk = self.blocks[self.blk_start] + clauses, _ = self.flatten(args[lexer_count:], sblk) + + if "TEAMS" in dir_tag: + # When NUM_TEAMS, THREAD_LIMIT are not in clauses, set them to 0 to + # use runtime defaults in teams, thread launching, otherwise use + # existing clauses. + clause_num_teams = self.get_clauses_by_name(clauses, "QUAL.OMP.NUM_TEAMS") + if not clause_num_teams: + start_tags.append(openmp_tag("QUAL.OMP.NUM_TEAMS", 0)) + + # Use the THREAD_LIMIT clause value if it exists, regardless of a + # combined PARALLEL (see + # https://www.openmp.org/spec-html/5.0/openmpse15.html) since + # THREAD_LIMIT takes precedence. If clause does not exist, set to 0 + # or to NUM_THREADS of the combined PARALLEL (if this exists). + clause_thread_limit = self.get_clauses_by_name( + clauses, "QUAL.OMP.THREAD_LIMIT" + ) + if not clause_thread_limit: + thread_limit = 0 + if "PARALLEL" in dir_tag: + clause_num_threads = self.get_clauses_by_name( + clauses, "QUAL.OMP.NUM_THREADS" + ) + if clause_num_threads: + assert len(clause_num_threads) == 1, ( + "Expected single NUM_THREADS clause" + ) + thread_limit = clause_num_threads[0].arg + start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", thread_limit)) + elif "PARALLEL" in dir_tag: + # PARALLEL in the directive (without TEAMS), set THREAD_LIMIT to NUM_THREADS clause + # (if NUM_THREADS exists), or 0 (if NUM_THREADS does not exist) + num_threads = 0 + clause_num_threads = self.get_clauses_by_name( + clauses, "QUAL.OMP.NUM_THREADS" + ) + if clause_num_threads: + assert len(clause_num_threads) == 1, ( + "Expected single NUM_THREADS clause" + ) + num_threads = clause_num_threads[0].arg + + # Replace existing THREAD_LIMIT clause. + clause_thread_limit = self.get_clauses_by_name( + clauses, "QUAL.OMP.THREAD_LIMIT", remove_from_orig=True + ) + clauses.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", num_threads)) + else: + # Neither TEAMS or PARALLEL in directive, set teams, threads to 1. + start_tags.append(openmp_tag("QUAL.OMP.NUM_TEAMS", 1)) + start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 1)) + + if DEBUG_OPENMP >= 1: + for clause in clauses: + print("target clause:", clause) + + self.some_data_clause_directive( + clauses, start_tags, end_tags, 0, has_loop=has_loop, for_target=True + ) + # self.some_data_clause_directive(args, start_tags, end_tags, lexer_count, has_loop=has_loop) + + def add_to_returns(self, stmts): + for blk in self.blocks.values(): + if isinstance(blk.body[-1], ir.Return): + blk.body = blk.body[:-1] + stmts + [blk.body[-1]] + + def add_block_in_order(self, new_block, insert_after_block): + """Insert a new block after the specified block while maintaining topological order""" + new_blocks = {} + # Copy blocks up to and including insert_after_block + for label, block in self.blocks.items(): + new_blocks[label] = block + if label == insert_after_block: + # Insert new block right after + # We add a fractional to make sure the block is sorted right + # after the insert_after_block and before its successor. + # TODO: Avoid this fractional addition. + new_block_num = label + 0.1 + new_blocks[new_block_num] = new_block + # Copy remaining blocks + for label, block in self.blocks.items(): + if label > insert_after_block: + new_blocks[label] = block + # new_blocks = flatten_labels(new_blocks) + self.blocks.clear() + self.blocks.update(new_blocks) + return new_block_num + + def some_data_clause_directive( + self, + args, + start_tags, + end_tags, + lexer_count, + has_loop=False, + for_target=False, + for_task=False, + ): + if DEBUG_OPENMP >= 1: + print( + "visit some_data_clause_directive", + args, + type(args), + self.blk_start, + self.blk_end, + ) + assert not (for_target and for_task) + + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + scope = sblk.scope + + if DEBUG_OPENMP >= 1: + for clause in args[lexer_count:]: + print("pre clause:", clause) + clauses, default_shared = self.flatten(args[lexer_count:], sblk) + if DEBUG_OPENMP >= 1: + for clause in clauses: + print("final clause:", clause) + + before_start = [] + after_start = [] + for_before_start = [] + for_after_start = [] + + # Get a dict mapping variables explicitly mentioned in the data clauses above to their openmp_tag. + vars_in_explicit_clauses, explicit_privates, non_user_explicits = ( + self.get_explicit_vars(clauses) + ) + if DEBUG_OPENMP >= 1: + print( + "vars_in_explicit_clauses:", + sorted(vars_in_explicit_clauses), + type(vars_in_explicit_clauses), + ) + for v in clauses: + print("vars_in_explicit clauses first:", v) + + if has_loop: + prepare_out = self.prepare_for_directive( + clauses, + vars_in_explicit_clauses, + for_before_start, + for_after_start, + start_tags, + end_tags, + scope, + ) + vars_in_explicit_clauses, explicit_privates, non_user_explicits = ( + self.get_explicit_vars(clauses) + ) + ( + found_loop, + blocks_for_io, + blocks_in_region, + entry_pred, + exit_block, + inst, + size_var, + step_var, + latest_index, + loop_index, + ) = prepare_out + assert found_loop + else: + blocks_for_io = self.body_blocks + blocks_in_region = get_blocks_between_start_end( + self.blocks, self.blk_start, self.blk_end + ) + entry_pred = sblk + exit_block = eblk + + # Do an analysis to get variable use information coming into and out of the region. + inputs_to_region, def_but_live_out, private_to_region, live_map = ( + self.find_io_vars(blocks_for_io) + ) + live_out_copy = copy.copy(def_but_live_out) + + if DEBUG_OPENMP >= 1: + print("inputs_to_region:", sorted(inputs_to_region)) + print("def_but_live_out:", sorted(def_but_live_out)) + print("private_to_region:", sorted(private_to_region)) + for v in clauses: + print("clause after find_io_vars:", v) + + # Remove variables the user explicitly added to a clause from the auto-determined variables. + # This will also treat SSA forms of vars the same as their explicit Python var clauses. + self.remove_explicit_from_io_vars( + inputs_to_region, + def_but_live_out, + private_to_region, + vars_in_explicit_clauses, + clauses, + non_user_explicits, + scope, + self.loc, + ) + + if DEBUG_OPENMP >= 1: + for v in clauses: + print("clause after remove_explicit_from_io_vars:", v) + + if DEBUG_OPENMP >= 1: + for k, v in vars_in_explicit_clauses.items(): + print("vars_in_explicit before:", k, v) + for v in clauses: + print("vars_in_explicit clauses before:", v) + for k, v in non_user_explicits.items(): + print("non_user_explicits before:", k, v) + + if DEBUG_OPENMP >= 1: + print("inputs_to_region after remove_explicit:", sorted(inputs_to_region)) + print("def_but_live_out after remove_explicit:", sorted(def_but_live_out)) + print("private_to_region after remove_explicit:", sorted(private_to_region)) + + if not default_shared and ( + has_user_defined_var(inputs_to_region) + or has_user_defined_var(def_but_live_out) + or has_user_defined_var(private_to_region) + ): + user_defined_inputs = get_user_defined_var(inputs_to_region) + user_defined_def_live = get_user_defined_var(def_but_live_out) + user_defined_private = get_user_defined_var(private_to_region) + if DEBUG_OPENMP >= 1: + print("inputs users:", sorted(user_defined_inputs)) + print("def users:", sorted(user_defined_def_live)) + print("private users:", sorted(user_defined_private)) + raise UnspecifiedVarInDefaultNone( + "Variables with no data env clause in OpenMP region: " + + str( + user_defined_inputs.union(user_defined_def_live).union( + user_defined_private + ) + ) + ) + + if for_target: + self.make_implicit_explicit_target( + scope, + vars_in_explicit_clauses, + clauses, + True, + inputs_to_region, + def_but_live_out, + private_to_region, + ) + elif for_task: + self.make_implicit_explicit( + scope, + vars_in_explicit_clauses, + clauses, + True, + inputs_to_region, + def_but_live_out, + private_to_region, + for_task=get_enclosing_region(self.func_ir, self.blk_start), + ) + else: + self.make_implicit_explicit( + scope, + vars_in_explicit_clauses, + clauses, + True, + inputs_to_region, + def_but_live_out, + private_to_region, + ) + if DEBUG_OPENMP >= 1: + for k, v in vars_in_explicit_clauses.items(): + print("vars_in_explicit after:", k, v) + for v in clauses: + print("vars_in_explicit clauses after:", v) + vars_in_explicit_clauses, explicit_privates, non_user_explicits = ( + self.get_explicit_vars(clauses) + ) + if DEBUG_OPENMP >= 1: + print("post get_explicit_vars:", explicit_privates) + for k, v in vars_in_explicit_clauses.items(): + print("vars_in_explicit post:", k, v) + if DEBUG_OPENMP >= 1: + print("blocks_in_region:", blocks_in_region) + + self.make_consts_unliteral_for_privates(explicit_privates, self.blocks) + + # Returns a dict of private clause variables and their potentially SSA form at the end of the region. + clause_privates = self.get_clause_privates( + clauses, live_out_copy, scope, self.loc + ) + + if DEBUG_OPENMP >= 1: + print("clause_privates:", sorted(clause_privates), type(clause_privates)) + print("inputs_to_region:", sorted(inputs_to_region)) + print("def_but_live_out:", sorted(def_but_live_out)) + print("live_out_copy:", sorted(live_out_copy)) + print("private_to_region:", sorted(private_to_region)) + + keep_alive = [] + tags_for_enclosing = self.add_explicits_to_start( + scope, vars_in_explicit_clauses, clauses, True, start_tags, keep_alive + ) + add_tags_to_enclosing(self.func_ir, self.blk_start, tags_for_enclosing) + + # or_start = openmp_region_start([openmp_tag("DIR.OMP.TARGET", target_num)] + clauses, 0, self.loc) + # or_end = openmp_region_end(or_start, [openmp_tag("DIR.OMP.END.TARGET", target_num)], self.loc) + # new_header_block_num = max(self.blocks.keys()) + 1 + + firstprivate_dead_after = list( + filter( + lambda x: x.name == "QUAL.OMP.FIRSTPRIVATE" + and x.arg not in live_map[self.blk_end], + start_tags, + ) + ) + + or_start = openmp_region_start( + start_tags, 0, self.loc, firstprivate_dead_after=firstprivate_dead_after + ) + or_end = openmp_region_end(or_start, end_tags, self.loc) + + if DEBUG_OPENMP >= 1: + for x in keep_alive: + print("keep_alive:", x) + for x in firstprivate_dead_after: + print("firstprivate_dead_after:", x) + + # Adding the openmp tags in topo order to avoid problems with code + # generation and with_lifting legalization. + # TODO: we should remove the requirement to process in topo order. There + # is state depending on topo order processing. + if has_loop: + new_header_block = ir.Block(scope, self.loc) + new_header_block.body = ( + [or_start] + after_start + for_after_start + [entry_pred.body[-1]] + ) + new_block_num = self.add_block_in_order(new_header_block, self.blk_start) + entry_pred.body = ( + entry_pred.body[:-1] + + before_start + + for_before_start + + [ir.Jump(new_block_num, self.loc)] + ) + + if for_task: + exit_block.body = [or_end] + exit_block.body + self.add_to_returns(keep_alive) + else: + exit_block.body = [or_end] + keep_alive + exit_block.body + else: + new_header_block = ir.Block(scope, self.loc) + new_header_block.body = [or_start] + after_start + sblk.body[:] + new_header_block_num = self.add_block_in_order( + new_header_block, self.blk_start + ) + sblk.body = before_start + [ir.Jump(new_header_block_num, self.loc)] + + # NOTE: or_start could also be inlined for correct codegen as + # follows. Favoring the add_block_in_order method for consistency. + # sblk.body = before_start + [or_start] + after_start + sblk.body[:] + + if for_task: + eblk.body = [or_end] + eblk.body[:] + self.add_to_returns(keep_alive) + else: + eblk.body = [or_end] + keep_alive + eblk.body[:] + + add_enclosing_region(self.func_ir, self.body_blocks, or_start) + return clauses + + def target_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit target_clause", args, type(args), args[0]) + if isinstance(args[0], list): + print(args[0][0]) + (val,) = args + if isinstance(val, openmp_tag): + return [val] + elif isinstance(val, list): + return val + elif val == "nowait": + return openmp_tag("QUAL.OMP.NOWAIT") + else: + return val + # return args[0] + + def target_teams_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit target_teams_clause", args, type(args), args[0]) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def target_teams_distribute_parallel_for_simd_clause(self, args): + if DEBUG_OPENMP >= 1: + print( + "visit target_teams_distribute_parallel_for_simd_clause", + args, + type(args), + args[0], + ) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def teams_distribute_parallel_for_simd_clause(self, args): + if DEBUG_OPENMP >= 1: + print( + "visit teams_distribute_parallel_for_simd_clause", + args, + type(args), + args[0], + ) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def teams_distribute_parallel_for_clause(self, args): + if DEBUG_OPENMP >= 1: + print( + "visit teams_distribute_parallel_for_clause", args, type(args), args[0] + ) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def distribute_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit distribute_clause", args, type(args), args[0]) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def teams_distribute_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit teams_distribute_clause", args, type(args), args[0]) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def teams_distribute_simd_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit teams_distribute_simd_clause", args, type(args), args[0]) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def distribute_parallel_for_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit distribute_parallel_for_clause", args, type(args), args[0]) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def target_teams_distribute_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit target_teams_distribute_clause", args, type(args), args[0]) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + def target_teams_distribute_parallel_for_clause(self, args): + if DEBUG_OPENMP >= 1: + print( + "visit target_teams_distribute_parallel_for_clause", + args, + type(args), + args[0], + ) + if isinstance(args[0], list): + print(args[0][0]) + return args[0] + + # Don't need a rule for target_update_construct. + + def target_update_directive(self, args): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit target_update_directive", args, type(args)) + clauses, _ = self.flatten(args[2:], sblk) + or_start = openmp_region_start( + [openmp_tag("DIR.OMP.TARGET.UPDATE")] + clauses, 0, self.loc + ) + or_end = openmp_region_end( + or_start, [openmp_tag("DIR.OMP.END.TARGET.UPDATE")], self.loc + ) + sblk.body = [or_start] + [or_end] + sblk.body[:] + + def target_update_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit target_update_clause", args, type(args), args[0]) + # return args[0] + (val,) = args + if isinstance(val, openmp_tag): + return [val] + elif isinstance(val, list): + return val + else: + return val + + def motion_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit motion_clause", args, type(args)) + assert args[0] in ["to", "from"] + map_type = args[0].upper() + var_list = args[1] + assert len(args) == 2 + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var)) + return ret + + def variable_array_section_list(self, args): + if DEBUG_OPENMP >= 1: + print("visit variable_array_section_list", args, type(args)) + if len(args) == 1: + return args + else: + args[0].append(args[1]) + return args[0] + + """ + def array_section(self, args): + if DEBUG_OPENMP >= 1: + print("visit array_section", args, type(args)) + return args + + def array_section_subscript(self, args): + if DEBUG_OPENMP >= 1: + print("visit array_section_subscript", args, type(args)) + return args + """ + + # Don't need a rule for TARGET. + # Don't need a rule for single_construct. + + def single_directive(self, args): + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit single_directive", args, type(args)) + or_start = openmp_region_start([openmp_tag("DIR.OMP.SINGLE")], 0, self.loc) + or_start.requires_acquire_release() + or_end = openmp_region_end( + or_start, [openmp_tag("DIR.OMP.END.SINGLE")], self.loc + ) + sblk.body = [or_start] + sblk.body[:] + eblk.body = [or_end] + eblk.body[:] + + def single_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit single_clause", args, type(args), args[0]) + return args[0] + + # Don't need a rule for unique_single_clause. + # def NOWAIT(self, args): + # return "nowait" + # Don't need a rule for NOWAIT. + # Don't need a rule for master_construct. + + def master_directive(self, args): + raise NotImplementedError("Master directive currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit master_directive", args, type(args)) + + # Don't need a rule for simd_construct. + + def simd_directive(self, args): + raise NotImplementedError("Simd directive currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit simd_directive", args, type(args)) + + # Don't need a rule for SIMD. + + def simd_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit simd_clause", args, type(args), args[0]) + return args[0] + + def aligned_clause(self, args): + raise NotImplementedError("Aligned clause currently unsupported.") + if DEBUG_OPENMP >= 1: + print("visit aligned_clause", args, type(args)) + + # Don't need a rule for declare_simd_construct. + + def declare_simd_directive_seq(self, args): + if DEBUG_OPENMP >= 1: + print("visit declare_simd_directive_seq", args, type(args), args[0]) + return args[0] + + def declare_simd_directive(self, args): + raise NotImplementedError("Declare simd directive currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit declare_simd_directive", args, type(args)) + + def declare_simd_clause(self, args): + raise NotImplementedError("Declare simd clauses currently unsupported.") + if DEBUG_OPENMP >= 1: + print("visit declare_simd_clause", args, type(args)) + + # Don't need a rule for ALIGNED. + + def inbranch_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit inbranch_clause", args, type(args), args[0]) + return args[0] + + # Don't need a rule for INBRANCH. + # Don't need a rule for NOTINBRANCH. + + def uniform_clause(self, args): + raise NotImplementedError("Uniform clause currently unsupported.") + if DEBUG_OPENMP >= 1: + print("visit uniform_clause", args, type(args)) + + # Don't need a rule for UNIFORM. + + def collapse_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit collapse_clause", args, type(args)) + return openmp_tag("QUAL.OMP.COLLAPSE", args[1]) + + # Don't need a rule for COLLAPSE. + # Don't need a rule for task_construct. + # Don't need a rule for TASK. + + def task_directive(self, args): + if DEBUG_OPENMP >= 1: + print("visit task_directive", args, type(args)) + + start_tags = [openmp_tag("DIR.OMP.TASK")] + end_tags = [openmp_tag("DIR.OMP.END.TASK")] + self.some_data_clause_directive(args, start_tags, end_tags, 1, for_task=True) + + def task_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit task_clause", args, type(args), args[0]) + return args[0] + + def unique_task_clause(self, args): + raise NotImplementedError("Task-related clauses currently unsupported.") + if DEBUG_OPENMP >= 1: + print("visit unique_task_clause", args, type(args)) + + # Don't need a rule for DEPEND. + # Don't need a rule for FINAL. + # Don't need a rule for UNTIED. + # Don't need a rule for MERGEABLE. + + def dependence_type(self, args): + if DEBUG_OPENMP >= 1: + print("visit dependence_type", args, type(args), args[0]) + return args[0] + + # Don't need a rule for IN. + # Don't need a rule for OUT. + # Don't need a rule for INOUT. + + def data_default_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit data_default_clause", args, type(args), args[0]) + return args[0] + + def data_sharing_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit data_sharing_clause", args, type(args), args[0]) + return args[0] + + def data_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit data_clause", args, type(args), args[0]) + return args[0] + + def private_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit private_clause", args, type(args), args[0]) + (_, var_list) = args + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.PRIVATE", var)) + return ret + + # Don't need a rule for PRIVATE. + + def copyprivate_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit copyprivate_clause", args, type(args), args[0]) + (_, var_list) = args + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.COPYPRIVATE", var)) + return ret + + # Don't need a rule for COPYPRIVATE. + + def firstprivate_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit firstprivate_clause", args, type(args), args[0]) + (_, var_list) = args + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", var)) + return ret + + # Don't need a rule for FIRSTPRIVATE. + + def lastprivate_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit lastprivate_clause", args, type(args), args[0]) + (_, var_list) = args + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.LASTPRIVATE", var)) + return ret + + # Don't need a rule for LASTPRIVATE. + + def shared_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit shared_clause", args, type(args), args[0]) + (_, var_list) = args + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.SHARED", var)) + return ret + + # Don't need a rule for SHARED. + + def copyin_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit copyin_clause", args, type(args), args[0]) + (_, var_list) = args + ret = [] + for var in var_list: + ret.append(openmp_tag("QUAL.OMP.COPYIN", var)) + return ret + + # Don't need a rule for COPYIN. + # Don't need a rule for REDUCTION. + + def reduction_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit reduction_clause", args, type(args), args[0]) + + (_, red_op, red_list) = args + ret = [] + for shared in red_list: + ret.append(openmp_tag("QUAL.OMP.REDUCTION." + red_op, shared)) + return ret + + def default_shared_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit default_shared_clause", args, type(args)) + return default_shared_val(True) + + def default_none_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit default_none", args, type(args)) + return default_shared_val(False) + + def const_num_or_var(self, args): + if DEBUG_OPENMP >= 1: + print("visit const_num_or_var", args, type(args)) + return args[0] + + # Don't need a rule for parallel_construct. + + def parallel_back_prop(self, clauses): + enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start) + if DEBUG_OPENMP >= 1: + print("parallel enclosing_regions:", enclosing_regions) + if not enclosing_regions: + return + + for enclosing_region in enclosing_regions[::-1]: + # If there is TEAMS in the enclosing region then THREAD_LIMIT is + # already set, do nothing. + if self.get_directive_if_contains(enclosing_region.tags, "TEAMS"): + return + if not self.get_directive_if_contains(enclosing_region.tags, "TARGET"): + continue + + # Set to 0 means "don't care", use implementation specific number of threads. + num_threads = 0 + num_threads_clause = self.get_clauses_by_name( + clauses, "QUAL.OMP.NUM_THREADS" + ) + if num_threads_clause: + assert len(num_threads_clause) == 1, ( + "Expected num_threads clause defined once" + ) + num_threads = num_threads_clause[0].arg + nt_tag = self.get_clauses_by_name( + enclosing_region.tags, "QUAL.OMP.THREAD_LIMIT" + ) + assert len(nt_tag) > 0 + + # If THREAD_LIMIT is less than requested NUM_THREADS or 1, + # increase it. This is still valid if THREAD_LIMIT is 0, since this + # means there was a parallel region before that did not specify + # NUM_THREADS so we can set to the concrete value of the sibling + # parallel region with the max value of NUM_THREADS. + if num_threads > nt_tag[-1].arg or nt_tag[-1].arg == 1: + nt_tag[-1].arg = num_threads + return + + def parallel_directive(self, args): + if DEBUG_OPENMP >= 1: + print("visit parallel_directive", args, type(args)) + + start_tags = [openmp_tag("DIR.OMP.PARALLEL")] + end_tags = [openmp_tag("DIR.OMP.END.PARALLEL")] + clauses = self.some_data_clause_directive(args, start_tags, end_tags, 1) + + # sblk = self.blocks[self.blk_start] + # eblk = self.blocks[self.blk_end] + # scope = sblk.scope + + # before_start = [] + # after_start = [] + # clauses, default_shared = self.flatten(args[1:], sblk) + + if len(list(filter(lambda x: x.name == "QUAL.OMP.NUM_THREADS", clauses))) > 1: + raise MultipleNumThreadsClauses( + f"Multiple num_threads clauses near line {self.loc} is not allowed in an OpenMP parallel region." + ) + + if DEBUG_OPENMP >= 1: + for clause in clauses: + print("final clause:", clause) + + # ---- Back propagate THREAD_LIMIT to enclosed target region. ---- + self.parallel_back_prop(clauses) + + def parallel_clause(self, args): + (val,) = args + if DEBUG_OPENMP >= 1: + print("visit parallel_clause", args, type(args), args[0]) + return val + + def unique_parallel_clause(self, args): + (val,) = args + if DEBUG_OPENMP >= 1: + print("visit unique_parallel_clause", args, type(args), args[0]) + assert isinstance(val, openmp_tag) + return val + + def teams_clause(self, args): + (val,) = args + if DEBUG_OPENMP >= 1: + print("visit teams_clause", args, type(args), args[0]) + return val + + def num_teams_clause(self, args): + (_, num_teams) = args + if DEBUG_OPENMP >= 1: + print("visit num_teams_clause", args, type(args)) + + return openmp_tag("QUAL.OMP.NUM_TEAMS", num_teams, load=True) + + def thread_limit_clause(self, args): + (_, thread_limit) = args + if DEBUG_OPENMP >= 1: + print("visit thread_limit_clause", args, type(args)) + + return openmp_tag("QUAL.OMP.THREAD_LIMIT", thread_limit, load=True) + + def if_clause(self, args): + (_, if_val) = args + if DEBUG_OPENMP >= 1: + print("visit if_clause", args, type(args)) + + return openmp_tag("QUAL.OMP.IF", if_val, load=True) + + # Don't need a rule for IF. + + def num_threads_clause(self, args): + (_, num_threads) = args + if DEBUG_OPENMP >= 1: + print("visit num_threads_clause", args, type(args)) + + return openmp_tag("QUAL.OMP.NUM_THREADS", num_threads, load=True) + + # Don't need a rule for NUM_THREADS. + # Don't need a rule for PARALLEL. + # Don't need a rule for FOR. + # Don't need a rule for parallel_for_construct. + + def parallel_for_directive(self, args): + return self.some_for_directive( + args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 2, True + ) + + def parallel_for_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit parallel_for_clause", args, type(args), args[0]) + return args[0] + + # Don't need a rule for for_construct. + + def for_directive(self, args): + return self.some_for_directive( + args, "DIR.OMP.LOOP", "DIR.OMP.END.LOOP", 1, False + ) + + def for_clause(self, args): + (val,) = args + if DEBUG_OPENMP >= 1: + print("visit for_clause", args, type(args)) + if isinstance(val, openmp_tag): + return [val] + elif isinstance(val, list): + return val + elif val == "nowait": + return openmp_tag("QUAL.OMP.NOWAIT") + + def unique_for_clause(self, args): + (val,) = args + if DEBUG_OPENMP >= 1: + print("visit unique_for_clause", args, type(args)) + if isinstance(val, openmp_tag): + return val + elif val == "ordered": + return openmp_tag("QUAL.OMP.ORDERED", 0) + + # Don't need a rule for LINEAR. + + def linear_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit linear_clause", args, type(args), args[0]) + return args[0] + + """ + Linear_expr not in grammar + def linear_expr(self, args): + (_, var, step) = args + if DEBUG_OPENMP >= 1: + print("visit linear_expr", args, type(args)) + return openmp_tag("QUAL.OMP.LINEAR", [var, step]) + """ + + """ + def ORDERED(self, args): + if DEBUG_OPENMP >= 1: + print("visit ordered", args, type(args)) + return "ordered" + """ + + def sched_no_expr(self, args): + (_, kind) = args + if DEBUG_OPENMP >= 1: + print("visit sched_no_expr", args, type(args)) + if kind == "static": + return openmp_tag("QUAL.OMP.SCHEDULE.STATIC", 0) + elif kind == "dynamic": + return openmp_tag("QUAL.OMP.SCHEDULE.DYNAMIC", 0) + elif kind == "guided": + return openmp_tag("QUAL.OMP.SCHEDULE.GUIDED", 0) + elif kind == "runtime": + return openmp_tag("QUAL.OMP.SCHEDULE.RUNTIME", 0) + + def sched_expr(self, args): + (_, kind, num_or_var) = args + if DEBUG_OPENMP >= 1: + print("visit sched_expr", args, type(args), num_or_var, type(num_or_var)) + if kind == "static": + return openmp_tag("QUAL.OMP.SCHEDULE.STATIC", num_or_var, load=True) + elif kind == "dynamic": + return openmp_tag("QUAL.OMP.SCHEDULE.DYNAMIC", num_or_var, load=True) + elif kind == "guided": + return openmp_tag("QUAL.OMP.SCHEDULE.GUIDED", num_or_var, load=True) + elif kind == "runtime": + return openmp_tag("QUAL.OMP.SCHEDULE.RUNTIME", num_or_var, load=True) + + def SCHEDULE(self, args): + if DEBUG_OPENMP >= 1: + print("visit SCHEDULE", args, type(args)) + return "schedule" + + def schedule_kind(self, args): + (kind,) = args + if DEBUG_OPENMP >= 1: + print("visit schedule_kind", args, type(args)) + return kind + + # Don't need a rule for STATIC. + # Don't need a rule for DYNAMIC. + # Don't need a rule for GUIDED. + # Don't need a rule for RUNTIME. + + """ + def STATIC(self, args): + if DEBUG_OPENMP >= 1: + print("visit STATIC", args, type(args)) + return "static" + + def DYNAMIC(self, args): + if DEBUG_OPENMP >= 1: + print("visit DYNAMIC", args, type(args)) + return "dynamic" + + def GUIDED(self, args): + if DEBUG_OPENMP >= 1: + print("visit GUIDED", args, type(args)) + return "guided" + + def RUNTIME(self, args): + if DEBUG_OPENMP >= 1: + print("visit RUNTIME", args, type(args)) + return "runtime" + """ + + def COLON(self, args): + if DEBUG_OPENMP >= 1: + print("visit COLON", args, type(args)) + return ":" + + def oslice(self, args): + if DEBUG_OPENMP >= 1: + print("visit oslice", args, type(args)) + start = None + end = None + if args[0] != ":": + start = args[0] + args = args[2:] + else: + args = args[1:] + + if len(args) > 0: + end = args[0] + return slice(start, end) + + def slice_list(self, args): + if DEBUG_OPENMP >= 1: + print("visit slice_list", args, type(args)) + if len(args) == 1: + return args + else: + args[0].append(args[1]) + return args[0] + + def name_slice(self, args): + if DEBUG_OPENMP >= 1: + print("visit name_slice", args, type(args)) + if len(args) == 1 or args[1] is None: + return args[0] + else: + return NameSlice(args[0], args[1:]) + + def var_list(self, args): + if DEBUG_OPENMP >= 1: + print("visit var_list", args, type(args)) + if len(args) == 1: + return args + else: + args[0].append(args[1]) + return args[0] + + def number_list(self, args): + if DEBUG_OPENMP >= 1: + print("visit number_list", args, type(args)) + if len(args) == 1: + return args + else: + args[0].append(args[1]) + return args[0] + + def ompx_attribute(self, args): + if DEBUG_OPENMP >= 1: + print("visit ompx_attribute", args, type(args), args[0]) + (_, attr, number_list) = args + return openmp_tag("QUAL.OMP.OMPX_ATTRIBUTE", (attr, number_list)) + + def PLUS(self, args): + if DEBUG_OPENMP >= 1: + print("visit PLUS", args, type(args)) + return "+" + + def MINUS(self, args): + if DEBUG_OPENMP >= 1: + print("visit MINUS", args, type(args)) + return "-" + + def STAR(self, args): + if DEBUG_OPENMP >= 1: + print("visit STAR", args, type(args)) + return "*" + + def reduction_operator(self, args): + arg = args[0] + if DEBUG_OPENMP >= 1: + print("visit reduction_operator", args, type(args), arg, type(arg)) + if arg == "+": + return "ADD" + elif arg == "-": + return "SUB" + elif arg == "*": + return "MUL" + assert 0 + + def threadprivate_directive(self, args): + raise NotImplementedError("Threadprivate currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit threadprivate_directive", args, type(args)) + + def cancellation_point_directive(self, args): + raise NotImplementedError("""Explicit cancellation points + currently unsupported.""") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit cancellation_point_directive", args, type(args)) + + def construct_type_clause(self, args): + if DEBUG_OPENMP >= 1: + print("visit construct_type_clause", args, type(args), args[0]) + return args[0] + + def cancel_directive(self, args): + raise NotImplementedError("Cancel directive currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit cancel_directive", args, type(args)) + + # Don't need a rule for ORDERED. + + def flush_directive(self, args): + raise NotImplementedError("Flush directive currently unsupported.") + sblk = self.blocks[self.blk_start] + eblk = self.blocks[self.blk_end] + + if DEBUG_OPENMP >= 1: + print("visit flush_directive", args, type(args)) + + def region_phrase(self, args): + raise NotImplementedError("No implementation for region phrase.") + if DEBUG_OPENMP >= 1: + print("visit region_phrase", args, type(args)) + + def PYTHON_NAME(self, args): + if DEBUG_OPENMP >= 1: + print("visit PYTHON_NAME", args, type(args), str(args)) + return str(args) + + def NUMBER(self, args): + if DEBUG_OPENMP >= 1: + print("visit NUMBER", args, type(args), str(args)) + return int(args) + + +openmp_grammar = r""" + openmp_statement: openmp_construct + | openmp_directive + openmp_directive: barrier_directive + | taskwait_directive + | taskyield_directive + | flush_directive + barrier_directive: BARRIER + taskwait_directive: TASKWAIT + taskyield_directive: TASKYIELD + BARRIER: "barrier" + TASKWAIT: "taskwait" + TASKYIELD: "taskyield" + taskgroup_directive: TASKGROUP + taskgroup_construct: taskgroup_directive + TASKGROUP: "taskgroup" + openmp_construct: parallel_construct + | parallel_for_construct + | for_construct + | single_construct + | task_construct + | teams_construct + | teams_distribute_construct + | teams_distribute_simd_construct + | teams_distribute_parallel_for_construct + | teams_distribute_parallel_for_simd_construct + | loop_construct + | teams_loop_construct + | target_construct + | target_teams_construct + | target_teams_distribute_construct + | target_teams_distribute_simd_construct + | target_teams_distribute_parallel_for_simd_construct + | target_teams_distribute_parallel_for_construct + | target_loop_construct + | target_teams_loop_construct + | target_enter_data_construct + | target_exit_data_construct + | distribute_construct + | distribute_simd_construct + | distribute_parallel_for_construct + | distribute_parallel_for_simd_construct + | critical_construct + | atomic_construct + | sections_construct + | section_construct + | simd_construct + | for_simd_construct + | parallel_for_simd_construct + | target_data_construct + | target_update_construct + | parallel_sections_construct + | master_construct + | ordered_construct + for_simd_construct: for_simd_directive + for_simd_directive: FOR SIMD [for_simd_clause*] + for_simd_clause: for_clause + | simd_clause + parallel_for_simd_construct: parallel_for_simd_directive + parallel_for_simd_directive: PARALLEL FOR SIMD [parallel_for_simd_clause*] + parallel_for_simd_clause: parallel_for_clause + | simd_clause + distribute_construct: distribute_directive + distribute_simd_construct: distribute_simd_directive + distribute_directive: DISTRIBUTE [distribute_clause*] + distribute_simd_directive: DISTRIBUTE SIMD [distribute_simd_clause*] + distribute_clause: private_clause + | firstprivate_clause + // | lastprivate_distribute_clause + | collapse_clause + | dist_schedule_clause + | allocate_clause + distribute_simd_clause: private_clause + | firstprivate_clause + // | lastprivate_distribute_clause + | collapse_clause + | dist_schedule_clause + | allocate_clause + | if_clause + // | safelen_clause + // | simdlen_clause + | linear_clause + | aligned_clause + // | nontemporal_clause + | reduction_clause + // | order_clause + + teams_distribute_clause: num_teams_clause + | thread_limit_clause + | data_default_clause + | private_clause + | firstprivate_clause + | data_sharing_clause + | reduction_clause + | allocate_clause + // | lastprivate_distribute_clause + | collapse_clause + | dist_schedule_clause + | ompx_attribute + + teams_distribute_simd_clause: num_teams_clause + | thread_limit_clause + | data_default_clause + | private_clause + | firstprivate_clause + | data_sharing_clause + | reduction_clause + | allocate_clause + // | lastprivate_distribute_clause + | collapse_clause + | dist_schedule_clause + | if_clause + // | safelen_clause + // | simdlen_clause + | linear_clause + | aligned_clause + // | nontemporal_clause + // | order_clause + | ompx_attribute + + distribute_parallel_for_construct: distribute_parallel_for_directive + distribute_parallel_for_directive: DISTRIBUTE PARALLEL FOR [distribute_parallel_for_clause*] + distribute_parallel_for_clause: if_clause + | num_threads_clause + | data_default_clause + | private_clause + | firstprivate_clause + | data_sharing_clause + | reduction_clause + | copyin_clause + // | proc_bind_clause + | allocate_clause + | lastprivate_clause + | linear_clause + | schedule_clause + | collapse_clause + | ORDERED + | NOWAIT + // | order_clause + | dist_schedule_clause + + distribute_parallel_for_simd_construct: distribute_parallel_for_simd_directive + distribute_parallel_for_simd_directive: DISTRIBUTE PARALLEL FOR SIMD [distribute_parallel_for_simd_clause*] + distribute_parallel_for_simd_clause: if_clause + | num_threads_clause + | data_default_clause + | private_clause + | firstprivate_clause + | data_sharing_clause + | reduction_clause + | copyin_clause + // | proc_bind_clause + | allocate_clause + | lastprivate_clause + | linear_clause + | schedule_clause + | collapse_clause + | ORDERED + | NOWAIT + // | order_clause + | dist_schedule_clause + // | safelen_clause + // | simdlen_clause + | aligned_clause + // | nontemporal_clause + + target_data_construct: target_data_directive + target_data_directive: TARGET DATA [target_data_clause*] + DATA: "data" + ENTER: "enter" + EXIT: "exit" + target_enter_data_construct: target_enter_data_directive + target_enter_data_directive: TARGET ENTER DATA [target_enter_data_clause*] + target_exit_data_construct: target_exit_data_directive + target_exit_data_directive: TARGET EXIT DATA [target_exit_data_clause*] + target_data_clause: device_clause + | map_clause + | if_clause + | NOWAIT + | depend_with_modifier_clause + target_enter_data_clause: device_clause + | map_enter_clause + | if_clause + | NOWAIT + | depend_with_modifier_clause + target_exit_data_clause: device_clause + | map_exit_clause + | if_clause + | NOWAIT + | depend_with_modifier_clause + device_clause: "device" "(" const_num_or_var ")" + map_clause: "map" "(" [map_type ":"] var_list ")" + map_type: ALLOC | TO | FROM | TOFROM + map_enter_clause: "map" "(" map_enter_type ":" var_list ")" + map_enter_type: ALLOC | TO + map_exit_clause: "map" "(" map_exit_type ":" var_list ")" + map_exit_type: FROM | RELEASE | DELETE + update_motion_type: TO | FROM + TO: "to" + FROM: "from" + ALLOC: "alloc" + TOFROM: "tofrom" + RELEASE: "release" + DELETE: "delete" + parallel_sections_construct: parallel_sections_directive + parallel_sections_directive: PARALLEL SECTIONS [parallel_sections_clause*] + parallel_sections_clause: unique_parallel_clause + | data_default_clause + | private_clause + | firstprivate_clause + | lastprivate_clause + | data_sharing_clause + | reduction_clause + sections_construct: sections_directive + sections_directive: SECTIONS [sections_clause*] + SECTIONS: "sections" + sections_clause: private_clause + | firstprivate_clause + | lastprivate_clause + | reduction_clause + | NOWAIT + section_construct: section_directive + section_directive: SECTION + SECTION: "section" + atomic_construct: atomic_directive + atomic_directive: ATOMIC [atomic_clause] [seq_cst_clause] + ATOMIC: "atomic" + atomic_clause: READ + | WRITE + | UPDATE + | CAPTURE + READ: "read" + WRITE: "write" + UPDATE: "update" + CAPTURE: "capture" + seq_cst_clause: "seq_cst" + critical_construct: critical_directive + critical_directive: CRITICAL + CRITICAL: "critical" + teams_construct: teams_directive + teams_directive: TEAMS [teams_clause*] + teams_distribute_directive: TEAMS DISTRIBUTE [teams_distribute_clause*] + teams_distribute_simd_directive: TEAMS DISTRIBUTE SIMD [teams_distribute_simd_clause*] + target_construct: target_directive + target_teams_distribute_parallel_for_simd_construct: target_teams_distribute_parallel_for_simd_directive + target_teams_distribute_parallel_for_construct: target_teams_distribute_parallel_for_directive + teams_distribute_parallel_for_construct: teams_distribute_parallel_for_directive + teams_distribute_parallel_for_simd_construct: teams_distribute_parallel_for_simd_directive + loop_construct: loop_directive + teams_loop_construct: teams_loop_directive + target_loop_construct: target_loop_directive + target_teams_loop_construct: target_teams_loop_directive + target_teams_construct: target_teams_directive + target_teams_distribute_construct: target_teams_distribute_directive + target_teams_distribute_simd_construct: target_teams_distribute_simd_directive + teams_distribute_construct: teams_distribute_directive + teams_distribute_simd_construct: teams_distribute_simd_directive + target_directive: TARGET [target_clause*] + HAS_DEVICE_ADDR: "has_device_addr" + has_device_addr_clause: HAS_DEVICE_ADDR "(" var_list ")" + target_clause: if_clause + | device_clause + | thread_limit_clause + | private_clause + | firstprivate_clause + // | in_reduction_clause + | map_clause + | is_device_ptr_clause + | has_device_addr_clause + // | defaultmap_clause + | NOWAIT + | allocate_clause + | depend_with_modifier_clause + // | uses_allocators_clause + | ompx_attribute + teams_clause: num_teams_clause + | thread_limit_clause + | data_default_clause + | private_clause + | firstprivate_clause + | data_sharing_clause + | reduction_clause + | allocate_clause + num_teams_clause: NUM_TEAMS "(" const_num_or_var ")" + NUM_TEAMS: "num_teams" + thread_limit_clause: THREAD_LIMIT "(" const_num_or_var ")" + THREAD_LIMIT: "thread_limit" + + dist_schedule_expr: DIST_SCHEDULE "(" STATIC ")" + dist_schedule_no_expr: DIST_SCHEDULE "(" STATIC "," const_num_or_var ")" + dist_schedule_clause: dist_schedule_expr + | dist_schedule_no_expr + DIST_SCHEDULE: "dist_schedule" + + target_teams_distribute_parallel_for_simd_directive: TARGET TEAMS DISTRIBUTE PARALLEL FOR SIMD [target_teams_distribute_parallel_for_simd_clause*] + target_teams_distribute_parallel_for_simd_clause: if_clause + | device_clause + | private_clause + | firstprivate_clause + // | in_reduction_clause + | map_clause + | is_device_ptr_clause + // | defaultmap_clause + | NOWAIT + | allocate_clause + | depend_with_modifier_clause + // | uses_allocators_clause + | num_teams_clause + | thread_limit_clause + | data_default_clause + | data_sharing_clause + | reduction_clause + | num_threads_clause + | copyin_clause + // | proc_bind_clause + | lastprivate_clause + | linear_clause + | schedule_clause + | collapse_clause + | ORDERED + // | order_clause + | dist_schedule_clause + // | safelen_clause + // | simdlen_clause + | aligned_clause + // | nontemporal_clause + | ompx_attribute + + teams_distribute_parallel_for_simd_directive: TEAMS DISTRIBUTE PARALLEL FOR SIMD [teams_distribute_parallel_for_simd_clause*] + teams_distribute_parallel_for_simd_clause: num_teams_clause + | thread_limit_clause + // | default_clause + | private_clause + | firstprivate_clause + | data_sharing_clause + | reduction_clause + | if_clause + | num_threads_clause + | copyin_clause + // | proc_bind_clause + | lastprivate_clause + | linear_clause + | schedule_clause + | collapse_clause + | ORDERED + | NOWAIT + // | order_clause + | dist_schedule_clause + // | safelen_clause + // | simdlen_clause + | aligned_clause + // | nontemporal_clause + // | in_reduction_clause + | map_clause + | is_device_ptr_clause + // | defaultmap_clause + | allocate_clause + | depend_with_modifier_clause + // | uses_allocators_clause + | data_default_clause + | ompx_attribute + + target_teams_distribute_parallel_for_directive: TARGET TEAMS DISTRIBUTE PARALLEL FOR [target_teams_distribute_parallel_for_clause*] + target_teams_distribute_parallel_for_clause: if_clause + | device_clause + | private_clause + | firstprivate_clause + // | in_reduction_clause + | map_clause + | is_device_ptr_clause + // | defaultmap_clause + | NOWAIT + | allocate_clause + | depend_with_modifier_clause + // | uses_allocators_clause + | num_teams_clause + | thread_limit_clause + | data_default_clause + | data_sharing_clause + | reduction_clause + | num_threads_clause + | copyin_clause + // | proc_bind_clause + | lastprivate_clause + | linear_clause + | schedule_clause + | collapse_clause + | ORDERED + // | order_clause + | dist_schedule_clause + | ompx_attribute + + teams_distribute_parallel_for_directive: TEAMS DISTRIBUTE PARALLEL FOR [teams_distribute_parallel_for_clause*] + teams_distribute_parallel_for_clause: num_teams_clause + | thread_limit_clause + | data_default_clause + | private_clause + | firstprivate_clause + | data_sharing_clause + | reduction_clause + | allocate_clause + | if_clause + | num_threads_clause + | copyin_clause + // | proc_bind_clause + | lastprivate_clause + | linear_clause + | schedule_clause + | collapse_clause + | ORDERED + | NOWAIT + // | order_clause + | dist_schedule_clause + | ompx_attribute + + LOOP: "loop" + + ompx_attribute: OMPX_ATTRIBUTE "(" PYTHON_NAME "(" number_list ")" ")" + OMPX_ATTRIBUTE: "ompx_attribute" + loop_directive: LOOP [teams_distribute_parallel_for_clause*] + teams_loop_directive: TEAMS LOOP [teams_distribute_parallel_for_clause*] + target_loop_directive: TARGET LOOP [target_teams_distribute_parallel_for_clause*] + target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_distribute_parallel_for_clause*] + + target_teams_directive: TARGET TEAMS [target_teams_clause*] + target_teams_clause: if_clause + | device_clause + | private_clause + | firstprivate_clause + // | in_reduction_clause + | map_clause + | is_device_ptr_clause + // | defaultmap_clause + | NOWAIT + | allocate_clause + | depend_with_modifier_clause + // | uses_allocators_clause + | num_teams_clause + | thread_limit_clause + | data_default_clause + | data_sharing_clause + // | reduction_default_only_clause + | ompx_attribute + + target_teams_distribute_simd_directive: TARGET TEAMS DISTRIBUTE SIMD [target_teams_distribute_simd_clause*] + target_teams_distribute_simd_clause: if_clause + | device_clause + | private_clause + | firstprivate_clause + // | in_reduction_clause + | map_clause + | is_device_ptr_clause + // | defaultmap_clause + | NOWAIT + | allocate_clause + | depend_with_modifier_clause + // | uses_allocators_clause + | num_teams_clause + | thread_limit_clause + | data_default_clause + | data_sharing_clause + | reduction_clause + // | reduction_default_only_clause + | lastprivate_clause + | collapse_clause + | dist_schedule_clause + // | safelen_clause + // | simdlen_clause + | linear_clause + | aligned_clause + // | nontemporal_clause + // | order_clause + | ompx_attribute + + target_teams_distribute_directive: TARGET TEAMS DISTRIBUTE [target_teams_distribute_clause*] + target_teams_distribute_clause: if_clause + | device_clause + | private_clause + | firstprivate_clause + // | in_reduction_clause + | map_clause + | is_device_ptr_clause + // | defaultmap_clause + | NOWAIT + | allocate_clause + | depend_with_modifier_clause + // | uses_allocators_clause + | num_teams_clause + | thread_limit_clause + | data_default_clause + | data_sharing_clause + // | reduction_default_only_clause + | lastprivate_clause + | collapse_clause + | dist_schedule_clause + | ompx_attribute + + IS_DEVICE_PTR: "is_device_ptr" + is_device_ptr_clause: IS_DEVICE_PTR "(" var_list ")" + allocate_clause: ALLOCATE "(" allocate_parameter ")" + ALLOCATE: "allocate" + allocate_parameter: [const_num_or_var] var_list + + target_update_construct: target_update_directive + target_update_directive: TARGET UPDATE target_update_clause* + target_update_clause: motion_clause + | device_clause + | if_clause + motion_clause: update_motion_type "(" variable_array_section_list ")" + variable_array_section_list: PYTHON_NAME + // | array_section + | name_slice + | variable_array_section_list "," PYTHON_NAME + | variable_array_section_list "," name_slice + // | variable_array_section_list "," array_section + //array_section: PYTHON_NAME array_section_subscript + //array_section_subscript: array_section_subscript "[" [const_num_or_var] ":" [const_num_or_var] "]" + // | array_section_subscript "[" const_num_or_var "]" + // | "[" [const_num_or_var] ":" [const_num_or_var] "]" + // | "[" const_num_or_var "]" + TARGET: "target" + TEAMS: "teams" + DISTRIBUTE: "distribute" + single_construct: single_directive + single_directive: SINGLE [single_clause*] + SINGLE: "single" + single_clause: unique_single_clause + | private_clause + | firstprivate_clause + | NOWAIT + unique_single_clause: copyprivate_clause + NOWAIT: "nowait" + master_construct: master_directive + master_directive: "master" + simd_construct: simd_directive + simd_directive: SIMD [simd_clause*] + SIMD: "simd" + simd_clause: collapse_clause + | aligned_clause + | linear_clause + | uniform_clause + | reduction_clause + | inbranch_clause + aligned_clause: ALIGNED "(" var_list ")" + | ALIGNED "(" var_list ":" const_num_or_var ")" + declare_simd_construct: declare_simd_directive_seq + declare_simd_directive_seq: declare_simd_directive + | declare_simd_directive_seq declare_simd_directive + declare_simd_directive: SIMD [declare_simd_clause*] + declare_simd_clause: "simdlen" "(" const_num_or_var ")" + | aligned_clause + | linear_clause + | uniform_clause + | reduction_clause + | inbranch_clause + ALIGNED: "aligned" + inbranch_clause: INBRANCH | NOTINBRANCH + INBRANCH: "inbranch" + NOTINBRANCH: "notinbranch" + uniform_clause: UNIFORM "(" var_list ")" + UNIFORM: "uniform" + collapse_clause: COLLAPSE "(" const_num_or_var ")" + COLLAPSE: "collapse" + task_construct: task_directive + TASK: "task" + task_directive: TASK [task_clause*] + task_clause: unique_task_clause + | data_sharing_clause + | private_clause + | firstprivate_clause + | data_default_clause + unique_task_clause: if_clause + | UNTIED + | MERGEABLE + | FINAL "(" const_num_or_var ")" + | depend_with_modifier_clause + DEPEND: "depend" + FINAL: "final" + UNTIED: "untied" + MERGEABLE: "mergeable" + dependence_type: IN + | OUT + | INOUT + depend_with_modifier_clause: DEPEND "(" dependence_type ":" variable_array_section_list ")" + IN: "in" + OUT: "out" + INOUT: "inout" + data_default_clause: default_shared_clause + | default_none_clause + data_sharing_clause: shared_clause + data_clause: private_clause + | copyprivate_clause + | firstprivate_clause + | lastprivate_clause + | data_sharing_clause + | data_default_clause + | copyin_clause + | reduction_clause + private_clause: PRIVATE "(" var_list ")" + PRIVATE: "private" + copyprivate_clause: COPYPRIVATE "(" var_list ")" + COPYPRIVATE: "copyprivate" + firstprivate_clause: FIRSTPRIVATE "(" var_list ")" + FIRSTPRIVATE: "firstprivate" + lastprivate_clause: LASTPRIVATE "(" var_list ")" + LASTPRIVATE: "lastprivate" + shared_clause: SHARED "(" var_list ")" + SHARED: "shared" + copyin_clause: COPYIN "(" var_list ")" + COPYIN: "copyin" + REDUCTION: "reduction" + DEFAULT: "default" + reduction_clause: REDUCTION "(" reduction_operator ":" var_list ")" + default_shared_clause: DEFAULT "(" "shared" ")" + default_none_clause: DEFAULT "(" "none" ")" + const_num_or_var: NUMBER | PYTHON_NAME + parallel_construct: parallel_directive + parallel_directive: PARALLEL [parallel_clause*] + parallel_clause: unique_parallel_clause + | data_default_clause + | private_clause + | firstprivate_clause + | data_sharing_clause + | reduction_clause + unique_parallel_clause: if_clause | num_threads_clause + if_clause: IF "(" const_num_or_var ")" + IF: "if" + num_threads_clause: NUM_THREADS "(" const_num_or_var ")" + NUM_THREADS: "num_threads" + PARALLEL: "parallel" + FOR: "for" + parallel_for_construct: parallel_for_directive + parallel_for_directive: PARALLEL FOR [parallel_for_clause*] + parallel_for_clause: unique_parallel_clause + | unique_for_clause + | data_default_clause + | private_clause + | firstprivate_clause + | lastprivate_clause + | data_sharing_clause + | reduction_clause + for_construct: for_directive + for_directive: FOR [for_clause*] + for_clause: unique_for_clause | data_clause | NOWAIT + unique_for_clause: ORDERED + | schedule_clause + | collapse_clause + LINEAR: "linear" + linear_clause: LINEAR "(" var_list ":" const_num_or_var ")" + | LINEAR "(" var_list ")" + sched_no_expr: SCHEDULE "(" schedule_kind ")" + sched_expr: SCHEDULE "(" schedule_kind "," const_num_or_var ")" + schedule_clause: sched_no_expr + | sched_expr + SCHEDULE: "schedule" + schedule_kind: STATIC | DYNAMIC | GUIDED | RUNTIME | AUTO + STATIC: "static" + DYNAMIC: "dynamic" + GUIDED: "guided" + RUNTIME: "runtime" + AUTO: "auto" + COLON: ":" + oslice: [const_num_or_var] COLON [const_num_or_var] + slice_list: oslice | slice_list "," oslice + name_slice: PYTHON_NAME [ "[" slice_list "]" ] + var_list: name_slice | var_list "," name_slice + number_list: NUMBER | number_list "," NUMBER + PLUS: "+" + MINUS: "-" + STAR: "*" + reduction_operator: PLUS | "\\" | STAR | MINUS | "&" | "^" | "|" | "&&" | "||" + threadprivate_directive: "threadprivate" "(" var_list ")" + cancellation_point_directive: "cancellation point" construct_type_clause + construct_type_clause: PARALLEL + | SECTIONS + | FOR + | TASKGROUP + cancel_directive: "cancel" construct_type_clause [if_clause] + ordered_directive: ORDERED + ordered_construct: ordered_directive + ORDERED: "ordered" + flush_directive: "flush" "(" var_list ")" + + region_phrase: "(" PYTHON_NAME ")" + PYTHON_NAME: /[a-zA-Z_]\w*/ + + %import common.NUMBER + %import common.WS + %ignore WS + """ + +""" + name_slice: PYTHON_NAME [ "[" slice ["," slice]* "]" ] +""" + +openmp_parser = Lark(openmp_grammar, start="openmp_statement") +var_collector_parser = Lark(openmp_grammar, start="openmp_statement") + + +def remove_ssa_callback(var, unused): + assert isinstance(var, ir.Var) + new_var = ir.Var(var.scope, var.unversioned_name, var.loc) + return new_var + + +def remove_ssa_from_func_ir(func_ir): + typed_passes.PreLowerStripPhis()._strip_phi_nodes(func_ir) + # new_func_ir = typed_passes.PreLowerStripPhis()._strip_phi_nodes(func_ir) + # func_ir.blocks = new_func_ir.blocks + visit_vars(func_ir.blocks, remove_ssa_callback, None) + func_ir._definitions = build_definitions(func_ir.blocks) + + +def _add_openmp_ir_nodes(func_ir, blocks, blk_start, blk_end, body_blocks, extra): + """Given the starting and ending block of the with-context, + replaces the head block with a new block that has the starting + openmp ir nodes in it and adds the ending openmp ir nodes to + the end block. + """ + sblk = blocks[blk_start] + loc = sblk.loc + sblk.body = sblk.body[1:] + + args = extra["args"] + arg = args[0] + # If OpenMP argument is not a constant or not a string then raise exception + if not isinstance(arg, (ir.Const, ir.FreeVar)): + raise NonconstantOpenmpSpecification( + f"Non-constant OpenMP specification at line {arg.loc}" + ) + if not isinstance(arg.value, str): + raise NonStringOpenmpSpecification( + f"Non-string OpenMP specification at line {arg.loc}" + ) + + if DEBUG_OPENMP >= 1: + print("args:", args, type(args)) + print("arg:", arg, type(arg), arg.value, type(arg.value)) + parse_res = openmp_parser.parse(arg.value) + if DEBUG_OPENMP >= 1: + print(parse_res.pretty()) + visitor = OpenmpVisitor(func_ir, blocks, blk_start, blk_end, body_blocks, loc) + try: + visitor.transform(parse_res) + except VisitError as e: + raise e.__context__ + if isinstance(e.__context__, UnspecifiedVarInDefaultNone): + print(str(e.__context__)) + raise e.__context__ + else: + print( + "Internal error for OpenMp pragma '{}'".format(arg.value), + e.__context__, + type(e.__context__), + ) + sys.exit(-1) + except Exception as f: + print("generic transform exception") + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print("Internal error for OpenMp pragma '{}'".format(arg.value)) + sys.exit(-2) + except: + print("fallthrough exception") + print("Internal error for OpenMP pragma '{}'".format(arg.value)) + sys.exit(-3) + assert blocks is visitor.blocks + + +class OpenmpExternalFunction(types.ExternalFunction): + def __call__(self, *args): + import inspect + + frm = inspect.stack()[1] + mod = inspect.getmodule(frm[0]) + if mod.__name__.startswith("numba") and not mod.__name__.startswith( + "numba.openmp.tests" + ): + return super(ExternalFunction, self).__call__(*args) + + ffi = FFI() + fname = self.symbol + ret_typ = str(self.sig.return_type) + + def numba_to_c(ret_typ): + if ret_typ == "int32": + return "int" + elif ret_typ == "none": + return "void" + elif ret_typ == "float64": + return "double" + else: + assert False + + ret_typ = numba_to_c(ret_typ) + arg_str = ",".join([numba_to_c(str(x)) for x in self.sig.args]) + proto = f"{ret_typ} {fname}({arg_str});" + ffi.cdef(proto) + # Should be loaded into the process by the load_library_permanently + # at the top of this file. + C = ffi.dlopen(None) + return getattr(C, fname)(*args) + + +model_register(OpenmpExternalFunction)(OpaqueModel) + +omp_set_num_threads = OpenmpExternalFunction( + "omp_set_num_threads", types.void(types.int32) +) +omp_get_thread_num = OpenmpExternalFunction("omp_get_thread_num", types.int32()) +omp_get_num_threads = OpenmpExternalFunction("omp_get_num_threads", types.int32()) +omp_get_wtime = OpenmpExternalFunction("omp_get_wtime", types.float64()) +omp_set_dynamic = OpenmpExternalFunction("omp_set_dynamic", types.void(types.int32)) +omp_set_nested = OpenmpExternalFunction("omp_set_nested", types.void(types.int32)) +omp_set_max_active_levels = OpenmpExternalFunction( + "omp_set_max_active_levels", types.void(types.int32) +) +omp_get_max_active_levels = OpenmpExternalFunction( + "omp_get_max_active_levels", types.int32() +) +omp_get_max_threads = OpenmpExternalFunction("omp_get_max_threads", types.int32()) +omp_get_num_procs = OpenmpExternalFunction("omp_get_num_procs", types.int32()) +omp_in_parallel = OpenmpExternalFunction("omp_in_parallel", types.int32()) +omp_get_thread_limit = OpenmpExternalFunction("omp_get_thread_limit", types.int32()) +omp_get_supported_active_levels = OpenmpExternalFunction( + "omp_get_supported_active_levels", types.int32() +) +omp_get_level = OpenmpExternalFunction("omp_get_level", types.int32()) +omp_get_active_level = OpenmpExternalFunction("omp_get_active_level", types.int32()) +omp_get_ancestor_thread_num = OpenmpExternalFunction( + "omp_get_ancestor_thread_num", types.int32(types.int32) +) +omp_get_team_size = OpenmpExternalFunction( + "omp_get_team_size", types.int32(types.int32) +) +omp_in_final = OpenmpExternalFunction("omp_in_finale", types.int32()) +omp_get_proc_bind = OpenmpExternalFunction("omp_get_proc_bind", types.int32()) +omp_get_num_places = OpenmpExternalFunction("omp_get_num_places", types.int32()) +omp_get_place_num_procs = OpenmpExternalFunction( + "omp_get_place_num_procs", types.int32(types.int32) +) +omp_get_place_num = OpenmpExternalFunction("omp_get_place_num", types.int32()) +omp_set_default_device = OpenmpExternalFunction( + "omp_set_default_device", types.int32(types.int32) +) +omp_get_default_device = OpenmpExternalFunction("omp_get_default_device", types.int32()) +omp_get_num_devices = OpenmpExternalFunction("omp_get_num_devices", types.int32()) +omp_get_device_num = OpenmpExternalFunction("omp_get_device_num", types.int32()) +omp_get_team_num = OpenmpExternalFunction("omp_get_team_num", types.int32()) +omp_get_num_teams = OpenmpExternalFunction("omp_get_num_teams", types.int32()) +omp_is_initial_device = OpenmpExternalFunction("omp_is_initial_device", types.int32()) +omp_get_initial_device = OpenmpExternalFunction("omp_get_initial_device", types.int32()) + + +def copy_np_array(x): + return np.copy(x) + + +# {meminfo, parent, ...} copy_np_array({meminfo, parent, ...}) + + +def create_native_np_copy(arg_typ): + # The cfunc wrapper of this function is what we need. + copy_cres = compiler.compile_isolated(copy_np_array, (arg_typ,), arg_typ) + copy_name = getattr(copy_cres.fndesc, "llvm_cfunc_wrapper_name") + return (copy_name, copy_cres) + + +def omp_shared_array(size, dtype): + return np.empty(size, dtype=dtype) + + +@overload(omp_shared_array, target="cpu", inline="always", prefer_literal=True) +def omp_shared_array_overload(size, dtype): + assert isinstance(size, types.IntegerLiteral) + + def impl(size, dtype): + return np.empty(size, dtype=dtype) + + return impl + + +@overload(omp_shared_array, target="cuda", inline="always", prefer_literal=True) +def omp_shared_array_overload(size, dtype): + assert isinstance(size, types.IntegerLiteral) + + def impl(size, dtype): + return numba_cuda.shared.array(size, dtype) + + return impl diff --git a/numba/openmp/nrt/init.c b/numba/openmp/nrt/init.c new file mode 100644 index 000000000000..8c659aa3a6fe --- /dev/null +++ b/numba/openmp/nrt/init.c @@ -0,0 +1,3 @@ +extern void NRT_MemSys_init(); + +__attribute__((constructor)) static void PyOMP_NRT_Init() { NRT_MemSys_init(); } diff --git a/numba/openmp/pass/CGIntrinsicsOpenMP.cpp b/numba/openmp/pass/CGIntrinsicsOpenMP.cpp new file mode 100644 index 000000000000..861c059c9656 --- /dev/null +++ b/numba/openmp/pass/CGIntrinsicsOpenMP.cpp @@ -0,0 +1,3102 @@ +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/IR/CFG.h" +#include +#include +#include + +#include "CGIntrinsicsOpenMP.h" +#include "DebugOpenMP.h" + +#define DEBUG_TYPE "intrinsics-openmp" + +using namespace llvm; +using namespace omp; +using namespace iomp; + +namespace { + +static CallInst *checkCreateCall(IRBuilderBase &Builder, FunctionCallee &Fn, + ArrayRef Args) { + auto PrintDebugOutput = [&]() { + dbgs() << "=== CGOpenMP checkCreateCall\n"; + dbgs() << "FunctionCallee: " << Fn.getCallee()->getName() << "\n"; + dbgs() << "FunctionCalee Type: " << *Fn.getFunctionType() << "\n"; + size_t ArgNo = 0; + for (Value *Arg : Args) { + dbgs() << "Arg " << ArgNo << ": " << *Arg << "\n"; + ArgNo++; + } + dbgs() << "=== End of CGOpenMP checkCreateCall\n"; + }; + DEBUG_ENABLE(PrintDebugOutput()); + + // Check number of parameters only for non-vararg functions. + if (!Fn.getFunctionType()->isVarArg()) + if (Args.size() != Fn.getFunctionType()->getNumParams()) { + DEBUG_ENABLE(dbgs() << "Mismatch argument size " << Args.size() << " != " + << Fn.getFunctionType()->getNumParams() << "\n"); + return nullptr; + } + + // Check argument types up to number params in the callee type to avoid + // checking varargs unknow types. + for (size_t I = 0; I < Fn.getFunctionType()->getNumParams(); ++I) + if (Args[I]->getType() != Fn.getFunctionType()->getParamType(I)) { + DEBUG_ENABLE(dbgs() << "Mismatch type at " << I << "\n"; + dbgs() << "Arg " << *Args[I] << "\n"; + dbgs() << "Expected type " + << *Fn.getFunctionType()->getParamType(I) << "\n";); + return nullptr; + } + + return Builder.CreateCall(Fn, Args); +} + +} // namespace + +void CGIntrinsicsOpenMP::setDeviceGlobalizedValues( + const ArrayRef GlobalizedValues) { + DeviceGlobalizedValues.clear(); + DeviceGlobalizedValues.insert(GlobalizedValues.begin(), + GlobalizedValues.end()); +} + +Value *CGIntrinsicsOpenMP::createScalarCast(Value *V, Type *DestTy) { + Value *Scalar = nullptr; + assert(V && "Expected non-null value"); + if (V->getType()->isPointerTy()) { + Value *Load = + OMPBuilder.Builder.CreateLoad(V->getType()->getPointerElementType(), V); + Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(Load, DestTy); + } else { + Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(V, DestTy); + } + + return Scalar; +} + +Function *CGIntrinsicsOpenMP::createOutlinedFunction( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, Function *OuterFn, + BasicBlock *StartBB, BasicBlock *EndBB, + SmallVectorImpl &CapturedVars, StringRef Suffix) { + SmallVector Privates; + SmallVector CapturedShared; + SmallVector CapturedFirstprivate; + SmallVector Reductions; + + InsertPointTy SavedIP = OMPBuilder.Builder.saveIP(); + + OpenMPIRBuilder::OutlineInfo OI; + OI.EntryBB = StartBB; + OI.ExitBB = EndBB; + SmallPtrSet BlockSet; + SmallVector BlockVector; + OI.collectBlocks(BlockSet, BlockVector); + + CodeExtractorAnalysisCache CEAC(*OuterFn); + CodeExtractor Extractor(BlockVector, /* DominatorTree */ nullptr, + /* AggregateArgs */ false, + /* BlockFrequencyInfo */ nullptr, + /* BranchProbabilityInfo */ nullptr, + /* AssumptionCache */ nullptr, + /* AllowVarArgs */ true, + /* AllowAlloca */ true, + /* Suffix */ "."); + + // Find inputs to, outputs from the code region. + BasicBlock *CommonExit = nullptr; + SetVector Inputs, Outputs, SinkingCands, HoistingCands; + Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); + Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands); + + assert(Outputs.empty() && "Expected empty outputs from outlined region"); + assert(SinkingCands.empty() && "Expected empty alloca sinking candidates"); + + auto IsTempOrDefaultPrivate = [](Value *V) { + if(V->getName().startswith(".")) + return true; + + if(V->getName().startswith("excinfo")) + return true; + + if(V->getName() == "quot") + return true; + + if(V->getName() == "rem") + return true; + + return false; + }; + + // Scan Inputs and define any missing values as Privates. Those values must + // correspond to Numba-generated temporaries that should be privatized. + for (auto *V : Inputs) { + if (!DSAValueMap.count(V)) { + DEBUG_ENABLE(dbgs() << "Missing V " << *V + << " from DSAValueMap, will privatize\n"); + if (!IsTempOrDefaultPrivate(V)) + FATAL_ERROR("Expected Numba temporary value or default private, named starting " + "with . but got " + + V->getName().str()); + Privates.push_back(V); + continue; + } + + DSAType DSA = DSAValueMap[V].Type; + + DEBUG_ENABLE(dbgs() << "V " << *V << " from DSAValueMap Type " << DSA + << "\n"); + switch (DSA) { + case DSA_PRIVATE: + Privates.push_back(V); + break; + case DSA_FIRSTPRIVATE: + CapturedFirstprivate.push_back(V); + break; + case DSA_SHARED: + // Treat as shared to capture the pointer. + case DSA_LASTPRIVATE: + case DSA_MAP_TO: + case DSA_MAP_FROM: + case DSA_MAP_TOFROM: + case DSA_MAP_STRUCT: + CapturedShared.push_back(V); + break; + case DSA_REDUCTION_ADD: + case DSA_REDUCTION_SUB: + case DSA_REDUCTION_MUL: + Reductions.push_back(V); + break; + default: + FATAL_ERROR("Unexpected DSA type"); + } + } + + SmallVector Params; + // tid + Params.push_back(OMPBuilder.Int32Ptr); + // bound_tid + Params.push_back(OMPBuilder.Int32Ptr); + for (auto *V : CapturedShared) + Params.push_back(V->getType()); + for (auto *V : CapturedFirstprivate) { + Type *VPtrElemTy = V->getType()->getPointerElementType(); + if (VPtrElemTy->isSingleValueType()) + // TODO: The OpenMP runtime expects and propagates arguments + // typed as Int64, thus we cast byval firstprivates to Int64. Using an + // aggregate to store arguments would avoid this peculiarity. + // Params.push_back(VPtrElemTy); + Params.push_back(OMPBuilder.Int64); + else + Params.push_back(V->getType()); + } + for (auto *V : Reductions) + Params.push_back(V->getType()); + + FunctionType *OutlinedFnTy = + FunctionType::get(OMPBuilder.Void, Params, /* isVarArgs */ false); + Function *OutlinedFn = + Function::Create(OutlinedFnTy, GlobalValue::InternalLinkage, + OuterFn->getName() + Suffix, M); + + // Name the parameters and add attributes. Shared are ordered before + // firstprivate in the parameter list. + OutlinedFn->arg_begin()->setName("global_tid"); + std::next(OutlinedFn->arg_begin())->setName("bound_tid"); + Function::arg_iterator AI = std::next(OutlinedFn->arg_begin(), 2); + int arg_no = 2; + for (auto *V : CapturedShared) { + AI->setName(V->getName() + ".shared"); + // Insert pointers in device globalized if they correspond to a device + // globalized pointer. + if (DeviceGlobalizedValues.contains(V)) + DeviceGlobalizedValues.insert(AI); + + OutlinedFn->addParamAttr(arg_no, Attribute::NonNull); + OutlinedFn->addParamAttr( + arg_no, Attribute::get(M.getContext(), Attribute::Dereferenceable, 8)); + ++AI; + ++arg_no; + } + for (auto *V : CapturedFirstprivate) { + Type *VPtrElemTy = V->getType()->getPointerElementType(); + if (VPtrElemTy->isSingleValueType()) { + AI->setName(V->getName() + ".firstprivate.byval"); + } else { + AI->setName(V->getName() + ".firstprivate"); + OutlinedFn->addParamAttr(arg_no, Attribute::NonNull); + OutlinedFn->addParamAttr( + arg_no, + Attribute::get(M.getContext(), Attribute::Dereferenceable, 8)); + } + ++AI; + ++arg_no; + } + for (auto *V : Reductions) { + AI->setName(V->getName() + ".red"); + OutlinedFn->addParamAttr(arg_no, Attribute::NonNull); + OutlinedFn->addParamAttr( + arg_no, Attribute::get(M.getContext(), Attribute::Dereferenceable, 8)); + ++AI; + ++arg_no; + } + + BasicBlock *OutlinedEntryBB = + BasicBlock::Create(M.getContext(), ".outlined.entry", OutlinedFn); + BasicBlock *OutlinedExitBB = + BasicBlock::Create(M.getContext(), ".outlined.exit", OutlinedFn); + + auto CreateAllocaAtEntry = [&](Type *Ty, Value *ArraySize = nullptr, + const Twine &Name = "") { + auto CurIP = OMPBuilder.Builder.saveIP(); + OMPBuilder.Builder.SetInsertPoint(OutlinedEntryBB, + OutlinedEntryBB->getFirstInsertionPt()); + Value *Alloca = OMPBuilder.Builder.CreateAlloca(Ty, nullptr, Name); + OMPBuilder.Builder.restoreIP(CurIP); + return Alloca; + }; + + OMPBuilder.Builder.SetInsertPoint(OutlinedEntryBB); + + OutlinedFn->addParamAttr(0, Attribute::NoAlias); + OutlinedFn->addParamAttr(1, Attribute::NoAlias); + OutlinedFn->addFnAttr(Attribute::NoUnwind); + OutlinedFn->addFnAttr(Attribute::NoRecurse); + + auto CollectUses = [&BlockSet](Value *V, SetVector &Uses) { + for (Use &U : V->uses()) + if (auto *UserI = dyn_cast(U.getUser())) + if (BlockSet.count(UserI->getParent())) + Uses.insert(&U); + }; + + auto ReplaceUses = [](SetVector &Uses, Value *ReplacementValue) { + for (Use *UPtr : Uses) + UPtr->set(ReplacementValue); + }; + + for (auto *V : Privates) { + SetVector Uses; + CollectUses(V, Uses); + + Type *VTy = V->getType()->getPointerElementType(); + Value *ReplacementValue = + CreateAllocaAtEntry(VTy, nullptr, V->getName() + ".private"); + // NOTE: We need to zero initialize privates because Numba reference + // counting breaks when those privates correspond to memory-managed + // data structures. + OMPBuilder.Builder.CreateStore(Constant::getNullValue(VTy), + ReplacementValue); + + if (VMap) + (*VMap)[V] = ReplacementValue; + + ReplaceUses(Uses, ReplacementValue); + } + + AI = std::next(OutlinedFn->arg_begin(), 2); + for (auto *V : CapturedShared) { + SetVector Uses; + CollectUses(V, Uses); + + Value *ReplacementValue = AI; + + if (VMap) + (*VMap)[V] = ReplacementValue; + + ReplaceUses(Uses, ReplacementValue); + ++AI; + } + + for (auto *V : CapturedFirstprivate) { + SetVector Uses; + CollectUses(V, Uses); + + Type *VPtrElemTy = V->getType()->getPointerElementType(); + Value *ReplacementValue = + CreateAllocaAtEntry(VPtrElemTy, nullptr, V->getName() + ".copy"); + if (VPtrElemTy->isSingleValueType()) { + // TODO: The OpenMP runtime expects and propagates arguments + // typed as Int64, thus we cast byval firstprivates to Int64. Using an + // aggregate to store arguments would avoid this peculiarity. + // OMPBuilder.Builder.CreateStore(AI, ReplacementValue); + Value *Alloca = CreateAllocaAtEntry(OMPBuilder.Int64); + + OMPBuilder.Builder.CreateStore(AI, Alloca); + Value *BitCast = OMPBuilder.Builder.CreateBitCast(Alloca, V->getType()); + Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, BitCast); + OMPBuilder.Builder.CreateStore(Load, ReplacementValue); + } else { + Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, AI, + V->getName() + ".reload"); + FunctionCallee CopyConstructor = DSAValueMap[V].CopyConstructor; + if (CopyConstructor) { + Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {Load}); + OMPBuilder.Builder.CreateStore(Copy, ReplacementValue); + } else + OMPBuilder.Builder.CreateStore(Load, ReplacementValue); + } + + if (VMap) + (*VMap)[V] = ReplacementValue; + + ReplaceUses(Uses, ReplacementValue); + + ++AI; + } + + SmallVector ReductionInfos; + for (auto *V : Reductions) { + SetVector Uses; + CollectUses(V, Uses); + + if (VMap) + (*VMap)[V] = AI; + + InsertPointTy AllocaIP(OutlinedEntryBB, + OutlinedEntryBB->getFirstInsertionPt()); + + + Value *Priv = nullptr; + switch (DSAValueMap[V].Type) { + case DSA_REDUCTION_ADD: + Priv = CGReduction::emitInitAndAppendInfo( + OMPBuilder.Builder, AllocaIP, AI, ReductionInfos); + break; + case DSA_REDUCTION_SUB: + Priv = CGReduction::emitInitAndAppendInfo( + OMPBuilder.Builder, AllocaIP, AI, ReductionInfos); + break; + case DSA_REDUCTION_MUL: + Priv = CGReduction::emitInitAndAppendInfo( + OMPBuilder.Builder, AllocaIP, AI, ReductionInfos); + break; + default: + FATAL_ERROR("Unsupported reduction"); + } + + assert(Priv && "Expected non-null private reduction variable"); + ReplaceUses(Uses, Priv); + + ++AI; + } + + OMPBuilder.Builder.CreateBr(StartBB); + + EndBB->getTerminator()->setSuccessor(0, OutlinedExitBB); + OMPBuilder.Builder.SetInsertPoint(OutlinedExitBB); + OMPBuilder.Builder.CreateRetVoid(); + if (!ReductionInfos.empty()) + OMPBuilder.createReductions( + InsertPointTy(OutlinedExitBB, OutlinedExitBB->begin()), + InsertPointTy(OutlinedEntryBB, OutlinedEntryBB->begin()), + ReductionInfos); + + // Deterministic insertion of BBs, BlockVector needs ExitBB to move to the + // outlined function. + BlockVector.push_back(OI.ExitBB); + for (auto *BB : BlockVector) + BB->moveBefore(OutlinedExitBB); + + DEBUG_ENABLE(dbgs() << "=== Dump OutlinedFn\n" + << *OutlinedFn << "=== End of Dump OutlinedFn\n"); + + if (verifyFunction(*OutlinedFn, &errs())) + FATAL_ERROR("Verification of OutlinedFn failed!"); + + CapturedVars.append(CapturedShared); + CapturedVars.append(CapturedFirstprivate); + CapturedVars.append(Reductions); + + if (SavedIP.isSet()) + OMPBuilder.Builder.restoreIP(SavedIP); + + return OutlinedFn; +} + +CGIntrinsicsOpenMP::CGIntrinsicsOpenMP(Module &M) : OMPBuilder(M), M(M) { + OMPBuilder.initialize(); + + TgtOffloadEntryTy = StructType::create({OMPBuilder.Int8Ptr, + OMPBuilder.Int8Ptr, OMPBuilder.SizeTy, + OMPBuilder.Int32, OMPBuilder.Int32}, + "struct.__tgt_offload_entry"); + // OpenMP device runtime expects this global that controls debugging, default + // to 0 (no debugging enabled). + if (isOpenMPDeviceRuntime()) + OMPBuilder.createGlobalFlag(0, "__omp_rtl_debug_kind"); +} + +void CGIntrinsicsOpenMP::emitOMPParallel( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, FinalizeCallbackTy FiniCB, + ParRegionInfoStruct &ParRegionInfo) { + if (isOpenMPDeviceRuntime()) + emitOMPParallelDeviceRuntime(DSAValueMap, VMap, DL, Fn, BBEntry, StartBB, + EndBB, AfterBB, FiniCB, ParRegionInfo); + else + emitOMPParallelHostRuntime(DSAValueMap, VMap, DL, Fn, BBEntry, StartBB, + EndBB, AfterBB, FiniCB, ParRegionInfo); +} + +void CGIntrinsicsOpenMP::emitOMPParallelHostRuntime( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, FinalizeCallbackTy FiniCB, + ParRegionInfoStruct &ParRegionInfo) { + + // Set the insertion location at the end of the BBEntry. + BBEntry->getTerminator()->eraseFromParent(); + OMPBuilder.Builder.SetInsertPoint(BBEntry); + OMPBuilder.Builder.CreateBr(AfterBB); + + OMPBuilder.Builder.SetInsertPoint(BBEntry->getTerminator()); + OpenMPIRBuilder::LocationDescription Loc(OMPBuilder.Builder.saveIP(), DL); + OMPBuilder.Builder.SetCurrentDebugLocation(Loc.DL); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident); + + SmallVector CapturedVars; + Function *OutlinedFn = + createOutlinedFunction(DSAValueMap, VMap, Fn, StartBB, EndBB, + CapturedVars, ".omp_outlined_parallel"); + + auto EmitForkCall = [&](InsertPointTy InsertIP) { + OMPBuilder.Builder.restoreIP(InsertIP); + + auto *OutlinedFnCast = OMPBuilder.Builder.CreateBitCast( + OutlinedFn, OMPBuilder.ParallelTaskPtr); + FunctionCallee ForkCall = + OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_fork_call); + SmallVector ForkArgs; + ForkArgs.append({Ident, OMPBuilder.Builder.getInt32(CapturedVars.size()), + OutlinedFnCast}); + + for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { + // Pass firstprivate scalar by value. + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && + CapturedVars[Idx] + ->getType() + ->getPointerElementType() + ->isSingleValueType()) { + // TODO: check type conversions. + Value *Alloca = OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int64); + Type *VPtrElemTy = + CapturedVars[Idx]->getType()->getPointerElementType(); + Value *LoadV = + OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); + Value *BitCast = OMPBuilder.Builder.CreateBitCast( + Alloca, CapturedVars[Idx]->getType()); + OMPBuilder.Builder.CreateStore(LoadV, BitCast); + Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, Alloca); + ForkArgs.push_back(Load); + continue; + } + + ForkArgs.push_back(CapturedVars[Idx]); + } + + OMPBuilder.Builder.CreateCall(ForkCall, ForkArgs); + }; + + auto EmitSerializedParallel = [&](InsertPointTy InsertIP) { + OMPBuilder.Builder.restoreIP(InsertIP); + + // Build calls __kmpc_serialized_parallel(&Ident, GTid); + Value *Args[] = {Ident, ThreadID}; + OMPBuilder.Builder.CreateCall(OMPBuilder.getOrCreateRuntimeFunctionPtr( + OMPRTL___kmpc_serialized_parallel), + Args); + + Value *ZeroAddr = OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int32, nullptr, + ".zero.addr"); + OMPBuilder.Builder.CreateStore(Constant::getNullValue(OMPBuilder.Int32), + ZeroAddr); + // Zero for thread id, bound tid. + SmallVector OutlinedArgs = {ZeroAddr, ZeroAddr}; + for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { + // Pass firstprivate scalar by value. + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && + CapturedVars[Idx] + ->getType() + ->getPointerElementType() + ->isSingleValueType()) { + // TODO: check type conversions. + Type *VPtrElemTy = + CapturedVars[Idx]->getType()->getPointerElementType(); + Value *Load = + OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); + OutlinedArgs.push_back(Load); + continue; + } + + OutlinedArgs.push_back(CapturedVars[Idx]); + } + + OMPBuilder.Builder.CreateCall(OutlinedFn, OutlinedArgs); + + // __kmpc_end_serialized_parallel(&Ident, GTid); + OMPBuilder.Builder.CreateCall(OMPBuilder.getOrCreateRuntimeFunctionPtr( + OMPRTL___kmpc_end_serialized_parallel), + Args); + }; + + if (ParRegionInfo.NumThreads) { + Value *NumThreads = + createScalarCast(ParRegionInfo.NumThreads, OMPBuilder.Int32); + assert(NumThreads && "Expected non-null num threads"); + Value *Args[] = {Ident, ThreadID, NumThreads}; + OMPBuilder.Builder.CreateCall(OMPBuilder.getOrCreateRuntimeFunctionPtr( + OMPRTL___kmpc_push_num_threads), + Args); + } + + if (ParRegionInfo.IfCondition) { + Instruction *ThenTI = nullptr, *ElseTI = nullptr; + Value *IfConditionEval = nullptr; + + if (ParRegionInfo.IfCondition->getType()->isFloatingPointTy()) + IfConditionEval = OMPBuilder.Builder.CreateFCmpUNE( + ParRegionInfo.IfCondition, + ConstantFP::get(ParRegionInfo.IfCondition->getType(), 0)); + else + IfConditionEval = OMPBuilder.Builder.CreateICmpNE( + ParRegionInfo.IfCondition, + ConstantInt::get(ParRegionInfo.IfCondition->getType(), 0)); + + assert(IfConditionEval && "Expected non-null condition"); + SplitBlockAndInsertIfThenElse(IfConditionEval, BBEntry->getTerminator(), + &ThenTI, &ElseTI); + + assert(ThenTI && "Expected non-null ThenTI"); + assert(ElseTI && "Expected non-null ElseTI"); + EmitForkCall(InsertPointTy(ThenTI->getParent(), ThenTI->getIterator())); + EmitSerializedParallel( + InsertPointTy(ElseTI->getParent(), ElseTI->getIterator())); + } else { + EmitForkCall( + InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator())); + } + + DEBUG_ENABLE(dbgs() << "=== Dump OuterFn\n" + << *Fn << "=== End of Dump OuterFn\n"); + + if (verifyFunction(*Fn, &errs())) + FATAL_ERROR("Verification of OuterFn failed!"); +} + +#if 0 +void CGIntrinsicsOpenMP::emitOMPParallelHostRuntimeOMPIRBuilder( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, + const DebugLoc &DL, Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, + BasicBlock *EndBB, BasicBlock *AfterBB, FinalizeCallbackTy FiniCB, + ParRegionInfoStruct &ParRegionInfo) { + InsertPointTy BodyIP, BodyAllocaIP; + SmallVector ReductionInfos; + + auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + Value &Orig, Value &Inner, + Value *&ReplacementValue) -> InsertPointTy { + auto It = DSAValueMap.find(&Orig); + DEBUG_ENABLE(dbgs() << "DSAValueMap for Orig " << Orig << " Inner " << Inner); + if (It != DSAValueMap.end()) + DEBUG_ENABLE(dbgs() << It->second.Type); + else + DEBUG_ENABLE(dbgs() << " (null)!"); + DEBUG_ENABLE(dbgs() << "\n "); + + if (It == DSAValueMap.end()) { + DSAValueMap[&Orig] = DSA_PRIVATE; + DEBUG_ENABLE(dbgs() << "Missing V " << Orig << " from DSAValueMap, will privatize\n"); + assert(Orig.getName().startswith(".") && + "Expected Numba temporary value, named starting with ."); + } + assert(It != DSAValueMap.end() && "Expected Value in DSAValueMap"); + + DSAType DSA = It->second.Type; + FunctionCallee CopyConstructor = It->second.CopyConstructor; + + if (DSA == DSA_PRIVATE) { + OMPBuilder.Builder.restoreIP(AllocaIP); + Type *VTy = Inner.getType()->getPointerElementType(); + ReplacementValue = OMPBuilder.Builder.CreateAlloca( + VTy, /*ArraySize */ nullptr, Inner.getName()); + // NOTE: We need to zero-out privates because Numba reference + // counting breaks when those privates correspond to memory-managed + // data structures. + OMPBuilder.Builder.CreateStore(Constant::getNullValue(VTy), + ReplacementValue); + DEBUG_ENABLE(dbgs() << "Privatizing Inner " << Inner << " -> to -> " + << *ReplacementValue << "\n"); + if (VMap) + (*VMap)[&Orig] = ReplacementValue; + } else if (DSA == DSA_FIRSTPRIVATE) { + OMPBuilder.Builder.restoreIP(AllocaIP); + Type *VTy = Inner.getType()->getPointerElementType(); + ReplacementValue = OMPBuilder.Builder.CreateAlloca( + VTy, /*ArraySize */ nullptr, Orig.getName() + ".copy"); + OMPBuilder.Builder.restoreIP(CodeGenIP); + Value *InnerLoad = + OMPBuilder.Builder.CreateLoad(VTy, &Inner, Orig.getName() + ".reload"); + if (CopyConstructor) { + Value *Copy = + OMPBuilder.Builder.CreateCall(CopyConstructor, {InnerLoad}); + OMPBuilder.Builder.CreateStore(Copy, ReplacementValue); + } else + OMPBuilder.Builder.CreateStore(InnerLoad, ReplacementValue); + + DEBUG_ENABLE(dbgs() << "Firstprivatizing Inner " << Inner << " -> to -> " + << *ReplacementValue << "\n"); + if (VMap) + (*VMap)[&Orig] = ReplacementValue; + } else if (DSA == DSA_REDUCTION_ADD) { + OMPBuilder.Builder.restoreIP(AllocaIP); + Type *VTy = Inner.getType()->getPointerElementType(); + Value *V = OMPBuilder.Builder.CreateAlloca(VTy, /* ArraySize */ nullptr, + Orig.getName() + ".red.priv"); + ReplacementValue = V; + if (VMap) + (*VMap)[&Orig] = ReplacementValue; + + OMPBuilder.Builder.restoreIP(CodeGenIP); + // Store idempotent value based on operation and type. + // TODO: use emitInitAndAppendInfo in CGReduction + if (VTy->isIntegerTy()) + OMPBuilder.Builder.CreateStore(ConstantInt::get(VTy, 0), V); + else if (VTy->isFloatTy() || VTy->isDoubleTy()) + OMPBuilder.Builder.CreateStore(ConstantFP::get(VTy, 0.0), V); + else + assert(false && + "Unsupported type to init with idempotent reduction value"); + + ReductionInfos.push_back({VTy, &Orig, V, CGReduction::sumReduction, + CGReduction::sumAtomicReduction}); + + return OMPBuilder.Builder.saveIP(); + } else { + ReplacementValue = &Inner; + DEBUG_ENABLE(dbgs() << "Shared Inner " << Inner << " -> to -> " + << *ReplacementValue << "\n"); + } + + return CodeGenIP; + }; + + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + BasicBlock &ContinuationIP) { + BasicBlock *CGStartBB = CodeGenIP.getBlock(); + BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint()); + assert(StartBB != nullptr && "StartBB should not be null"); + CGStartBB->getTerminator()->setSuccessor(0, StartBB); + assert(EndBB != nullptr && "EndBB should not be null"); + EndBB->getTerminator()->setSuccessor(0, CGEndBB); + + BodyIP = InsertPointTy(CGEndBB, CGEndBB->getFirstInsertionPt()); + BodyAllocaIP = AllocaIP; + }; + + IRBuilder<>::InsertPoint AllocaIP(&Fn->getEntryBlock(), + Fn->getEntryBlock().getFirstInsertionPt()); + + // Set the insertion location at the end of the BBEntry. + BBEntry->getTerminator()->eraseFromParent(); + + Value *IfConditionEval = nullptr; + if (ParRegionInfo.IfCondition) { + OMPBuilder.Builder.SetInsertPoint(BBEntry); + if (ParRegionInfo.IfCondition->getType()->isFloatingPointTy()) + IfConditionEval = OMPBuilder.Builder.CreateFCmpUNE( + ParRegionInfo.IfCondition, + ConstantFP::get(ParRegionInfo.IfCondition->getType(), 0)); + else + IfConditionEval = OMPBuilder.Builder.CreateICmpNE( + ParRegionInfo.IfCondition, + ConstantInt::get(ParRegionInfo.IfCondition->getType(), 0)); + } + + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->end()), DL); + + Value *NumThreads = nullptr; + // It is allowed to have a nullptr NumThreads, createParallel handles that. + if (ParRegionInfo.NumThreads) + NumThreads = createScalarCast(ParRegionInfo.NumThreads, OMPBuilder.Int32); + // TODO: support cancellable, binding. + InsertPointTy AfterIP = OMPBuilder.createParallel( + Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, + /* IfCondition */ IfConditionEval, + /* NumThreads */ NumThreads, OMP_PROC_BIND_default, + /* IsCancellable */ false); + + if (!ReductionInfos.empty()) + OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos); + + BranchInst::Create(AfterBB, AfterIP.getBlock()); + + DEBUG_ENABLE(dbgs() << "=== Before Fn\n" << *Fn << "=== End of Before Fn\n"); + OMPBuilder.finalize(Fn); + DEBUG_ENABLE(dbgs() << "=== Finalize Fn\n" + << *Fn << "=== End of Finalize Fn\n"); +} +#endif + +void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, FinalizeCallbackTy FiniCB, + ParRegionInfoStruct &ParRegionInfo) { + // Extract parallel region + SmallVector CapturedVars; + Function *OutlinedFn = + createOutlinedFunction(DSAValueMap, VMap, Fn, StartBB, EndBB, + CapturedVars, ".omp_outlined_parallel"); + + // Create wrapper for worker threads + SmallVector Params; + // parallelism level, unused? + Params.push_back(OMPBuilder.Int16); + // tid + Params.push_back(OMPBuilder.Int32); + + FunctionType *OutlinedWrapperFnTy = + FunctionType::get(OMPBuilder.Void, Params, /* isVarArgs */ false); + Function *OutlinedWrapperFn = + Function::Create(OutlinedWrapperFnTy, GlobalValue::InternalLinkage, + OutlinedFn->getName() + ".wrapper", M); + BasicBlock *OutlinedWrapperEntryBB = + BasicBlock::Create(M.getContext(), "entry", OutlinedWrapperFn); + + // Code generation for the outlined wrapper function. + OMPBuilder.Builder.SetInsertPoint(OutlinedWrapperEntryBB); + + constexpr const int TIDArgNo = 1; + AllocaInst *TIDAddr = + OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int32, nullptr, ".tid.addr"); + AllocaInst *ZeroAddr = + OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int32, nullptr, ".zero.addr"); + AllocaInst *GlobalArgs = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int8PtrPtr, nullptr, "global_args"); + + OMPBuilder.Builder.CreateStore(OutlinedWrapperFn->getArg(TIDArgNo), TIDAddr); + OMPBuilder.Builder.CreateStore(Constant::getNullValue(OMPBuilder.Int32), + ZeroAddr); + FunctionCallee KmpcGetSharedVariables = OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_get_shared_variables); + OMPBuilder.Builder.CreateCall(KmpcGetSharedVariables, {GlobalArgs}); + + SmallVector OutlinedFnArgs; + OutlinedFnArgs.push_back(TIDAddr); + OutlinedFnArgs.push_back(ZeroAddr); + + for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { + Value *LoadGlobalArgs = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int8PtrPtr, GlobalArgs); + Value *GEP = OMPBuilder.Builder.CreateConstInBoundsGEP1_64( + OMPBuilder.Int8Ptr, LoadGlobalArgs, Idx); + + // Pass firstprivate scalar by value. + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && + CapturedVars[Idx] + ->getType() + ->getPointerElementType() + ->isSingleValueType()) { + Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType(); + Value *Bitcast = + OMPBuilder.Builder.CreateBitCast(GEP, CapturedVars[Idx]->getType()); + Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, Bitcast); + // TODO: Runtime expects values in Int64 type, fix with arguments in + // struct. + AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int64, nullptr, + CapturedVars[Idx]->getName() + "fpriv.byval"); + Value *Cast = OMPBuilder.Builder.CreateBitCast( + TmpInt64, CapturedVars[Idx]->getType()); + OMPBuilder.Builder.CreateStore(Load, Cast); + Value *ConvLoad = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); + OutlinedFnArgs.push_back(ConvLoad); + + continue; + } + + Value *Bitcast = OMPBuilder.Builder.CreateBitCast( + GEP, CapturedVars[Idx]->getType()->getPointerTo()); + Value *Load = + OMPBuilder.Builder.CreateLoad(CapturedVars[Idx]->getType(), Bitcast); + OutlinedFnArgs.push_back(Load); + } + + FunctionCallee OutlinedFnCallee(OutlinedFn->getFunctionType(), OutlinedFn); + + auto *OutlinedCI = + checkCreateCall(OMPBuilder.Builder, OutlinedFnCallee, OutlinedFnArgs); + assert(OutlinedCI && "Expected valid call"); + OMPBuilder.Builder.CreateRetVoid(); + + if (verifyFunction(*OutlinedWrapperFn, &errs())) + FATAL_ERROR("Verification of OutlinedWrapperFn failed!"); + + DEBUG_ENABLE(dbgs() << "=== Dump OutlinedWrapper\n" + << *OutlinedWrapperFn + << "=== End of Dump OutlinedWrapper\n"); + + // Setup the call to kmpc_parallel_51 + BBEntry->getTerminator()->eraseFromParent(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->end()), DL); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + OMPBuilder.Builder.restoreIP(Loc.IP); + OMPBuilder.Builder.SetCurrentDebugLocation(Loc.DL); + + // Create the address table of the global data. + // The number of outlined arguments without global_tid, bound_tid. + Value *NumCapturedArgs = + ConstantInt::get(OMPBuilder.SizeTy, CapturedVars.size()); + Type *CapturedVarsAddrsTy = + ArrayType::get(OMPBuilder.Int8Ptr, CapturedVars.size()); + + // TODO: Re-think allocas, move to start of caller. If the caller is outlined + // in an outer OpenMP region, dot naming ensures captured_var_addrs is a + // private value, since it's only used for setting up the call to + // kmpc_parallel_51. + auto PrevIP = OMPBuilder.Builder.saveIP(); + InsertPointTy AllocaIP(&Fn->getEntryBlock(), + Fn->getEntryBlock().getFirstInsertionPt()); + OMPBuilder.Builder.restoreIP(AllocaIP); + Value *CapturedVarsAddrs = OMPBuilder.Builder.CreateAlloca( + CapturedVarsAddrsTy, nullptr, ".captured_var_addrs"); + OMPBuilder.Builder.restoreIP(PrevIP); + + SmallVector GlobalAllocas; + for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { + DEBUG_ENABLE(dbgs() << "CapturedVar " << Idx << " " << *CapturedVars[Idx] + << "\n"); + Value *GEP = OMPBuilder.Builder.CreateConstInBoundsGEP2_64( + CapturedVarsAddrsTy, CapturedVarsAddrs, 0, Idx); + + // Pass firstprivate scalar by value. + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && + CapturedVars[Idx] + ->getType() + ->getPointerElementType() + ->isSingleValueType()) { + // TODO: check type conversions. + Value *BitCast = OMPBuilder.Builder.CreateBitCast(CapturedVars[Idx], + OMPBuilder.Int64Ptr); + Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, BitCast); + Value *IntToPtr = + OMPBuilder.Builder.CreateIntToPtr(Load, OMPBuilder.Int8Ptr); + OMPBuilder.Builder.CreateStore(IntToPtr, GEP); + + continue; + } + + // Allocate from global memory if the pointer is not globalized (not in the + // global address space). + FunctionCallee KmpcAllocShared = + OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_alloc_shared); + assert(CapturedVars[Idx]->getType()->isPointerTy() && + "Expected pointer type"); + + if (DeviceGlobalizedValues.contains(CapturedVars[Idx])) { + Value *Bitcast = OMPBuilder.Builder.CreateBitCast(CapturedVars[Idx], + OMPBuilder.Int8Ptr); + OMPBuilder.Builder.CreateStore(Bitcast, GEP); + } else { + Type *AllocTy = CapturedVars[Idx]->getType()->getPointerElementType(); + Value *Size = ConstantInt::get( + OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(AllocTy)); + CallBase *GlobalAlloc = + OMPBuilder.Builder.CreateCall(KmpcAllocShared, {Size}); + GlobalAlloc->addRetAttr( + llvm::Attribute::get(M.getContext(), llvm::Attribute::Alignment, 16)); + GlobalAllocas.push_back(GlobalAlloc); + // TODO: this assumes the type is trivally copyable, use the copy + // constructor for more complex types. + OMPBuilder.Builder.CreateMemCpy( + GlobalAlloc, GlobalAlloc->getPointerAlignment(M.getDataLayout()), + CapturedVars[Idx], + CapturedVars[Idx]->getPointerAlignment(M.getDataLayout()), Size); + + OMPBuilder.Builder.CreateStore(GlobalAlloc, GEP); + } + } + + Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident); + + Value *IfCondition = ParRegionInfo.IfCondition; + Value *NumThreads = ParRegionInfo.NumThreads; + if (!IfCondition) + // Set condition to 1 (execute in parallel) if not set. + IfCondition = ConstantInt::get(OMPBuilder.Int32, 1); + + if (!NumThreads) + NumThreads = ConstantInt::get(OMPBuilder.Int32, -1); + else + NumThreads = + OMPBuilder.Builder.CreateTruncOrBitCast(NumThreads, OMPBuilder.Int32); + + assert(NumThreads && "Expected non-null NumThreads"); + + FunctionCallee KmpcParallel51 = + OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_parallel_51); + + // Set proc_bind to -1 by default as it is unused. + assert(Ident && "Expected non-null Ident"); + assert(ThreadID && "Expected non-null ThreadID"); + assert(IfCondition && "Expected non-null IfCondition"); + assert(NumThreads && "Expected non-null NumThreads"); + assert(OutlinedWrapperFn && "Expected non-null OutlinedWrapperFn"); + assert(CapturedVarsAddrs && "Expected non-null CapturedVarsAddrs"); + assert(NumCapturedArgs && "Expected non-null NumCapturedArgs"); + + Value *ProcBind = OMPBuilder.Builder.getInt32(-1); + Value *OutlinedFnBitcast = + OMPBuilder.Builder.CreateBitCast(OutlinedFn, OMPBuilder.VoidPtr); + Value *OutlinedWrapperFnBitcast = + OMPBuilder.Builder.CreateBitCast(OutlinedWrapperFn, OMPBuilder.VoidPtr); + Value *CapturedVarAddrsBitcast = OMPBuilder.Builder.CreateBitCast( + CapturedVarsAddrs, OMPBuilder.VoidPtrPtr); + + SmallVector Args = {Ident, + ThreadID, + IfCondition, + NumThreads, + ProcBind, + OutlinedFnBitcast, + OutlinedWrapperFnBitcast, + CapturedVarAddrsBitcast, + NumCapturedArgs}; + + auto *CallKmpcParallel51 = + checkCreateCall(OMPBuilder.Builder, KmpcParallel51, Args); + assert(CallKmpcParallel51 && + "Expected non-null call instr from code generation"); + + FunctionCallee KmpcFreeShared = + OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_free_shared); + for (Value *GA : GlobalAllocas) { + Type *AllocTy = GA->getType()->getPointerElementType(); + Value *Size = ConstantInt::get(OMPBuilder.SizeTy, + M.getDataLayout().getTypeAllocSize(AllocTy)); + auto *CI = checkCreateCall(OMPBuilder.Builder, KmpcFreeShared, {GA, Size}); + assert(CI && "Expected valid call"); + } + + OMPBuilder.Builder.CreateBr(AfterBB); + + DEBUG_ENABLE(dbgs() << "=== Dump OuterFn\n" + << *Fn << "=== End of Dump OuterFn\n"); + + if (verifyFunction(*Fn, &errs())) + FATAL_ERROR("Verification of OuterFn failed!"); +} + +FunctionCallee CGIntrinsicsOpenMP::getKmpcForStaticInit(Type *Ty) { + DEBUG_ENABLE(dbgs() << "Type " << *Ty << "\n"); + unsigned Bitwidth = Ty->getIntegerBitWidth(); + DEBUG_ENABLE(dbgs() << "Bitwidth " << Bitwidth << "\n"); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_for_static_init_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_for_static_init_8u); + + FATAL_ERROR("unknown OpenMP loop iterator bitwidth"); +} + +FunctionCallee CGIntrinsicsOpenMP::getKmpcDistributeStaticInit(Type *Ty) { + DEBUG_ENABLE(dbgs() << "Type " << *Ty << "\n"); + unsigned Bitwidth = Ty->getIntegerBitWidth(); + DEBUG_ENABLE(dbgs() << "Bitwidth " << Bitwidth << "\n"); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_distribute_static_init_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_distribute_static_init_8u); + + FATAL_ERROR("unknown OpenMP loop iterator bitwidth"); +} + +void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap, + OMPLoopInfoStruct &OMPLoopInfo, + BasicBlock *StartBB, BasicBlock *ExitBB, + bool IsStandalone, bool IsDistribute, + bool IsDistributeParallelFor, + OMPDistributeInfoStruct *OMPDistributeInfo) { + DEBUG_ENABLE(dbgs() << "OMPLoopInfo.IV " << *OMPLoopInfo.IV << "\n"); + DEBUG_ENABLE(dbgs() << "OMPLoopInfo.UB " << *OMPLoopInfo.UB << "\n"); + assert(OMPLoopInfo.IV && "Expected non-null IV"); + assert(OMPLoopInfo.UB && "Expected non-null UB"); + + assert(static_cast(OMPLoopInfo.Sched) && + "Expected non-zero loop schedule"); + + BasicBlock *PreHeader = StartBB; + PreHeader->setName("omp.for.preheader"); + BasicBlock *Header = PreHeader->getUniqueSuccessor(); + assert(Header && "Expected unique successor header"); + Header->setName("omp.for.cond"); + BasicBlock *Exit = ExitBB; + Exit->setName("omp.for.exit"); + assert(Header && "Expected unique successor from PreHeader to Header"); + DEBUG_ENABLE(dbgs() << "=== PreHeader\n" + << *PreHeader << "=== End of PreHeader\n"); + DEBUG_ENABLE(dbgs() << "=== Header\n" << *Header << "=== End of Header\n"); + assert(Header->getTerminator()->getNumSuccessors() == 2 && + "Expected 2 successors (loopbody, exit)"); + BasicBlock *HeaderSuccBBs[2] = {Header->getTerminator()->getSuccessor(0), + Header->getTerminator()->getSuccessor(1)}; + BasicBlock *LoopBody = + (HeaderSuccBBs[0] == Exit ? HeaderSuccBBs[1] : HeaderSuccBBs[0]); + assert(LoopBody && "Expected non-null loop body basic block\n"); + + assert(Header->hasNPredecessors(2) && + "Expected exactly 2 predecessors to loop header (preheader, latch)"); + BasicBlock *HeaderPredBBs[2] = {*predecessors(Header).begin(), + *std::next(predecessors(Header).begin(), 1)}; + BasicBlock *Latch = + (HeaderPredBBs[0] == PreHeader ? HeaderPredBBs[1] : HeaderPredBBs[0]); + Latch->setName("omp.for.inc"); + assert(Latch && "Expected latch basicblock"); + + auto ClearBlockInstructions = [](BasicBlock *BB) { + // Remove all instructions in the BB, iterate backwards to avoid + // dangling uses for safe deletion. The BB becomes malformed and + // requires a terminator added. + while (!BB->empty()) { + Instruction &I = BB->back(); + assert(I.getNumUses() == 0 && "Expected no uses to delete"); + I.eraseFromParent(); + } + }; + // Clear Latch, Header. + ClearBlockInstructions(Latch); + ClearBlockInstructions(Header); + + DEBUG_ENABLE(dbgs() << "=== Exit\n" << *Exit << "=== End of Exit\n"); + + Type *IVTy = OMPLoopInfo.IV->getType()->getPointerElementType(); + SmallVector ReductionInfos; + + FunctionCallee LoopStaticInit = ((IsDistribute && isOpenMPDeviceRuntime()) + ? getKmpcDistributeStaticInit(IVTy) + : getKmpcForStaticInit(IVTy)); + FunctionCallee LoopStaticFini = + ((IsDistribute && isOpenMPDeviceRuntime()) + ? OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_distribute_static_fini) + : OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___kmpc_for_static_fini)); + + const DebugLoc DL = PreHeader->getTerminator()->getDebugLoc(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(PreHeader, PreHeader->getTerminator()->getIterator()), DL); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Value *ThreadNum = nullptr; + + // Create allocas for static init values. + // TODO: Move the AllocaIP to the start of the containing function. + InsertPointTy AllocaIP(PreHeader, PreHeader->getFirstInsertionPt()); + Type *I32Type = Type::getInt32Ty(M.getContext()); + OMPBuilder.Builder.restoreIP(AllocaIP); + Value *PLastIter = + OMPBuilder.Builder.CreateAlloca(I32Type, nullptr, "omp.for.is_last"); + // Value *PStart = OMPBuilder.Builder.CreateAlloca(IVTy, nullptr, + // "omp.for.start"); + Value *PLowerBound = + OMPBuilder.Builder.CreateAlloca(IVTy, nullptr, "omp.for.lb"); + Value *PStride = + OMPBuilder.Builder.CreateAlloca(IVTy, nullptr, "omp.for.stride"); + Value *PUpperBound = + OMPBuilder.Builder.CreateAlloca(IVTy, nullptr, "omp.for.ub"); + + // Store distribute LB, UB to be used by combined loop constructs. + if (IsDistribute) + if (OMPDistributeInfo) { + OMPDistributeInfo->LB = PLowerBound; + OMPDistributeInfo->UB = PUpperBound; + } + + // Create BasicBlock structure. + BasicBlock *MinUBBlock = + PreHeader->splitBasicBlock(PreHeader->getTerminator(), "omp.for.min.ub"); + BasicBlock *CapUBBlock = MinUBBlock->splitBasicBlock( + MinUBBlock->getTerminator(), "omp.for.cap.ub"); + BasicBlock *SetupLoopBlock = + CapUBBlock->splitBasicBlock(CapUBBlock->getTerminator(), "omp.for.setup"); + BasicBlock *ForEndBB = + ExitBB->splitBasicBlockBefore(ExitBB->getFirstInsertionPt()); + ForEndBB->setName("omp.for.end"); + + BasicBlock *DispatchCondBB = nullptr; + BasicBlock *DispatchIncBB = nullptr; + BasicBlock *DispatchEndBB = nullptr; + if (OMPLoopInfo.Sched == OMPScheduleType::StaticChunked || + OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked) { + DispatchCondBB = SetupLoopBlock->splitBasicBlock( + SetupLoopBlock->getTerminator(), "omp.dispatch.cond"); + DispatchIncBB = ExitBB->splitBasicBlockBefore(ExitBB->getFirstInsertionPt(), + "omp.dispatch.inc"); + DispatchEndBB = ExitBB->splitBasicBlockBefore(ExitBB->getFirstInsertionPt(), + "omp.dispatch.end"); + } + + Constant *Zero_I32 = ConstantInt::get(I32Type, 0); + Constant *One = ConstantInt::get(IVTy, 1); + + // Extend PreHeader + { + OMPBuilder.Builder.SetInsertPoint(PreHeader->getTerminator()); + // Store the initial normalized upper bound to PUpperBound. + Value *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.UB); + OMPBuilder.Builder.CreateStore(LoadUB, PUpperBound); + + Value *LoadLB = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.LB); + OMPBuilder.Builder.CreateStore(LoadLB, PLowerBound); + OMPBuilder.Builder.CreateStore(One, PStride); + OMPBuilder.Builder.CreateStore(Zero_I32, PLastIter); + + // If Chunk is not specified (nullptr), default to one, complying with + // the OpenMP specification. + if (!OMPLoopInfo.Chunk) + OMPLoopInfo.Chunk = One; + Value *ChunkCast = OMPBuilder.Builder.CreateIntCast(OMPLoopInfo.Chunk, IVTy, + /*isSigned*/ false); + + Constant *SchedulingType = + ConstantInt::get(I32Type, static_cast(OMPLoopInfo.Sched)); + + ThreadNum = OMPBuilder.getOrCreateThreadID(SrcLoc); + DEBUG_ENABLE(dbgs() << "=== SchedulingType " << *SchedulingType << "\n"); + DEBUG_ENABLE(dbgs() << "=== PLowerBound " << *PLowerBound << "\n"); + DEBUG_ENABLE(dbgs() << "=== PUpperBound " << *PUpperBound << "\n"); + DEBUG_ENABLE(dbgs() << "=== PStride " << *PStride << "\n"); + DEBUG_ENABLE(dbgs() << "=== Incr " << *One << "\n"); + DEBUG_ENABLE(dbgs() << "=== Schedule " + << static_cast(OMPLoopInfo.Sched) << "\n"); + DEBUG_ENABLE(dbgs() << "=== Chunk " << *ChunkCast << "\n"); + OMPBuilder.Builder.CreateCall( + LoopStaticInit, {SrcLoc, ThreadNum, SchedulingType, PLastIter, + PLowerBound, PUpperBound, PStride, One, ChunkCast}); + } + + // Create MinUBBlock. + { + OMPBuilder.Builder.SetInsertPoint(MinUBBlock, + MinUBBlock->getFirstInsertionPt()); + auto *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, PUpperBound); + auto *LoadGlobalUB = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.UB); + auto *Cond = OMPBuilder.Builder.CreateICmpUGT(LoadUB, LoadGlobalUB); + OMPBuilder.Builder.CreateCondBr(Cond, CapUBBlock, SetupLoopBlock); + MinUBBlock->getTerminator()->eraseFromParent(); + } + + // Create CapUBBlock + { + OMPBuilder.Builder.SetInsertPoint(CapUBBlock, + CapUBBlock->getFirstInsertionPt()); + auto *LoadGlobalUB = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.UB); + OMPBuilder.Builder.CreateStore(LoadGlobalUB, PUpperBound); + } + + // Create SetupLoopBlock + { + OMPBuilder.Builder.SetInsertPoint(SetupLoopBlock, + SetupLoopBlock->getFirstInsertionPt()); + Value *LoadLB = OMPBuilder.Builder.CreateLoad(IVTy, PLowerBound); + OMPBuilder.Builder.CreateStore(LoadLB, OMPLoopInfo.IV); + } + + // Create Header + { + auto SaveIP = OMPBuilder.Builder.saveIP(); + OMPBuilder.Builder.SetInsertPoint(Header); + auto *LoadIV = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.IV); + auto *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, PUpperBound); + auto *Cond = OMPBuilder.Builder.CreateICmpSLE(LoadIV, LoadUB); + OMPBuilder.Builder.CreateCondBr(Cond, LoopBody, ForEndBB); + OMPBuilder.Builder.restoreIP(SaveIP); + } + + // Create Latch. + { + auto SaveIP = OMPBuilder.Builder.saveIP(); + OMPBuilder.Builder.SetInsertPoint(Latch); + Value *LoadIV = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.IV); + if (IsDistribute && IsDistributeParallelFor) { + Value *LoadStride = OMPBuilder.Builder.CreateLoad(IVTy, PStride); + Value *Inc = OMPBuilder.Builder.CreateAdd(LoadIV, LoadStride); + OMPBuilder.Builder.CreateStore(Inc, OMPLoopInfo.IV); + } else { + Value *Inc = OMPBuilder.Builder.CreateAdd(LoadIV, One); + OMPBuilder.Builder.CreateStore(Inc, OMPLoopInfo.IV); + } + + // If it's a combined "distribute parallel for" with static/distribute + // chunked then fall through to the strided dispatch increment. + if (IsDistributeParallelFor && + ((OMPLoopInfo.Sched == OMPScheduleType::StaticChunked) || + (OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked))) + OMPBuilder.Builder.CreateBr(DispatchIncBB); + else + OMPBuilder.Builder.CreateBr(Header); + + OMPBuilder.Builder.restoreIP(SaveIP); + } + + assert(ThreadNum && "Expected non-null threadnum"); + if (OMPLoopInfo.Sched == OMPScheduleType::Static || + OMPLoopInfo.Sched == OMPScheduleType::Distribute) { + OMPBuilder.Builder.SetInsertPoint(ForEndBB, + ForEndBB->getFirstInsertionPt()); + OMPBuilder.Builder.CreateCall(LoopStaticFini, {SrcLoc, ThreadNum}); + } else if (OMPLoopInfo.Sched == OMPScheduleType::StaticChunked || + OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked) { + assert(DispatchCondBB && "Expected non-null dispatch cond bb"); + assert(DispatchIncBB && "Expected non-null dispatch inc bb"); + assert(DispatchEndBB && "Expected non-null dispatch end bb"); + // Create DispatchCond + { + auto SaveIP = OMPBuilder.Builder.saveIP(); + DispatchCondBB->getTerminator()->eraseFromParent(); + OMPBuilder.Builder.SetInsertPoint(DispatchCondBB); + auto *LoadLB = OMPBuilder.Builder.CreateLoad(IVTy, PLowerBound); + OMPBuilder.Builder.CreateStore(LoadLB, OMPLoopInfo.IV); + auto *LoadIV = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.IV); + auto *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, PUpperBound); + auto *Cond = OMPBuilder.Builder.CreateICmpSLE(LoadIV, LoadUB); + OMPBuilder.Builder.CreateCondBr(Cond, Header, DispatchEndBB); + OMPBuilder.Builder.restoreIP(SaveIP); + } + // Create DispatchIncBB. + { + auto SaveIP = OMPBuilder.Builder.saveIP(); + DispatchIncBB->getTerminator()->eraseFromParent(); + OMPBuilder.Builder.SetInsertPoint(DispatchIncBB); + auto *LoadLB = OMPBuilder.Builder.CreateLoad(IVTy, PLowerBound); + auto *LoadStride = OMPBuilder.Builder.CreateLoad(IVTy, PStride); + auto *LBPlusStride = OMPBuilder.Builder.CreateAdd(LoadLB, LoadStride); + OMPBuilder.Builder.CreateStore(LBPlusStride, PLowerBound); + + auto *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, PUpperBound); + auto *UBPlusStride = OMPBuilder.Builder.CreateAdd(LoadUB, LoadStride); + OMPBuilder.Builder.CreateStore(UBPlusStride, PUpperBound); + + // OMPBuilder.Builder.CreateBr(DispatchCondBB); + OMPBuilder.Builder.CreateBr(MinUBBlock); + OMPBuilder.Builder.restoreIP(SaveIP); + } + // Create ForEndBB + { + ForEndBB->getTerminator()->eraseFromParent(); + OMPBuilder.Builder.SetInsertPoint(ForEndBB); + OMPBuilder.Builder.CreateBr(DispatchIncBB); + } + + // Create DispatchEndBB + { + OMPBuilder.Builder.SetInsertPoint(DispatchEndBB, + DispatchEndBB->getFirstInsertionPt()); + OMPBuilder.Builder.CreateCall(LoopStaticFini, {SrcLoc, ThreadNum}); + } + } else { + FATAL_ERROR("Unknown loop schedule type"); + } + + OpenMPIRBuilder::OutlineInfo OI; + OI.EntryBB = PreHeader; + OI.ExitBB = Exit; + SmallPtrSet BlockSet; + SmallVector BlockVector; + OI.collectBlocks(BlockSet, BlockVector); + + // TODO: De-duplicate privatization code. + auto PrivatizeWithReductions = [&]() { + auto CurrentIP = OMPBuilder.Builder.saveIP(); + for (auto &It : DSAValueMap) { + Value *Orig = It.first; + DSAType DSA = It.second.Type; + FunctionCallee CopyConstructor = It.second.CopyConstructor; + Value *ReplacementValue = nullptr; + Type *VTy = Orig->getType()->getPointerElementType(); + + if (DSA == DSA_SHARED) + continue; + + // Lastprivates are handled later, need elaborate codegen. + if (DSA == DSA_LASTPRIVATE) + continue; + + // Store previous uses to set them to the ReplacementValue after + // privatization codegen. + SetVector Uses; + for (Use &U : Orig->uses()) + if (auto *UserI = dyn_cast(U.getUser())) + if (BlockSet.count(UserI->getParent())) + Uses.insert(&U); + + OMPBuilder.Builder.restoreIP(AllocaIP); + if (DSA == DSA_PRIVATE) { + ReplacementValue = OMPBuilder.Builder.CreateAlloca( + VTy, /*ArraySize */ nullptr, Orig->getName() + ".for.priv"); + OMPBuilder.Builder.CreateStore(Constant::getNullValue(VTy), + ReplacementValue); + } else if (DSA == DSA_FIRSTPRIVATE) { + Value *V = OMPBuilder.Builder.CreateLoad( + VTy, Orig, Orig->getName() + ".for.firstpriv.reload"); + ReplacementValue = OMPBuilder.Builder.CreateAlloca( + VTy, /*ArraySize */ nullptr, + Orig->getName() + ".for.firstpriv.copy"); + if (CopyConstructor) { + Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {V}); + OMPBuilder.Builder.CreateStore(Copy, ReplacementValue); + } else + OMPBuilder.Builder.CreateStore(V, ReplacementValue); + } else if (DSA == DSA_REDUCTION_ADD) { + ReplacementValue = + CGReduction::emitInitAndAppendInfo( + OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, + ReductionInfos); + } else if (DSA == DSA_REDUCTION_SUB) { + ReplacementValue = + CGReduction::emitInitAndAppendInfo( + OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig, + ReductionInfos); + } else + FATAL_ERROR("Unsupported privatization"); + + assert(ReplacementValue && "Expected non-null ReplacementValue"); + + for (Use *UPtr : Uses) + UPtr->set(ReplacementValue); + } + + OMPBuilder.Builder.restoreIP(CurrentIP); + }; + + auto EmitLastPrivate = [&](InsertPointTy CodeGenIP) { + auto ShouldReplace = [&BlockSet](Use &U) { + if (auto *UserI = dyn_cast(U.getUser())) + if (BlockSet.count(UserI->getParent())) + return true; + + return false; + }; + + for (auto &It : DSAValueMap) { + Value *Orig = It.first; + DSAType DSA = It.second.Type; + + if (DSA != DSA_LASTPRIVATE) + continue; + + FunctionCallee CopyConstructor = It.second.CopyConstructor; + Value *ReplacementValue = nullptr; + Type *VTy = Orig->getType()->getPointerElementType(); + + OMPBuilder.Builder.restoreIP(AllocaIP); + ReplacementValue = OMPBuilder.Builder.CreateAlloca( + VTy, /*ArraySize */ nullptr, Orig->getName() + ".for.lastpriv"); + OMPBuilder.Builder.CreateStore(Constant::getNullValue(VTy), + ReplacementValue); + Orig->replaceUsesWithIf(ReplacementValue, ShouldReplace); + + BasicBlock *InsertBB = CodeGenIP.getBlock(); + + BasicBlock *LastPrivCond = + SplitBlock(InsertBB, InsertBB->getTerminator()); + LastPrivCond->setName("omp.for.lastpriv.cond"); + BasicBlock *LastPrivThen = + SplitBlock(LastPrivCond, LastPrivCond->getTerminator()); + LastPrivThen->setName("omp.for.lastpriv.then"); + BasicBlock *LastPrivEnd = + SplitBlock(LastPrivThen, LastPrivThen->getTerminator()); + LastPrivEnd->setName("omp.for.lastpriv.end"); + OMPBuilder.Builder.SetInsertPoint(LastPrivThen->getTerminator()); + Value *Load = OMPBuilder.Builder.CreateLoad(VTy, ReplacementValue); + if (CopyConstructor) { + Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {Load}); + OMPBuilder.Builder.CreateStore(Copy, Orig); + } else + OMPBuilder.Builder.CreateStore(Load, Orig); + + LastPrivCond->getTerminator()->eraseFromParent(); + OMPBuilder.Builder.SetInsertPoint(LastPrivCond); + Value *PLastIterLoad = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int32, PLastIter); + Value *Cond = OMPBuilder.Builder.CreateICmpNE( + PLastIterLoad, ConstantInt::get(OMPBuilder.Int32, 0)); + OMPBuilder.Builder.CreateCondBr(Cond, LastPrivThen, LastPrivEnd); + } + }; + + BasicBlock *FiniBB = + (OMPLoopInfo.Sched == OMPScheduleType::Static) ? ForEndBB : DispatchEndBB; + EmitLastPrivate(InsertPointTy(FiniBB, FiniBB->end())); + + // Emit reductions, barrier, privatize if standalone. + if (IsStandalone) { + PrivatizeWithReductions(); + if (!ReductionInfos.empty()) { + OMPBuilder.Builder.SetInsertPoint(ForEndBB->getTerminator()); + OMPBuilder.createReductions(OpenMPIRBuilder::LocationDescription( + OMPBuilder.Builder.saveIP(), Loc.DL), + AllocaIP, ReductionInfos); + } + + OMPBuilder.Builder.SetInsertPoint(ExitBB->getTerminator()); + OMPBuilder.createBarrier(OpenMPIRBuilder::LocationDescription( + OMPBuilder.Builder.saveIP(), Loc.DL), + omp::Directive::OMPD_for, + /* ForceSimpleCall */ false, + /* CheckCancelFlag */ false); + } + + if (verifyFunction(*PreHeader->getParent(), &errs())) + FATAL_ERROR("Verification of omp for lowering failed!"); +} + +void CGIntrinsicsOpenMP::emitOMPFor(DSAValueMapTy &DSAValueMap, + OMPLoopInfoStruct &OMPLoopInfo, + BasicBlock *StartBB, BasicBlock *ExitBB, + bool IsStandalone, + bool IsDistributeParallelFor) { + // Set default loop schedule. + if (static_cast(OMPLoopInfo.Sched) == 0) + OMPLoopInfo.Sched = + (isOpenMPDeviceRuntime() ? OMPScheduleType::StaticChunked + : OMPScheduleType::Static); + + emitLoop(DSAValueMap, OMPLoopInfo, StartBB, ExitBB, IsStandalone, false, + IsDistributeParallelFor); +} + +void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, + BasicBlock *BBEntry, BasicBlock *StartBB, + BasicBlock *EndBB, BasicBlock *AfterBB) { + // Define types. + // ************** START TYPE DEFINITION ************** // + enum { + TiedFlag = 0x1, + FinalFlag = 0x2, + DestructorsFlag = 0x8, + PriorityFlag = 0x20, + DetachableFlag = 0x40, + }; + + // This is a union for priority/firstprivate destructors, use the + // routine entry pointer to allocate space since it is larger than + // Int32Ty for priority, see kmp.h. Unused for now. + StructType *KmpCmplrdataTy = + StructType::create({OMPBuilder.TaskRoutineEntryPtr}); + StructType *KmpTaskTTy = + StructType::create({OMPBuilder.VoidPtr, OMPBuilder.TaskRoutineEntryPtr, + OMPBuilder.Int32, KmpCmplrdataTy, KmpCmplrdataTy}, + "struct.kmp_task_t"); + Type *KmpTaskTPtrTy = KmpTaskTTy->getPointerTo(); + + FunctionCallee KmpcOmpTaskAlloc = + OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_omp_task_alloc); + SmallVector SharedsTy; + SmallVector PrivatesTy; + for (auto &It : DSAValueMap) { + Value *OriginalValue = It.first; + if (It.second.Type == DSA_SHARED) + SharedsTy.push_back(OriginalValue->getType()); + else if (It.second.Type == DSA_PRIVATE || + It.second.Type == DSA_FIRSTPRIVATE) { + assert(isa(OriginalValue->getType()) && + "Expected private, firstprivate value with pointer type"); + // Store a copy of the value, thus get the pointer element type. + PrivatesTy.push_back(OriginalValue->getType()->getPointerElementType()); + } else + FATAL_ERROR("Unknown DSA type"); + } + + StructType *KmpSharedsTTy = nullptr; + if (SharedsTy.empty()) + KmpSharedsTTy = StructType::create(M.getContext(), "struct.kmp_shareds"); + else + KmpSharedsTTy = StructType::create(SharedsTy, "struct.kmp_shareds"); + assert(KmpSharedsTTy && "Expected non-null KmpSharedsTTy"); + Type *KmpSharedsTPtrTy = KmpSharedsTTy->getPointerTo(); + StructType *KmpPrivatesTTy = + StructType::create(PrivatesTy, "struct.kmp_privates"); + Type *KmpPrivatesTPtrTy = KmpPrivatesTTy->getPointerTo(); + StructType *KmpTaskTWithPrivatesTy = StructType::create( + {KmpTaskTTy, KmpPrivatesTTy}, "struct.kmp_task_t_with_privates"); + Type *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo(); + + // Declare the task entry function. + Function *TaskEntryFn = Function::Create( + OMPBuilder.TaskRoutineEntry, GlobalValue::InternalLinkage, + Fn->getAddressSpace(), Fn->getName() + ".omp_task_entry", &M); + // Name arguments. + TaskEntryFn->getArg(0)->setName(".global_tid"); + TaskEntryFn->getArg(1)->setName(".task_t_with_privates"); + + // Declare the task outlined function. + FunctionType *TaskOutlinedFnTy = + FunctionType::get(OMPBuilder.Void, + {OMPBuilder.Int32, OMPBuilder.Int32Ptr, + OMPBuilder.VoidPtr, KmpTaskTPtrTy, KmpSharedsTPtrTy}, + /*isVarArg=*/false); + Function *TaskOutlinedFn = Function::Create( + TaskOutlinedFnTy, GlobalValue::InternalLinkage, Fn->getAddressSpace(), + Fn->getName() + ".omp_task_outlined", &M); + TaskOutlinedFn->getArg(0)->setName(".global_tid"); + TaskOutlinedFn->getArg(1)->setName(".part_id"); + TaskOutlinedFn->getArg(2)->setName(".privates"); + TaskOutlinedFn->getArg(3)->setName(".task.data"); + TaskOutlinedFn->getArg(4)->setName(".shareds"); + + // ************** END TYPE DEFINITION ************** // + + // Emit kmpc_omp_task_alloc, kmpc_omp_task + { + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + // TODO: parse clauses, for now fix flags to tied + unsigned TaskFlags = TiedFlag; + Value *SizeofShareds = nullptr; + if (KmpSharedsTTy->isEmptyTy()) + SizeofShareds = OMPBuilder.Builder.getInt64(0); + else + SizeofShareds = OMPBuilder.Builder.getInt64( + M.getDataLayout().getTypeAllocSize(KmpSharedsTTy)); + Value *SizeofKmpTaskTWithPrivates = OMPBuilder.Builder.getInt64( + M.getDataLayout().getTypeAllocSize(KmpTaskTWithPrivatesTy)); + OMPBuilder.Builder.SetInsertPoint(BBEntry, BBEntry->getFirstInsertionPt()); + Value *ThreadNum = OMPBuilder.getOrCreateThreadID(SrcLoc); + Value *KmpTaskTWithPrivatesVoidPtr = OMPBuilder.Builder.CreateCall( + KmpcOmpTaskAlloc, + {SrcLoc, ThreadNum, OMPBuilder.Builder.getInt32(TaskFlags), + SizeofKmpTaskTWithPrivates, SizeofShareds, TaskEntryFn}, + ".task.data"); + Value *KmpTaskTWithPrivates = OMPBuilder.Builder.CreateBitCast( + KmpTaskTWithPrivatesVoidPtr, KmpTaskTWithPrivatesPtrTy); + + const unsigned KmpTaskTIdx = 0; + const unsigned KmpSharedsIdx = 0; + Value *KmpTaskT = OMPBuilder.Builder.CreateStructGEP( + KmpTaskTWithPrivatesTy, KmpTaskTWithPrivates, KmpTaskTIdx); + Value *KmpSharedsGEP = + OMPBuilder.Builder.CreateStructGEP(KmpTaskTTy, KmpTaskT, KmpSharedsIdx); + Value *KmpSharedsVoidPtr = + OMPBuilder.Builder.CreateLoad(OMPBuilder.VoidPtr, KmpSharedsGEP); + Value *KmpShareds = + OMPBuilder.Builder.CreateBitCast(KmpSharedsVoidPtr, KmpSharedsTPtrTy); + const unsigned KmpPrivatesIdx = 1; + Value *KmpPrivates = OMPBuilder.Builder.CreateStructGEP( + KmpTaskTWithPrivatesTy, KmpTaskTWithPrivates, KmpPrivatesIdx); + + // Store shareds by reference, firstprivates by value, in task data + // storage. + unsigned SharedsGEPIdx = 0; + unsigned PrivatesGEPIdx = 0; + for (auto &It : DSAValueMap) { + Value *OriginalValue = It.first; + DSAType DSA = It.second.Type; + FunctionCallee CopyConstructor = It.second.CopyConstructor; + if (DSA == DSA_SHARED) { + Value *SharedGEP = OMPBuilder.Builder.CreateStructGEP( + KmpSharedsTTy, KmpShareds, SharedsGEPIdx, + OriginalValue->getName() + ".task.shared"); + OMPBuilder.Builder.CreateStore(OriginalValue, SharedGEP); + ++SharedsGEPIdx; + } else if (DSA == DSA_FIRSTPRIVATE) { + Value *FirstprivateGEP = OMPBuilder.Builder.CreateStructGEP( + KmpPrivatesTTy, KmpPrivates, PrivatesGEPIdx, + OriginalValue->getName() + ".task.firstprivate"); + Value *Load = OMPBuilder.Builder.CreateLoad( + OriginalValue->getType()->getPointerElementType(), OriginalValue); + if (CopyConstructor) { + Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {Load}); + OMPBuilder.Builder.CreateStore(Copy, FirstprivateGEP); + } else + OMPBuilder.Builder.CreateStore(Load, FirstprivateGEP); + ++PrivatesGEPIdx; + } else if (DSA == DSA_PRIVATE) + ++PrivatesGEPIdx; + } + + FunctionCallee KmpcOmpTask = + OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_omp_task); + OMPBuilder.Builder.CreateCall( + KmpcOmpTask, {SrcLoc, ThreadNum, KmpTaskTWithPrivatesVoidPtr}); + } + + // Emit task entry function. + { + BasicBlock *TaskEntryBB = + BasicBlock::Create(M.getContext(), "entry", TaskEntryFn); + OMPBuilder.Builder.SetInsertPoint(TaskEntryBB); + const unsigned TaskTIdx = 0; + const unsigned PrivatesIdx = 1; + const unsigned SharedsIdx = 0; + Value *GTId = TaskEntryFn->getArg(0); + Value *KmpTaskTWithPrivates = OMPBuilder.Builder.CreateBitCast( + TaskEntryFn->getArg(1), KmpTaskTWithPrivatesPtrTy); + Value *KmpTaskT = OMPBuilder.Builder.CreateStructGEP( + KmpTaskTWithPrivatesTy, KmpTaskTWithPrivates, TaskTIdx, ".task.data"); + Value *SharedsGEP = OMPBuilder.Builder.CreateStructGEP( + KmpTaskTTy, KmpTaskT, SharedsIdx, ".shareds.gep"); + Value *SharedsVoidPtr = OMPBuilder.Builder.CreateLoad( + OMPBuilder.VoidPtr, SharedsGEP, ".shareds.void.ptr"); + Value *Shareds = OMPBuilder.Builder.CreateBitCast( + SharedsVoidPtr, KmpSharedsTPtrTy, ".shareds"); + + Value *Privates = nullptr; + if (PrivatesTy.empty()) { + Privates = Constant::getNullValue(OMPBuilder.VoidPtr); + } else { + Value *PrivatesTyped = OMPBuilder.Builder.CreateStructGEP( + KmpTaskTWithPrivatesTy, KmpTaskTWithPrivates, PrivatesIdx, + ".privates"); + Privates = OMPBuilder.Builder.CreateBitCast( + PrivatesTyped, OMPBuilder.VoidPtr, ".privates.void.ptr"); + } + assert(Privates && "Expected non-null privates"); + + const unsigned PartIdIdx = 2; + Value *PartId = OMPBuilder.Builder.CreateStructGEP(KmpTaskTTy, KmpTaskT, + PartIdIdx, ".part_id"); + OMPBuilder.Builder.CreateCall(TaskOutlinedFnTy, TaskOutlinedFn, + {GTId, PartId, Privates, KmpTaskT, Shareds}); + OMPBuilder.Builder.CreateRet(OMPBuilder.Builder.getInt32(0)); + } + + // Emit TaskOutlinedFn code. + { + OpenMPIRBuilder::OutlineInfo OI; + OI.EntryBB = StartBB; + OI.ExitBB = EndBB; + SmallPtrSet OutlinedBlockSet; + SmallVector OutlinedBlockVector; + OI.collectBlocks(OutlinedBlockSet, OutlinedBlockVector); + BasicBlock *TaskOutlinedEntryBB = + BasicBlock::Create(M.getContext(), "entry", TaskOutlinedFn); + BasicBlock *TaskOutlinedExitBB = + BasicBlock::Create(M.getContext(), "exit", TaskOutlinedFn); + for (BasicBlock *BB : OutlinedBlockVector) + BB->moveBefore(TaskOutlinedExitBB); + // Explicitly move EndBB to the outlined functions, since OutlineInfo + // does not contain it in the OutlinedBlockVector. + EndBB->moveBefore(TaskOutlinedExitBB); + EndBB->getTerminator()->setSuccessor(0, TaskOutlinedExitBB); + + OMPBuilder.Builder.SetInsertPoint(TaskOutlinedEntryBB); + const unsigned KmpPrivatesArgNo = 2; + const unsigned KmpSharedsArgNo = 4; + Value *KmpPrivatesArgVoidPtr = TaskOutlinedFn->getArg(KmpPrivatesArgNo); + Value *KmpPrivatesArg = OMPBuilder.Builder.CreateBitCast( + KmpPrivatesArgVoidPtr, KmpPrivatesTPtrTy); + Value *KmpSharedsArg = TaskOutlinedFn->getArg(KmpSharedsArgNo); + + // Replace shareds, privates, firstprivates to refer to task data + // storage. + unsigned SharedsGEPIdx = 0; + unsigned PrivatesGEPIdx = 0; + for (auto &It : DSAValueMap) { + Value *OriginalValue = It.first; + Value *ReplacementValue = nullptr; + if (It.second.Type == DSA_SHARED) { + Value *SharedGEP = OMPBuilder.Builder.CreateStructGEP( + KmpSharedsTTy, KmpSharedsArg, SharedsGEPIdx, + OriginalValue->getName() + ".task.shared.gep"); + ReplacementValue = OMPBuilder.Builder.CreateLoad( + OriginalValue->getType(), SharedGEP, + OriginalValue->getName() + ".task.shared"); + ++SharedsGEPIdx; + } else if (It.second.Type == DSA_PRIVATE) { + Value *PrivateGEP = OMPBuilder.Builder.CreateStructGEP( + KmpPrivatesTTy, KmpPrivatesArg, PrivatesGEPIdx, + OriginalValue->getName() + ".task.private.gep"); + ReplacementValue = PrivateGEP; + // NOTE: Zero initialize private to avoid issue with Numba ref counting. + OMPBuilder.Builder.CreateStore( + Constant::getNullValue( + OriginalValue->getType()->getPointerElementType()), + ReplacementValue); + ++PrivatesGEPIdx; + } else if (It.second.Type == DSA_FIRSTPRIVATE) { + Value *FirstprivateGEP = OMPBuilder.Builder.CreateStructGEP( + KmpPrivatesTTy, KmpPrivatesArg, PrivatesGEPIdx, + OriginalValue->getName() + ".task.firstprivate.gep"); + ReplacementValue = FirstprivateGEP; + ++PrivatesGEPIdx; + } else + FATAL_ERROR("Unknown DSA type"); + + assert(ReplacementValue && "Expected non-null ReplacementValue"); + SmallVector Users(OriginalValue->users()); + for (User *U : Users) + if (Instruction *I = dyn_cast(U)) + if (OutlinedBlockSet.contains(I->getParent())) + I->replaceUsesOfWith(OriginalValue, ReplacementValue); + } + + OMPBuilder.Builder.CreateBr(StartBB); + OMPBuilder.Builder.SetInsertPoint(TaskOutlinedExitBB); + OMPBuilder.Builder.CreateRetVoid(); + BBEntry->getTerminator()->setSuccessor(0, AfterBB); + } +} + +void CGIntrinsicsOpenMP::emitOMPOffloadingEntry(const Twine &DevFuncName, + Value *EntryPtr, + Constant *&OMPOffloadEntry) { + + Constant *DevFuncNameConstant = + ConstantDataArray::getString(M.getContext(), DevFuncName.str()); + auto *GV = new GlobalVariable( + M, DevFuncNameConstant->getType(), + /* isConstant */ true, GlobalValue::InternalLinkage, DevFuncNameConstant, + ".omp_offloading.entry_name", nullptr, GlobalVariable::NotThreadLocal, + /* AddressSpace */ 0); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + Constant *EntryConst = dyn_cast(EntryPtr); + assert(EntryConst && "Expected constant entry pointer"); + OMPOffloadEntry = ConstantStruct::get( + TgtOffloadEntryTy, + ConstantExpr::getPointerBitCastOrAddrSpaceCast(EntryConst, + OMPBuilder.VoidPtr), + ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, OMPBuilder.Int8Ptr), + ConstantInt::get(OMPBuilder.SizeTy, 0), + ConstantInt::get(OMPBuilder.Int32, 0), + ConstantInt::get(OMPBuilder.Int32, 0)); + auto *OMPOffloadEntryGV = new GlobalVariable( + M, TgtOffloadEntryTy, + /* isConstant */ true, GlobalValue::WeakAnyLinkage, OMPOffloadEntry, + ".omp_offloading.entry." + DevFuncName); + OMPOffloadEntryGV->setSection("omp_offloading_entries"); + OMPOffloadEntryGV->setAlignment(Align(1)); +} + +void CGIntrinsicsOpenMP::emitOMPOffloadingMappings( + InsertPointTy AllocaIP, DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap, + OffloadingMappingArgsTy &OffloadingMappingArgs, bool IsTargetRegion) { + + struct MapperInfo { + Value *BasePtr; + Value *Ptr; + Value *Size; + }; + + SmallVector MapperInfos; + // SmallVector OffloadSizes; + SmallVector OffloadMapTypes; + SmallVector OffloadMapNames; + + if (DSAValueMap.empty()) { + OffloadingMappingArgs.Size = 0; + OffloadingMappingArgs.BasePtrs = + Constant::getNullValue(OMPBuilder.VoidPtrPtr); + OffloadingMappingArgs.Ptrs = Constant::getNullValue(OMPBuilder.VoidPtrPtr); + OffloadingMappingArgs.Sizes = Constant::getNullValue(OMPBuilder.Int64Ptr); + OffloadingMappingArgs.MapTypes = + Constant::getNullValue(OMPBuilder.Int64Ptr); + OffloadingMappingArgs.MapNames = + Constant::getNullValue(OMPBuilder.VoidPtrPtr); + + return; + } + + auto EmitMappingEntry = [&](Value *Size, uint64_t MapType, Value *BasePtr, + Value *Ptr) { + OffloadMapTypes.push_back(ConstantInt::get(OMPBuilder.SizeTy, MapType)); + // TODO: maybe add debug info. + uint32_t SrcLocStrSize; + OffloadMapNames.push_back(OMPBuilder.getOrCreateSrcLocStr( + BasePtr->getName(), "", 0, 0, SrcLocStrSize)); + DEBUG_ENABLE(dbgs() << "Emit mapping entry BasePtr " << *BasePtr << " Ptr " + << *Ptr << " Size " << *Size << " MapType " << MapType + << "\n"); + MapperInfos.push_back({BasePtr, Ptr, Size}); + }; + + auto GetMapType = [IsTargetRegion](DSAType DSA) { + uint64_t MapType; + // Determine the map type, completely or partly (structs). + switch (DSA) { + case DSA_FIRSTPRIVATE: + MapType = OMP_TGT_MAPTYPE_LITERAL; + if (IsTargetRegion) + MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM; + break; + case DSA_MAP_ALLOC: + // Allocation is the default in the OpenMP runtime, no extra flags. + MapType = OMP_TGT_MAPTYPE_NONE; + if (IsTargetRegion) + MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM; + break; + case DSA_MAP_TO: + MapType = OMP_TGT_MAPTYPE_TO; + if (IsTargetRegion) + MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM; + break; + case DSA_MAP_FROM: + MapType = OMP_TGT_MAPTYPE_FROM; + if (IsTargetRegion) + MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM; + break; + case DSA_MAP_TOFROM: + MapType = OMP_TGT_MAPTYPE_TO | OMP_TGT_MAPTYPE_FROM; + if (IsTargetRegion) + MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM; + break; + case DSA_MAP_STRUCT: + MapType = OMP_TGT_MAPTYPE_NONE; + if (IsTargetRegion) + MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM; + break; + case DSA_MAP_ALLOC_STRUCT: + // Allocation is the default in the OpenMP runtime, no extra flags. + MapType = OMP_TGT_MAPTYPE_NONE; + break; + case DSA_MAP_TO_STRUCT: + MapType = OMP_TGT_MAPTYPE_TO; + break; + case DSA_MAP_FROM_STRUCT: + MapType = OMP_TGT_MAPTYPE_FROM; + break; + case DSA_MAP_TOFROM_STRUCT: + MapType = OMP_TGT_MAPTYPE_TO | OMP_TGT_MAPTYPE_FROM; + break; + case DSA_PRIVATE: + // do nothing + break; + default: + FATAL_ERROR("Unknown mapping type"); + } + + return MapType; + }; + + // Keep track of argument position, needed for struct mappings. + for (auto &It : DSAValueMap) { + Value *V = It.first; + DSAType DSA = It.second.Type; + + // Emit the mapping entry. + Value *Size; + switch (DSA) { + case DSA_MAP_ALLOC: + case DSA_MAP_TO: + case DSA_MAP_FROM: + case DSA_MAP_TOFROM: + Size = ConstantInt::get(OMPBuilder.SizeTy, + M.getDataLayout().getTypeAllocSize(V->getType())); + EmitMappingEntry(Size, GetMapType(DSA), V, V); + break; + case DSA_FIRSTPRIVATE: { + auto *Load = OMPBuilder.Builder.CreateLoad( + V->getType()->getPointerElementType(), V); + // TODO: Runtime expects values in Int64 type, fix with arguments in + // struct. + AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int64, nullptr, V->getName() + ".casted"); + Value *Cast = OMPBuilder.Builder.CreateBitCast( + TmpInt64, V->getType()); + auto *Store = OMPBuilder.Builder.CreateStore(Load, Cast); + Value *ScalarV= + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); + Size = ConstantInt::get(OMPBuilder.SizeTy, + M.getDataLayout().getTypeAllocSize( + V->getType()->getPointerElementType())); + EmitMappingEntry(Size, GetMapType(DSA), ScalarV, ScalarV); + break; + } + case DSA_MAP_STRUCT: { + Size = ConstantInt::get(OMPBuilder.SizeTy, + M.getDataLayout().getTypeAllocSize( + V->getType()->getPointerElementType())); + EmitMappingEntry(Size, GetMapType(DSA), V, V); + // Stores the argument position (starting from 1) of the parent + // struct, to be used to set MEMBER_OF in the map type. + size_t ArgPos = MapperInfos.size(); + + for (auto &FieldInfo : StructMappingInfoMap[V]) { + // MEMBER_OF(Argument Position) + const size_t MemberOfOffset = 48; + uint64_t MemberOfBits = ArgPos << MemberOfOffset; + uint64_t FieldMapType = GetMapType(FieldInfo.MapType) | MemberOfBits; + auto *FieldGEP = OMPBuilder.Builder.CreateInBoundsGEP( + V->getType()->getPointerElementType(), V, + {OMPBuilder.Builder.getInt32(0), FieldInfo.Index}); + + Value *BasePtr = nullptr; + Value *Ptr = nullptr; + + if (FieldGEP->getType()->getPointerElementType()->isPointerTy()) { + FieldMapType |= OMP_TGT_MAPTYPE_PTR_AND_OBJ; + BasePtr = FieldGEP; + auto *Load = OMPBuilder.Builder.CreateLoad( + BasePtr->getType()->getPointerElementType(), BasePtr); + Ptr = OMPBuilder.Builder.CreateInBoundsGEP( + Load->getType()->getPointerElementType(), Load, FieldInfo.Offset); + } else { + BasePtr = V; + Ptr = OMPBuilder.Builder.CreateInBoundsGEP( + FieldGEP->getType()->getPointerElementType(), FieldGEP, + FieldInfo.Offset); + } + + assert(BasePtr && "Expected non-null base pointer"); + assert(Ptr && "Expected non-null pointer"); + + auto ElementSize = ConstantInt::get( + OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize( + Ptr->getType()->getPointerElementType())); + Value *NumElements = nullptr; + + // Load the value of NumElements if it is a pointer. + if (FieldInfo.NumElements->getType()->isPointerTy()) + NumElements = OMPBuilder.Builder.CreateLoad(OMPBuilder.SizeTy, + FieldInfo.NumElements); + else + NumElements = FieldInfo.NumElements; + + auto *Size = OMPBuilder.Builder.CreateMul(ElementSize, NumElements); + EmitMappingEntry(Size, FieldMapType, BasePtr, Ptr); + } + break; + } + case DSA_PRIVATE: { + // do nothing + break; + } + default: + FATAL_ERROR("Unknown mapping type"); + } + } + + auto EmitConstantArrayGlobalBitCast = [&](SmallVectorImpl &Vector, + Type *Ty, Type *DestTy, + StringRef Name) { + auto *Init = ConstantArray::get(ArrayType::get(Ty, Vector.size()), Vector); + auto *GV = new GlobalVariable(M, ArrayType::get(Ty, Vector.size()), + /* isConstant */ true, + GlobalVariable::PrivateLinkage, Init, Name); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + return OMPBuilder.Builder.CreateBitCast(GV, DestTy); + }; + + // TODO: offload_sizes can be a global of constants for optimization if all + // sizes are constants. + // OffloadingMappingArgs.Sizes = + // EmitConstantArrayGlobalBitCast(OffloadSizes, OMPBuilder.SizeTy, + // OMPBuilder.Int64Ptr, ".offload_sizes"); + OffloadingMappingArgs.MapTypes = + EmitConstantArrayGlobalBitCast(OffloadMapTypes, OMPBuilder.SizeTy, + OMPBuilder.Int64Ptr, ".offload_maptypes"); + OffloadingMappingArgs.MapNames = EmitConstantArrayGlobalBitCast( + OffloadMapNames, OMPBuilder.Int8Ptr, OMPBuilder.VoidPtrPtr, + ".offload_mapnames"); + + auto EmitArrayAlloca = [&](size_t Size, Type *Ty, StringRef Name) { + InsertPointTy CodeGenIP = OMPBuilder.Builder.saveIP(); + + OMPBuilder.Builder.restoreIP(AllocaIP); + auto *Alloca = OMPBuilder.Builder.CreateAlloca(ArrayType::get(Ty, Size), + nullptr, Name); + + OMPBuilder.Builder.restoreIP(CodeGenIP); + + return Alloca; + }; + + auto *BasePtrsAlloca = EmitArrayAlloca(MapperInfos.size(), OMPBuilder.VoidPtr, + ".offload_baseptrs"); + auto *PtrsAlloca = + EmitArrayAlloca(MapperInfos.size(), OMPBuilder.VoidPtr, ".offload_ptrs"); + auto *SizesAlloca = + EmitArrayAlloca(MapperInfos.size(), OMPBuilder.SizeTy, ".offload_sizes"); + + size_t Idx = 0; + for (auto &MI : MapperInfos) { + // Store in the base pointers alloca. + auto *GEP = OMPBuilder.Builder.CreateInBoundsGEP( + BasePtrsAlloca->getType()->getPointerElementType(), BasePtrsAlloca, + {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); + auto *Bitcast = OMPBuilder.Builder.CreateBitCast( + GEP, MI.BasePtr->getType()->getPointerTo()); + OMPBuilder.Builder.CreateStore(MI.BasePtr, Bitcast); + + // Store in the pointers alloca. + GEP = OMPBuilder.Builder.CreateInBoundsGEP( + PtrsAlloca->getType()->getPointerElementType(), PtrsAlloca, + {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); + Bitcast = OMPBuilder.Builder.CreateBitCast( + GEP, MI.Ptr->getType()->getPointerTo()); + OMPBuilder.Builder.CreateStore(MI.Ptr, Bitcast); + + // Store in the sizes alloca. + GEP = OMPBuilder.Builder.CreateInBoundsGEP( + SizesAlloca->getType()->getPointerElementType(), SizesAlloca, + {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)}); + Bitcast = OMPBuilder.Builder.CreateBitCast( + GEP, MI.Size->getType()->getPointerTo()); + OMPBuilder.Builder.CreateStore(MI.Size, Bitcast); + + Idx++; + } + + OffloadingMappingArgs.Size = MapperInfos.size(); + OffloadingMappingArgs.BasePtrs = + OMPBuilder.Builder.CreateBitCast(BasePtrsAlloca, OMPBuilder.VoidPtrPtr); + OffloadingMappingArgs.Ptrs = + OMPBuilder.Builder.CreateBitCast(PtrsAlloca, OMPBuilder.VoidPtrPtr); + OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateBitCast( + SizesAlloca, OMPBuilder.SizeTy->getPointerTo()); + + // OffloadingMappingArgs.BasePtrs = OMPBuilder.Builder.CreateInBoundsGEP( + // BasePtrsAlloca->getType()->getPointerElementType(), BasePtrsAlloca, + // {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)}); + // OffloadingMappingArgs.Ptrs = OMPBuilder.Builder.CreateInBoundsGEP( + // PtrsAlloca->getType()->getPointerElementType(), PtrsAlloca, + // {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)}); + // OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateInBoundsGEP( + // SizesAlloca->getType()->getPointerElementType(), SizesAlloca, + // {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)}); +} + +void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry, + BasicBlock *AfterBB, + BodyGenCallbackTy BodyGenCB, + FinalizeCallbackTy FiniCB) { + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + BBEntry->getTerminator()->eraseFromParent(); + // Set the insertion location at the end of the BBEntry. + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->end()), DL); + + InsertPointTy AfterIP = + OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /*DidIt*/ nullptr); + BranchInst::Create(AfterBB, AfterIP.getBlock()); + DEBUG_ENABLE(dbgs() << "=== Single Fn\n" << *Fn << "=== End of Single Fn\n"); +} + +void CGIntrinsicsOpenMP::emitOMPCritical(Function *Fn, BasicBlock *BBEntry, + BasicBlock *AfterBB, + BodyGenCallbackTy BodyGenCB, + FinalizeCallbackTy FiniCB) { + if (isOpenMPDeviceRuntime()) + FATAL_ERROR("Critical regions are not (yet) implemented on device"); + + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + BBEntry->getTerminator()->eraseFromParent(); + // Set the insertion location at the end of the BBEntry. + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->end()), DL); + + InsertPointTy AfterIP = OMPBuilder.createCritical(Loc, BodyGenCB, FiniCB, "", + /*HintInst*/ nullptr); + BranchInst::Create(AfterBB, AfterIP.getBlock()); + DEBUG_ENABLE(dbgs() << "=== Critical Fn\n" + << *Fn << "=== End of Critical Fn\n"); +} + +void CGIntrinsicsOpenMP::emitOMPBarrier(Function *Fn, BasicBlock *BBEntry, + Directive DK) { + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + // Set the insertion location at the end of the BBEntry. + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL); + + // TODO: check ForceSimpleCall usage. + OMPBuilder.createBarrier(Loc, DK, + /*ForceSimpleCall*/ false, + /*CheckCancelFlag*/ true); + DEBUG_ENABLE(dbgs() << "=== Barrier Fn\n" << *Fn << "=== End of Barrier Fn\n"); +} + +void CGIntrinsicsOpenMP::emitOMPTaskwait(BasicBlock *BBEntry) { + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + // Set the insertion location at the end of the BBEntry. + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL); + + OMPBuilder.createTaskwait(Loc); +} + +GlobalVariable * +CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName, + ConstantDataArray *ELF) { + GlobalVariable *OMPRegionId = nullptr; + GlobalVariable *OMPOffloadEntries = nullptr; + + // TODO: assumes 1 target region, can we call tgt_register_lib + // multiple times? + OMPRegionId = new GlobalVariable( + M, OMPBuilder.Int8, /* isConstant */ true, GlobalValue::WeakAnyLinkage, + ConstantInt::get(OMPBuilder.Int8, 0), DevWrapperFuncName + ".region_id", + nullptr, GlobalVariable::NotThreadLocal, + /* AddressSpace */ 0); + + Constant *OMPOffloadEntry; + CGIntrinsicsOpenMP::emitOMPOffloadingEntry(DevWrapperFuncName, OMPRegionId, + OMPOffloadEntry); + + // TODO: do this at finalization when all entries have been + // found. + // TODO: assumes 1 device image, can we call tgt_register_lib + // multiple times? + auto *ArrayTy = ArrayType::get(TgtOffloadEntryTy, 1); + OMPOffloadEntries = + new GlobalVariable(M, ArrayTy, + /* isConstant */ true, GlobalValue::InternalLinkage, + ConstantArray::get(ArrayTy, {OMPOffloadEntry}), + ".omp_offloading.entries"); + + assert(OMPRegionId && "Expected non-null omp region id global"); + assert(OMPOffloadEntries && + "Expected non-null omp offloading entries constant"); + + auto EmitOffloadingBinaryGlobals = [&]() { + auto *GV = new GlobalVariable(M, ELF->getType(), /* isConstant */ true, + GlobalValue::InternalLinkage, ELF, + ".omp_offloading.device_image"); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + StructType *TgtDeviceImageTy = StructType::create( + {OMPBuilder.Int8Ptr, OMPBuilder.Int8Ptr, + TgtOffloadEntryTy->getPointerTo(), TgtOffloadEntryTy->getPointerTo()}, + "struct.__tgt_device_image"); + + StructType *TgtBinDescTy = StructType::create( + {OMPBuilder.Int32, TgtDeviceImageTy->getPointerTo(), + TgtOffloadEntryTy->getPointerTo(), TgtOffloadEntryTy->getPointerTo()}, + "struct.__tgt_bin_desc"); + + auto *ArrayTy = ArrayType::get(TgtDeviceImageTy, 1); + auto *Zero = ConstantInt::get(OMPBuilder.SizeTy, 0); + auto *One = ConstantInt::get(OMPBuilder.SizeTy, 1); + auto *Size = ConstantInt::get(OMPBuilder.SizeTy, ELF->getNumElements()); + Constant *ZeroZero[] = {Zero, Zero}; + Constant *ZeroOne[] = {Zero, One}; + Constant *ZeroSize[] = {Zero, Size}; + + auto *ImageB = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, ZeroZero); + auto *ImageE = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, ZeroSize); + auto *EntriesB = ConstantExpr::getGetElementPtr( + OMPOffloadEntries->getValueType(), OMPOffloadEntries, ZeroZero); + auto *EntriesE = ConstantExpr::getGetElementPtr( + OMPOffloadEntries->getValueType(), OMPOffloadEntries, ZeroOne); + + auto *DeviceImageEntry = ConstantStruct::get(TgtDeviceImageTy, ImageB, + ImageE, EntriesB, EntriesE); + auto *DeviceImages = + new GlobalVariable(M, ArrayTy, + /* isConstant */ true, GlobalValue::InternalLinkage, + ConstantArray::get(ArrayTy, {DeviceImageEntry}), + ".omp_offloading.device_images"); + + auto *ImagesB = ConstantExpr::getGetElementPtr(DeviceImages->getValueType(), + DeviceImages, ZeroZero); + auto *DescInit = + ConstantStruct::get(TgtBinDescTy, + ConstantInt::get(OMPBuilder.Int32, + /* number of images */ 1), + ImagesB, EntriesB, EntriesE); + auto *BinDesc = + new GlobalVariable(M, DescInit->getType(), + /* isConstant */ true, GlobalValue::InternalLinkage, + DescInit, ".omp_offloading.descriptor"); + + // Add tgt_register_requires, tgt_register_lib, + // tgt_unregister_lib. + { + // tgt_register_requires. + auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.requires_reg", &M); + Func->setSection(".text.startup"); + + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = FunctionType::get(OMPBuilder.Void, OMPBuilder.Int64, + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_requires", RegFuncTy); + + // Construct function body + IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func)); + // TODO: fix to pass the requirements enum value. + Builder.CreateCall(RegFuncC, ConstantInt::get(OMPBuilder.Int64, 1)); + Builder.CreateRetVoid(); + + // Add this function to constructors. + // Set priority to 1 so that __tgt_register_lib is executed + // AFTER + // __tgt_register_requires (we want to know what requirements + // have been asked for before we load a libomptarget plugin so + // that by the time the plugin is loaded it can report how + // many devices there are which can satisfy these + // requirements). + appendToGlobalCtors(M, Func, /*Priority*/ 0); + } + { + // ctor + auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_reg", &M); + Func->setSection(".text.startup"); + + // Get __tgt_register_lib function declaration. + auto *RegFuncTy = + FunctionType::get(OMPBuilder.Void, TgtBinDescTy->getPointerTo(), + /*isVarArg*/ false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + + // Construct function body + IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func)); + Builder.CreateCall(RegFuncC, BinDesc); + Builder.CreateRetVoid(); + + // Add this function to constructors. + // Set priority to 1 so that __tgt_register_lib is executed + // AFTER + // __tgt_register_requires (we want to know what requirements + // have been asked for before we load a libomptarget plugin so + // that by the time the plugin is loaded it can report how + // many devices there are which can satisfy these + // requirements). + appendToGlobalCtors(M, Func, /*Priority*/ 1); + } + { + auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, + ".omp_offloading.descriptor_unreg", &M); + Func->setSection(".text.startup"); + + // Get __tgt_unregister_lib function declaration. + auto *UnRegFuncTy = + FunctionType::get(OMPBuilder.Void, TgtBinDescTy->getPointerTo(), + /*isVarArg*/ false); + FunctionCallee UnRegFuncC = + M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy); + + // Construct function body + IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func)); + Builder.CreateCall(UnRegFuncC, BinDesc); + Builder.CreateRetVoid(); + + // Add this function to global destructors. + // Match priority of __tgt_register_lib + appendToGlobalDtors(M, Func, /*Priority*/ 1); + } + }; + + EmitOffloadingBinaryGlobals(); + + return OMPRegionId; +} + +void CGIntrinsicsOpenMP::emitOMPTarget(Function *Fn, BasicBlock *EntryBB, + BasicBlock *StartBB, BasicBlock *EndBB, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap, + TargetInfoStruct &TargetInfo, + OMPLoopInfoStruct *OMPLoopInfo, + bool IsDeviceTargetRegion) { + if (IsDeviceTargetRegion) + emitOMPTargetDevice(Fn, EntryBB, StartBB, EndBB, DSAValueMap, + StructMappingInfoMap, TargetInfo); + else + emitOMPTargetHost(Fn, EntryBB, StartBB, EndBB, DSAValueMap, + StructMappingInfoMap, TargetInfo, OMPLoopInfo); +} + +void CGIntrinsicsOpenMP::emitOMPTargetHost( + Function *Fn, BasicBlock *EntryBB, BasicBlock *StartBB, BasicBlock *EndBB, + DSAValueMapTy &DSAValueMap, StructMapTy &StructMappingInfoMap, + TargetInfoStruct &TargetInfo, OMPLoopInfoStruct *OMPLoopInfo) { + + Twine DevWrapperFuncName = getDevWrapperFuncPrefix() + TargetInfo.DevFuncName; + + GlobalVariable *OMPRegionId = + emitOffloadingGlobals(DevWrapperFuncName.str(), TargetInfo.ELF); + + const DebugLoc DL = EntryBB->getTerminator()->getDebugLoc(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(EntryBB, EntryBB->getTerminator()->getIterator()), DL); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + + // TODO: should we use target_mapper without teams or the more general + // target_teams_mapper. Does the former buy us anything (less overhead?) + // FunctionCallee TargetMapper = + // OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_mapper); + // TODO: For nowait we need to enclose the host code in a task for async + // execution. + FunctionCallee TargetMapper = + (TargetInfo.NoWait ? OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_teams_nowait_mapper) + : OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_teams_mapper)); + OMPBuilder.Builder.SetInsertPoint(EntryBB->getTerminator()); + + // Emit mappings. + OffloadingMappingArgsTy OffloadingMappingArgs; + InsertPointTy AllocaIP(&Fn->getEntryBlock(), + Fn->getEntryBlock().getFirstInsertionPt()); + emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap, + OffloadingMappingArgs, /* isTargetRegion */ true); + + // Push the tripcount. + if (OMPLoopInfo) { + FunctionCallee TripcountMapper = OMPBuilder.getOrCreateRuntimeFunction( + M, + llvm::omp::RuntimeFunction::OMPRTL___kmpc_push_target_tripcount_mapper); + Value *Load = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, OMPLoopInfo->UB); + Value *Tripcount = OMPBuilder.Builder.CreateAdd( + Load, ConstantInt::get(OMPBuilder.Int64, 1)); + auto *CI = checkCreateCall( + OMPBuilder.Builder, TripcountMapper, + {Ident, ConstantInt::get(OMPBuilder.Int64, -1), Tripcount}); + assert(CI && "Expected valid call"); + } + + Value *NumTeams = createScalarCast(TargetInfo.NumTeams, OMPBuilder.Int32); + Value *ThreadLimit = + createScalarCast(TargetInfo.ThreadLimit, OMPBuilder.Int32); + + assert(NumTeams && "Expected non-null NumTeams"); + assert(ThreadLimit && "Expected non-null ThreadLimit"); + + SmallVector Args = { + Ident, ConstantInt::get(OMPBuilder.Int64, -1), + ConstantExpr::getBitCast(OMPRegionId, OMPBuilder.VoidPtr), + ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size), + OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs, + OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes, + OffloadingMappingArgs.MapNames, + // TODO: offload_mappers is null for now. + Constant::getNullValue(OMPBuilder.VoidPtrPtr), NumTeams, ThreadLimit}; + + if (TargetInfo.NoWait) { + // Add extra dependency information (unused for now). + Args.push_back(Constant::getNullValue(OMPBuilder.Int32)); + Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr)); + Args.push_back(Constant::getNullValue(OMPBuilder.Int32)); + Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr)); + } + + auto *OffloadResult = checkCreateCall(OMPBuilder.Builder, TargetMapper, Args); + assert(OffloadResult && "Expected non-null call inst from code generation"); + auto *Failed = OMPBuilder.Builder.CreateIsNotNull(OffloadResult); + OMPBuilder.Builder.CreateCondBr(Failed, StartBB, EndBB); + EntryBB->getTerminator()->eraseFromParent(); +} + +void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB, + BasicBlock *StartBB, + BasicBlock *EndBB, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap, + TargetInfoStruct &TargetInfo) { + // Emit the Numba wrapper offloading function. + SmallVector WrapperArgsTypes; + SmallVector WrapperArgsNames; + for (auto &It : DSAValueMap) { + Value *V = It.first; + DSAType DSA = It.second.Type; + + DEBUG_ENABLE(dbgs() << "V " << *V << " DSA " << DSA << "\n"); + switch (DSA) { + case DSA_FIRSTPRIVATE: + // TODO: Runtime expects firstprivate (scalars) typed as Int64. + WrapperArgsTypes.push_back(OMPBuilder.Int64); + WrapperArgsNames.push_back(V->getName()); + break; + case DSA_PRIVATE: + // do nothing + break; + default: + WrapperArgsTypes.push_back(V->getType()); + WrapperArgsNames.push_back(V->getName()); + } + } + + Twine DevWrapperFuncName = getDevWrapperFuncPrefix() + Fn->getName(); + FunctionType *NumbaWrapperFnTy = + FunctionType::get(OMPBuilder.Void, WrapperArgsTypes, + /* isVarArg */ false); + Function *NumbaWrapperFunc = Function::Create( + NumbaWrapperFnTy, GlobalValue::ExternalLinkage, DevWrapperFuncName, M); + + // Name the wrapper arguments for readability. + for (size_t I = 0; I < NumbaWrapperFunc->arg_size(); ++I) + NumbaWrapperFunc->getArg(I)->setName(WrapperArgsNames[I]); + + IRBuilder<> Builder( + BasicBlock::Create(M.getContext(), "entry", NumbaWrapperFunc)); + // Set up default arguments. Depends on the target architecture. + FunctionCallee DevFuncCallee(Fn); + // Set the callee device function with internal linkage to enable + // optimization. + Fn->setLinkage(GlobalValue::InternalLinkage); + SmallVector DevFuncArgs; + Triple TargetTriple(M.getTargetTriple()); + + // Adapt arguments to the Numba calling convention depending on target. First + // two arguments are Numba-generated pointers for return value and exceptions + // (if targeting the CPU), which are unused. Init to nullptr. + size_t ArgOffset; + DevFuncArgs.push_back(Constant::getNullValue(Fn->getArg(0)->getType())); + if (!isOpenMPDeviceRuntime()) { + DevFuncArgs.push_back(Constant::getNullValue(Fn->getArg(1)->getType())); + ArgOffset = 2; + } else { + ArgOffset = 1; + } + for (auto &Arg : NumbaWrapperFunc->args()) { + // TODO: Runtime expects all scalars typed as Int64. + if (!Arg.getType()->isPointerTy()) { + auto *ParamType = + DevFuncCallee.getFunctionType()->getParamType(ArgOffset + Arg.getArgNo()); + dbgs() << "ParamType " << *ParamType << "\n"; + AllocaInst *TmpInt64 = Builder.CreateAlloca(OMPBuilder.Int64, nullptr, + Arg.getName() + ".casted"); + Builder.CreateStore(&Arg, TmpInt64); + Value *Cast = Builder.CreateBitCast(TmpInt64, ParamType->getPointerTo()); + Value *ConvLoad = Builder.CreateLoad(ParamType, Cast); + DevFuncArgs.push_back(ConvLoad); + } else + DevFuncArgs.push_back(&Arg); + } + + bool IsSPMD = (TargetInfo.ExecMode == omp::OMP_TGT_EXEC_MODE_SPMD); + if (isOpenMPDeviceRuntime()) { + OpenMPIRBuilder::LocationDescription Loc(Builder); + auto IP = OMPBuilder.createTargetInit(Loc, /* IsSPMD */ IsSPMD, + /* RequiresFullRuntime */ false); + Builder.restoreIP(IP); + } + + auto *CI = checkCreateCall(Builder, DevFuncCallee, DevFuncArgs); + assert(CI && "Expected valid call"); + + if (isOpenMPDeviceRuntime()) { + OpenMPIRBuilder::LocationDescription Loc(Builder); + OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD, + /* RequiresFullRuntime */ false); + } + + Builder.CreateRetVoid(); + + if (isOpenMPDeviceRuntime()) { + assert(TargetInfo.ExecMode && "Expected non-zero ExecMode"); + // Emit OMP device globals and metadata. + // TODO: Make the exec_mode a parameter and use SPMD when possible. + auto *ExecModeGV = new GlobalVariable( + M, OMPBuilder.Int8, /* isConstant */ false, GlobalValue::WeakAnyLinkage, + Builder.getInt8(TargetInfo.ExecMode), + DevWrapperFuncName + "_exec_mode"); + appendToCompilerUsed(M, {ExecModeGV}); + + // Get "nvvm.annotations" metadata node. + // TODO: may need to adjust for AMD gpus. + NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); + + Metadata *MDVals[] = { + ConstantAsMetadata::get(NumbaWrapperFunc), + MDString::get(M.getContext(), "kernel"), + ConstantAsMetadata::get(ConstantInt::get(OMPBuilder.Int32, 1))}; + // Append metadata to nvvm.annotations. + MD->addOperand(MDNode::get(M.getContext(), MDVals)); + + // Add a function attribute for the kernel. + NumbaWrapperFunc->addFnAttr(Attribute::get(M.getContext(), "kernel")); + + } else { + // Generating an offloading entry is required by the x86_64 plugin. + Constant *OMPOffloadEntry; + emitOMPOffloadingEntry(DevWrapperFuncName, NumbaWrapperFunc, + OMPOffloadEntry); + } + // Add llvm.module.flags for "openmp", "openmp-device" to enable + // OpenMPOpt. + M.addModuleFlag(llvm::Module::Max, "openmp", 50); + M.addModuleFlag(llvm::Module::Max, "openmp-device", 50); +} + +void CGIntrinsicsOpenMP::emitOMPTeamsDeviceRuntime( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, TeamsInfoStruct &TeamsInfo) { + SmallVector CapturedVars; + Function *OutlinedFn = + createOutlinedFunction(DSAValueMap, VMap, Fn, StartBB, EndBB, + CapturedVars, ".omp_outlined_teams"); + + // Set up the call to the teams outlined function. + BBEntry->getTerminator()->eraseFromParent(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->end()), DL); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + OMPBuilder.Builder.restoreIP(Loc.IP); + OMPBuilder.Builder.SetCurrentDebugLocation(Loc.DL); + Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident); + + assert(Ident && "Expected non-null Ident"); + assert(ThreadID && "Expected non-null ThreadID"); + + // Create global_tid, bound_tid (zero) to pass to the teams outlined function. + AllocaInst *ThreadIDAddr = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int32, nullptr, ".threadid.addr"); + AllocaInst *ZeroAddr = + OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int32, nullptr, ".zero.addr"); + OMPBuilder.Builder.CreateStore(ThreadID, ThreadIDAddr); + OMPBuilder.Builder.CreateStore(Constant::getNullValue(OMPBuilder.Int32), + ZeroAddr); + + FunctionCallee TeamsOutlinedFn(OutlinedFn); + SmallVector Args; + Args.append({ThreadIDAddr, ZeroAddr}); + + for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { + // Pass firstprivate scalar by value. + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && + CapturedVars[Idx] + ->getType() + ->getPointerElementType() + ->isSingleValueType()) { + Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType(); + Value *Load = + OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); + // TODO: Runtime expects values in Int64 type, fix with arguments in + // struct. + AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int64, nullptr, + CapturedVars[Idx]->getName() + "fpriv.byval"); + Value *Cast = OMPBuilder.Builder.CreateBitCast( + TmpInt64, CapturedVars[Idx]->getType()); + OMPBuilder.Builder.CreateStore(Load, Cast); + Value *ConvLoad = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); + Args.push_back(ConvLoad); + + continue; + } + Args.push_back(CapturedVars[Idx]); + } + + auto *CI = checkCreateCall(OMPBuilder.Builder, TeamsOutlinedFn, Args); + assert(CI && "Expected valid call"); + + OMPBuilder.Builder.CreateBr(AfterBB); + + DEBUG_ENABLE(dbgs() << "=== Dump OuterFn\n" + << *Fn << "=== End of Dump OuterFn\n"); + + if (verifyFunction(*Fn, &errs())) + FATAL_ERROR("Verification of OuterFn failed!"); +} + +void CGIntrinsicsOpenMP::emitOMPTeams(DSAValueMapTy &DSAValueMap, + ValueToValueMapTy *VMap, + const DebugLoc &DL, Function *Fn, + BasicBlock *BBEntry, BasicBlock *StartBB, + BasicBlock *EndBB, BasicBlock *AfterBB, + TeamsInfoStruct &TeamsInfo) { + if (isOpenMPDeviceRuntime()) + emitOMPTeamsDeviceRuntime(DSAValueMap, VMap, DL, Fn, BBEntry, StartBB, + EndBB, AfterBB, TeamsInfo); + else + emitOMPTeamsHostRuntime(DSAValueMap, VMap, DL, Fn, BBEntry, StartBB, EndBB, + AfterBB, TeamsInfo); +} + +void CGIntrinsicsOpenMP::emitOMPTeamsHostRuntime( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, TeamsInfoStruct &TeamsInfo) { + SmallVector CapturedVars; + Function *OutlinedFn = createOutlinedFunction( + DSAValueMap, /*ValueToValueMapTy */ VMap, Fn, StartBB, EndBB, + CapturedVars, ".omp_outlined_teams"); + + // Set up the call to the teams outlined function. + BBEntry->getTerminator()->eraseFromParent(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->end()), DL); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + OMPBuilder.Builder.restoreIP(Loc.IP); + OMPBuilder.Builder.SetCurrentDebugLocation(Loc.DL); + Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident); + + assert(Ident && "Expected non-null Ident"); + // Emit call to set the number of teams and thread limit. + if (TeamsInfo.NumTeams || TeamsInfo.ThreadLimit) { + Value *NumTeams = + (TeamsInfo.NumTeams + ? createScalarCast(TeamsInfo.NumTeams, OMPBuilder.Int32) + : Constant::getNullValue(OMPBuilder.Int32)); + Value *ThreadLimit = + (TeamsInfo.ThreadLimit + ? createScalarCast(TeamsInfo.ThreadLimit, OMPBuilder.Int32) + : Constant::getNullValue(OMPBuilder.Int32)); + FunctionCallee KmpcPushNumTeams = + OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_push_num_teams); + auto *CI = checkCreateCall(OMPBuilder.Builder, KmpcPushNumTeams, + {Ident, ThreadID, NumTeams, ThreadLimit}); + assert(CI && "Expected valid call"); + } + + FunctionCallee ForkTeams = + OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_fork_teams); + + SmallVector Args; + Value *NumCapturedVars = OMPBuilder.Builder.getInt32(CapturedVars.size()); + Args.append({Ident, NumCapturedVars, + OMPBuilder.Builder.CreateBitCast(OutlinedFn, + OMPBuilder.ParallelTaskPtr)}); + + for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) { + // Pass firstprivate scalar by value. + if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE && + CapturedVars[Idx] + ->getType() + ->getPointerElementType() + ->isSingleValueType()) { + Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType(); + Value *Load = + OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]); + // TODO: Runtime expects values in Int64 type, fix with arguments in + // struct. + AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca( + OMPBuilder.Int64, nullptr, + CapturedVars[Idx]->getName() + ".fpriv.byval"); + Value *Cast = OMPBuilder.Builder.CreateBitCast( + TmpInt64, CapturedVars[Idx]->getType()); + OMPBuilder.Builder.CreateStore(Load, Cast); + Value *ConvLoad = + OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64); + Args.push_back(ConvLoad); + + continue; + } + Args.push_back(CapturedVars[Idx]); + } + + auto *CI = checkCreateCall(OMPBuilder.Builder, ForkTeams, Args); + assert(CI && "Expected valid call"); + + OMPBuilder.Builder.CreateBr(AfterBB); + + DEBUG_ENABLE(dbgs() << "=== Dump OuterFn\n" + << *Fn << "=== End of Dump OuterFn\n"); + + if (verifyFunction(*Fn, &errs())) + FATAL_ERROR("Verification of OuterFn failed!"); +} + +void CGIntrinsicsOpenMP::emitOMPTargetEnterData( + Function *Fn, BasicBlock *BBEntry, DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap) { + + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + + FunctionCallee TargetDataBeginMapper = OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_begin_mapper); + OMPBuilder.Builder.SetInsertPoint(BBEntry->getTerminator()); + + // Emit mappings. + OffloadingMappingArgsTy OffloadingMappingArgs; + InsertPointTy AllocaIP(&Fn->getEntryBlock(), + Fn->getEntryBlock().getFirstInsertionPt()); + emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap, + OffloadingMappingArgs, /* IsTargetRegion */ false); + + OMPBuilder.Builder.CreateCall( + TargetDataBeginMapper, + {SrcLoc, ConstantInt::get(OMPBuilder.Int64, -1), + ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size), + OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs, + OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes, + OffloadingMappingArgs.MapNames, + // TODO: offload_mappers is null for now. + Constant::getNullValue(OMPBuilder.VoidPtrPtr)}); +} + +void CGIntrinsicsOpenMP::emitOMPTargetExitData( + Function *Fn, BasicBlock *BBEntry, DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap) { + + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + + FunctionCallee TargetDataEndMapper = OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_end_mapper); + OMPBuilder.Builder.SetInsertPoint(BBEntry->getTerminator()); + + // Emit mappings. + OffloadingMappingArgsTy OffloadingMappingArgs; + InsertPointTy AllocaIP(&Fn->getEntryBlock(), + Fn->getEntryBlock().getFirstInsertionPt()); + emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap, + OffloadingMappingArgs, /* IsTargetRegion */ false); + + OMPBuilder.Builder.CreateCall( + TargetDataEndMapper, + {SrcLoc, ConstantInt::get(OMPBuilder.Int64, -1), + ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size), + OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs, + OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes, + OffloadingMappingArgs.MapNames, + // TODO: offload_mappers is null for now. + Constant::getNullValue(OMPBuilder.VoidPtrPtr)}); +} + +void CGIntrinsicsOpenMP::emitOMPTargetData(Function *Fn, BasicBlock *BBEntry, + BasicBlock *BBExit, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap) { + // Re-use codegen from TARGET ENTER/EXIT DATA. + emitOMPTargetEnterData(Fn, BBEntry, DSAValueMap, StructMappingInfoMap); + emitOMPTargetExitData(Fn, BBExit, DSAValueMap, StructMappingInfoMap); +} + +void CGIntrinsicsOpenMP::emitOMPTargetUpdate( + Function *Fn, BasicBlock *BBEntry, DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap) { + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + OpenMPIRBuilder::LocationDescription Loc( + InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize); + + FunctionCallee TargetDataUpdateMapper = OMPBuilder.getOrCreateRuntimeFunction( + M, OMPRTL___tgt_target_data_update_mapper); + OMPBuilder.Builder.SetInsertPoint(BBEntry->getTerminator()); + + // Emit mappings. + OffloadingMappingArgsTy OffloadingMappingArgs; + InsertPointTy AllocaIP(&Fn->getEntryBlock(), + Fn->getEntryBlock().getFirstInsertionPt()); + emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap, + OffloadingMappingArgs, /* IsTargetRegion */ false); + + OMPBuilder.Builder.CreateCall( + TargetDataUpdateMapper, + {SrcLoc, ConstantInt::get(OMPBuilder.Int64, -1), + ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size), + OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs, + OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes, + OffloadingMappingArgs.MapNames, + // TODO: offload_mappers is null for now. + Constant::getNullValue(OMPBuilder.VoidPtrPtr)}); +} + +void CGIntrinsicsOpenMP::emitOMPDistribute( + DSAValueMapTy &DSAValueMap, OMPLoopInfoStruct &OMPLoopInfo, + BasicBlock *StartBB, BasicBlock *ExitBB, bool IsStandalone, + bool IsDistributeParallelFor, OMPDistributeInfoStruct *DistributeInfo) { + if (static_cast(OMPLoopInfo.Sched) == 0) + OMPLoopInfo.Sched = OMPScheduleType::Distribute; + + emitLoop(DSAValueMap, OMPLoopInfo, StartBB, ExitBB, IsStandalone, true, + IsDistributeParallelFor, DistributeInfo); +} + +void CGIntrinsicsOpenMP::emitOMPDistributeParallelFor( + DSAValueMapTy &DSAValueMap, BasicBlock *StartBB, BasicBlock *ExitBB, + OMPLoopInfoStruct &OMPLoopInfo, ParRegionInfoStruct &ParRegionInfo, + bool IsStandalone) { + + Function *Fn = StartBB->getParent(); + const DebugLoc DL = StartBB->getTerminator()->getDebugLoc(); + + BasicBlock *DistPreheader = + StartBB->splitBasicBlock(StartBB->begin(), "omp.distribute.preheader"); + BasicBlock *DistHeader = DistPreheader->splitBasicBlock( + DistPreheader->begin(), "omp.distribute.header"); + BasicBlock *ForEntry = + DistHeader->splitBasicBlock(DistHeader->begin(), "omp.inner.for.entry"); + BasicBlock *ForBegin = + ForEntry->splitBasicBlock(ForEntry->begin(), "omp.inner.for.begin"); + BasicBlock *ForEnd = splitBlockBefore( + ExitBB, &*ExitBB->getFirstInsertionPt(), /*DomTreeUpdater*/ nullptr, + /*LoopInfo*/ nullptr, /*MemorySSAUpdater*/ nullptr); + ForEnd->setName("omp.inner.for.end"); + BasicBlock *ForExit = SplitBlock(ForEnd, ForEnd->getTerminator()); + ForExit->setName("omp.inner.for.exit"); + BasicBlock *ForExitAfter = SplitBlock(ForExit, ForExit->getTerminator()); + ForExitAfter->setName("omp.inner.for.exit.after"); + BasicBlock *DistInc = ForExitAfter->splitBasicBlock( + ForExitAfter->getTerminator(), "omp.distribute.inc"); + BasicBlock *DistExit = + DistInc->splitBasicBlock(DistInc->getTerminator(), "omp.distribute.exit"); + + // Create skeleton DistHeader + { + // Dummy condition to create the expected structure. + DistHeader->getTerminator()->eraseFromParent(); + OMPBuilder.Builder.SetInsertPoint(DistHeader); + auto *Cond = + OMPBuilder.Builder.CreateICmpSLE(OMPLoopInfo.IV, OMPLoopInfo.UB); + OMPBuilder.Builder.CreateCondBr(Cond, ForEntry, DistExit); + } + // Create skeleton DistInc + { + DistInc->getTerminator()->eraseFromParent(); + OMPBuilder.Builder.SetInsertPoint(DistInc); + OMPBuilder.Builder.CreateBr(DistHeader); + } + + OMPLoopInfo.Sched = (isOpenMPDeviceRuntime() ? OMPScheduleType::StaticChunked + : OMPScheduleType::Static); + emitOMPFor(DSAValueMap, OMPLoopInfo, ForBegin, ForEnd, IsStandalone, true); + BasicBlock *ParEntryBB = ForEntry; + DEBUG_ENABLE(dbgs() << "ParEntryBB " << ParEntryBB->getName() << "\n"); + BasicBlock *ParStartBB = ForBegin; + DEBUG_ENABLE(dbgs() << "ParStartBB " << ParStartBB->getName() << "\n"); + BasicBlock *ParEndBB = ForExit; + DEBUG_ENABLE(dbgs() << "ParEndBB " << ParEndBB->getName() << "\n"); + BasicBlock *ParAfterBB = ForExitAfter; + DEBUG_ENABLE(dbgs() << "ParAfterBB " << ParAfterBB->getName() << "\n"); + + emitOMPParallel( + DSAValueMap, nullptr, DL, Fn, ParEntryBB, ParStartBB, ParEndBB, + ParAfterBB, [](auto) {}, ParRegionInfo); + + // By default, to maximize performance on GPUs, we do static chunked with a + // chunk size equal to the block size when targeting the device runtime. + if (isOpenMPDeviceRuntime()) { + OMPLoopInfo.Sched = OMPScheduleType::DistributeChunked; + // Extend DistPreheader + { + OMPBuilder.Builder.SetInsertPoint(DistPreheader, + DistPreheader->getFirstInsertionPt()); + + FunctionCallee NumTeamThreadsFn = OMPBuilder.getOrCreateRuntimeFunction( + M, llvm::omp::RuntimeFunction:: + OMPRTL___kmpc_get_hardware_num_threads_in_block); + Value *NumTeamThreads = + OMPBuilder.Builder.CreateCall(NumTeamThreadsFn, {}); + OMPLoopInfo.Chunk = NumTeamThreads; + } + } else { + OMPLoopInfo.Sched = OMPScheduleType::Distribute; + } + + OMPDistributeInfoStruct DistributeInfo; + emitOMPDistribute(DSAValueMap, OMPLoopInfo, DistPreheader, DistExit, + IsStandalone, true, &DistributeInfo); + + // Replace upper bound, lower bound to the "parallel for" with distribute + // bounds. + { + assert(DistributeInfo.LB && "Expected non-null distribute lower bound"); + assert(DistributeInfo.UB && "Expected non-null distribute upper bound"); + auto ShouldReplace = [&](Use &U) { + if (auto *UserI = dyn_cast(U.getUser())) + if (UserI->getParent() == ForEntry) + return true; + + return false; + }; + + // Replace the inner, parallel for loop LB, UB. + OMPLoopInfo.LB->replaceUsesWithIf(DistributeInfo.LB, ShouldReplace); + OMPLoopInfo.UB->replaceUsesWithIf(DistributeInfo.UB, ShouldReplace); + } +} + +void CGIntrinsicsOpenMP::emitOMPTargetTeamsDistributeParallelFor( + DSAValueMapTy &DSAValueMap, const DebugLoc &DL, Function *Fn, + BasicBlock *EntryBB, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *ExitBB, BasicBlock *AfterBB, OMPLoopInfoStruct &OMPLoopInfo, + ParRegionInfoStruct &ParRegionInfo, TargetInfoStruct &TargetInfo, + StructMapTy &StructMappingInfoMap, bool IsDeviceTargetRegion) { + + emitOMPDistributeParallelFor(DSAValueMap, StartBB, ExitBB, OMPLoopInfo, + ParRegionInfo, + /* isStandalone */ false); + + emitOMPTargetTeams(DSAValueMap, nullptr, DL, Fn, EntryBB, + StartBB, EndBB, AfterBB, + TargetInfo, &OMPLoopInfo, StructMappingInfoMap, + IsDeviceTargetRegion); + + // Alternative codegen, starting from top-down and renaming values using the + // ValueToValueMap. +#if 0 + ValueToValueMapTy VMap; + // Lower target_teams. + emitOMPTargetTeams(DSAValueMap, &VMap, DL, Fn, EntryBB, StartBB, EndBB, AfterBB, + TargetInfo, &OMPLoopInfo, StructMappingInfoMap, + IsDeviceTargetRegion); + + dbgs() << "=== VMap\n"; + for(auto VV : VMap) { + dbgs() << "V " << *VV.first << " -> " << *VV.second << "\n"; + } + dbgs() << "=== End of VMap\n"; + getchar(); + + // Update DSAValueMap + SmallVector ToDelete; + for(auto &It : DSAValueMap) { + Value *V = It.first; + if(!VMap.count(V)) + continue; + + DSAValueMap[VMap[V]] = It.second; + dbgs() << "Update DSAValueMap " << *VMap[V] << " ~> " << It.second.Type << "\n"; + ToDelete.push_back(V); + } + for(auto *V : ToDelete) { + dbgs() << "Update DSAValueMAp delete " << *V << "\n"; + DSAValueMap.erase(V); + } + + // Update OMPLoopInfo + OMPLoopInfo.IV = VMap[OMPLoopInfo.IV]; + OMPLoopInfo.Start = VMap[OMPLoopInfo.Start]; + OMPLoopInfo.LB = VMap[OMPLoopInfo.LB]; + OMPLoopInfo.UB = VMap[OMPLoopInfo.UB]; + + emitOMPDistributeParallelFor(DSAValueMap, StartBB, ExitBB, OMPLoopInfo, + ParRegionInfo, + /* isStandalone */ false); +#endif +} + +void CGIntrinsicsOpenMP::emitOMPTargetTeams( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *EntryBB, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, TargetInfoStruct &TargetInfo, + OMPLoopInfoStruct *OMPLoopInfo, StructMapTy &StructMappingInfoMap, + bool IsDeviceTargetRegion) { + + BasicBlock *TeamsEntryBB = SplitBlock(EntryBB, EntryBB->getTerminator()); + TeamsEntryBB->setName("omp.teams.entry"); + BasicBlock *TeamsStartBB = + splitBlockBefore(StartBB, &*StartBB->getFirstInsertionPt(), nullptr, + nullptr, nullptr, "omp.teams.start"); + BasicBlock *TeamsEndBB = + splitBlockBefore(EndBB, &*EndBB->getFirstInsertionPt(), nullptr, nullptr, + nullptr, "omp.teams.end"); + // TargetInfo contains teams info. + TeamsInfoStruct TeamsInfo; + TeamsInfo.NumTeams = TargetInfo.NumTeams; + TeamsInfo.ThreadLimit = TargetInfo.ThreadLimit; + emitOMPTeams(DSAValueMap, VMap, DL, Fn, TeamsEntryBB, TeamsStartBB, + TeamsEndBB, EndBB, TeamsInfo); + + emitOMPTarget(Fn, EntryBB, TeamsEntryBB, EndBB, DSAValueMap, + StructMappingInfoMap, TargetInfo, OMPLoopInfo, + IsDeviceTargetRegion); +} + +bool CGIntrinsicsOpenMP::isOpenMPDeviceRuntime() { + Triple TargetTriple(M.getTargetTriple()); + + if (TargetTriple.isNVPTX()) + return true; + + return false; +} + +template <> +Value *CGReduction::emitOperation(IRBuilderBase &IRB, + Value *LHS, Value *RHS) { + Type *VTy = RHS->getType(); + if (VTy->isIntegerTy()) + return IRB.CreateAdd(LHS, RHS, "red.add"); + else if (VTy->isFloatTy() || VTy->isDoubleTy()) + return IRB.CreateFAdd(LHS, RHS, "red.add"); + else + FATAL_ERROR("Unsupported type for reduction operation"); +} + +// OpenMP 5.1, 2.21.5, sub is the same as add. +template <> +Value *CGReduction::emitOperation(IRBuilderBase &IRB, + Value *LHS, Value *RHS) { + return emitOperation(IRB, LHS, RHS); +} + +template <> +Value *CGReduction::emitOperation(IRBuilderBase &IRB, + Value *LHS, Value *RHS) { + Type *VTy = RHS->getType(); + if (VTy->isIntegerTy()) + return IRB.CreateMul(LHS, RHS, "red.mul"); + else if (VTy->isFloatTy() || VTy->isDoubleTy()) + return IRB.CreateFMul(LHS, RHS, "red.mul"); + else + FATAL_ERROR("Unsupported type for reduction operation"); +} + +template <> +InsertPointTy CGReduction::emitAtomicOperationRMW( + IRBuilderBase &IRB, Value *LHS, Value *Partial) { + IRB.CreateAtomicRMW(AtomicRMWInst::Add, LHS, Partial, None, + AtomicOrdering::Monotonic); + return IRB.saveIP(); +} + +// OpenMP 5.1, 2.21.5, sub is the same as add. +template <> +InsertPointTy CGReduction::emitAtomicOperationRMW( + IRBuilderBase &IRB, Value *LHS, Value *Partial) { + return emitAtomicOperationRMW(IRB, LHS, Partial); +} diff --git a/numba/openmp/pass/CGIntrinsicsOpenMP.h b/numba/openmp/pass/CGIntrinsicsOpenMP.h new file mode 100644 index 000000000000..f9b5d29291df --- /dev/null +++ b/numba/openmp/pass/CGIntrinsicsOpenMP.h @@ -0,0 +1,527 @@ +#ifndef LLVM_TRANSFORMS_INTRINSICS_OPENMP_CODEGEN_H +#define LLVM_TRANSFORMS_INTRINSICS_OPENMP_CODEGEN_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Frontend/OpenMP/OMP.h.inc" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" +#include "llvm/IR/Value.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include +#include +#include +#include +#include + +#include "DebugOpenMP.h" + +using namespace llvm; +using namespace omp; + +using InsertPointTy = OpenMPIRBuilder::InsertPointTy; +using BodyGenCallbackTy = OpenMPIRBuilder::BodyGenCallbackTy; +using FinalizeCallbackTy = OpenMPIRBuilder::FinalizeCallbackTy; + +namespace iomp { +// TODO: expose clauses through namespace omp? +enum DSAType { + DSA_NONE, + DSA_PRIVATE, + DSA_FIRSTPRIVATE, + DSA_LASTPRIVATE, + DSA_SHARED, + DSA_REDUCTION_ADD, + DSA_REDUCTION_SUB, + DSA_REDUCTION_MUL, + DSA_MAP_ALLOC, + DSA_MAP_TO, + DSA_MAP_FROM, + DSA_MAP_TOFROM, + DSA_MAP_ALLOC_STRUCT, + DSA_MAP_TO_STRUCT, + DSA_MAP_FROM_STRUCT, + DSA_MAP_TOFROM_STRUCT, + DSA_MAP_STRUCT +}; + +struct DSATypeInfo { + DSAType Type; + FunctionCallee CopyConstructor; + + DSATypeInfo() : Type(DSA_NONE), CopyConstructor(nullptr) {} + DSATypeInfo(DSAType InType) : Type(InType), CopyConstructor(nullptr) {} + DSATypeInfo(DSAType InType, FunctionCallee InCopyConstructor) + : Type(InType), CopyConstructor(InCopyConstructor) {} + DSATypeInfo(const DSATypeInfo &DTI) { + Type = DTI.Type; + CopyConstructor = DTI.CopyConstructor; + } + DSATypeInfo &operator=(const DSATypeInfo &DTI) = default; +}; + +using DSAValueMapTy = MapVector; + +// using DSAValueMapTy = MapVector; + +static const DenseMap StringToDir = { + {"DIR.OMP.PARALLEL", OMPD_parallel}, + {"DIR.OMP.SINGLE", OMPD_single}, + {"DIR.OMP.CRITICAL", OMPD_critical}, + {"DIR.OMP.BARRIER", OMPD_barrier}, + {"DIR.OMP.LOOP", OMPD_for}, + {"DIR.OMP.PARALLEL.LOOP", OMPD_parallel_for}, + {"DIR.OMP.TASK", OMPD_task}, + {"DIR.OMP.TASKWAIT", OMPD_taskwait}, + {"DIR.OMP.TARGET", OMPD_target}, + {"DIR.OMP.TEAMS", OMPD_teams}, + {"DIR.OMP.DISTRIBUTE", OMPD_distribute}, + {"DIR.OMP.TEAMS.DISTRIBUTE", OMPD_teams_distribute}, + {"DIR.OMP.TEAMS.DISTRIBUTE.PARALLEL.LOOP", + OMPD_teams_distribute_parallel_for}, + {"DIR.OMP.TARGET.TEAMS", OMPD_target_teams}, + {"DIR.OMP.TARGET.DATA", OMPD_target_data}, + {"DIR.OMP.TARGET.ENTER.DATA", OMPD_target_enter_data}, + {"DIR.OMP.TARGET.EXIT.DATA", OMPD_target_exit_data}, + {"DIR.OMP.TARGET.UPDATE", OMPD_target_update}, + {"DIR.OMP.TARGET.TEAMS.DISTRIBUTE", OMPD_target_teams_distribute}, + {"DIR.OMP.DISTRIBUTE.PARALLEL.LOOP", OMPD_distribute_parallel_for}, + {"DIR.OMP.TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", + OMPD_target_teams_distribute_parallel_for}}; + +// TODO: add more reduction operators. +static const DenseMap StringToDSA = { + {"QUAL.OMP.PRIVATE", DSA_PRIVATE}, + {"QUAL.OMP.FIRSTPRIVATE", DSA_FIRSTPRIVATE}, + {"QUAL.OMP.LASTPRIVATE", DSA_LASTPRIVATE}, + {"QUAL.OMP.SHARED", DSA_SHARED}, + {"QUAL.OMP.REDUCTION.ADD", DSA_REDUCTION_ADD}, + {"QUAL.OMP.REDUCTION.SUB", DSA_REDUCTION_SUB}, + {"QUAL.OMP.REDUCTION.MUL", DSA_REDUCTION_MUL}, + {"QUAL.OMP.MAP.ALLOC", DSA_MAP_ALLOC}, + {"QUAL.OMP.MAP.TO", DSA_MAP_TO}, + {"QUAL.OMP.MAP.FROM", DSA_MAP_FROM}, + {"QUAL.OMP.MAP.TOFROM", DSA_MAP_TOFROM}, + {"QUAL.OMP.MAP.ALLOC.STRUCT", DSA_MAP_ALLOC_STRUCT}, + {"QUAL.OMP.MAP.TO.STRUCT", DSA_MAP_TO_STRUCT}, + {"QUAL.OMP.MAP.FROM.STRUCT", DSA_MAP_FROM_STRUCT}, + {"QUAL.OMP.MAP.TOFROM.STRUCT", DSA_MAP_TOFROM_STRUCT}}; + +/// Data attributes for each data reference used in an OpenMP target region. +enum tgt_map_type { + // No flags + OMP_TGT_MAPTYPE_NONE = 0x000, + // copy data from host to device + OMP_TGT_MAPTYPE_TO = 0x001, + // copy data from device to host + OMP_TGT_MAPTYPE_FROM = 0x002, + // copy regardless of the reference count + OMP_TGT_MAPTYPE_ALWAYS = 0x004, + // force unmapping of data + OMP_TGT_MAPTYPE_DELETE = 0x008, + // map the pointer as well as the pointee + OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, + // pass device base address to kernel + OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, + // return base device address of mapped data + OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, + // private variable - not mapped + OMP_TGT_MAPTYPE_PRIVATE = 0x080, + // copy by value - not mapped + OMP_TGT_MAPTYPE_LITERAL = 0x100, + // mapping is implicit + OMP_TGT_MAPTYPE_IMPLICIT = 0x200, + // copy data to device + OMP_TGT_MAPTYPE_CLOSE = 0x400, + // runtime error if not already allocated + OMP_TGT_MAPTYPE_PRESENT = 0x1000, + // descriptor for non-contiguous target-update + OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, + // member of struct, member given by [16 MSBs] - 1 + OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 +}; + +struct OffloadingMappingArgsTy { + Value *Sizes; + Value *MapTypes; + Value *MapNames; + Value *BasePtrs; + Value *Ptrs; + size_t Size; +}; + +struct FieldMappingInfo { + Value *Index; + Value *Offset; + Value *NumElements; + DSAType MapType; +}; + +using StructMapTy = MapVector>; + +struct OMPLoopInfoStruct { + Value *IV = nullptr; + Value *Start = nullptr; + Value *LB = nullptr; + Value *UB = nullptr; + // 0 is invalid, schedule will be set by the user or to reasonable defaults + // by the pass. + OMPScheduleType DistSched = static_cast(0); + OMPScheduleType Sched = static_cast(0); + Value *Chunk = nullptr; +}; + +struct OMPDistributeInfoStruct { + Value *UB = nullptr; + Value *LB = nullptr; +}; + +struct TargetInfoStruct { + StringRef DevFuncName; + ConstantDataArray *ELF = nullptr; + Value *NumTeams = nullptr; + Value *ThreadLimit = nullptr; + OMPTgtExecModeFlags ExecMode = (OMPTgtExecModeFlags)0; + bool NoWait = false; +}; + +struct ParRegionInfoStruct { + Value *NumThreads = nullptr; + Value *IfCondition = nullptr; +}; + +struct TeamsInfoStruct { + Value *NumTeams = nullptr; + Value *ThreadLimit = nullptr; +}; + +struct CGReduction { + template + static Value *emitOperation(IRBuilderBase &IRB, Value *LHS, Value *RHS); + + template + static OpenMPIRBuilder::InsertPointTy + reductionNonAtomic(OpenMPIRBuilder::InsertPointTy IP, Value *LHS, Value *RHS, + Value *&Result) { + IRBuilder<> Builder(IP.getBlock(), IP.getPoint()); + Result = emitOperation(Builder, LHS, RHS); + return Builder.saveIP(); + } + + template + static InsertPointTy emitAtomicOperationRMW(IRBuilderBase &IRB, Value *LHS, + Value *Partial); + + template + static InsertPointTy emitAtomicOperationCmpxchg(IRBuilderBase &IRB, + InsertPointTy IP, Type *VTy, + Value *LHS, Value *Partial) { + LLVMContext &Ctx = IRB.getContext(); + unsigned int Bitwidth = VTy->getScalarSizeInBits(); + auto *IntTy = + (Bitwidth == 64 ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx)); + auto *IntPtrTy = + (Bitwidth == 64 ? Type::getInt64PtrTy(Ctx) : Type::getInt32PtrTy(Ctx)); + + auto SaveIP = IRB.saveIP(); + // TODO: move alloca to function entry point, may be outlined later, e.g., + // for nested under parallel. + Value *AllocaTemp = + IRB.CreateAlloca(IntTy, nullptr, "atomic.alloca.tmp"); + IRB.restoreIP(SaveIP); + + Value *CastLHS = + IRB.CreateBitCast(LHS, IntPtrTy, LHS->getName() + ".cast.int"); + auto *LoadAtomic = + IRB.CreateLoad(IntTy, CastLHS, LHS->getName() + ".load.atomic"); + LoadAtomic->setAtomic(AtomicOrdering::Monotonic); + + Value *CastFP = IRB.CreateBitCast(LoadAtomic, VTy, "cast.fp"); + Value *RedOp = emitOperation(IRB, CastFP, Partial); + Value *CastFAdd = + IRB.CreateBitCast(RedOp, IntTy, RedOp->getName() + ".cast.int"); + + auto *CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadAtomic, CastFAdd, + None, AtomicOrdering::Monotonic, + AtomicOrdering::Monotonic); + + auto *Returned = IRB.CreateExtractValue(CmpXchg, 0); + auto *StoreTemp = IRB.CreateStore(Returned, AllocaTemp); + auto *Cond = IRB.CreateExtractValue(CmpXchg, 1); + // Add unreachable as placholder for splitting. + auto *Unreachable = IRB.CreateUnreachable(); + auto *IfTrueTerm = SplitBlockAndInsertIfThen(Cond, Unreachable, false); + auto *ExitBlock = IfTrueTerm->getParent(); + auto *Retry = ExitBlock->getSingleSuccessor(); + assert(Retry && "Expected single successor tail block"); + // Erase the fall-through branch. + IfTrueTerm->eraseFromParent(); + + SaveIP = IRB.saveIP(); + IRB.SetInsertPoint(Retry, Retry->getFirstInsertionPt()); + auto *LoadReturned = IRB.CreateLoad(IntTy, AllocaTemp); + auto *CastLoad = IRB.CreateBitCast(LoadReturned, VTy); + // FAdd = IRB.CreateFAdd(CastLoad, Partial, "retry.add"); + RedOp = emitOperation(IRB, CastLoad, Partial); + CastFAdd = + IRB.CreateBitCast(RedOp, IntTy, RedOp->getName() + ".cast.int"); + CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadReturned, CastFAdd, None, + AtomicOrdering::Monotonic, + AtomicOrdering::Monotonic); + Returned = IRB.CreateExtractValue(CmpXchg, 0); + StoreTemp = IRB.CreateStore(Returned, AllocaTemp); + Cond = IRB.CreateExtractValue(CmpXchg, 1); + IRB.CreateCondBr(Cond, ExitBlock, Retry); + // Remove unreachable placeholder. + Unreachable->eraseFromParent(); + IRB.restoreIP(SaveIP); + + return InsertPointTy(ExitBlock, ExitBlock->getFirstInsertionPt()); + } + + template + static OpenMPIRBuilder::InsertPointTy + reductionAtomic(OpenMPIRBuilder::InsertPointTy IP, Type *VTy, Value *LHS, + Value *RHS) { + IRBuilder<> Builder(IP.getBlock(), IP.getPoint()); + Value *Partial = Builder.CreateLoad(VTy, RHS, "red.partial"); + if (VTy->isIntegerTy()) + switch (ReductionOperator) { + case DSA_REDUCTION_ADD: + case DSA_REDUCTION_SUB: + return emitAtomicOperationRMW(Builder, LHS, Partial); + break; + case DSA_REDUCTION_MUL: + // RMW does not support mul. + return emitAtomicOperationCmpxchg(Builder, IP, VTy, LHS, + Partial); + default: + FATAL_ERROR("Unsupported reduction operation"); + } + else if (VTy->isFloatTy() || VTy->isDoubleTy()) { + // NOTE: Using atomicrmw for floats is buggy for aarch64, fallback to + // cmpxchg codegen for now similarly to Clang. Revisit with newer LLVM + // versions. + // Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, LHS, Partial, None, + // AtomicOrdering::Monotonic); + return emitAtomicOperationCmpxchg(Builder, IP, VTy, LHS, + Partial); + } else + FATAL_ERROR("Unsupported type for reductionAtomic"); + } + + template + static Value *emitInitAndAppendInfo( + IRBuilderBase &IRB, InsertPointTy AllocaIP, Value *Orig, + SmallVectorImpl &ReductionInfos) { + auto GetIdentityValue = []() { + switch (ReductionOperator) { + case DSA_REDUCTION_ADD: + case DSA_REDUCTION_SUB: + return 0; + case DSA_REDUCTION_MUL: + return 1; + default: + FATAL_ERROR("Unknown reduction type"); + } + }; + + Type *VTy = Orig->getType()->getPointerElementType(); + auto SaveIP = IRB.saveIP(); + IRB.restoreIP(AllocaIP); + Value *Priv = IRB.CreateAlloca(VTy, /* ArraySize */ nullptr, + Orig->getName() + ".red.priv"); + IRB.restoreIP(SaveIP); + + // Store identity value based on operation and type. + if (VTy->isIntegerTy()) { + IRB.CreateStore(ConstantInt::get(VTy, GetIdentityValue()), + Priv); + } else if (VTy->isFloatTy() || VTy->isDoubleTy()) { + IRB.CreateStore(ConstantFP::get(VTy, GetIdentityValue()), + Priv); + } + else + FATAL_ERROR( + "Unsupported type to init with identity reduction value"); + + ReductionInfos.push_back( + {VTy, Orig, Priv, + CGReduction::reductionNonAtomic, + CGReduction::reductionAtomic}); + + return Priv; + } +}; + +class CGIntrinsicsOpenMP { +public: + CGIntrinsicsOpenMP(Module &M); + + OpenMPIRBuilder OMPBuilder; + Module &M; + StructType *TgtOffloadEntryTy; + + StructType *getTgtOffloadEntryTy() { return TgtOffloadEntryTy; } + + void emitOMPParallel(DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, + const DebugLoc &DL, Function *Fn, BasicBlock *BBEntry, + BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, FinalizeCallbackTy FiniCB, + ParRegionInfoStruct &ParRegionInfo); + + void emitOMPFor(DSAValueMapTy &DSAValueMap, OMPLoopInfoStruct &OMPLoopInfo, + BasicBlock *StartBB, BasicBlock *ExitBB, bool IsStandalone, + bool IsDistributeParallelFor); + + void emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn, + BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB); + + void emitOMPOffloadingEntry(const Twine &DevFuncName, Value *EntryPtr, + Constant *&OMPOffloadEntry); + + void emitOMPOffloadingMappings(InsertPointTy AllocaIP, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap, + OffloadingMappingArgsTy &OffloadingMappingArgs, + bool IsTargetRegion); + + void emitOMPSingle(Function *Fn, BasicBlock *BBEntry, BasicBlock *AfterBB, + BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB); + + void emitOMPCritical(Function *Fn, BasicBlock *BBEntry, BasicBlock *AfterBB, + BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB); + + void emitOMPBarrier(Function *Fn, BasicBlock *BBEntry, Directive DK); + + void emitOMPTaskwait(BasicBlock *BBEntry); + + void emitOMPTarget(Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, + BasicBlock *EndBB, DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap, + TargetInfoStruct &TargetInfo, + OMPLoopInfoStruct *OMPLoopInfo, bool IsDeviceTargetRegion); + + void emitOMPTeams(DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, + const DebugLoc &DL, Function *Fn, BasicBlock *BBEntry, + BasicBlock *StartBB, BasicBlock *EndBB, BasicBlock *AfterBB, + TeamsInfoStruct &TeamsInfo); + + void emitOMPTargetData(Function *Fn, BasicBlock *BBEntry, BasicBlock *BBExit, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap); + + void emitOMPTargetEnterData(Function *Fn, BasicBlock *BBEntry, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap); + + void emitOMPTargetExitData(Function *Fn, BasicBlock *BBEntry, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap); + + void emitOMPTargetUpdate(Function *Fn, BasicBlock *BBEntry, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap); + + void emitOMPDistribute(DSAValueMapTy &DSAValueMap, + OMPLoopInfoStruct &OMPLoopInfo, BasicBlock *StartBB, + BasicBlock *ExitBB, bool IsStandalone, + bool IsDistributeParallelFor, + OMPDistributeInfoStruct *DistributeInfo = nullptr); + + void emitOMPDistributeParallelFor(DSAValueMapTy &DSAValueMap, + BasicBlock *StartBB, BasicBlock *ExitBB, + OMPLoopInfoStruct &OMPLoopInfo, + ParRegionInfoStruct &ParRegionInfo, + bool IsStandalone); + + void emitOMPTargetTeamsDistributeParallelFor( + DSAValueMapTy &DSAValueMap, const DebugLoc &DL, Function *Fn, + BasicBlock *EntryBB, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *ExitBB, BasicBlock *AfterBB, OMPLoopInfoStruct &OMPLoopInfo, + ParRegionInfoStruct &ParRegionInfo, TargetInfoStruct &TargetInfo, + StructMapTy &StructMappingInfoMap, bool IsDeviceTargetRegion); + + void emitOMPTargetTeams(DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, + const DebugLoc &DL, Function *Fn, BasicBlock *EntryBB, + BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, TargetInfoStruct &TargetInfo, + OMPLoopInfoStruct *OMPLoopInfo, + StructMapTy &StructMappingInfoMap, + bool IsDeviceTargetRegion); + + GlobalVariable *emitOffloadingGlobals(StringRef DevWrapperFuncName, + ConstantDataArray *ELF); + + Twine getDevWrapperFuncPrefix() { return "__omp_offload_numba_"; } + + Function *createOutlinedFunction(DSAValueMapTy &DSAValueMap, + ValueToValueMapTy *VMap, Function *OuterFn, + BasicBlock *StartBB, BasicBlock *EndBB, + SmallVectorImpl &CapturedVars, + StringRef Suffix); + + void setDeviceGlobalizedValues(const ArrayRef GlobalizedValues); + +private: + void emitOMPParallelDeviceRuntime(DSAValueMapTy &DSAValueMap, + ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, + BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, + FinalizeCallbackTy FiniCB, + ParRegionInfoStruct &ParRegionInfo); + void emitOMPParallelHostRuntime(DSAValueMapTy &DSAValueMap, + ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, + BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, + FinalizeCallbackTy FiniCB, + ParRegionInfoStruct &ParRegionInfo); + void emitOMPParallelHostRuntimeOMPIRBuilder( + DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, FinalizeCallbackTy FiniCB, + ParRegionInfoStruct &ParRegionInfo); + + void emitOMPTeamsDeviceRuntime(DSAValueMapTy &DSAValueMap, + ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, + BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, + TeamsInfoStruct &TeamsInfo); + void emitOMPTeamsHostRuntime(DSAValueMapTy &DSAValueMap, + ValueToValueMapTy *VMap, const DebugLoc &DL, + Function *Fn, BasicBlock *BBEntry, + BasicBlock *StartBB, BasicBlock *EndBB, + BasicBlock *AfterBB, TeamsInfoStruct &TeamsInfo); + + void emitOMPTargetHost(Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, + BasicBlock *EndBB, DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap, + TargetInfoStruct &TargetInfo, + OMPLoopInfoStruct *OMPLoopInfo); + + void emitOMPTargetDevice(Function *Fn, BasicBlock *BBEntry, + BasicBlock *StartBB, BasicBlock *EndBB, + DSAValueMapTy &DSAValueMap, + StructMapTy &StructMappingInfoMap, + TargetInfoStruct &TargetInfo); + + void emitLoop(DSAValueMapTy &DSAValueMap, OMPLoopInfoStruct &OMPLoopInfo, + BasicBlock *StartBB, BasicBlock *ExitBB, bool IsStandalone, + bool IsDistribute, bool IsDistributeParallelFor, + OMPDistributeInfoStruct *OMPDistributeInfo = nullptr); + + FunctionCallee getKmpcForStaticInit(Type *Ty); + FunctionCallee getKmpcDistributeStaticInit(Type *Ty); + Value *createScalarCast(Value *V, Type *DestTy); + bool isOpenMPDeviceRuntime(); + + SmallPtrSet DeviceGlobalizedValues; +}; + +} // namespace iomp + +#endif \ No newline at end of file diff --git a/numba/openmp/pass/CMakeLists.txt b/numba/openmp/pass/CMakeLists.txt new file mode 100644 index 000000000000..a01fa0a46c71 --- /dev/null +++ b/numba/openmp/pass/CMakeLists.txt @@ -0,0 +1,40 @@ +cmake_minimum_required(VERSION 3.20) +project(pyomp-pass) + +# Set this to a valid LLVM installation dir +set(LT_LLVM_INSTALL_DIR "" CACHE PATH "LLVM installation directory") + +# Add the location of LLVMConfig.cmake to CMake search paths (so that +# find_package can locate it) +list(APPEND CMAKE_PREFIX_PATH "${LT_LLVM_INSTALL_DIR}/lib/cmake/llvm/") + +find_package(LLVM CONFIG) +if("${LLVM_VERSION_MAJOR}" VERSION_LESS 14) + message(FATAL_ERROR "Found LLVM ${LLVM_VERSION_MAJOR}, but need LLVM 14 or above") +endif() + +# HelloWorld includes headers from LLVM - update the include paths accordingly +include_directories(SYSTEM ${LLVM_INCLUDE_DIRS}) + +# Use the same C++ standard as LLVM does +set(CMAKE_CXX_STANDARD 17 CACHE STRING "") + +# LLVM is normally built without RTTI. Be consistent with that. +if(NOT LLVM_ENABLE_RTTI) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") +endif() + +add_library(IntrinsicsOpenMP SHARED + CGIntrinsicsOpenMP.cpp + DebugOpenMP.cpp + IntrinsicsOpenMP.cpp) + +# Allow undefined symbols in shared objects on Darwin (this is the default +# behaviour on Linux) +target_link_libraries(IntrinsicsOpenMP + "$<$:-undefined dynamic_lookup>") + +install(TARGETS IntrinsicsOpenMP + EXPORT IntrinsicsOpenMP + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX} +) diff --git a/numba/openmp/pass/DebugOpenMP.cpp b/numba/openmp/pass/DebugOpenMP.cpp new file mode 100644 index 000000000000..d0d01f4f7d7b --- /dev/null +++ b/numba/openmp/pass/DebugOpenMP.cpp @@ -0,0 +1,16 @@ +#include +#include +#include + +bool DebugOpenMPFlag; +void DebugOpenMPInit() { + char *DebugStr = getenv("NUMBA_DEBUG_OPENMP_LLVM_PASS"); + DebugOpenMPFlag = false; + if(DebugStr) + DebugOpenMPFlag = (std::stoi(DebugStr) >= 1); +} + +[[noreturn]] void fatalError(const std::string &msg, const char *file, int line) { + std::cerr << "Fatal error @ " << file << ":" << line << " :: " << msg << "\n"; + std::abort(); +} diff --git a/numba/openmp/pass/DebugOpenMP.h b/numba/openmp/pass/DebugOpenMP.h new file mode 100644 index 000000000000..c9814dc90553 --- /dev/null +++ b/numba/openmp/pass/DebugOpenMP.h @@ -0,0 +1,28 @@ +#ifndef DEBUG_OPENMP_H +#define DEBUG_OPENMP_H + +#include + +#ifdef NDEBUG + +#define DEBUG_ENABLE(X) +#define DebugOpenMPInit() + +#else + +extern bool DebugOpenMPFlag; +void DebugOpenMPInit(); + +#define DEBUG_ENABLE(X) \ + do { \ + if (DebugOpenMPFlag) { \ + X; \ + } \ + } while (false) + +#endif + +[[noreturn]] void fatalError(const std::string &msg, const char *file, int line); +#define FATAL_ERROR(msg) fatalError(msg, __FILE__, __LINE__) + +#endif diff --git a/numba/openmp/pass/IntrinsicsOpenMP.cpp b/numba/openmp/pass/IntrinsicsOpenMP.cpp new file mode 100644 index 000000000000..81b4e334cdd7 --- /dev/null +++ b/numba/openmp/pass/IntrinsicsOpenMP.cpp @@ -0,0 +1,732 @@ +//===- IntrinsicsOpenMP.cpp - Codegen OpenMP from IR intrinsics +//--------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements code generation for OpenMP from intrinsics embedded in +// the IR, using the OpenMPIRBuilder +// +//===-------------------------------------------------------------------------===// + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Frontend/OpenMP/OMP.h.inc" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Pass.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include + +#include "IntrinsicsOpenMP.h" +#include "IntrinsicsOpenMP_CAPI.h" +#include "CGIntrinsicsOpenMP.h" +#include "DebugOpenMP.h" + +#include +#include + +using namespace llvm; +using namespace omp; +using namespace iomp; + +#define DEBUG_TYPE "intrinsics-openmp" + +// TODO: Increment. +STATISTIC(NumOpenMPRegions, "Counts number of OpenMP regions created"); + +namespace { + +class DirectiveRegionAnalysis; + +class DirectiveRegion; +SmallVector, 8> DirectiveRegionStorage; + +class DirectiveRegion { +public: + DirectiveRegion() = delete; + + void addNested(DirectiveRegionAnalysis &DRA, DirectiveRegion *DR); + + const SmallVector &getNested() const { return Nested; } + + CallBase *getEntry() const { return CBEntry; } + + CallBase *getExit() const { return CBExit; } + + void setParent(DirectiveRegion *P) { Parent = P; } + + DirectiveRegion *getParent() const { return Parent; } + + StringRef getTag() const { + return getEntry()->getOperandBundleAt(0).getTagName(); + } + + static DirectiveRegion *create(CallBase *CBEntry, CallBase *CBExit) { + // Use global storage of unique_ptr for auto-cleanup. + DirectiveRegionStorage.push_back( + std::unique_ptr(new DirectiveRegion{CBEntry, CBExit})); + return DirectiveRegionStorage.back().get(); + } + +private: + CallBase *CBEntry; + CallBase *CBExit; + DirectiveRegion *Parent; + SmallVector Nested; + + DirectiveRegion(CallBase *CBEntry, CallBase *CBExit) + : CBEntry(CBEntry), CBExit(CBExit), Parent(nullptr) {} +}; + +class DirectiveRegionAnalysis { +public: + explicit DirectiveRegionAnalysis(Function &F) : DT(F), PDT(F) {} + + bool directiveEncloses(DirectiveRegion *DR, DirectiveRegion *OtherDR) { + // Use DominatorTree for Entry and PostDominatorTree for Exit. + // PostDominator is effective for checking Exit when there are loops in + // the CFG, since dominance does not hold for graphs with cycles, but + // post-dominance does. + if (DT.dominates(DR->getEntry(), OtherDR->getEntry()) && + PDT.dominates(DR->getExit(), OtherDR->getExit())) + return true; + + return false; + }; + + bool directiveEntryDominates(DirectiveRegion *DR, DirectiveRegion *OtherDR) { + if (DT.dominates(DR->getEntry(), OtherDR->getEntry())) + return true; + + return false; + } + +private: + DominatorTree DT; + PostDominatorTree PDT; +}; + +void DirectiveRegion::addNested(DirectiveRegionAnalysis &DRA, + DirectiveRegion *DR) { + // Insert in topological order. + auto Compare = [&DRA](DirectiveRegion *DR, DirectiveRegion *OtherDR) { + return DRA.directiveEntryDominates(DR, OtherDR); + }; + + Nested.insert(std::upper_bound(Nested.begin(), Nested.end(), DR, Compare), + DR); +} + +static SmallVector +collectGlobalizedValues(DirectiveRegion &Directive) { + + SmallVector GlobalizedValues; + + SmallVector OpBundles; + Directive.getEntry()->getOperandBundlesAsDefs(OpBundles); + for (OperandBundleDef &O : OpBundles) { + StringRef Tag = O.getTag(); + auto It = StringToDSA.find(Tag); + if (It == StringToDSA.end()) + continue; + + const ArrayRef &TagInputs = O.inputs(); + + DSAType DSATy = It->second; + + switch (DSATy) { + case iomp::DSA_FIRSTPRIVATE: + case iomp::DSA_PRIVATE: + continue; + default: + GlobalizedValues.push_back(TagInputs[0]); + } + } + + return GlobalizedValues; +} + +struct IntrinsicsOpenMP { + + IntrinsicsOpenMP() { + DebugOpenMPInit(); + } + + bool runOnModule(Module &M) { + // Codegen for nested or combined constructs assumes code is generated + // bottom-up, that is from the innermost directive to the outermost. This + // simplifies handling of DSA attributes by avoiding renaming values (tags + // contain pre-lowered values when defining the data sharing environment) + // when an outlined function privatizes them in the DSAValueMap. + DEBUG_ENABLE(dbgs() << "=== Start IntrinsicsOpenMPPass v4\n"); + + Function *RegionEntryF = M.getFunction("llvm.directive.region.entry"); + + // Return early for lack of directive intrinsics. + if (!RegionEntryF) { + DEBUG_ENABLE(dbgs() << "No intrinsics directives, exiting...\n"); + return false; + } + + DEBUG_ENABLE(dbgs() << "=== Dump Module\n" + << M << "=== End of Dump Module\n"); + + CGIntrinsicsOpenMP CGIOMP(M); + // Find all calls to directive intrinsics. + SmallMapVector, 8> + FunctionToDirectives; + + for (User *Usr : RegionEntryF->users()) { + CallBase *CBEntry = dyn_cast(Usr); + assert(CBEntry && "Expected call to directive entry"); + assert(CBEntry->getNumUses() == 1 && + "Expected single use of the directive entry"); + Use &U = *CBEntry->use_begin(); + CallBase *CBExit = dyn_cast(U.getUser()); + assert(CBExit && "Expected call to region exit intrinsic"); + Function *F = CBEntry->getFunction(); + assert(F == CBExit->getFunction() && + "Expected directive entry/exit in the same function"); + + DirectiveRegion *DM = DirectiveRegion::create(CBEntry, CBExit); + FunctionToDirectives[F].push_back(DM); + } + + SmallVector, 4> DirectiveListVector; + // Create directive lists per function, building trees of directive nests. + // Each list stores directives outermost to innermost (pre-order). + for (auto &FTD : FunctionToDirectives) { + // Find the dominator tree for the function to find directive lists. + Function &F = *FTD.first; + auto &DirectiveRegions = FTD.second; + DirectiveRegionAnalysis DRA{F}; + + // Construct directive tree nests. First, find immediate parents, then add + // nested children to parents. + + // Find immediate parents. + for (auto *DR : DirectiveRegions) { + for (auto *OtherDR : DirectiveRegions) { + if (DR == OtherDR) + continue; + + if (!DRA.directiveEncloses(OtherDR, DR)) + continue; + + DirectiveRegion *Parent = DR->getParent(); + if (!Parent) { + DR->setParent(OtherDR); + continue; + } + + // If OtherDR is nested under Parent and encloses DR, then OtherDR is + // the immediate parent of DR. + if (DRA.directiveEncloses(Parent, OtherDR)) { + DR->setParent(OtherDR); + continue; + } + + // Else, OtherDR must be enclosing Parent. It is not OtherDR's + // immediate parent, hence no change to OtherDR. + assert(DRA.directiveEncloses(OtherDR, Parent)); + } + } + // Gather all root directives, add nested children. + SmallVector Roots; + for (auto *DR : DirectiveRegions) { + DirectiveRegion *Parent = DR->getParent(); + if (!Parent) { + Roots.push_back(DR); + continue; + } + + Parent->addNested(DRA, DR); + } + + // Travese the tree and add directives (outermost to innermost) + // in a list. + for (auto *Root : Roots) { + SmallVector DirectiveList; + + auto VisitNode = [&DirectiveList](DirectiveRegion *Node, int Depth, + auto &&VisitNode) -> void { + DirectiveList.push_back(Node); + for (auto *Nested : Node->getNested()) + VisitNode(Nested, Depth + 1, VisitNode); + }; + + VisitNode(Root, 0, VisitNode); + + DirectiveListVector.push_back(DirectiveList); + + auto PrintTree = [&]() { + dbgs() << " === TREE\n"; + auto PrintNode = [](DirectiveRegion *Node, int Depth, + auto &&PrintNode) -> void { + if (Depth) { + for (int I = 0; I < Depth; ++I) + dbgs() << " "; + dbgs() << "|_ "; + } + dbgs() << Node->getTag() << "\n"; + + for (auto *Nested : Node->getNested()) + PrintNode(Nested, Depth + 1, PrintNode); + }; + PrintNode(Root, 0, PrintNode); + dbgs() << " === END OF TREE\n"; + }; + DEBUG_ENABLE(PrintTree()); + + auto PrintList = [&]() { + dbgs() << " === List\n"; + for (auto *DR : DirectiveList) + dbgs() << DR->getTag() << " -> "; + dbgs() << "EOL\n"; + dbgs() << " === End of List\n"; + }; + DEBUG_ENABLE(PrintList()); + } + } + + // Iterate all directive lists and codegen. + for (auto &DirectiveList : DirectiveListVector) { + // If the outermost directive is a TARGET directive, collect globalized + // values to set for codegen. + // TODO: implement Directives as a class, parse each directive before + // codegen, optimize privatization. + auto *Outer = DirectiveList.front(); + if (Outer->getEntry()->getOperandBundleAt(0).getTagName().contains( + "TARGET")) { + auto GlobalizedValues = collectGlobalizedValues(*Outer); + CGIOMP.setDeviceGlobalizedValues(GlobalizedValues); + } + // Iterate post-order, from innermost to outermost to avoid renaming + // values in codegen. + for (auto It = DirectiveList.rbegin(), E = DirectiveList.rend(); It != E; + ++It) { + DirectiveRegion *DR = *It; + DEBUG_ENABLE(dbgs() << "Found Directive " << *DR->getEntry() << "\n"); + // Extract the directive kind and data sharing attributes of values + // from the operand bundles of the intrinsic call. + Directive Dir = OMPD_unknown; + SmallVector OpBundles; + DSAValueMapTy DSAValueMap; + + // RAII for directive metainfo structs. + OMPLoopInfoStruct OMPLoopInfo; + ParRegionInfoStruct ParRegionInfo; + TargetInfoStruct TargetInfo; + TeamsInfoStruct TeamsInfo; + + MapVector> + StructMappingInfoMap; + + bool IsDeviceTargetRegion = false; + + DR->getEntry()->getOperandBundlesAsDefs(OpBundles); + // TODO: parse clauses. + for (OperandBundleDef &O : OpBundles) { + StringRef Tag = O.getTag(); + DEBUG_ENABLE(dbgs() << "OPB " << Tag << "\n"); + + // TODO: check for conflicting DSA, for example reduction variables + // cannot be set private. Should be done in Numba. + if (Tag.startswith("DIR")) { + auto It = StringToDir.find(Tag); + assert(It != StringToDir.end() && "Directive is not supported!"); + Dir = It->second; + } else if (Tag.startswith("QUAL")) { + const ArrayRef &TagInputs = O.inputs(); + if (Tag.startswith("QUAL.OMP.NORMALIZED.IV")) { + assert(O.input_size() == 1 && "Expected single IV value"); + OMPLoopInfo.IV = TagInputs[0]; + } else if (Tag.startswith("QUAL.OMP.NORMALIZED.START")) { + assert(O.input_size() == 1 && "Expected single START value"); + OMPLoopInfo.Start = TagInputs[0]; + } else if (Tag.startswith("QUAL.OMP.NORMALIZED.LB")) { + assert(O.input_size() == 1 && "Expected single LB value"); + OMPLoopInfo.LB = TagInputs[0]; + } else if (Tag.startswith("QUAL.OMP.NORMALIZED.UB")) { + assert(O.input_size() == 1 && "Expected single UB value"); + OMPLoopInfo.UB = TagInputs[0]; + } else if (Tag.startswith("QUAL.OMP.NUM_THREADS")) { + assert(O.input_size() == 1 && "Expected single NumThreads value"); + ParRegionInfo.NumThreads = TagInputs[0]; + } else if (Tag.startswith("QUAL.OMP.SCHEDULE")) { + // TODO: Add DIST_SCHEDULE for distribute loops. + assert(O.input_size() == 1 && + "Expected single chunking scheduling value"); + Constant *Zero = ConstantInt::get(TagInputs[0]->getType(), 0); + OMPLoopInfo.Chunk = TagInputs[0]; + + if (Tag == "QUAL.OMP.SCHEDULE.STATIC") { + if (TagInputs[0] == Zero) + OMPLoopInfo.Sched = OMPScheduleType::Static; + else { + OMPLoopInfo.Sched = OMPScheduleType::StaticChunked; + OMPLoopInfo.Chunk = TagInputs[0]; + } + } else + FATAL_ERROR("Unsupported scheduling type"); + } else if (Tag.startswith("QUAL.OMP.IF")) { + assert(O.input_size() == 1 && + "Expected single if condition value"); + ParRegionInfo.IfCondition = TagInputs[0]; + } else if (Tag.startswith("QUAL.OMP.TARGET.DEV_FUNC")) { + assert(O.input_size() == 1 && + "Expected a single device function name"); + ConstantDataArray *DevFuncArray = + dyn_cast(TagInputs[0]); + assert(DevFuncArray && + "Expected constant string for the device function"); + TargetInfo.DevFuncName = DevFuncArray->getAsString(); + } else if (Tag.startswith("QUAL.OMP.TARGET.ELF")) { + assert(O.input_size() == 1 && + "Expected a single elf image string"); + ConstantDataArray *ELF = + dyn_cast(TagInputs[0]); + assert(ELF && "Expected constant string for ELF"); + TargetInfo.ELF = ELF; + } else if (Tag.startswith("QUAL.OMP.DEVICE")) { + // TODO: Handle device selection for target regions. + } else if (Tag.startswith("QUAL.OMP.NUM_TEAMS")) { + assert(O.input_size() == 1 && "Expected single NumTeams value"); + switch (Dir) { + case OMPD_target: + TargetInfo.NumTeams = TagInputs[0]; + break; + case OMPD_teams: + case OMPD_teams_distribute: + case OMPD_teams_distribute_parallel_for: + TeamsInfo.NumTeams = TagInputs[0]; + break; + case OMPD_target_teams: + case OMPD_target_teams_distribute: + TargetInfo.NumTeams = TagInputs[0]; + TeamsInfo.NumTeams = TagInputs[0]; + break; + case OMPD_target_teams_distribute_parallel_for: + TargetInfo.NumTeams = TagInputs[0]; + TeamsInfo.NumTeams = TagInputs[0]; + break; + default: + FATAL_ERROR("Unsupported qualifier in directive"); + } + } else if (Tag.startswith("QUAL.OMP.THREAD_LIMIT")) { + assert(O.input_size() == 1 && + "Expected single ThreadLimit value"); + switch (Dir) { + case OMPD_target: + TargetInfo.ThreadLimit = TagInputs[0]; + break; + case OMPD_teams: + case OMPD_teams_distribute: + case OMPD_teams_distribute_parallel_for: + TeamsInfo.ThreadLimit = TagInputs[0]; + break; + case OMPD_target_teams: + case OMPD_target_teams_distribute: + case OMPD_target_teams_distribute_parallel_for: + TargetInfo.ThreadLimit = TagInputs[0]; + TeamsInfo.ThreadLimit = TagInputs[0]; + break; + default: + FATAL_ERROR("Unsupported qualifier in directive"); + } + } else if (Tag.startswith("QUAL.OMP.NOWAIT")) { + switch (Dir) { + case OMPD_target: + case OMPD_target_teams: + case OMPD_target_teams_distribute: + case OMPD_target_teams_distribute_parallel_for: + TargetInfo.NoWait = true; + break; + default: + FATAL_ERROR("Unsupported nowait qualifier in directive"); + } + } else /* DSA Qualifiers */ { + auto It = StringToDSA.find(Tag); + assert(It != StringToDSA.end() && "DSA type not found in map"); + if (It->second == DSA_MAP_ALLOC_STRUCT || + It->second == DSA_MAP_TO_STRUCT || + It->second == DSA_MAP_FROM_STRUCT || + It->second == DSA_MAP_TOFROM_STRUCT) { + assert((TagInputs.size() - 1) == 3 && + "Expected input triple for struct mapping"); + Value *Index = TagInputs[1]; + Value *Offset = TagInputs[2]; + Value *NumElements = TagInputs[3]; + StructMappingInfoMap[TagInputs[0]].push_back( + {Index, Offset, NumElements, It->second}); + + DSAValueMap[TagInputs[0]] = DSATypeInfo(DSA_MAP_STRUCT); + } else { + // This firstprivate includes a copy-constructor operand. + if ((It->second == DSA_FIRSTPRIVATE || + It->second == DSA_LASTPRIVATE) && + TagInputs.size() == 2) { + Value *V = TagInputs[0]; + ConstantDataArray *CopyFnNameArray = + dyn_cast(TagInputs[1]); + assert(CopyFnNameArray && "Expected constant string for the " + "copy-constructor function"); + StringRef CopyFnName = CopyFnNameArray->getAsString(); + FunctionCallee CopyConstructor = M.getOrInsertFunction( + CopyFnName, V->getType()->getPointerElementType(), + V->getType()->getPointerElementType()); + DSAValueMap[TagInputs[0]] = + DSATypeInfo(It->second, CopyConstructor); + } else + // Sink for DSA qualifiers that do not require special + // handling. + DSAValueMap[TagInputs[0]] = DSATypeInfo(It->second); + } + } + } else if (Tag == "OMP.DEVICE") + IsDeviceTargetRegion = true; + else + FATAL_ERROR(("Unknown tag " + Tag).str().c_str()); + } + + assert(Dir != OMPD_unknown && "Expected valid OMP directive"); + + // Gather info. + BasicBlock *BBEntry = DR->getEntry()->getParent(); + Function *Fn = BBEntry->getParent(); + const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc(); + + // Create the basic block structure to isolate the outlined region. + // Structure: BBEntry -> StartBB -> BBExit -> EndBB -> AfterBB + // TODO: Reverse naming on BBExit and EndBB? + BasicBlock *StartBB = SplitBlock(BBEntry, DR->getEntry()); + assert(BBEntry->getUniqueSuccessor() == StartBB && + "Expected unique successor at region start BB"); + + BasicBlock *BBExit = DR->getExit()->getParent(); + BasicBlock *EndBB = SplitBlock(BBExit, DR->getExit()->getNextNode()); + assert(BBExit->getUniqueSuccessor() == EndBB && + "Expected unique successor at region end BB"); + BasicBlock *AfterBB = SplitBlock(EndBB, &*EndBB->getFirstInsertionPt()); + + DEBUG_ENABLE(dbgs() << "BBEntry " << BBEntry->getName() << "\n"); + DEBUG_ENABLE(dbgs() << "StartBB " << StartBB->getName() << "\n"); + DEBUG_ENABLE(dbgs() << "BBExit " << BBExit->getName() << "\n"); + DEBUG_ENABLE(dbgs() << "EndBB " << EndBB->getName() << "\n"); + DEBUG_ENABLE(dbgs() << "AfterBB " << AfterBB->getName() << "\n"); + + // Define the default BodyGenCB lambda. + auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + BasicBlock &ContinuationIP) { + BasicBlock *CGStartBB = CodeGenIP.getBlock(); + BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint()); + assert(StartBB != nullptr && "StartBB should not be null"); + CGStartBB->getTerminator()->setSuccessor(0, StartBB); + assert(EndBB != nullptr && "EndBB should not be null"); + EndBB->getTerminator()->setSuccessor(0, CGEndBB); + }; + + // Define the default FiniCB lambda. + auto FiniCB = [&](InsertPointTy CodeGenIP) {}; + + // Remove intrinsics of OpenMP tags, first CBExit to also remove use + // of CBEntry, then CBEntry. + DR->getExit()->eraseFromParent(); + DR->getEntry()->eraseFromParent(); + + if (Dir == OMPD_parallel) { + CGIOMP.emitOMPParallel(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB, + EndBB, AfterBB, FiniCB, ParRegionInfo); + } else if (Dir == OMPD_single) { + CGIOMP.emitOMPSingle(Fn, BBEntry, AfterBB, BodyGenCB, FiniCB); + } else if (Dir == OMPD_critical) { + CGIOMP.emitOMPCritical(Fn, BBEntry, AfterBB, BodyGenCB, FiniCB); + } else if (Dir == OMPD_barrier) { + CGIOMP.emitOMPBarrier(Fn, BBEntry, OMPD_barrier); + } else if (Dir == OMPD_for) { + CGIOMP.emitOMPFor(DSAValueMap, OMPLoopInfo, StartBB, BBExit, + /* IsStandalone */ true, false); + } else if (Dir == OMPD_parallel_for) { + CGIOMP.emitOMPFor(DSAValueMap, OMPLoopInfo, StartBB, BBExit, + /* IsStandalone */ false, false); + CGIOMP.emitOMPParallel(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB, + EndBB, AfterBB, FiniCB, ParRegionInfo); + } else if (Dir == OMPD_task) { + CGIOMP.emitOMPTask(DSAValueMap, Fn, BBEntry, StartBB, EndBB, AfterBB); + } else if (Dir == OMPD_taskwait) { + CGIOMP.emitOMPTaskwait(BBEntry); + } else if (Dir == OMPD_target) { + TargetInfo.ExecMode = OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; + CGIOMP.emitOMPTarget(Fn, BBEntry, StartBB, EndBB, DSAValueMap, + StructMappingInfoMap, TargetInfo, + /* OMPLoopInfo */ nullptr, IsDeviceTargetRegion); + } else if (Dir == OMPD_teams) { + CGIOMP.emitOMPTeams(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB, + EndBB, AfterBB, TeamsInfo); + } else if (Dir == OMPD_distribute) { + CGIOMP.emitOMPDistribute(DSAValueMap, OMPLoopInfo, StartBB, BBExit, + /* IsStandalone */ true, false); + } else if (Dir == OMPD_teams_distribute) { + CGIOMP.emitOMPDistribute(DSAValueMap, OMPLoopInfo, StartBB, BBExit, + /* IsStandalone */ false, false); + CGIOMP.emitOMPTeams(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB, + EndBB, AfterBB, TeamsInfo); + } else if (Dir == OMPD_teams_distribute_parallel_for) { + CGIOMP.emitOMPDistributeParallelFor(DSAValueMap, StartBB, BBExit, + OMPLoopInfo, ParRegionInfo, + /* IsStandalone */ false); + CGIOMP.emitOMPTeams(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB, + EndBB, AfterBB, TeamsInfo); + } else if (Dir == OMPD_target_teams) { + TargetInfo.ExecMode = OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; + CGIOMP.emitOMPTargetTeams(DSAValueMap, nullptr, DL, Fn, BBEntry, + StartBB, EndBB, AfterBB, TargetInfo, + /* OMPLoopInfo */ nullptr, + StructMappingInfoMap, IsDeviceTargetRegion); + } else if (Dir == OMPD_target_data) { + if (IsDeviceTargetRegion) + FATAL_ERROR("Target enter data should never appear inside a " + "device target region"); + CGIOMP.emitOMPTargetData(Fn, BBEntry, BBExit, DSAValueMap, + StructMappingInfoMap); + } else if (Dir == OMPD_target_enter_data) { + if (IsDeviceTargetRegion) + FATAL_ERROR("Target enter data should never appear inside a " + "device target region"); + + CGIOMP.emitOMPTargetEnterData(Fn, BBEntry, DSAValueMap, + StructMappingInfoMap); + } else if (Dir == OMPD_target_exit_data) { + if (IsDeviceTargetRegion) + FATAL_ERROR("Target exit data should never appear inside a " + "device target region"); + + CGIOMP.emitOMPTargetExitData(Fn, BBEntry, DSAValueMap, + StructMappingInfoMap); + } else if (Dir == OMPD_target_update) { + if (IsDeviceTargetRegion) + FATAL_ERROR("Target exit data should never appear inside a " + "device target region"); + + CGIOMP.emitOMPTargetUpdate(Fn, BBEntry, DSAValueMap, + StructMappingInfoMap); + } else if (Dir == OMPD_target_teams_distribute) { + TargetInfo.ExecMode = OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; + CGIOMP.emitOMPDistribute(DSAValueMap, OMPLoopInfo, StartBB, BBExit, + /* IsStandalone */ false, false); + CGIOMP.emitOMPTargetTeams(DSAValueMap, nullptr, DL, Fn, BBEntry, + StartBB, EndBB, AfterBB, TargetInfo, + &OMPLoopInfo, StructMappingInfoMap, + IsDeviceTargetRegion); + } else if (Dir == OMPD_distribute_parallel_for) { + CGIOMP.emitOMPDistributeParallelFor(DSAValueMap, StartBB, BBExit, + OMPLoopInfo, ParRegionInfo, + /* isStandalone */ false); + } else if (Dir == OMPD_target_teams_distribute_parallel_for) { + TargetInfo.ExecMode = OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; + CGIOMP.emitOMPTargetTeamsDistributeParallelFor( + DSAValueMap, DL, Fn, BBEntry, StartBB, EndBB, BBExit, AfterBB, + OMPLoopInfo, ParRegionInfo, TargetInfo, StructMappingInfoMap, + IsDeviceTargetRegion); + } else { + FATAL_ERROR("Unknown directive"); + } + + if (verifyFunction(*Fn, &errs())) + FATAL_ERROR( + "Verification of IntrinsicsOpenMP lowering failed!"); + } + } + + DEBUG_ENABLE(dbgs() << "=== Dump Lowered Module\n" + << M << "=== End of Dump Lowered Module\n"); + + DEBUG_ENABLE(dbgs() << "=== End of IntrinsicsOpenMP pass\n"); + + return true; + } + +}; +} // namespace + +// Legacy PM registration. +struct LegacyIntrinsicsOpenmMPPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + LegacyIntrinsicsOpenmMPPass() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + IntrinsicsOpenMP IOMP; + return IOMP.runOnModule(M); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + ModulePass::getAnalysisUsage(AU); + } +}; + +char LegacyIntrinsicsOpenmMPPass::ID = 0; +static RegisterPass X("intrinsics-openmp", + "Legacy IntrinsicsOpenMP Pass"); + +ModulePass *llvm::createIntrinsicsOpenMPPass() { + return new LegacyIntrinsicsOpenmMPPass(); +} + +void LLVMAddIntrinsicsOpenMPPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createIntrinsicsOpenMPPass()); +} + +// New PM registration. + +class IntrinsicsOpenMPPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) { + IntrinsicsOpenMP IOMP; + bool Changed = IOMP.runOnModule(M); + + if (Changed) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); + + } + + // Run always to lower OpenMP intrinsics. + static bool isRequired() { return true; } +}; + +llvm::PassPluginLibraryInfo getIntrinsicsOpenMPPluginInfo() { + return {LLVM_PLUGIN_API_VERSION, "IntrinsicsOpenMP", LLVM_VERSION_STRING, + [](PassBuilder &PB) { + PB.registerPipelineParsingCallback( + [](StringRef Name, ModulePassManager &MPM, + ArrayRef) { + if (Name == "intrinsics-openmp") { + MPM.addPass(IntrinsicsOpenMPPass()); + return true; + } + return false; + }); + }}; +} + +extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo +llvmGetPassPluginInfo() { + return getIntrinsicsOpenMPPluginInfo(); +} diff --git a/numba/openmp/pass/IntrinsicsOpenMP.h b/numba/openmp/pass/IntrinsicsOpenMP.h new file mode 100644 index 000000000000..3d44f3f92312 --- /dev/null +++ b/numba/openmp/pass/IntrinsicsOpenMP.h @@ -0,0 +1,14 @@ +#ifndef LLVM_TRANSFORMS_INTRINSICS_OPENMP_H +#define LLVM_TRANSFORMS_INTRINSICS_OPENMP_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" + +namespace llvm { + + +ModulePass *createIntrinsicsOpenMPPass(); + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_INTRINSICS_OPENMP_H \ No newline at end of file diff --git a/numba/openmp/pass/IntrinsicsOpenMP_CAPI.h b/numba/openmp/pass/IntrinsicsOpenMP_CAPI.h new file mode 100644 index 000000000000..b0d0b67bca81 --- /dev/null +++ b/numba/openmp/pass/IntrinsicsOpenMP_CAPI.h @@ -0,0 +1,23 @@ +#ifndef LLVM_C_TRANSFORMS_INTRINSICS_OPENMP_H +#define LLVM_C_TRANSFORMS_INTRINSICS_OPENMP_H + +#include "llvm-c/ExternC.h" +#include "llvm-c/Types.h" + +LLVM_C_EXTERN_C_BEGIN + +/** + * @defgroup LLVMCTransformsIntrinsicsOpenMP IntrinsicsOpenMP transformations + * @ingroup LLVMCTransforms + * + * @{ + */ + +/** See llvm::createIntrinsicsOpenMPPass function. */ +void LLVMAddIntrinsicsOpenMPPass(LLVMPassManagerRef PM); + +/** + * @} + */ +LLVM_C_EXTERN_C_END +#endif \ No newline at end of file diff --git a/numba/openmp/tests/test_openmp.py b/numba/openmp/tests/test_openmp.py new file mode 100644 index 000000000000..17aba2cb8b02 --- /dev/null +++ b/numba/openmp/tests/test_openmp.py @@ -0,0 +1,4959 @@ +import contextlib +import math +import time +import dis +import numbers +import os +import platform +import sys +import subprocess +import warnings +from functools import reduce +import numpy as np +from numpy.random import randn +import operator +from collections import defaultdict, namedtuple +import copy +from itertools import cycle, chain +import subprocess as subp + +from numba import typeof +from numba.core import ( + types, + utils, + typing, + errors, + ir, + rewrites, + typed_passes, + inline_closurecall, + config, + compiler, + cpu, +) +from numba.extending import ( + overload_method, + register_model, + typeof_impl, + unbox, + NativeValue, + models, +) +from numba.core.registry import cpu_target +from numba.core.annotations import type_annotations +from numba.core.ir_utils import ( + find_callname, + guard, + build_definitions, + get_definition, + is_getitem, + is_setitem, + index_var_of_get_setitem, +) +from numba.np.unsafe.ndarray import empty_inferred as unsafe_empty +from numba.core.bytecode import ByteCodeIter +from numba.core.compiler import ( + compile_isolated, + Flags, + CompilerBase, + DefaultPassBuilder, +) +from numba.core.compiler_machinery import register_pass, AnalysisPass +from numba.core.typed_passes import IRLegalization +from numba.tests.support import ( + TestCase, + captured_stdout, + MemoryLeakMixin, + override_env_config, + linux_only, + tag, + _32bit, + needs_blas, + needs_lapack, + disabled_test, + skip_unless_scipy, + needs_subprocess, +) +import numba.openmp as openmp +from numba.openmp import njit +from numba.openmp import openmp_context as omp +from numba.openmp import ( + omp_set_num_threads, + omp_get_thread_num, + omp_get_num_threads, + omp_get_wtime, + omp_set_nested, + omp_set_max_active_levels, + omp_set_dynamic, + omp_get_max_active_levels, + omp_get_max_threads, + omp_get_num_procs, + UnspecifiedVarInDefaultNone, + NonconstantOpenmpSpecification, + NonStringOpenmpSpecification, + omp_get_thread_limit, + ParallelForExtraCode, + ParallelForWrongLoopCount, + omp_in_parallel, + omp_get_level, + omp_get_active_level, + omp_get_team_size, + omp_get_ancestor_thread_num, + omp_get_team_num, + omp_get_num_teams, + omp_in_final, + omp_shared_array, +) +import cmath +import unittest + +# NOTE: Each OpenMP test class is run in separate subprocess, this is to reduce +# memory pressure in CI settings. The environment variable "SUBPROC_TEST" is +# used to determine whether a test is skipped or not, such that if you want to +# run any OpenMP test directly this environment variable can be set. The +# subprocesses running the test classes set this environment variable as the new +# process starts which enables the tests within the process. The decorator +# @needs_subprocess is used to ensure the appropriate test skips are made. + +# +# class TestOpenmpRunner(TestCase): +# _numba_parallel_test_ = False +# +# # Each test class can run for 30 minutes before time out. +# _TIMEOUT = 1800 +# +# """This is the test runner for all the OpenMP tests, it runs them in +# subprocesses as described above. The convention for the test method naming +# is: `test_` where is the name of the test class in +# this module. +# """ +# def runner(self): +# themod = self.__module__ +# test_clazz_name = self.id().split('.')[-1].split('_')[-1] +# # don't specify a given test, it's an entire class that needs running +# self.subprocess_test_runner(test_module=themod, +# test_class=test_clazz_name, +# timeout=self._TIMEOUT) +# +# """ +# def test_TestOpenmpBasic(self): +# self.runner() +# """ +# +# def test_TestOpenmpRoutinesEnvVariables(self): +# self.runner() +# +# def test_TestOpenmpParallelForResults(self): +# self.runner() +# +# def test_TestOpenmpWorksharingSchedule(self): +# self.runner() +# +# def test_TestOpenmpParallelClauses(self): +# self.runner() +# +# def test_TestOpenmpDataClauses(self): +# self.runner() +# +# def test_TestOpenmpConstraints(self): +# self.runner() +# +# def test_TestOpenmpConcurrency(self): +# self.runner() +# +# def test_TestOpenmpTask(self): +# self.runner() +# +# def test_TestOpenmpTaskloop(self): +# self.runner() +# +# def test_TestOpenmpTarget(self): +# self.runner() +# +# def test_TestOpenmpPi(self): +# self.runner() + + +x86_only = unittest.skipIf( + platform.machine() not in ("i386", "x86_64"), "x86 only test" +) + + +def null_comparer(a, b): + """ + Used with check_arq_equality to indicate that we do not care + whether the value of the parameter at the end of the function + has a particular value. + """ + pass + + +@contextlib.contextmanager +def override_config(name, value): + """ + Return a context manager that temporarily sets an openmp config variable + *name* to *value*. *name* must be the name of an existing variable + in openmp. + """ + old_value = getattr(openmp, name) + setattr(openmp, name, value) + try: + yield + finally: + setattr(openmp, name, old_value) + + +# @needs_subprocess +class TestOpenmpBase(TestCase): + """ + Base class for testing OpenMP. + Provides functions for compilation and three way comparison between + python functions, njit'd functions and njit'd functions with + OpenMP disabled. + + To set a default value or state for all the tests in a class, set + a variable *var* inside the class where *var* is: + + - MAX_THREADS - Thread team size for parallel regions. + - MAX_ACTIVE_LEVELS - Number of nested parallel regions capable of + running in parallel. + """ + + _numba_parallel_test_ = False + + skip_disabled = int(os.environ.get("OVERRIDE_TEST_SKIP", 0)) != 0 + run_target = int(os.environ.get("RUN_TARGET", 0)) != 0 + test_devices = os.environ.get("TEST_DEVICES", "") + + env_vars = { + "OMP_NUM_THREADS": omp_get_num_procs(), + "OMP_MAX_ACTIVE_LEVELS": 1, + "OMP_DYNAMIC": True, + } + + def __init__(self, *args): + # flags for njit() + self.cflags = Flags() + self.cflags.enable_ssa = False + self.cflags.nrt = True + + super(TestOpenmpBase, self).__init__(*args) + + def setUp(self): + omp_set_num_threads( + getattr(self, "MAX_THREADS", TestOpenmpBase.env_vars.get("OMP_NUM_THREADS")) + ) + omp_set_max_active_levels( + getattr( + self, + "MAX_ACTIVE_LEVELS", + TestOpenmpBase.env_vars.get("OMP_MAX_ACTIVE_LEVELS"), + ) + ) + self.beforeThreads = omp_get_max_threads() + self.beforeLevels = omp_get_max_active_levels() + + def tearDown(self): + omp_set_num_threads(self.beforeThreads) + omp_set_max_active_levels(self.beforeLevels) + + def _compile_this(self, func, sig, flags): + return compile_isolated(func, sig, flags=flags) + + def compile_njit_openmp_disabled(self, func, sig): + with override_config("OPENMP_DISABLED", True): + return self._compile_this(func, sig, flags=self.cflags) + + def compile_njit(self, func, sig): + return self._compile_this(func, sig, flags=self.cflags) + + def compile_all(self, pyfunc, *args, **kwargs): + sig = tuple([typeof(x) for x in args]) + + # compile the OpenMP-disabled njit function + cdfunc = self.compile_njit_openmp_disabled(pyfunc, sig) + + # compile a standard njit of the original function + cfunc = self.compile_njit(pyfunc, sig) + + return cfunc, cdfunc + + def assert_outputs_equal(self, *outputs): + assert len(outputs) > 1 + + for op_num in range(len(outputs) - 1): + op1, op2 = outputs[op_num], outputs[op_num + 1] + if isinstance(op1, (bool, np.bool_)): + assert isinstance(op2, (bool, np.bool_)) + elif not isinstance(op1, numbers.Number) or not isinstance( + op2, numbers.Number + ): + self.assertEqual(type(op1), type(op2)) + + if isinstance(op1, np.ndarray): + np.testing.assert_almost_equal(op1, op2) + elif isinstance(op1, (tuple, list)): + assert len(op1) == len(op2) + for i in range(len(op1)): + self.assert_outputs_equal(op1[i], op2[i]) + elif isinstance(op1, (bool, np.bool_, str, type(None))): + assert op1 == op2 + elif isinstance(op1, numbers.Number): + np.testing.assert_approx_equal(op1, op2) + else: + raise ValueError("Unsupported output type encountered") + + def check_openmp_vs_others(self, pyfunc, cfunc, cdfunc, *args, **kwargs): + """ + Checks python, njit and njit without OpenMP impls produce the same result. + + Arguments: + pyfunc - the python function to test + cfunc - CompilerResult from njit of pyfunc + cdfunc - CompilerResult from OpenMP-disabled njit of pyfunc + args - arguments for the function being tested + Keyword Arguments: + scheduler_type - 'signed', 'unsigned' or None, default is None. + Supply in cases where the presence of a specific + scheduler is to be asserted. + fastmath_pcres - a fastmath parallel compile result, if supplied + will be run to make sure the result is correct + check_arg_equality - some functions need to check that a + parameter is modified rather than a certain + value returned. If this keyword argument + is supplied, it should be a list of + comparison functions such that the i'th + function in the list is used to compare the + i'th parameter of the njit and OpenMP-disabled + functions against the i'th parameter of the + standard Python function, asserting if they + differ. The length of this list must be equal + to the number of parameters to the function. + The null comparator is available for use + when you do not desire to test if some + particular parameter is changed. + Remaining kwargs are passed to np.testing.assert_almost_equal + """ + check_args_for_equality = kwargs.pop("check_arg_equality", None) + + def copy_args(*args): + if not args: + return tuple() + new_args = [] + for x in args: + if isinstance(x, np.ndarray): + new_args.append(x.copy("k")) + elif isinstance(x, np.number): + new_args.append(x.copy()) + elif isinstance(x, numbers.Number): + new_args.append(x) + elif isinstance(x, tuple): + new_args.append(copy.deepcopy(x)) + elif isinstance(x, list): + new_args.append(x[:]) + elif isinstance(x, str): + new_args.append(x) + else: + raise ValueError("Unsupported argument type encountered") + return tuple(new_args) + + # python result + py_args = copy_args(*args) + py_expected = pyfunc(*py_args) + + # njit result + njit_args = copy_args(*args) + njit_output = cfunc.entry_point(*njit_args) + + # OpenMP-disabled result + openmp_disabled_args = copy_args(*args) + openmp_disabled_output = cdfunc.entry_point(*openmp_disabled_args) + + if check_args_for_equality is None: + self.assert_outputs_equal(py_expected, njit_output, openmp_disabled_output) + else: + assert len(py_args) == len(check_args_for_equality) + for pyarg, njitarg, noomparg, argcomp in zip( + py_args, njit_args, openmp_disabled_args, check_args_for_equality + ): + argcomp(njitarg, pyarg, **kwargs) + argcomp(noomparg, pyarg, **kwargs) + + # TODO: remove this check function and check_openmp_vs_others and check + # directly expected results. + def check(self, pyfunc, *args, **kwargs): + """Checks that pyfunc compiles for *args under njit OpenMP-disabled and + njit and asserts that all version execute and produce the same result + """ + cfunc, cdfunc = self.compile_all(pyfunc, *args) + self.check_openmp_vs_others(pyfunc, cfunc, cdfunc, *args, **kwargs) + + def check_variants(self, impl, arg_gen, **kwargs): + """Run self.check(impl, ...) on array data generated from arg_gen.""" + for args in arg_gen(): + with self.subTest(list(map(typeof, args))): + self.check(impl, *args, **kwargs) + + +class TestPipeline(object): + def __init__(self, typingctx, targetctx, args, test_ir): + self.state = compiler.StateDict() + self.state.typingctx = typingctx + self.state.targetctx = targetctx + self.state.args = args + self.state.func_ir = test_ir + self.state.typemap = None + self.state.return_type = None + self.state.calltypes = None + self.state.metadata = {} + + +# +# class TestOpenmpBasic(TestOpenmpBase): +# """OpenMP smoke tests. These tests check the most basic +# functionality""" +# +# def __init__(self, *args): +# TestOpenmpBase.__init__(self, *args) + + +class TestOpenmpRoutinesEnvVariables(TestOpenmpBase): + MAX_THREADS = 5 + + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + """ + def test_func_get_wtime(self): + @njit + def test_impl(t): + start = omp_get_wtime() + time.sleep(t) + return omp_get_wtime() - start + t = 0.5 + np.testing.assert_approx_equal(test_impl(t), t, signifcant=2) + """ + + def test_func_get_max_threads(self): + @njit + def test_impl(): + omp_set_dynamic(0) + o_nt = omp_get_max_threads() + count = 0 + with omp("parallel"): + i_nt = omp_get_max_threads() + with omp("critical"): + count += 1 + return count, i_nt, o_nt + + nt = self.MAX_THREADS + with override_env_config("OMP_NUM_THREADS", str(nt)): + r = test_impl() + assert r[0] == r[1] == r[2] == nt + + def test_func_get_num_threads(self): + @njit + def test_impl(): + omp_set_dynamic(0) + o_nt = omp_get_num_threads() + count = 0 + with omp("parallel"): + i_nt = omp_get_num_threads() + with omp("critical"): + count += 1 + return (count, i_nt), o_nt + + nt = self.MAX_THREADS + with override_env_config("OMP_NUM_THREADS", str(nt)): + r = test_impl() + assert r[0][0] == r[0][1] == nt + assert r[1] == 1 + + def test_func_set_num_threads(self): + @njit + def test_impl(n1, n2): + omp_set_dynamic(0) + omp_set_num_threads(n1) + count1 = 0 + count2 = 0 + with omp("parallel"): + with omp("critical"): + count1 += 1 + omp_set_num_threads(n2) + with omp("parallel"): + with omp("critical"): + count2 += 1 + return count1, count2 + + nt = 32 + with override_env_config("OMP_NUM_THREADS", str(4)): + r = test_impl(nt, 20) + assert r[0] == r[1] == nt + + def test_func_set_max_active_levels(self): + @njit + def test_impl(n1, n2, n3): + omp_set_dynamic(0) + omp_set_max_active_levels(2) + omp_set_num_threads(n2) + count1, count2, count3 = 0, 0, 0 + with omp("parallel num_threads(n1)"): + with omp("single"): + with omp("parallel"): + with omp("single"): + omp_set_num_threads(n3) + with omp("parallel"): + with omp("critical"): + count3 += 1 + with omp("critical"): + count2 += 1 + with omp("critical"): + count1 += 1 + return count1, count2, count3 + + n1, n2 = 3, 4 + r = test_impl(n1, n2, 5) + assert r[0] == n1 + assert r[1] == n2 + assert r[2] == 1 + + def test_func_get_ancestor_thread_num(self): + @njit + def test_impl(): + oa = omp_get_ancestor_thread_num(0) + with omp("parallel"): + with omp("single"): + m1 = omp_get_ancestor_thread_num(0) + f1 = omp_get_ancestor_thread_num(1) + s1 = omp_get_ancestor_thread_num(2) + tn1 = omp_get_thread_num() + with omp("parallel"): + m2 = omp_get_ancestor_thread_num(0) + f2 = omp_get_ancestor_thread_num(1) + s2 = omp_get_ancestor_thread_num(2) + tn2 = omp_get_thread_num() + return oa, (m1, f1, s1, tn1), (m2, f2, s2, tn2) + + oa, r1, r2 = test_impl() + assert oa == r1[0] == r2[0] == 0 + assert r1[1] == r1[3] == r2[1] + assert r1[2] == -1 + assert r2[2] == r2[3] + + def test_func_get_team_size(self): + @njit + def test_impl(n1, n2): + omp_set_max_active_levels(2) + oa = omp_get_team_size(0) + with omp("parallel num_threads(n1)"): + with omp("single"): + m1 = omp_get_team_size(0) + f1 = omp_get_team_size(1) + s1 = omp_get_team_size(2) + nt1 = omp_get_num_threads() + with omp("parallel num_threads(n2)"): + with omp("single"): + m2 = omp_get_team_size(0) + f2 = omp_get_team_size(1) + s2 = omp_get_team_size(2) + nt2 = omp_get_num_threads() + return oa, (m1, f1, s1, nt1), (m2, f2, s2, nt2) + + n1, n2 = 6, 8 + oa, r1, r2 = test_impl(n1, n2) + assert oa == r1[0] == r2[0] == 1 + assert r1[1] == r1[3] == r2[1] == n1 + assert r1[2] == -1 + assert r2[2] == r2[3] == n2 + + def test_func_get_level(self): + @njit + def test_impl(): + oa = omp_get_level() + with omp("parallel if(0)"): + f = omp_get_level() + with omp("parallel num_threads(1)"): + s = omp_get_level() + with omp("parallel"): + t = omp_get_level() + return oa, f, s, t + + for i, l in enumerate(test_impl()): + assert i == l + + def test_func_get_active_level(self): + @njit + def test_impl(): + oa = omp_get_active_level() + with omp("parallel if(0)"): + f = omp_get_active_level() + with omp("parallel num_threads(1)"): + s = omp_get_active_level() + with omp("parallel"): + t = omp_get_active_level() + return oa, f, s, t + + r = test_impl() + for i in range(3): + assert r[i] == 0 + assert r[3] == 1 + + def test_func_in_parallel(self): + @njit + def test_impl(): + omp_set_dynamic(0) + omp_set_max_active_levels(1) # 1 because first region is inactive + oa = omp_in_parallel() + with omp("parallel num_threads(1)"): + ia = omp_in_parallel() + with omp("parallel"): + n1a = omp_in_parallel() + with omp("single"): + with omp("parallel"): + n2a = omp_in_parallel() + with omp("parallel if(0)"): + ua = omp_in_parallel() + return oa, ia, n1a, n2a, ua + + r = test_impl() + assert r[0] == False + assert r[1] == False + assert r[2] == True + assert r[3] == True + assert r[4] == False + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_func_in_final(self): + @njit + def test_impl(N, c): + a = np.arange(N)[::-1] + fa = np.zeros(N) + fia = np.zeros(N) + with omp("parallel"): + with omp("single"): + for i in range(len(a)): + e = a[i] + with omp("task final(e >= c)"): + fa[i] = omp_in_final() + with omp("task"): + fia[i] = omp_in_final() + return fa, fia + + N, c = 25, 10 + r = test_impl(N, c) + np.testing.assert_array_equal(r[0], np.concatenate(np.ones(N - c), np.zeros(c))) + np.testing.assert_array_equal(r[0], r[1]) + + +class TestOpenmpParallelForResults(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_parallel_for_set_elements(self): + def test_impl(v): + with omp("parallel for"): + for i in range(len(v)): + v[i] = 1.0 + return v + + self.check(test_impl, np.zeros(100)) + + def test_separate_parallel_for_set_elements(self): + def test_impl(v): + with omp("parallel"): + with omp("for"): + for i in range(len(v)): + v[i] = 1.0 + return v + + self.check(test_impl, np.zeros(100)) + + def test_parallel_for_const_var_omp_statement(self): + def test_impl(v): + ovar = "parallel for" + with omp(ovar): + for i in range(len(v)): + v[i] = 1.0 + return v + + self.check(test_impl, np.zeros(100)) + + def test_parallel_for_string_conditional(self): + def test_impl(S): + capitalLetters = 0 + with omp("parallel for reduction(+:capitalLetters)"): + for i in range(len(S)): + if S[i].isupper(): + capitalLetters += 1 + return capitalLetters + + self.check(test_impl, "OpenMPstrTEST") + + def test_parallel_for_tuple(self): + def test_impl(t): + len_total = 0 + with omp("parallel for reduction(+:len_total)"): + for i in range(len(t)): + len_total += len(t[i]) + return len_total + + self.check(test_impl, ("32", "4", "test", "567", "re", "")) + + def test_parallel_for_range_step_2(self): + def test_impl(N): + a = np.zeros(N, dtype=np.int32) + with omp("parallel for"): + for i in range(0, len(a), 2): + a[i] = i + 1 + + return a + + self.check(test_impl, 12) + + def test_parallel_for_range_step_arg(self): + def test_impl(N, step): + a = np.zeros(N, dtype=np.int32) + with omp("parallel for"): + for i in range(0, len(a), step): + a[i] = i + 1 + + return a + + self.check(test_impl, 12, 2) + + def test_parallel_for_incremented_step(self): + @njit + def test_impl(v, n): + for i in range(n): + with omp("parallel for"): + for j in range(0, len(v), i + 1): + v[j] = i + 1 + return v + + self.check(test_impl, np.zeros(100), 3) + + def test_parallel_for_range_backward_step(self): + def test_impl(N): + a = np.zeros(N, dtype=np.int32) + with omp("parallel for"): + for i in range(N - 1, -1, -1): + a[i] = i + 1 + + return a + + self.check(test_impl, 12) + + """ + def test_parallel_for_dictionary(self): + def test_impl(N, c): + l = {} + with omp("parallel for"): + for i in range(N): + l[i] = i % c + return l + self.check(test_impl, 32, 5) + """ + + def test_parallel_for_num_threads(self): + def test_impl(nt): + a = np.zeros(nt) + with omp("parallel num_threads(nt)"): + with omp("for"): + for i in range(nt): + a[i] = i + return a + + self.check(test_impl, 15) + + def test_parallel_for_only_inside_var(self): + @njit + def test_impl(nt): + a = np.zeros(nt) + with omp("parallel num_threads(nt) private(x)"): + with omp("for private(x)"): + for i in range(nt): + x = 0 + # print("out:", i, x, i + x, nt) + a[i] = i + x + return a + + nt = 12 + np.testing.assert_array_equal(test_impl(nt), np.arange(nt)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_parallel_for_ordered(self): + @njit + def test_impl(N, c): + a = np.zeros(N) + b = np.zeros(N) + with omp("parallel for ordered"): + for i in range(1, N): + b[i] = b[i - 1] + c + with omp("ordered"): + a[i] = a[i - 1] + c + return a + + N, c = 30, 4 + r = test_impl(N, c) + rc = np.arange(0, N * c, c) + np.testing.assert_array_equal(r[0], rc) + assert not np.array_equal(r[1], rc) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_parallel_for_collapse(self): + @njit + def test_impl(n1, n2, n3): + ia = np.zeros(n1) + ja = np.zeros((n1, n2)) + ka = np.zeros((n1, n2, n3)) + with omp("parallel for collapse(2)"): + for i in range(n1): + ia[i] = omp_get_thread_num() + for j in range(n2): + ja[i][j] = omp_get_thread_num() + for k in range(n3): + ka[i][j][k] = omp_get_thread_num() + return ia, ja, ka + + ia, ja, ka = test_impl(5, 3, 2) + print(ia) + print(ja) + for a1i in range(len(ja)): + with self.assertRaises(AssertionError) as raises: + np.testing.assert_equal(ia[a1i], ja[a1i]) # Scalar to array + for a1i in range(len(ka)): + for a2i in range(a1i): + # Scalar to array + np.testing.assert_equal(ja[a1i][a2i], ka[a1i][a2i]) + + +class TestOpenmpWorksharingSchedule(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + """ + def test_static_work_calculation(self): + def test_impl(N, nt): + v = np.zeros(N) + step = -2 + omp_set_num_threads(nt) + with omp("parallel private(thread_num)"): + running_omp = omp_in_parallel() + thread_num = omp_get_thread_num() + if not running_omp: + iters = N // abs(step) + itersPerThread = iters // nt + finishToThread = {} + for t in range(N): + f = itersPerThread*(t+1)-1 + min(iters%itersPerThread, t+1) + finishToThread[f] = t + with omp("for schedule(static)"): + for index, i in enumerate(range(N-1, N%2 - 1, -2)): + if not running_omp: + for finish in finishToThread.keys(): + if index <= finish: + thread_num = finishToThread[finish] + if i % (thread_num+1) == 0: + v[i] = i/(thread_num+1) + print(v) + return v + self.check(test_impl, 100, 8) + """ + + # Giorgis pass doesn't support static with chunksize yet? + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented") + def test_avg_sched_const(self): + def test_impl(n, a): + b = np.zeros(n) + nt = 5 + with omp("parallel for num_threads(nt) schedule(static, 4)"): + for i in range(1, n): + b[i] = (a[i] + a[i - 1]) / 2.0 + + return b + + self.check(test_impl, 10, np.ones(10)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented") + def test_avg_sched_var(self): + def test_impl(n, a): + b = np.zeros(n) + nt = 5 + ss = 4 + with omp("parallel for num_threads(nt) schedule(static, ss)"): + for i in range(1, n): + b[i] = (a[i] + a[i - 1]) / 2.0 + + return b + + self.check(test_impl, 10, np.ones(10)) + + def test_static_distribution(self): + @njit + def test_impl(nt, c): + a = np.empty(nt * c) + with omp("parallel for num_threads(nt) schedule(static)"): + for i in range(nt * c): + a[i] = omp_get_thread_num() + return a + + nt, c = 8, 3 + r = test_impl(nt, c) + for tn in range(nt): + indices = np.sort(np.where(r == tn)[0]) + si = indices[0] + np.testing.assert_array_equal(indices, np.arange(si, si + c)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_static_chunk_distribution(self): + @njit + def test_impl(nt, c, cs): + a = np.empty(nt * c) + with omp("parallel for num_threads(nt) schedule(static, cs)"): + for i in range(nt * c): + a[i] = omp_get_thread_num() + return a + + nt, c, cs = 8, 6, 3 + r = test_impl(nt, c, cs) + for tn in range(nt): + indices = np.sort(np.where(r == tn)[0]) + for i in range(c // cs): + si = indices[i * cs] + np.testing.assert_array_equal( + indices, np.arange(si, min(len(r), si + cs)) + ) + + def test_static_consistency(self): + @njit + def test_impl(nt, c, cs): + a = np.empty(nt * c) + b = np.empty(nt * c) + with omp("parallel num_threads(8)"): + with omp("for schedule(static)"): + for i in range(nt * c): + a[i] = omp_get_thread_num() + with omp("for schedule(static)"): + for i in range(nt * c): + b[i] = omp_get_thread_num() + return a, b + + r = test_impl(8, 7, 5) + np.testing.assert_array_equal(r[0], r[1]) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_dynamic_distribution(self): + @njit + def test_impl(nt, c, cs): + a = np.empty(nt * c) + with omp("parallel for num_threads(nt) schedule(dynamic)"): + for i in range(nt * c): + a[i] = omp_get_thread_num() + return a + + nt, c, cs = 10, 2, 1 + r = test_impl(nt, c, cs) + a = np.zeros(nt) + for tn in range(nt): + indices = np.sort(np.where(r == tn)[0]) + if len(indices > 0): + for i in range(c // cs): + si = indices[i * cs] + np.testing.assert_array_equal( + indices, np.arange(si, min(len(r), si + cs)) + ) + else: + a[tn] = 1 + assert np.any(a) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_guided_distribution(self): + @njit + def test_impl(nt, c, cs): + a = np.empty(nt * c) + with omp("parallel for num_threads(nt) schedule(guided, cs)"): + for i in range(nt * c): + a[i] = omp_get_thread_num() + return a + + nt, c, cs = 8, 6, 3 + r = test_impl(nt, c, cs) + chunksizes = [] + cur_tn = r[0] + cur_chunk = 0 + for e in r: + if e == cur_tn: + cur_chunk += 1 + else: + chunksizes.append(cur_chunk) + cur_chunk = 1 + chunksizes.append(cur_chunk) + ca = np.array(chunksizes) + np.testing.assert_array_equal(ca, np.sort(ca)[::-1]) + assert ca[-2] >= cs + + +class TestOpenmpParallelClauses(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_num_threads_clause(self): + @njit + def test_impl(N, c1, c2): + omp_set_dynamic(0) + omp_set_max_active_levels(2) + omp_set_num_threads(N + c1) + d_count = 0 + n_count = 0 + nc_count = 0 + a_count = 0 + with omp("parallel num_threads(N) shared(c2)"): + with omp("critical"): + d_count += 1 + with omp("parallel"): + with omp("critical"): + n_count += 1 + with omp("single"): + with omp("parallel num_threads(6)"): + with omp("critical"): + nc_count += 1 + with omp("parallel"): + with omp("critical"): + a_count += 1 + return d_count, a_count, n_count, nc_count + + a, b, c = 13, 3, 6 + r = test_impl(a, b, c) + assert r[0] == a + assert r[1] == a + b + assert r[2] == a * (a + b) + assert r[3] == c + + def test_if_clause(self): + @njit + def test_impl(s): + rp = 2 # Should also work with anything non-zero + drp = 0 + ar = np.zeros(s, dtype=np.int32) + adr = np.zeros(s, dtype=np.int32) + par = np.full(s, 2, dtype=np.int32) + padr = np.full(s, 2, dtype=np.int32) + + omp_set_num_threads(s) + omp_set_dynamic(0) + with omp("parallel for if(rp)"): + for i in range(s): + ar[omp_get_thread_num()] = 1 + par[i] = omp_in_parallel() + with omp("parallel for if(drp)"): + for i in range(s): + adr[omp_get_thread_num()] = 1 + padr[i] = omp_in_parallel() + return ar, adr, par, padr + + size = 20 + r = test_impl(size) + np.testing.assert_array_equal(r[0], np.ones(size)) + rc = np.zeros(size) + rc[0] = 1 + np.testing.assert_array_equal(r[1], rc) + np.testing.assert_array_equal(r[2], np.ones(size)) + np.testing.assert_array_equal(r[3], np.zeros(size)) + + def test_avg_arr_prev_two_elements_base(self): + def test_impl(n, a): + b = np.zeros(n) + omp_set_num_threads(5) + + with omp("parallel for"): + for i in range(1, n): + b[i] = (a[i] + a[i - 1]) / 2.0 + return b + + self.check(test_impl, 10, np.ones(10)) + + def test_avg_num_threads_clause(self): + def test_impl(n, a): + b = np.zeros(n) + with omp("parallel for num_threads(5)"): + for i in range(1, n): + b[i] = (a[i] + a[i - 1]) / 2.0 + + return b + + self.check(test_impl, 10, np.ones(10)) + + def test_avg_num_threads_clause_var(self): + def test_impl(n, a): + b = np.zeros(n) + nt = 5 + with omp("parallel for num_threads(nt)"): + for i in range(1, n): + b[i] = (a[i] + a[i - 1]) / 2.0 + + return b + + self.check(test_impl, 10, np.ones(10)) + + # Uses apparently unsupported chunking. + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented") + def test_avg_if_const(self): + def test_impl(n, a): + b = np.zeros(n) + nt = 5 + with omp("parallel for if(1) num_threads(nt) schedule(static, 4)"): + for i in range(1, n): + b[i] = (a[i] + a[i - 1]) / 2.0 + + return b + + self.check(test_impl, 10, np.ones(10)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented") + def test_avg_if_var(self): + def test_impl(n, a): + b = np.zeros(n) + nt = 5 + ss = 4 + do_if = 1 + with omp("parallel for if(do_if) num_threads(nt) schedule(static, ss)"): + for i in range(1, n): + b[i] = (a[i] + a[i - 1]) / 2.0 + + return b + + self.check(test_impl, 10, np.ones(10)) + + def test_teams1(self): + def test_impl(): + a = 1 + with omp("teams"): + with omp("parallel"): + a = 123 + return a + + self.check(test_impl) + + +class TestReductions(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_parallel_reduction_add_int(self): + @njit + def test_impl(): + redux = 0 + nthreads = 0 + with omp("parallel reduction(+:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = 1 + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, nthreads) + + def test_parallel_reduction_sub_int(self): + @njit + def test_impl(): + redux = 0 + nthreads = 0 + with omp("parallel reduction(-:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = 1 + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, nthreads) + + def test_parallel_reduction_mul_int(self): + @njit + def test_impl(): + redux = 1 + nthreads = 0 + with omp("parallel reduction(*:redux) num_threads(8)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = 2 + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 2**nthreads) + + def test_parallel_reduction_add_fp64(self): + @njit + def test_impl(): + redux = np.float64(0.0) + nthreads = np.float64(0.0) + with omp("parallel reduction(+:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = np.float64(1.0) + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 1.0 * nthreads) + + def test_parallel_reduction_sub_fp64(self): + @njit + def test_impl(): + redux = np.float64(0.0) + nthreads = np.float64(0.0) + with omp("parallel reduction(-:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = np.float64(1.0) + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 1.0 * nthreads) + + def test_parallel_reduction_mul_fp64(self): + @njit + def test_impl(): + redux = np.float64(1.0) + nthreads = np.float64(0.0) + with omp("parallel reduction(*:redux) num_threads(8)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = np.float64(2.0) + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 2.0**nthreads) + + def test_parallel_reduction_add_fp32(self): + @njit + def test_impl(): + redux = np.float32(0.0) + nthreads = np.float32(0.0) + with omp("parallel reduction(+:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = np.float32(1.0) + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 1.0 * nthreads) + + def test_parallel_reduction_sub_fp32(self): + @njit + def test_impl(): + redux = np.float32(0.0) + nthreads = np.float32(0.0) + with omp("parallel reduction(-:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = np.float32(1.0) + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 1.0 * nthreads) + + def test_parallel_reduction_mul_fp32(self): + @njit + def test_impl(): + redux = np.float32(1.0) + nthreads = np.float32(0.0) + with omp("parallel reduction(*:redux) num_threads(8)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = np.float32(2.0) + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 2.0**nthreads) + + def test_parallel_for_reduction_add_int(self): + @njit + def test_impl(): + redux = 0 + with omp("parallel for reduction(+:redux)"): + for i in range(10): + redux += 1 + return redux + + redux = test_impl() + self.assertEqual(redux, 10) + + def test_parallel_for_reduction_sub_int(self): + @njit + def test_impl(): + redux = 0 + with omp("parallel for reduction(-:redux)"): + for i in range(10): + redux += 1 + return redux + + redux = test_impl() + self.assertEqual(redux, 10) + + def test_parallel_for_reduction_mul_int(self): + @njit + def test_impl(): + redux = 1 + with omp("parallel for reduction(*:redux)"): + for i in range(10): + redux *= 2 + return redux + + redux = test_impl() + self.assertEqual(redux, 2**10) + + def test_parallel_for_reduction_add_fp64(self): + @njit + def test_impl(): + redux = np.float64(0.0) + with omp("parallel for reduction(+:redux)"): + for i in range(10): + redux += np.float64(1.0) + return redux + + redux = test_impl() + self.assertEqual(redux, 10.0) + + def test_parallel_for_reduction_sub_fp64(self): + @njit + def test_impl(): + redux = np.float64(0.0) + with omp("parallel for reduction(-:redux)"): + for i in range(10): + redux += np.float64(1.0) + return redux + + redux = test_impl() + self.assertEqual(redux, 10.0) + + def test_parallel_for_reduction_mul_fp64(self): + @njit + def test_impl(): + redux = np.float64(1.0) + with omp("parallel for reduction(*:redux)"): + for i in range(10): + redux *= np.float64(2.0) + return redux + + redux = test_impl() + self.assertEqual(redux, 2.0**10) + + def test_parallel_for_reduction_add_fp32(self): + @njit + def test_impl(): + redux = np.float32(0.0) + with omp("parallel for reduction(+:redux)"): + for i in range(10): + redux += np.float32(1.0) + return redux + + redux = test_impl() + self.assertEqual(redux, 10.0) + + def test_parallel_for_reduction_sub_fp32(self): + @njit + def test_impl(): + redux = np.float32(0.0) + with omp("parallel for reduction(-:redux)"): + for i in range(10): + redux += np.float32(1.0) + return redux + + redux = test_impl() + self.assertEqual(redux, 10.0) + + def test_parallel_for_reduction_mul_fp32(self): + @njit + def test_impl(): + redux = np.float32(1.0) + with omp("parallel for reduction(*:redux)"): + for i in range(10): + redux *= np.float32(2.0) + return redux + + redux = test_impl() + self.assertEqual(redux, 2.0**10) + + def test_parallel_reduction_add_int_10(self): + @njit + def test_impl(): + redux = 10 + nthreads = 0 + with omp("parallel reduction(+:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = 1 + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, nthreads + 10) + + def test_parallel_reduction_add_fp32_10(self): + @njit + def test_impl(): + redux = np.float32(10.0) + nthreads = np.float32(0.0) + with omp("parallel reduction(+:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = np.float32(1.0) + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 1.0 * nthreads + 10.0) + + def test_parallel_reduction_add_fp64_10(self): + @njit + def test_impl(): + redux = np.float64(10.0) + nthreads = np.float64(0.0) + with omp("parallel reduction(+:redux)"): + thread_id = omp_get_thread_num() + if thread_id == 0: + nthreads = omp_get_num_threads() + redux = np.float64(1.0) + return redux, nthreads + + redux, nthreads = test_impl() + self.assertGreater(nthreads, 1) + self.assertEqual(redux, 1.0 * nthreads + 10.0) + + def test_parallel_for_reduction_add_int_10(self): + @njit + def test_impl(): + redux = 10 + with omp("parallel for reduction(+:redux)"): + for i in range(10): + redux += 1 + return redux + + redux = test_impl() + self.assertEqual(redux, 10 + 10) + + def test_parallel_for_reduction_add_fp32(self): + @njit + def test_impl(): + redux = np.float32(0.0) + with omp("parallel for reduction(+:redux)"): + for i in range(10): + redux += np.float32(1.0) + return redux + + redux = test_impl() + self.assertEqual(redux, 10.0) + + def test_parallel_for_reduction_add_fp64_10(self): + @njit + def test_impl(): + redux = np.float64(10.0) + with omp("parallel for reduction(+:redux)"): + for i in range(10): + redux += np.float64(1.0) + return redux + + redux = test_impl() + self.assertEqual(redux, 10.0 + 10.0) + + +class TestOpenmpDataClauses(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_default_none(self): + @njit + def test_impl(N): + a = np.zeros(N, dtype=np.int32) + x = 7 + with omp("parallel for default(none)"): + for i in range(N): + y = i + x + a[i] = y + z = i + + return a, z + + with self.assertRaises(UnspecifiedVarInDefaultNone) as raises: + test_impl(100) + self.assertIn("Variables with no data env clause", str(raises.exception)) + + def test_data_sharing_default(self): + @njit + def test_impl(N, M): + x = np.zeros(N) + y = np.zeros(N) + z = 3.14 + i = 7 + with omp("parallel private(i)"): + yn = M + 1 + zs = z + with omp("for"): + for i in range(N): + y[i] = yn + 2 * (i + 1) + with omp("for"): + for i in range(N): + x[i] = y[i] - i + with omp("critical"): + z += 3 + return x, y, zs, z, i + + N, M = 10, 5 + r = test_impl(N, M) + np.testing.assert_array_equal(r[0], np.arange(M + 3, M + N + 3)) + np.testing.assert_array_equal(r[1], np.arange(M + 3, M + 2 * N + 2, 2)) + assert r[2] == 3.14 + assert r[3] == 3.14 + 3 * N + assert r[4] == 7 + + def test_variables(self): + @njit + def test_impl(): + x = 5 + y = 3 + zfp = 2 + zsh = 7 + nerr = 0 + nsing = 0 + NTHREADS = 4 + numthrds = 0 + omp_set_num_threads(NTHREADS) + vals = np.zeros(NTHREADS) + valsfp = np.zeros(NTHREADS) + + with omp("""parallel private(x) shared(zsh) + firstprivate(zfp) private(ID)"""): + ID = omp_get_thread_num() + with omp("single"): + nsing = nsing + 1 + numthrds = omp_get_num_threads() + if y != 3: + nerr = nerr + 1 + print( + "Shared Default status failure y = ", + y, + " It should equal 3", + ) + + # verify each thread sees the same variable vsh + with omp("critical"): + zsh = zsh + ID + + # test first private + zfp = zfp + ID + valsfp[ID] = zfp + + # setup test to see if each thread got its own x value + x = ID + vals[ID] = x + + # Shared clause test: assumes zsh starts at 7 and we add up IDs from 4 threads + if zsh != 13: + print("Shared clause or critical failed", zsh) + nerr = nerr + 1 + + # Single Test: How many threads updated nsing? + if nsing != 1: + print(" Single test failed", nsing) + nerr = nerr + 1 + + # Private clause test: did each thread get its own x variable? + for i in range(numthrds): + if int(vals[i]) != i: + print("Private clause failed", numthrds, i, vals[i]) + nerr = nerr + 1 + + # First private clause test: each thread should get 2 + ID for up to 4 threads + for i in range(numthrds): + if int(valsfp[i]) != 2 + i: + print("Firstprivate clause failed", numthrds, i, valsfp[i]) + nerr = nerr + 1 + + # Test number of threads + if numthrds > NTHREADS: + print("Number of threads error: too many threads", numthrds, NTHREADS) + nerr = nerr + 1 + + if nerr > 0: + print( + nerr, + """ errors when testing parallel, private, shared, + firstprivate, critical and single""", + ) + + return nerr + + assert test_impl() == 0 + + def test_privates(self): + def test_impl(N): + a = np.zeros(N, dtype=np.int32) + x = 7 + with omp("""parallel for firstprivate(x) private(y) + lastprivate(zzzz) private(private_index) shared(a) + firstprivate(N) default(none)"""): + for private_index in range(N): + y = private_index + x + a[private_index] = y + zzzz = private_index + + return a, zzzz + + self.check(test_impl, 100) + + def test_private_retain_value(self): + @njit + def test_impl(): + x = 5 + with omp("parallel private(x)"): + x = 13 + return x + + assert test_impl() == 5 + + def test_private_retain_value_param(self): + @njit + def test_impl(x): + with omp("parallel private(x)"): + x = 13 + return x + + assert test_impl(5) == 5 + + def test_private_retain_value_for(self): + @njit + def test_impl(): + x = 5 + with omp("parallel private(x)"): + with omp("for"): + for i in range(10): + x = i + return x + + assert test_impl() == 5 + + def test_private_retain_value_for_param(self): + @njit + def test_impl(x): + with omp("parallel private(x)"): + with omp("for"): + for i in range(10): + x = i + return x + + assert test_impl(5) == 5 + + def test_private_retain_value_combined_for(self): + @njit + def test_impl(): + x = 5 + with omp("parallel for private(x)"): + for i in range(10): + x = i + return x + + assert test_impl() == 5 + + def test_private_retain_value_combined_for_param(self): + @njit + def test_impl(x): + with omp("parallel for private(x)"): + for i in range(10): + x = i + return x + + assert test_impl(5) == 5 + + def test_private_retain_two_values(self): + @njit + def test_impl(): + x = 5 + y = 7 + with omp("parallel private(x,y)"): + x = 13 + y = 40 + return x, y + + assert test_impl() == (5, 7) + + def test_private_retain_array(self): + @njit + def test_impl(N, x): + a = np.ones(N) + with omp("parallel private(a)"): + with omp("single"): + sa = a + a = np.zeros(N) + with omp("for"): + for i in range(N): + a[i] = x + return a, sa + + r = test_impl(10, 3) + np.testing.assert_array_equal(r[0], np.ones(r[0].shape)) + with self.assertRaises(AssertionError) as raises: + np.testing.assert_array_equal(r[1], np.ones(r[0].shape)) + + def test_private_divide_work(self): + def test_impl(v, npoints): + omp_set_num_threads(3) + + with omp("""parallel default(shared) + private(iam,nt,ipoints,istart)"""): + iam = omp_get_thread_num() + nt = omp_get_num_threads() + ipoints = npoints // nt + istart = iam * ipoints + if iam == nt - 1: + ipoints = npoints - istart + for i in range(ipoints): + v[istart + i] = 123.456 + return v + + self.check(test_impl, np.zeros(12), 12) + + def test_firstprivate(self): + @njit + def test_impl(x, y): + with omp("parallel firstprivate(x)"): + xs = x + x = y + return xs, x + + x, y = 5, 3 + self.assert_outputs_equal(test_impl(x, y), (x, x)) + + def test_lastprivate_for(self): + @njit + def test_impl(N): + a = np.zeros(N) + si = 0 + with omp("parallel for lastprivate(si)"): + for i in range(N): + si = i + 1 + a[i] = si + return si, a + + N = 10 + r = test_impl(N) + assert r[0] == N + np.testing.assert_array_equal(r[1], np.arange(1, N + 1)) + + def test_lastprivate_non_one_step(self): + @njit + def test_impl(n1, n2, s): + a = np.zeros(math.ceil((n2 - n1) / s)) + rl = np.arange(n1, n2, s) + with omp("parallel for lastprivate(si)"): + for i in range(len(rl)): + si = rl[i] + 1 + a[i] = si + return si, a + + n1, n2, s = 4, 26, 3 + r = test_impl(n1, n2, s) + ra = np.arange(n1, n2, s) + 1 + assert r[0] == ra[-1] + np.testing.assert_array_equal(r[1], ra) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_lastprivate_sections(self): + @njit + def test_impl(N2, si): + a = np.zeros(N2) + with omp("parallel shared(sis1)"): + with omp("sections lastprivate(si)"): + sis1 = si + # N1 = number of sections + with omp("section"): + si = 0 + with omp("section"): + si = 1 + with omp("section"): + si = 2 + sis2 = si + with omp("sections lastprivate(si)"): + # N2 = number of sections + with omp("section"): + i = 0 + si = N2 - i + a[i] = si + with omp("section"): + i = 1 + si = N2 - i + a[i] = si + with omp("section"): + i = 2 + si = N2 - i + a[i] = si + with omp("section"): + i = 3 + si = N2 - i + a[i] = si + return si, sis1, sis2, a + + N1, N2, d = 3, 4, 5 + r = test_impl(N2, d) + assert r[0] == 1 + assert r[1] != d + assert r[2] == N1 - 1 + np.testing.assert_array_equal(r[3], np.arange(N2, 0, -1)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_lastprivate_conditional(self): + @njit + def test_impl(N, c1, c2): + a = np.arange(0, N * 2, c2) + num = 0 + with omp("parallel"): + with omp("for lastprivate(conditional: num)"): + for i in range(N): + if i < c1: + num = a[i] + c2 + return num + + c1, c2 = 11, 3 + assert test_impl(15, c1, c2) == c1 * c2 + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_threadprivate(self): + @njit + def test_impl(N, c): + omp_set_num_threads(N) + a = np.zeros(N) + ra = np.zeros(N) + val = 0 + with omp("threadprivate(val)"): + pass + with omp("parallel private(tn, sn)"): + tn = omp_get_thread_num() + sn = c + tn + val = sn + a[tn] = sn + with omp("parallel private(tn)"): + tn = omp_get_thread_num() + ra[tn] = 1 if val == a[tn] else 0 + return ra + + nt = 8 + np.testing.assert_array_equal(test_impl(nt, 5), np.ones(nt)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_copyin(self): + @njit + def test_impl(nt, n1, n2, n3): + xsa1 = np.zeros(nt) + xsa2 = np.zeros(nt) + x = n1 + with omp("threadprivate(x)"): + pass + x = n2 + with omp("parallel num_threads(nt) copyin(x) private(tn)"): + tn = omp_get_thread_num() + xsa1[tn] = x + if tn == 0: + x = n3 + with omp("parallel copyin(x)"): + xsa2[omp_get_thread_num()] = x + return xsa1, xsa2 + + nt, n2, n3 = 10, 12.5, 7.1 + r = test_impl(nt, 4.3, n2, n3) + np.testing.assert_array_equal(r[0], np.full(nt, n2)) + np.testing.assert_array_equal(r[1], np.full(nt, n3)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_copyin_nested(self): + def test_impl(nt1, nt2, mt, n1, n2, n3): + omp_set_nested(1) + omp_set_dynamic(0) + xsa1 = np.zeros(nt1) + xsa2 = np.zeros(nt2) + x = n1 + with omp("threadprivate(x)"): + pass + x = n2 + with omp("parallel num_threads(nt1) copyin(x) private(tn)"): + tn = omp_get_thread_num() + xsa1[tn] = x + if tn == mt: + x = n3 + with omp("parallel num_threads(nt2) copyin(x)"): + xsa2[omp_get_thread_num()] = x + return xsa1, xsa2 + + nt1, nt2, n2, n3 = 10, 4, 12.5, 7.1 + r = test_impl(nt1, nt2, 2, 4.3, n2, n3) + np.testing.assert_array_equal(r[0], np.full(nt1, n2)) + np.testing.assert_array_equal(r[1], np.full(nt2, n3)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_copyprivate(self): + @njit + def test_impl(nt, n1, n2, n3): + x = n1 + a = np.zeros(nt) + xsa = np.zeros(nt) + ar = np.zeros(nt) + omp_set_num_threads(nt) + with omp("parallel firstprivate(x, a) private(tn)"): + with omp("single copyprivate(x, a)"): + x = n2 + a = np.full(nt, n3) + tn = omp_get_thread_num() + xsa[tn] = x + ar[tn] = a[tn] + return xsa, a, ar + + nt, n2, n3 = 16, 12, 3 + r = test_impl(nt, 5, n2, n3) + np.testing.assert_array_equal(r[0], np.full(nt, n2)) + self.assert_outputs_equal(r[1], r[2], np.full(nt, n3)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_linear_clause(self): + @njit + def test_impl(N): + a = np.arange(N) + 1 + b = np.zeros(N // 2) + + linearj = 0 + with omp("parallel for linear(linearj:1)"): + for i in range(0, N, 2): + b[linearj] = a[i] * 2 + + return b, linearj + + N = 50 + r = test_impl(N) + np.testing.assert_array_equal(r[0], np.arange(2, N * 2 - 1, 4)) + assert r[1] == N // 2 - 1 + + +class TestOpenmpConstraints(TestOpenmpBase): + """Tests designed to confirm that errors occur when expected, or + to see how OpenMP behaves in various circumstances""" + + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_parallel_for_no_for_loop(self): + @njit + def test_impl(): + with omp("parallel for"): + pass + + with self.assertRaises(ParallelForWrongLoopCount) as raises: + test_impl() + self.assertIn( + "OpenMP parallel for regions must contain exactly one", + str(raises.exception), + ) + + def test_parallel_for_multiple_for_loops(self): + @njit + def test_impl(): + a = np.zeros(4) + with omp("parallel for"): + for i in range(2): + a[i] = 1 + for i in range(2, 4): + a[i] = 1 + + with self.assertRaises(ParallelForWrongLoopCount) as raises: + test_impl() + self.assertIn( + "OpenMP parallel for regions must contain exactly one", + str(raises.exception), + ) + + def test_statement_before_parallel_for(self): + @njit + def test_impl(): + a = np.zeros(4) + with omp("parallel for"): + print("Fail") + for i in range(4): + a[i] = i + return a + + with self.assertRaises(ParallelForExtraCode) as raises: + test_impl() + self.assertIn("Extra code near line", str(raises.exception)) + + def test_statement_after_parallel_for(self): + @njit + def test_impl(): + a = np.zeros(4) + with omp("parallel for"): + for i in range(4): + a[i] = i + print("Fail") + return a + + with self.assertRaises(ParallelForExtraCode) as raises: + a = test_impl() + print("a", a) + self.assertIn("Extra code near line", str(raises.exception)) + + def test_nonstring_var_omp_statement(self): + @njit + def test_impl(v): + ovar = 7 + with omp(ovar): + for i in range(len(v)): + v[i] = 1.0 + return v + + with self.assertRaises(NonStringOpenmpSpecification) as raises: + test_impl(np.zeros(100)) + self.assertIn("Non-string OpenMP specification at line", str(raises.exception)) + + def test_parallel_for_nonconst_var_omp_statement(self): + @njit + def test_impl(v): + ovar = "parallel " + ovar += "for" + with omp(ovar): + for i in range(len(v)): + v[i] = 1.0 + return v + + with self.assertRaises(NonconstantOpenmpSpecification) as raises: + test_impl(np.zeros(100)) + self.assertIn( + "Non-constant OpenMP specification at line", str(raises.exception) + ) + + # def test_parallel_for_blocking_if(self): + # @njit + # def test_impl(): + # n = 0 + # with omp("parallel"): + # half_threads = omp_get_num_threads()//2 + # if omp_get_thread_num() < half_threads: + # with omp("for reduction(+:n)"): + # for _ in range(half_threads): + # n += 1 + # return n + + # #with self.assertRaises(AssertionError) as raises: + # # njit(test_impl) + # test_impl() + # #print(str(raises.exception)) + + def test_parallel_for_delaying_condition(self): + @njit + def test_impl(): + n = 0 + with omp("parallel private(lc)"): + lc = 0 + while lc < omp_get_thread_num(): + lc += 1 + with omp("for reduction(+:n)"): + for _ in range(omp_get_num_threads()): + n += 1 + return n + + test_impl() + + def test_parallel_for_nowait(self): + @njit + def test_impl(nt): + a = np.zeros(nt) + with omp("parallel for num_threads(nt) nowait"): + for i in range(nt): + a[omp_get_thread_num] = i + return a + + with self.assertRaises(Exception) as raises: + test_impl(12) + self.assertIn("No terminal matches", str(raises.exception)) + + def test_parallel_double_num_threads(self): + @njit + def test_impl(nt1, nt2): + count = 0 + with omp("parallel num_threads(nt1) num_threads(nt2)"): + with omp("critical"): + count += 1 + print(count) + return count + + with self.assertRaises(Exception) as raises: + test_impl(5, 7) + + def test_conditional_barrier(self): + @njit + def test_impl(nt): + hp = nt // 2 + a = np.zeros(hp) + b = np.zeros(nt - hp) + with omp("parallel num_threads(nt) private(tn)"): + tn = omp_get_thread_num() + if tn < hp: + with omp("barrier"): + pass + a[tn] = 1 + else: + with omp("barrier"): + pass + b[tn - hp] = 1 + return a, b + + # The spec seems to say this should be an error but in practice maybe not? + # with self.assertRaises(Exception) as raises: + test_impl(12) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Hangs") + def test_closely_nested_for_loops(self): + @njit + def test_impl(N): + a = np.zeros((N, N)) + with omp("parallel"): + with omp("for"): + for i in range(N): + with omp("for"): + for j in range(N): + a[i][j] = 1 + return a + + with self.assertRaises(Exception) as raises: + test_impl(4) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Hangs") + def test_nested_critical(self): + @njit + def test_impl(): + num = 0 + with omp("parallel"): + with omp("critical"): + num += 1 + with omp("critical"): + num -= 1 + return num + + with self.assertRaises(Exception) as raises: + test_impl() + + +class TestOpenmpConcurrency(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_parallel_region(self): + @njit + def test_impl(): + a = 1 + with omp("parallel"): + a += 1 + + test_impl() + + def test_single(self): + @njit + def test_impl(nt): + omp_set_num_threads(nt) + a = np.zeros(4, dtype=np.int64) + with omp("parallel"): + with omp("single"): + a[0] += 1 + return a + + np.testing.assert_array_equal(test_impl(4), np.array([1, 0, 0, 0])) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_master(self): + @njit + def test_impl(nt): + omp_set_num_threads(nt) + a = np.ones(4, dtype=np.int64) + with omp("parallel"): + with omp("master"): + a[0] += omp_get_thread_num() + return a + + np.testing.assert_array_equal(test_impl(4), np.array([0, 1, 1, 1])) + + def test_critical_threads1(self): + @njit + def test_impl(N, iters): + omp_set_num_threads(N) + count = 0 + p = 0 + sum = 0 + with omp("parallel"): + with omp("barrier"): + pass + with omp("for private(p, sum)"): + for _ in range(iters): + with omp("critical"): + p = count + sum = 0 + for i in range(10000): + if i % 2 == 0: + sum += 1 + else: + sum -= 1 + p += 1 + sum + count = p + return count + + iters = 1000 + self.check(test_impl, 2, iters) + + def test_critical_threads2(self): + @njit + def test_impl(N): + omp_set_num_threads(N) + ca = np.zeros(N) + sum = 0 + with omp("parallel private(sum) shared(c)"): + c = N + with omp("barrier"): + pass + with omp("critical"): + ca[omp_get_thread_num()] = c - 1 + # Sleep + sum = 0 + for i in range(10000): + if i % 2 == 0: + sum += 1 + else: + sum -= 1 + c -= 1 + sum + return np.sort(ca) + + nt = 16 + np.testing.assert_array_equal(test_impl(nt), np.arange(nt)) + + def test_critical_result(self): + @njit + def test_impl(N): + omp_set_num_threads(N) + count = 0 + with omp("parallel"): + if omp_get_thread_num() < N // 2: + with omp("critical"): + count += 1 + else: + with omp("critical"): + count += 1 + return count + + nt = 16 + assert test_impl(nt) == nt + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_named_critical(self): + @njit + def test_impl(N): + omp_set_num_threads(N) + a = np.zeros((2, N)) + sa = np.zeros(N) + with omp("parallel private(a0c, sum, tn)"): + tn = omp_get_thread_num() + with omp("barrier"): + pass + with omp("critical (a)"): + # Sleep + sum = 0 + for j in range(1000): + if j % 2 == 0: + sum += 1 + else: + sum -= 1 + a[0][tn] = 1 + sum + with omp("critical (b)"): + a0c = np.copy(a[0]) + # Sleep + sum = 0 + for j in range(10000): + if j % 2 == 0: + sum += 1 + else: + sum -= 1 + a[1][tn] = 1 + sum + sa[tn] = 1 if a[0] != a0c else 0 + return a, sa + + nt = 16 + r = test_impl(nt) + np.testing.assert_array_equal(r[0], np.ones((2, nt))) + assert np.any(r[1]) + + # Revisit - how to prove atomic works without a race condition? + # def test_atomic_threads(self): + # def test_impl(N, iters): + # omp_set_num_threads(N) + # count = 0 + # p = 0 + # sum = 0 + # with omp("parallel"): + # with omp("barrier"): + # pass + # with omp("for private(p, sum)"): + # for _ in range(iters): + # with omp("atomic"): + # p = count + # sum = 0 + # for i in range(10000): + # if i % 2 == 0: + # sum += 1 + # else: + # sum -= 1 + # p += 1 + sum + # count = p + # return count + # iters = 1000 + # self.check(test_impl, 2, iters) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_atomic(self): + @njit + def test_impl(nt, N, c): + omp_set_num_threads(nt) + a = np.zeros(N) + with omp("parallel for private(b, index)"): + for i in range(nt): + b = 0 + index = i % N + with omp("atomic write"): + a[index] = nt % c + with omp("barrier"): + pass + with omp("atomic read"): + b = a[index - 1] + index + with omp("barrier"): + pass + with omp("atomic update"): + a[index] += b + return a + + nt, N, c = 27, 8, 6 + rc = np.zeros(N) + # ba = np.zeros(nt) + # for i in range(nt): + # index = i % N + # rc[index] = nt % c + # print("rc1:", rc) + + # for i in range(nt): + # index = i % N + # ba[i] = rc[index-1] + index + + # for i in range(nt): + # index = i % N + # rc[index] += ba[i] + # print("rc2:", rc) + + for i in range(nt): + index = i % N + ts = nt // N + ts += 1 if index < nt % N else 0 + rc[index] = nt % c + (nt % c + index) * ts + + np.testing.assert_array_equal(test_impl(nt, N, c), rc) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_atomic_capture(self): + @njit + def test_impl(nt, N, c): + s = math.ceil(N // 2) + a = np.zeros(s) + sva = np.zeros(N) + tns = np.zeros(N) + with omp("parallel for num_threads(nt) private(sv, index)"): + for i in range(N): + index = i % s + tns[i] = omp_get_thread_num() + with omp("atomic write"): + a[index] = index * c + 1 + with omp("barrier"): + pass + with omp("atomic capture"): + sv = a[index - 1] + a[index - 1] += sv + (tns[i] % c + 1) + # sva[index] = sv + return a, sva, tns + + nt, N, c = 16, 30, 7 + r1, r2, tns = test_impl(nt, N, c) + size = math.ceil(N // 2) + rc = np.arange(1, (size - 1) * c + 2, c) + # np.testing.assert_array_equal(r2, np.roll(rc, 1)) + for i in range(N): + index = i % size + rc[index - 1] += rc[index - 1] + (tns[i] % c + 1) + np.testing.assert_array_equal(r1, rc) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_parallel_sections(self): + @njit + def test_impl(nt): + ta0 = np.zeros(nt) + ta1 = np.zeros(nt) + secpa = np.zeros(nt) + + with omp("parallel sections num_threads(nt)"): + with omp("section"): + ta0[omp_get_thread_num()] += 1 + secpa[0] = omp_in_parallel() + with omp("section"): + ta1[omp_get_thread_num()] += 1 + secpa[1] = omp_in_parallel() + print(ta0, ta1) + return ta0, ta0, secpa + + NT = 2 # Must equal the number of section directives in the test + r = test_impl(NT) + assert np.sum(r[0]) == 1 + assert np.sum(r[1]) == 1 + assert np.sum(r[2]) == NT + np.testing.assert_array_equal(r[0] + r[1], np.ones(NT)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - needs fix") + def test_barrier(self): + @njit + def test_impl(nt, iters, c): + a = np.zeros(nt) + ac = np.zeros((nt, nt)) + x = iters // c + iters = x * c + sum = 0 + with omp("parallel num_threads(nt) private(tn, sum)"): + tn = omp_get_thread_num() + with omp("critical"): + sum = 0 + for i in range(iters): + if i % x == 0: + sum += 1 + a[tn] = sum + with omp("barrier"): + pass + for j in range(nt): + ac[tn][j] = a[j] + return ac + + nt, c = 15, 12 + r = test_impl(nt, 10000, c) + a = np.full(nt, c) + for i in range(nt): + np.testing.assert_array_equal(r[i], a) + + # def test_for_nowait(self): + # @njit + # def test_impl(nt, n, c1, c2): + # a = np.zeros(n) + # b = np.zeros(n) + # ac = np.zeros((nt, n)) + # sum = 0 + # with omp("parallel num_threads(nt) private(tn)"): + # tn = omp_get_thread_num() + # with omp("for nowait schedule(static) private(sum)"): + # for i in range(n): + # # Sleep + # sum = 0 + # for j in range(i * 1000): + # if j % 2 == 0: + # sum += 1 + # else: + # sum -= 1 + # a[i] = i * c1 + sum + # for j in range(nt): + # ac[tn][j] = a[j] + # with omp("for schedule(static)"): + # for i in range(n): + # b[i] = a[i] + c2 + # return b, ac + # nt, n, c1, c2 = 8, 30, 5, -7 + # r = test_impl(nt, n, c1, c2) + # a = np.arange(n) * c1 + # np.testing.assert_array_equal(r[0], a + c2) + # arc = [np.array_equal(r[1][i], a) for i in range(nt)] + # assert(not np.all(arc)) + # + # def test_nowait_result(self): + # def test_impl(n, m, a, b, y, z): + # omp_set_num_threads(5) + # + # with omp("parallel"): + # with omp("for nowait"): + # for i in range(1, n): + # b[i] = (a[i] + a[i-1]) / 2.0 + # with omp("for nowait"): + # for i in range(m): + # y[i] = math.sqrt(z[i]) + # + # return b, y + # n, m = 10, 20 + # self.check(test_impl, n, m, np.ones(n), np.zeros(n), + # np.zeros(m), np.full(m, 13)) + + def test_nested_parallel_for(self): + @njit + def test_impl(nt): + omp_set_num_threads(nt) + omp_set_nested(1) + omp_set_dynamic(0) + a = np.zeros((nt, nt), dtype=np.int32) + with omp("parallel for"): + for i in range(nt): + with omp("parallel for"): + for j in range(nt): + a[i][j] = omp_get_thread_num() + return a + + nt = 8 + r = test_impl(nt) + for i in range(len(r)): + np.testing.assert_array_equal(np.sort(r[i]), np.arange(nt)) + + def test_nested_parallel_regions_1(self): + @njit + def test_impl(nt1, nt2): + omp_set_dynamic(0) + omp_set_max_active_levels(2) + ca = np.zeros(nt1) + omp_set_num_threads(nt1) + with omp("parallel private(tn)"): + tn = omp_get_thread_num() + with omp("parallel num_threads(3)"): + with omp("critical"): + ca[tn] += 1 + with omp("single"): + ats = omp_get_ancestor_thread_num(1) == tn + ts = omp_get_team_size(1) + return ca, ats, ts + + nt1, nt2 = 6, 3 + r = test_impl(nt1, nt2) + np.testing.assert_array_equal(r[0], np.full(nt1, nt2)) + assert r[1] == True + assert r[2] == nt1 + + def test_nested_parallel_regions_2(self): + @njit + def set_array(a): + tn = omp_get_thread_num() + a[tn][0] = omp_get_max_active_levels() + a[tn][1] = omp_get_num_threads() + a[tn][2] = omp_get_max_threads() + a[tn][3] = omp_get_level() + a[tn][4] = omp_get_team_size(1) + a[tn][5] = omp_in_parallel() + + @njit + def test_impl(mal, n1, n2, n3): + omp_set_max_active_levels(mal) + omp_set_dynamic(0) + omp_set_num_threads(n1) + a = np.zeros((n2, 6), dtype=np.int32) + b = np.zeros((n1, 6), dtype=np.int32) + with omp("parallel"): + omp_set_num_threads(n2) + with omp("single"): + with omp("parallel"): + omp_set_num_threads(n3) + set_array(a) + set_array(b) + + return a, b + + mal, n1, n2, n3 = 8, 2, 4, 5 + a, b = test_impl(mal, n1, n2, n3) + for i in range(n2): + np.testing.assert_array_equal(a[i], np.array([8, n2, n3, 2, n1, 1])) + for i in range(n1): + np.testing.assert_array_equal(b[i], np.array([8, n1, n2, 1, n1, 1])) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort / Segmentation Fault") + def test_parallel_two_dimensional_array(self): + @njit + def test_impl(N): + omp_set_dynamic(0) + omp_set_num_threads(N) + a = np.zeros((N, 2), dtype=np.int32) + with omp("parallel private(tn)"): + tn = omp_get_thread_num() + a[tn][0] = 1 + a[tn][1] = 2 + return a + + N = 5 + r = test_impl(N) + for i in range(N): + np.testing.assert_array_equal(r[i], np.array([1, 2])) + + +class TestOpenmpTask(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_task_basic(self): + def test_impl(ntsks): + a = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task"): + a[i] = 1 + return a + + self.check(test_impl, 15) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Sometimes segmentation fault") + def test_task_thread_assignment(self): + @njit + def test_impl(ntsks): + a = np.empty(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task"): + a[i] = omp_get_thread_num() + return a + + with self.assertRaises(AssertionError) as raises: + v = test_impl(15) + np.testing.assert_equal(v[0], v) + + def test_task_data_sharing_default(self): + @njit + def test_impl(n1, n2): + x = n1 + with omp("parallel private(y)"): + y = n1 + with omp("single"): + with omp("task"): + xa = x == n1 + ya = y == n1 + x, y = n2, n2 + with omp("taskwait"): + ysave = y + return (x, ysave), (xa, ya) + + n1, n2 = 1, 2 + r = test_impl(n1, n2) + self.assert_outputs_equal(r[1], (True, True)) + self.assert_outputs_equal(r[0], (n2, n1)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Segmentation fault") + def test_task_single_implicit_barrier(self): + @njit + def test_impl(ntsks): + a = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task private(sum)"): + # Sleep + sum = 0 + for j in range(10000): + if j % 2 == 0: + sum += 1 + else: + sum -= 1 + a[i] = 1 + sum + # with omp("barrier"): + # pass + sa = np.copy(a) + return sa + + ntsks = 15 + r = test_impl(ntsks) + np.testing.assert_array_equal(r, np.ones(ntsks)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Segmentation fault") + def test_task_single_nowait(self): + @njit + def test_impl(ntsks): + a = np.zeros(ntsks) + with omp("parallel"): + with omp("single nowait"): + for i in range(ntsks): + with omp("task private(sum)"): + sum = 0 + for j in range(10000): + if j % 2 == 0: + sum += 1 + else: + sum -= 1 + a[i] = 1 + sum + sa = np.copy(a) + return sa + + with self.assertRaises(AssertionError) as raises: + ntsks = 15 + r = test_impl(ntsks) + np.testing.assert_array_equal(r, np.ones(ntsks)) + + # Error with commented out code, other version never finished running + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Error") + def test_task_barrier(self): + @njit + def test_impl(nt): + omp_set_num_threads(nt) + a = np.zeros((nt + 1) * nt / 2) + # a = np.zeros(10) + with omp("parallel"): + with omp("single"): + for tn in range(nt): + with omp("task"): + for i in range(tn + 1): + with omp("task"): + a[i] = omp_get_thread_num() + 1 + with omp("barrier"): + ret = np.all(a) + return ret + + assert test_impl(4) + + def test_taskwait(self): + def test_impl(ntsks): + a = np.zeros(ntsks) + with omp("parallel private(i)"): + with omp("single"): + for i in range(ntsks): + with omp("task private(sum) private(j)"): + sum = 0 + for j in range(10000): + if j % 2 == 0: + sum += 1 + else: + sum -= 1 + a[i] = 1 + sum + with omp("taskwait"): + ret = np.all(a) + return ret + + self.check(test_impl, 15) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Sometimes segmentation fault") + def test_taskwait_descendants(self): + @njit + def test_impl(ntsks, dtsks): + a = np.zeros(ntsks) + da = np.zeros((ntsks, dtsks)) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task"): + a[i] = 1 + for j in range(dtsks): + with omp("task private(sum)"): + sum = 0 + for k in range(10000): + if k % 2 == 0: + sum += 1 + else: + sum -= 1 + da[i][j] = 1 + sum + with omp("taskwait"): + ac = np.copy(a) + dac = np.copy(da) + with omp("barrier"): + pass + return ac, dac + + r = test_impl(15, 10) + np.testing.assert_array_equal(r[0], np.ones(r[0].shape)) + with self.assertRaises(AssertionError) as raises: + np.testing.assert_array_equal(r[1], np.ones(r[1].shape)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_undeferred_task(self): + @njit + def test_impl(): + with omp("parallel"): + flag = 1 + with omp("single"): + with omp("task if(1) private(sum)"): + sum = 0 + for i in range(10000): + if i % 2 == 0: + sum += 1 + else: + sum -= 1 + r = flag + sum + flag = 0 + return r + + assert test_impl() + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_untied_task_thread_assignment(self): + @njit + def test_impl(ntsks): + start_nums = np.zeros(ntsks) + current_nums = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task untied private(sum)"): + start_nums[i] = omp_get_thread_num() + with omp("task if(0) shared(sum)"): + # Sleep + sum = 0 + for j in range(10000): + if j % 2 == 0: + sum += 1 + else: + sum -= 1 + current_nums[i] = omp_get_thread_num() + sum + with omp("barrier"): + pass + return start_nums, current_nums + + with self.assertRaises(AssertionError) as raises: + sids, cids = test_impl(15) + np.testing.assert_array_equal(sids, cids) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_taskyield_thread_assignment(self): + @njit + def test_impl(ntsks): + start_nums = np.zeros(ntsks) + finish_nums = np.zeros(ntsks) + yielded_tasks = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task private(stn, start_i, finish_i, diff)"): + stn = omp_get_thread_num() + start_i = np.where(start_nums == stn)[0] + finish_i = np.where(finish_nums == stn)[0] + diff = np.zeros(len(start_i), dtype=np.int64) + for sindex in range(len(start_i)): + for findex in range(len(finish_i)): + if start_i[sindex] == finish_i[findex]: + break + else: + diff[sindex] = start_i[sindex] + for dindex in diff[diff != 0]: + yielded_tasks[dindex] = 1 + start_nums[i] = stn + with omp("taskyield"): + pass + finish_nums[i] = omp_get_thread_num() + with omp("barrier"): + pass + return yielded_tasks + + yt = test_impl(50) + assert np.any(yt) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_final_task_thread_assignment(self): + @njit + def test_impl(ntsks, c): + final_nums = np.zeros(ntsks) + included_nums = np.zeros(ntsks) + da = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task final(i>c) private(sum, d)"): + ftask_num = i + final_nums[ftask_num] = omp_get_thread_num() + # If it is a final task, generate an included task + if ftask_num > c: + d = 1 + with omp("task private(sum)"): + itask_num = ftask_num + # Sleep + sum = 0 + for j in range(10000): + if j % 2 == 0: + sum += 1 + else: + sum -= 1 + included_nums[itask_num] = omp_get_thread_num() + da[itask_num] = d + sum + d = 0 + + return final_nums, included_nums, da + + ntsks, c = 15, 5 + fns, ins, da = test_impl(ntsks, c) + np.testing.assert_array_equal(fns[c:], ins[c:]) + np.testing.assert_array_equal(da, np.ones(ntsks)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_taskgroup(self): + @njit + def test_impl(ntsks, dtsks): + a = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + with omp("taskgroup"): + for i in range(ntsks): + with omp("task"): + for _ in range(dtsks): + with omp("task"): + # Sleep + sum = 0 + for j in range(10000): + if j % 2 == 0: + sum += 1 + else: + sum -= 1 + a[i] = 1 + sum + sa = np.copy(a) + return a, sa + + ntsks = 15 + r = test_impl(ntsks, 10) + np.testing.assert_array_equal(r[0], np.ones(ntsks)) + np.testing.assert_array_equal(r[1], np.ones(ntsks)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_task_priority(self): + @njit + def test_impl(ntsks): + a = np.zeros(ntsks) + count = 0 + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task priority(i)"): + count += i + 1 + a[i] = count + return a + + ntsks = 15 + r = test_impl(ntsks) + rc = np.zeros(ntsks) + for i in range(ntsks): + rc[i] = sum(range(i + 1, ntsks + 1)) + np.testing.assert_array_equal(r, rc) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_task_mergeable(self): + @njit + def test_impl(ntsks, c1, c2): + a = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task private(x)"): + x = c1 + with omp("task mergeable if(0)"): + x = c2 + a[i] = x + return a + + ntsks, c1, c2 = 75, 2, 3 + assert c2 in test_impl(ntsks, c1, c2) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_task_depend(self): + def test_impl(ntsks): + a = np.zeros(ntsks) + da = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task private(x, done)"): + x = 1 + done = False + with omp("task shared(x) depend(out: x)"): + x = 5 + with omp("""task shared(done, x) + depend(out: done) depend(inout: x)"""): + x += i + done = True + with omp("""task shared(done, x) + depend(in: done) depend(inout: x)"""): + x *= i + da[i] = 1 if done else 0 + with omp("task shared(x) depend(in: x)"): + a[i] = x + return a, da + + self.check(test_impl, 15) + + # Affinity clause should not affect result + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") + def test_task_affinity(self): + def test_impl(ntsks, const): + a = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + for i in range(ntsks): + with omp("task firstprivate(i)"): + with omp("""task shared(b) depend(out: b) + affinity(a)"""): + b = np.full(i, const) + with omp("""task shared(b) depend(in: b) + affinity(a)"""): + a[i] = np.sum(b) + return a + + self.check(test_impl, 15, 4) + + def test_shared_array(self): + def test_impl(mode): + if mode == 0: + return + + b = np.zeros(100) + with omp("parallel"): + with omp("single"): + a = np.ones(100) + c = 0 + d = 0 + if mode > 1: + with omp("task shared(a, c)"): + c = a.sum() + with omp("task shared(a, d)"): + d = a.sum() + with omp("taskwait"): + b[:] = c + d + + return b + + self.check(test_impl, 0) + self.check(test_impl, 1) + self.check(test_impl, 2) + + +@unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented") +class TestOpenmpTaskloop(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_taskloop_basic(self): + def test_impl(ntsks): + a = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + with omp("taskloop"): + for i in range(ntsks): + a[i] = 1 + return a + + self.check(test_impl, 15) + + def test_taskloop_num_tasks(self): + @njit + def test_impl(nt, iters, ntsks): + a = np.zeros(ntsks) + with omp("parallel num_threads(nt)"): + with omp("single"): + with omp("taskloop num_tasks(ntsks)"): + for i in range(iters): + a[i] = omp_get_thread_num() + return a + + nt, iters, ntsks = 8, 10, 4 + assert len(np.unique(test_impl(nt, iters, ntsks))) <= ntsks + + def test_taskloop_grainsize(self): + @njit + def test_impl(nt, iters, ntsks): + a = np.zeros(ntsks) + with omp("parallel num_threads(nt)"): + with omp("single"): + iters_per_task = iters // ntsks + with omp("taskloop grainsize(iters_per_task)"): + for i in range(iters): + a[i] = omp_get_thread_num() + return a + + nt, iters, ntsks = 8, 10, 4 + assert len(np.unique(test_impl(nt, iters, ntsks))) <= ntsks + + def test_taskloop_nogroup(self): + @njit + def test_impl(ntsks): + a = np.zeros(ntsks) + sa = np.zeros(ntsks) + with omp("parallel"): + with omp("single"): + s = 0 + with omp("taskloop nogroup num_tasks(ntsks)"): + for i in range(ntsks): + a[i] = 1 + sa[i] = s + with omp("task priority(1)"): + s = 1 + return a, sa + + ntsks = 15 + r = test_impl(ntsks) + np.testing.assert_array_equal(r[0], np.ones(ntsks)) + np.testing.assert_array_equal(r[1], np.ones(ntsks)) + + def test_taskloop_collapse(self): + @njit + def test_impl(ntsks, nt): + fl = np.zeros(ntsks) + sl = np.zeros(ntsks) + tl = np.zeros(ntsks) + omp_set_num_threads(nt) + with omp("parallel"): + with omp("single"): + with omp("taskloop collapse(2) num_tasks(ntsks)"): + for i in range(ntsks): + fl[i] = omp_get_thread_num() + for j in range(1): + sl[i] = omp_get_thread_num() + for k in range(1): + tl[i] = omp_get_thread_num() + + return fl, sl, tl + + r = test_impl(25, 4) + with self.assertRaises(AssertionError) as raises: + np.testing.assert_array_equal(r[0], r[1]) + np.testing.assert_array_equal(r[1], r[2]) + + +@linux_only +@unittest.skipUnless( + TestOpenmpBase.skip_disabled or TestOpenmpBase.run_target, "Unimplemented" +) +class TestOpenmpTarget(TestOpenmpBase): + """ + OpenMP target offloading tests. TEST_DEVICES is a required env var to + specify the device numbers to run the tests on: 0 for host backend, 1 for + CUDA backend. It is expected to be a comma-separated list of integer values. + """ + + devices = [] + assert TestOpenmpBase.test_devices, ( + "Expected env var TEST_DEVICES (comma-separated list of device numbers)" + ) + devices = [int(devno) for devno in TestOpenmpBase.test_devices.split(",")] + assert devices, "Expected non-empty test devices list" + + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + @classmethod + def is_testing_cpu(cls): + return 1 in cls.devices + + # How to check for nowait? + # Currently checks only compilation. + # Numba optimizes the whole target away? This runs too fast. + def target_nowait(self, device): + target_pragma = f"target nowait device({device})" + + @njit + def test_impl(): + with omp(target_pragma): + a = 0 + for i in range(1000000): + for j in range(1000000): + for k in range(1000000): + a += math.sqrt(i) + math.sqrt(j) + math.sqrt(k) + + test_impl() + + def target_nest_parallel_default_threadlimit(self, device): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("parallel"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + np.testing.assert_equal(teams, 1) + self.assertGreater(threads, 1) + + def target_nest_parallel_set_numthreads(self, device): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("parallel num_threads(32)"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + np.testing.assert_equal(teams, 1) + np.testing.assert_equal(threads, 32) + + def target_nest_teams_default_numteams(self, device): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("teams"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + # GPU device(0) starts >1 teams each with 1 thread. + if device == 0: + self.assertGreater(teams, 1) + self.assertEqual(threads, 1) + # CPU device(1) starts 1 team with >1 threads. + elif device == 1: + self.assertEqual(teams, 1) + self.assertGreater(threads, 1) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_nest_teams_set_numteams(self, device): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("teams num_teams(32)"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + if device == 0: + self.assertEqual(teams, 32) + elif device == 1: + self.assertLessEqual(teams, 32) + else: + raise ValueError(f"Device {device} must be 0 or 1") + self.assertGreaterEqual(threads, 1) + + def target_nest_teams_nest_parallel_default_numteams_threadlimit(self, device): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("teams"): + with omp("parallel"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + # For GPU, impl. creates multiple threads and teams. + if device == 0: + self.assertGreater(teams, 1) + self.assertGreater(threads, 1) + # For CPU, impl. creates 1 teams with multiple threads. + elif device == 1: + self.assertEqual(teams, 1) + self.assertGreater(threads, 1) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_nest_teams_nest_parallel_set_numteams(self, device): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("teams num_teams(32)"): + with omp("parallel"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + if device == 0: + self.assertEqual(teams, 32) + elif device == 1: + self.assertGreaterEqual(teams, 1) + else: + raise ValueError(f"Device {device} must be 0 or 1") + self.assertGreaterEqual(threads, 1) + + def target_nest_teams_nest_parallel_set_threadlimit(self, device): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("teams thread_limit(32)"): + with omp("parallel"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + # For GPU, impl. creates > 1 teams. + if device == 0: + self.assertGreater(teams, 1) + self.assertEqual(threads, 32) + # For CPU, impl. creates exactly 1 team. + elif device == 1: + self.assertEqual(teams, 1) + self.assertLessEqual(threads, 32) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_nest_teams_nest_parallel_set_numteams_threadlimit(self, device): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("teams num_teams(32) thread_limit(32)"): + with omp("parallel"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + self.assertGreaterEqual(teams, 1) + if device == 0: + self.assertEqual(teams, 32) + self.assertEqual(threads, 32) + elif device == 1: + self.assertLessEqual(teams, 32) + self.assertLessEqual(threads, 32) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_nest_teams_nest_parallel_set_numteams_threadlimit_gt_numthreads( + self, device + ): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("teams num_teams(32) thread_limit(64)"): + with omp("parallel num_threads(32)"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + self.assertGreaterEqual(teams, 1) + if device == 0: + self.assertEqual(teams, 32) + self.assertEqual(threads, 32) + elif device == 1: + self.assertLessEqual(teams, 32) + self.assertLessEqual(threads, 32) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_nest_teams_nest_parallel_set_numteams_threadlimit_lt_numthreads( + self, device + ): + target_pragma = f"target device({device}) map(from: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + # THREAD_LIMIT takes precedence over NUM_THREADS. + with omp("teams num_teams(32) thread_limit(64)"): + with omp("parallel num_threads(128)"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + self.assertGreaterEqual(teams, 1) + if device == 0: + self.assertEqual(teams, 32) + self.assertEqual(threads, 64) + elif device == 1: + self.assertLessEqual(teams, 32) + self.assertLessEqual(threads, 64) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_nest_parallel_multiple_set_numthreads(self, device): + target_pragma = ( + f"target device({device}) map(from: teams1, threads1, teams2, threads2)" + ) + + @njit + def test_impl(): + teams1 = 0 + threads1 = 0 + teams2 = 0 + threads2 = 0 + with omp(target_pragma): + with omp("parallel num_threads(32)"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams1 = omp_get_num_teams() + threads1 = omp_get_num_threads() + with omp("parallel num_threads(256)"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams2 = omp_get_num_teams() + threads2 = omp_get_num_threads() + return teams1, threads1, teams2, threads2 + + teams1, threads1, teams2, threads2 = test_impl() + np.testing.assert_equal(teams1, 1) + np.testing.assert_equal(threads1, 32) + np.testing.assert_equal(teams2, 1) + np.testing.assert_equal(threads2, 256) + + def target_nest_parallel_multiple_default_numthreads(self, device): + target_pragma = ( + f"target device({device}) map(from: teams1, threads1, teams2, threads2)" + ) + + @njit + def test_impl(): + teams1 = 0 + threads1 = 0 + teams2 = 0 + threads2 = 0 + with omp(target_pragma): + with omp("parallel"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams1 = omp_get_num_teams() + threads1 = omp_get_num_threads() + with omp("parallel"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams2 = omp_get_num_teams() + threads2 = omp_get_num_threads() + return teams1, threads1, teams2, threads2 + + teams1, threads1, teams2, threads2 = test_impl() + np.testing.assert_equal(teams1, 1) + self.assertGreater(threads1, 1) + np.testing.assert_equal(teams2, 1) + self.assertGreater(threads2, 1) + + def target_nest_parallel_multiple_set_numthreads_byone(self, device): + target_pragma = f"target device({device}) map(from: max_threads, teams1, threads1, teams2, threads2)" + + @njit + def test_impl(): + max_threads = 0 + teams1 = 0 + threads1 = 0 + teams2 = 0 + threads2 = 0 + with omp(target_pragma): + max_threads = omp_get_max_threads() + with omp("parallel"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams1 = omp_get_num_teams() + threads1 = omp_get_num_threads() + with omp("parallel num_threads(256)"): + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + teams2 = omp_get_num_teams() + threads2 = omp_get_num_threads() + return max_threads, teams1, threads1, teams2, threads2 + + # NOTE: max_threads for device(0) is the number of threads set by the + # sibling parallel legion with the highest num_threads clause. + # For device(1), is the number of max threads as determined by the host + # runtime. + max_threads, teams1, threads1, teams2, threads2 = test_impl() + np.testing.assert_equal(teams1, 1) + np.testing.assert_equal(threads1, max_threads) + np.testing.assert_equal(teams2, 1) + np.testing.assert_equal(threads2, 256) + + def target_nest_parallel(self, device): + # TODO: map should be "from" instead of "tofrom" once this is fixed. + target_pragma = f"target device({device}) map(from: a)" + # NOTE: num_threads should be a multiple of warp size, e.g. for NVIDIA + # V100 it is 32, the OpenMP runtime floors non-multiple of warp size. + # TODO: Newer LLVM versions should not have this restriction. + parallel_pragma = ( + "parallel num_threads(32)" # + (" shared(a)" if explicit else "") + ) + + @njit + def test_impl(): + a = np.zeros(32, dtype=np.int64) + with omp(target_pragma): + with omp(parallel_pragma): + thread_id = omp_get_thread_num() + a[thread_id] = 1 + return a + + r = test_impl() + np.testing.assert_equal(r, np.full(32, 1)) + + def target_parallel_for_range_step_arg(self, device): + target_pragma = f"target device({device}) map(tofrom: a)" + parallel_pragma = "parallel for" + N = 10 + step = 2 + + @njit + def test_impl(): + a = np.zeros(N, dtype=np.int32) + with omp(target_pragma): + with omp(parallel_pragma): + for i in range(0, len(a), step): + a[i] = i + 1 + + return a + + r = test_impl() + np.testing.assert_equal(r, np.array([1, 0, 3, 0, 5, 0, 7, 0, 9, 0])) + + def target_parallel_for_incremented_step(self, device): + target_pragma = f"target device({device}) map(tofrom: a)" + parallel_pragma = "parallel for" + N = 10 + step_range = 3 + + @njit + def test_impl(): + a = np.zeros(N, dtype=np.int32) + for i in range(step_range): + with omp(target_pragma): + with omp(parallel_pragma): + for j in range(0, len(a), i + 1): + a[j] = i + 1 + return a + + r = test_impl() + np.testing.assert_equal(r, np.array([3, 1, 2, 3, 2, 1, 3, 1, 2, 3])) + + def target_teams(self, device): + target_pragma = ( + f"target teams num_teams(100) device({device}) map(from: a, nteams)" + ) + + @njit + def test_impl(): + a = np.zeros(100, dtype=np.int64) + nteams = 0 + with omp(target_pragma): + team_id = omp_get_team_num() + if team_id == 0: + nteams = omp_get_num_teams() + a[team_id] = 1 + return a, nteams + + r, nteams = test_impl() + if device == 0: + np.testing.assert_equal(r, np.full(100, 1)) + elif device == 1: + np.testing.assert_equal(r[:nteams], np.full(nteams, 1)) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_nest_teams(self, device): + target_pragma = f"target device({device}) map(from: a, nteams)" + + @njit + def test_impl(): + a = np.zeros(100, dtype=np.int64) + nteams = 0 + with omp(target_pragma): + with omp("teams num_teams(100)"): + team_id = omp_get_team_num() + if team_id == 0: + nteams = omp_get_num_teams() + a[team_id] = 1 + return a, nteams + + r, nteams = test_impl() + if device == 0: + np.testing.assert_equal(r, np.full(100, 1)) + elif device == 1: + np.testing.assert_equal(r[:nteams], np.full(nteams, 1)) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_nest_teams_from_shared_expl_scalar(self, device): + target_pragma = f"target device({device}) map(from: s)" + + @njit + def test_impl(): + s = 0 + with omp(target_pragma): + with omp("teams num_teams(100) shared(s)"): + team_id = omp_get_team_num() + if team_id == 0: + s = 1 + return s + + s = test_impl() + np.testing.assert_equal(s, 1) + + def target_nest_teams_from_shared_impl_scalar(self, device): + target_pragma = f"target device({device}) map(from: s)" + + @njit + def test_impl(): + s = 0 + with omp(target_pragma): + with omp("teams num_teams(100)"): + team_id = omp_get_team_num() + if team_id == 0: + s = 1 + return s + + s = test_impl() + np.testing.assert_equal(s, 1) + + def target_nest_teams_tofrom_shared_expl_scalar(self, device): + target_pragma = f"target device({device}) map(tofrom: s)" + + @njit + def test_impl(): + s = 0 + with omp(target_pragma): + with omp("teams num_teams(100) shared(s)"): + team_id = omp_get_team_num() + if team_id == 0: + s = 1 + return s + + s = test_impl() + np.testing.assert_equal(s, 1) + + def target_nest_teams_tofrom_shared_impl_scalar(self, device): + target_pragma = f"target device({device}) map(tofrom: s)" + + @njit + def test_impl(): + s = 0 + ss = np.zeros(1) + with omp(target_pragma): + with omp("teams num_teams(100)"): + team_id = omp_get_team_num() + if team_id == 0: + s = 1 + ss[0] = 1 + return s, ss + + s, ss = test_impl() + np.testing.assert_equal(s, 1) + np.testing.assert_equal(ss, 1) + + def target_teams_nest_parallel(self, device): + target_pragma = f"target teams device({device}) num_teams(10) thread_limit(32) map(tofrom: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("parallel"): + team_id = omp_get_team_num() + thread_id = omp_get_thread_num() + if team_id == 0 and thread_id == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + if device == 0: + self.assertEqual(teams, 10) + self.assertEqual(threads, 32) + elif device == 1: + self.assertLessEqual(teams, 10) + self.assertLessEqual(threads, 32) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_teams_nest_parallel_set_thread_limit(self, device): + target_pragma = f"target device({device}) map(tofrom: teams, threads)" + + @njit + def test_impl(): + teams = 0 + threads = 0 + with omp(target_pragma): + with omp("teams num_teams(10) thread_limit(32)"): + with omp("parallel"): + team_id = omp_get_team_num() + thread_id = omp_get_thread_num() + if team_id == 0 and thread_id == 0: + teams = omp_get_num_teams() + threads = omp_get_num_threads() + return teams, threads + + teams, threads = test_impl() + if device == 0: + self.assertEqual(teams, 10) + self.assertEqual(threads, 32) + elif device == 1: + self.assertLessEqual(teams, 10) + self.assertLessEqual(threads, 32) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_map_to_scalar(self, device): + target_pragma = f"target device({device}) map(to: x) map(from: r)" + + @njit + def test_impl(x): + with omp(target_pragma): + x += 1 + r = x + return r + + x = 42 + r = test_impl(x) + np.testing.assert_equal(r, 43) + + def target_map_to_array(self, device): + target_pragma = f"target device({device}) map(to: a) map(from: r)" + + @njit + def test_impl(a): + with omp(target_pragma): + r = 0 + for i in range(len(a)): + r += a[i] + return r + + n = 10 + a = np.ones(n) + r = test_impl(a) + # r is the sum of array elements (ones-array), thus must equal s. + np.testing.assert_equal(r, n) + + def target_map_from_scalar(self, device): + target_pragma = f"target device({device}) map(from: x)" + + @njit + def test_impl(x): + with omp(target_pragma): + x = 43 + return x + + x = 42 + r = test_impl(x) + np.testing.assert_equal(r, 43) + + def target_map_tofrom_scalar(self, device): + target_pragma = f"target device({device}) map(tofrom: x)" + + @njit + def test_impl(x): + with omp(target_pragma): + x += 1 + return x + + x = 42 + r = test_impl(x) + np.testing.assert_equal(r, 43) + + def target_multiple_map_tofrom_scalar(self, device): + target_pragma = f"target device({device}) map(tofrom: x)" + + @njit + def test_impl(x): + with omp(target_pragma): + x += 1 + with omp(target_pragma): + x += 1 + return x + + x = 42 + r = test_impl(x) + np.testing.assert_equal(r, 44) + + def target_map_from_array(self, device): + target_pragma = f"target device({device}) map(from: a)" + + @njit + def test_impl(n): + a = np.zeros(n, dtype=np.int64) + with omp(target_pragma): + for i in range(len(a)): + a[i] = 42 + return a + + n = 10 + r = test_impl(n) + np.testing.assert_array_equal(r, np.full(n, 42)) + + def target_map_slice_in_mapping(self, device): + target_pragma = f"target device({device}) map(a[50:100]) map(to: b[100:150])" + + @njit + def test_impl(n): + a = np.zeros(n) + b = np.arange(n) + with omp(target_pragma): + for i in range(50): + # These b accesses are within the transferred region. + a[i + 50] = b[i + 100] + return a + + n = 200 + r = test_impl(n) + np.testing.assert_array_equal(r[0:50], np.zeros(50)) + np.testing.assert_array_equal(r[50:100], np.arange(n)[100:150]) + np.testing.assert_array_equal(r[100:200], np.zeros(100)) + + def target_map_slice_read_out_mapping(self, device): + target_pragma = f"target device({device}) map(a[50:100]) map(to: b[100:150])" + + @njit + def test_impl(n): + a = np.zeros(n) + b = np.arange(n) + with omp(target_pragma): + for i in range(50): + # These b accesses are outside the transferred region. + # Should get whatever happens to be in memory at that point. + # We assume that isn't arange(50:100). + a[i + 50] = b[i + 50] + return a + + n = 200 + r = test_impl(n) + np.testing.assert_array_equal(r[0:50], np.zeros(50)) + # Make sure that the range 50-100 was not transferred. + assert not np.array_equal(r[50:100], np.arange(n)[50:100]) + np.testing.assert_array_equal(r[100:200], np.zeros(100)) + + def target_map_tofrom_array(self, device): + target_pragma = f"target device({device}) map(tofrom: a)" + + @njit + def test_impl(a): + with omp(target_pragma): + for i in range(len(a)): + a[i] += 1 + return a + + n = 10 + a = np.full(n, 42) + r = test_impl(a) + np.testing.assert_array_equal(r, np.full(n, 43)) + + def target_nest_parallel_for(self, device): + target_pragma = f"target device({device}) map(tofrom: a, sched)" + + @njit + def test_impl(a, sched): + with omp(target_pragma): + with omp("parallel for num_threads(256)"): + for i in range(len(a)): + a[i] = 1 + thread_id = omp_get_thread_num() + sched[i] = thread_id + return a, sched + + n = 1000 + a = np.zeros(n) + sched = np.zeros(n) + r, sched = test_impl(a, sched) + np.testing.assert_array_equal(r, np.ones(n)) + # u = unique thread ids that processed the array, c = number of iters + # each unique thread id has processed. + u, c = np.unique(sched, return_counts=True) + # test that 256 threads executed. + np.testing.assert_equal(len(u), 256) + # test that each thread executed more than 1 iteration. + for ci in c: + self.assertGreater(ci, 0) + + def target_nest_teams_distribute(self, device): + target_pragma = f"target device({device}) map(tofrom: a, sched)" + + @njit + def test_impl(a, sched): + with omp(target_pragma): + with omp("teams distribute"): + for i in range(len(a)): + a[i] = 1 + team_id = omp_get_team_num() + sched[i] = team_id + return a, sched + + n = 100 + a = np.zeros(n) + sched = np.zeros(n) + r, sched = test_impl(a, sched) + np.testing.assert_array_equal(r, np.ones(n)) + # u = unique teams ids that processed the array, c = number of iters + # each unique team id has processed. + u, c = np.unique(sched, return_counts=True) + if device == 0: + # For GPU, OpenMP creates as many teams as the number of iterations, + # where each team leader executes one iteration. + np.testing.assert_equal(len(u), n) + np.testing.assert_array_equal(c, np.ones(n)) + elif device == 1: + # For CPU, OpenMP creates 1 teams with 1 thread processing all n + # iterations. + np.testing.assert_equal(len(u), 1) + np.testing.assert_array_equal(c, [100]) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_teams_distribute(self, device): + target_pragma = ( + f"target teams distribute device({device}) map(tofrom: a, sched)" + ) + + @njit + def test_impl(a, sched): + with omp(target_pragma): + for i in range(len(a)): + a[i] = 1 + team_id = omp_get_team_num() + sched[i] = team_id + return a, sched + + n = 1000 + a = np.zeros(n) + sched = np.zeros(n) + r, sched = test_impl(a, sched) + np.testing.assert_array_equal(r, np.ones(n)) + # u = unique teams ids that processed the array, c = number of iters + # each unique team id has processed. + u, c = np.unique(sched, return_counts=True) + if device == 0: + # For GPU, impl. creates as many teams as the number of iterations, + # where each team leader executes one iteration. + np.testing.assert_equal(len(u), n) + np.testing.assert_array_equal(c, np.ones(n)) + elif device == 1: + # For CPU, impl. creates 1 team which processes all iterations. + np.testing.assert_equal(len(u), 1) + np.testing.assert_array_equal(c, [1000]) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_teams_distribute_set_num_teams(self, device): + target_pragma = ( + f"target teams distribute device({device}) map(tofrom: a) num_teams(4)" + ) + + @njit + def test_impl(a, sched): + with omp(target_pragma): + for i in range(len(a)): + a[i] = 1 + team_id = omp_get_team_num() + sched[i] = team_id + return a, sched + + n = 1000 + a = np.zeros(n) + sched = np.zeros(n) + r, sched = test_impl(a, sched) + np.testing.assert_array_equal(r, np.ones(n)) + # u = unique teams ids that processed the array, c = number of iters + # each unique team id has processed. + u, c = np.unique(sched, return_counts=True) + np.testing.assert_equal(len(u), 4) + np.testing.assert_array_equal(c, np.full(4, 250)) + + def target_firstprivate_scalar_explicit(self, device): + target_pragma = f"target device({device}) firstprivate(s)" + + @njit + def test_impl(s): + with omp(target_pragma): + s = 43 + return s + + s = 42 + r = test_impl(s) + np.testing.assert_equal(r, 42) + + def target_firstprivate_scalar_implicit(self, device): + target_pragma = f"target device({device})" + + @njit + def test_impl(s): + with omp(target_pragma): + s = 43 + return s + + s = 42 + r = test_impl(s) + np.testing.assert_equal(r, 42) + + def target_data_from(self, device): + target_data_pragma = f"""target data device({device}) + map(from: a)""" + target_pragma = f"target device({device})" + + @njit + def test_impl(): + a = np.ones(10) + with omp(target_data_pragma): + with omp(target_pragma): + for i in range(len(a)): + a[i] = 42 + return a + + a = test_impl() + np.testing.assert_array_equal(a, np.full(10, 42)) + + def target_data_to(self, device): + target_data_pragma = f"""target data device({device}) + map(to: a) map(from: b)""" + target_pragma = f"target device({device})" + + @njit + def test_impl(): + a = np.ones(10) + b = np.zeros(10) + with omp(target_data_pragma): + with omp(target_pragma): + for i in range(len(a)): + a[i] = 42 + b[i] = a[i] + return a, b + + a, b = test_impl() + np.testing.assert_array_equal(a, np.ones(10)) + np.testing.assert_array_equal(b, np.full(10, 42)) + + def target_data_tofrom(self, device): + target_data_pragma = f"""target data device({device}) + map(tofrom: s, a)""" + target_pragma = f"target device({device})" + + @njit + def test_impl(): + s = 0 + a = np.ones(10) + with omp(target_data_pragma): + with omp(target_pragma): + for i in range(len(a)): + a[i] += 41 + s = 42 + return s, a + + s, a = test_impl() + # s is a FIRSTPRIVATE in the target region, so changes do not affect + # host s despite FROM mapping. + np.testing.assert_equal(s, 0) + np.testing.assert_array_equal(a, np.full(10, 42)) + + def target_data_alloc_from(self, device): + target_data_pragma = f"""target data device({device}) + map(alloc: a) map(from: b)""" + target_pragma = f"target device({device})" + + @njit + def test_impl(): + a = np.ones(10) + b = np.zeros(10) + with omp(target_data_pragma): + with omp(target_pragma): + for i in range(len(a)): + a[i] = 42 + b[i] = a[i] + return a, b + + a, b = test_impl() + np.testing.assert_array_equal(a, np.ones(10)) + np.testing.assert_array_equal(b, np.full(10, 42)) + + def target_data_mix_to_from(self, device): + target_data_pragma = f"""target data device({device}) + map(to: a) map(from: b)""" + target_pragma = f"target device({device})" + + @njit + def test_impl(): + a = np.ones(10) + b = np.ones(10) + with omp(target_data_pragma): + with omp(target_pragma): + for i in range(len(a)): + a[i] = 42 + b[i] = 42 + return a, b + + a, b = test_impl() + np.testing.assert_array_equal(a, np.ones(10)) + np.testing.assert_array_equal(b, np.full(10, 42)) + + def target_update_from(self, device): + target_data_pragma = f"""target data device({device}) + map(to: a)""" + target_pragma = f"target device({device})" + target_update_pragma = f"target update from(a) device({device})" + + @njit + def test_impl(): + a = np.ones(10) + with omp(target_data_pragma): + with omp(target_pragma): + for i in range(len(a)): + a[i] = 42 + with omp(target_update_pragma): + pass + return a + + a = test_impl() + np.testing.assert_array_equal(a, np.full(10, 42)) + + def target_update_to(self, device): + target_data_pragma = f"""target data device({device}) + map(from: a)""" + target_pragma = f"target device({device})" + target_update_pragma = f"target update to(a) device({device})" + + @njit + def test_impl(): + a = np.ones(10) + with omp(target_data_pragma): + a += 1 + + with omp(target_update_pragma): + pass + + with omp(target_pragma): + for i in range(len(a)): + a[i] += 1 + return a + + a = test_impl() + np.testing.assert_array_equal(a, np.full(10, 3)) + + def target_update_to_from(self, device): + target_data_pragma = f"""target data device({device}) + map(to: a)""" + target_pragma = f"target device({device})" + target_update_to_pragma = f"target update to(a) device({device})" + target_update_from_pragma = f"target update from(a) device({device})" + + @njit + def test_impl(): + a = np.ones(10) + with omp(target_data_pragma): + a += 1 + + with omp(target_update_to_pragma): + pass + + with omp(target_pragma): + for i in range(len(a)): + a[i] += 1 + + with omp(target_update_from_pragma): + pass + + a += 1 + return a + + a = test_impl() + np.testing.assert_array_equal(a, np.full(10, 4)) + + # WEIRD: breaks when runs alone, passes if runs with all tests. + def target_enter_exit_data_to_from_hostonly(self, device): + target_enter = f"""target enter data device({device}) + map(to: a)""" + + target_exit = f"""target exit data device({device}) + map(from: a)""" + + @njit + def test_impl(): + a = np.ones(10) + with omp(target_enter): + pass + + a += 1 + + # XXX: Test passes if uncommented! + # with omp("target device(1)"): + # pass + + with omp(target_exit): + pass + + return a + + a = test_impl() + np.testing.assert_array_equal(a, np.full(10, 1)) + + # WEIRD: breaks when runs alone, passes if runs with all tests. + def target_data_tofrom_hostonly(self, device): + target_data = f"""target data device({device}) + map(tofrom: a)""" + + @njit + def test_impl(): + a = np.ones(10) + with omp(target_data): + a += 1 + + # XXX: Test passes if uncommented! + # with omp("target device(1)"): + # pass + + return a + + a = test_impl() + np.testing.assert_array_equal(a, np.full(10, 1)) + + def target_data_update(self, device): + target_pragma = f"target teams distribute parallel for device({device})" + target_data = f"target data map(from:a) device({device})" + target_update = f"target update to(a) device({device})" + + @njit + def test_impl(a): + with omp(target_data): + for rep in range(10): + # Target update resets a to ones. + with omp(target_update): + pass + with omp(target_pragma): + for i in range(len(a)): + a[i] += 1 + + a = np.ones(4) + test_impl(a) + np.testing.assert_array_equal(a, np.full(4, 2)) + + @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented") + def target_data_nest_multiple_target(self, device): + target_data_pragma = f"""target data device({device}) map(to: a) + map(tofrom: b) map(from: as1, as2, bs1, bs2)""" + target_pragma = f"target device({device})" + + @njit + def test_impl(s, n1, n2): + a = np.full(s, n1) + as1 = np.empty(s, dtype=a.dtype) + as2 = np.empty(s, dtype=a.dtype) + b = n1 + with omp(target_data_pragma): + with omp(target_pragma): + as1[:] = a + bs1 = b + with omp(target_pragma): + for i in range(s): + a[i] = n2 + b = n2 + with omp(target_pragma): + as2[:] = a + bs2 = b + return a, as1, as2, b, bs1, bs2 + + s, n1, n2 = 50, 1, 2 + ao, a1, a2, bo, b1, b2 = test_impl(s, n1, n2) + np.testing.assert_array_equal(ao, np.full(s, n1)) + np.testing.assert_array_equal(a1, np.full(s, n1)) + np.testing.assert_array_equal(a2, np.full(s, n2)) + assert bo == n2 + assert b1 == n1 + assert b2 == n2 + + @unittest.skip("Creates map entries that aren't cleared.") + def target_enter_exit_data_array_sections(self, device): + target_enter_pragma = ( + f"target enter data map(to: a[0:3], b[bstart:bstop]) device({device})" + ) + target_exit_pragma = f"target exit data map(from: a[0:3]) device({device})" + target_pragma = f"target teams distribute parallel for device({device})" + + @njit + def test_impl(): + bstart = 0 + bstop = 3 + a = np.array([1, 2, 3]) + b = np.array([3, 2, 1]) + with omp(target_enter_pragma): + with omp(target_pragma): + for i in range(1): + a[0] = 42 + b[0] = 42 + + with omp(target_exit_pragma): + pass + + return a, b + + a, b = test_impl() + np.testing.assert_array_equal(a, [42, 2, 3]) + np.testing.assert_array_equal(b, [3, 2, 1]) + + def target_enter_exit_data(self, device): + target_enter_pragma = f"""target enter data device({device}) + map(to: scalar) map(to: array)""" + target_exit_pragma = f"""target exit data device({device}) + map(from: scalar, array)""" + target_pragma = f"target device({device})" + + @njit + def test_impl(scalar, array): + with omp(target_enter_pragma): + pass + + with omp(target_pragma): + scalar += 1 + for i in range(len(array)): + array[i] += 1 + + with omp(target_exit_pragma): + pass + + return scalar, array + + n = 10 + s = 42 + a = np.full(n, 42) + r_s, r_a = test_impl(s, a) + # NOTE: This is confusing but spec compliant and matches OpenMP target + # offloading of the C/C++ version: scalar is implicitly a firstprivate + # thus it does not copy back to the host although it is in a "from" map + # of the target exit data directive. + + # TODO: we may want to revise Python behavior and copy back scalar too. + np.testing.assert_equal(r_s, 42) + np.testing.assert_array_equal(r_a, np.full(n, 43)) + + def target_enter_exit_data_alloc(self, device): + target_enter_pragma = f"""target enter data device({device}) + map(alloc: a)""" + target_exit_pragma = f"target exit data device({device}) map(from: a)" + target_pragma = f"target device({device})" + + @njit + def test_impl(a): + with omp(target_enter_pragma): + pass + with omp(target_pragma): + for i in range(len(a)): + a[i] = 1 + with omp(target_exit_pragma): + pass + + return a + + n = 100 + a = np.zeros(n) + r = test_impl(a) + np.testing.assert_array_equal(r, np.ones(n)) + + def target_teams_distribute_parallel_for(self, device): + target_pragma = f"""target teams distribute parallel for + device({device}) num_teams(4) + map(tofrom: s, a, sched_team, sched_thread)""" + + @njit + def test_impl(a, sched_team, sched_thread): + s = 42 + with omp(target_pragma): + for i in range(len(a)): + a[i] = 1 + team_id = omp_get_team_num() + sched_team[i] = team_id + thread_id = omp_get_thread_num() + sched_thread[i] = thread_id + if i == 0 and team_id == 0 and thread_id == 0: + s += 1 + return s, a, sched_team, sched_thread + + n = 1024 + a = np.zeros(n) + sched_team = np.zeros(n) + sched_thread = np.zeros(n) + s, r, sched_team, sched_thread = test_impl(a, sched_team, sched_thread) + self.assertEqual(s, 43) + np.testing.assert_array_equal(r, np.ones(n)) + # u_team stores unique ids of teams, c_team stores how many iterations + # each time executed. + u_team, c_team = np.unique(sched_team, return_counts=True) + # u_thread stores unique ids of threads (regardless of team), c_thread + # stores how many iterations threads of the same unique id executed. + u_thread, c_thread = np.unique(sched_thread, return_counts=True) + if device == 0: + # there are 4 teams each with a unique id starting from 0. + self.assertEqual(len(u_team), 4) + np.testing.assert_array_equal(u_team, np.arange(0, len(u_team))) + # each team should execute 1024/4 = 256 iterations. + np.testing.assert_array_equal(c_team, np.full(len(c_team), n / len(u_team))) + # Expect equal number of iterations per thread id across teams. + np.testing.assert_array_equal( + c_thread, np.full(len(u_thread), n / len(u_thread)) + ) + elif device == 1: + self.assertLessEqual(len(u_team), 4) + np.testing.assert_array_equal(u_team, np.arange(0, len(u_team))) + # Divide (integer) n iterations by number of teams and add the + # remainder. + chunk = n // len(u_team) + rem = n % len(u_team) + chunks = np.full(len(u_team), chunk) + chunks[:rem] += 1 + np.testing.assert_array_equal(c_team, chunks) + + # Divide (integer) per team iterations by number of threads and add the + # remainder. + chunks_thread = np.zeros(len(u_thread)) + for i in range(len(u_team)): + chunk = chunks[i] // len(u_thread) + rem = chunks[i] % len(u_thread) + chunk_thread = np.full(len(u_thread), chunk) + chunk_thread[:rem] += 1 + chunks_thread += chunk_thread + + np.testing.assert_array_equal(c_thread, chunks_thread) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + @unittest.skip("Fix unexpected QUAL.OMP.THREAD_LIMIT") + def target_teams_nest_distribute_parallel_for(self, device): + target_pragma = f"""target teams device({device}) num_teams(4) + map(tofrom: s, a, sched_team, sched_thread)""" + dist_parfor_pragma = "distribute parallel for num_threads(256)" + + @njit + def test_impl(a, sched_team, sched_thread): + s = 42 + with omp(target_pragma): + with omp(dist_parfor_pragma): + for i in range(len(a)): + a[i] = 1 + team_id = omp_get_team_num() + sched_team[i] = team_id + thread_id = omp_get_thread_num() + sched_thread[i] = thread_id + if i == 0 and team_id == 0 and thread_id == 0: + s += 1 + return s, a, sched_team, sched_thread + + n = 1024 + a = np.zeros(n) + sched_team = np.zeros(n) + sched_thread = np.zeros(n) + s, r, sched_team, sched_thread = test_impl(a, sched_team, sched_thread) + np.testing.assert_equal(s, 43) + np.testing.assert_array_equal(r, np.ones(n)) + u_team, c_team = np.unique(sched_team, return_counts=True) + # there are 4 teams each with a unique id starting from 0. + np.testing.assert_equal(len(u_team), 4) + np.testing.assert_array_equal(u_team, np.arange(0, len(u_team))) + # each team should execute 1024/4 = 256 iterations. + np.testing.assert_array_equal(c_team, np.full(len(c_team), n / len(u_team))) + u_thread, c_thread = np.unique(sched_thread, return_counts=True) + # testing thread scheduling is tricky: OpenMP runtime sets aside a warp + # for the "sequential" target region execution. + # TODO: update tests as newer LLVM version lift the above limitations. + self.assertGreaterEqual(len(u_thread), n / len(u_team) - 32) + for c_thread_i in c_thread: + # threads from team 0 will execute more iterations (see above + # comment on removed warp). + self.assertGreaterEqual(c_thread_i, 4) + + def target_teams_nest_parallel_fpriv_shared_scalar(self, device): + target_pragma = f"target teams num_teams(1) thread_limit(32) device({device}) map(from: threads)" + + @njit + def test_impl(): + s = 42 + r = np.zeros(32) + threads = 0 + with omp(target_pragma): + with omp("parallel firstprivate(s)"): + threadno = omp_get_thread_num() + if threadno == 0: + threads = omp_get_num_threads() + s += 1 + r[threadno] = s + return s, r, threads + + s, r, threads = test_impl() + self.assertEqual(s, 42) + self.assertLessEqual(threads, 32) + np.testing.assert_array_equal(r[:threads], np.full(threads, 43)) + + def target_nest_parallel_float_fpriv(self, device): + target_pragma = f"target device({device}) map(from: r)" + + @njit + def test_impl(): + s = np.float32(42.0) + r = np.float32(0.0) + with omp(target_pragma): + with omp("parallel firstprivate(s)"): + threadno = omp_get_thread_num() + if threadno == 0: + r = s + 1 + return r + + r = test_impl() + np.testing.assert_equal(r, 43.0) + + def target_nest_teams_float_fpriv(self, device): + target_pragma = f"target device({device}) map(from: r)" + + @njit + def test_impl(): + s = np.float32(42.0) + r = np.float32(0.0) + with omp(target_pragma): + with omp("teams firstprivate(s)"): + teamno = omp_get_thread_num() + if teamno == 0: + r = s + 1 + return r + + r = test_impl() + np.testing.assert_equal(r, 43.0) + + @unittest.skip("Frontend codegen error") + def target_teams_nest_parallel_fpriv_shared_array(self, device): + target_pragma = f"target teams num_teams(1) thread_limit(32) device({device})" + + # FIX: frontend fails to emit copy constructor, error: + # add_llvm_module is not supported on the CUDACodelibrary + # QUESTION: in which address space does the copy constructor create the copy on the GPU? + @njit + def test_impl(): + s = np.zeros(32) + with omp(target_pragma): + with omp("parallel firstprivate(s)"): + print("parallel s", s[0]) + teams = omp_get_num_teams() + threads = omp_get_num_threads() + teamno = omp_get_team_num() + threadno = omp_get_thread_num() + if teamno == 0 and threadno == 0: + print("teams", teams, "threads", threads) + + test_impl() + input("ok?") + + def target_teams_shared_array(self, device): + target_pragma = f"target teams num_teams(10) map(tofrom: a) map(from: nteams) device({device})" + + @njit + def test_impl(): + a = np.zeros(10, dtype=np.int32) + nteams = 0 + + with omp(target_pragma): + team_shared_array = np.empty(10, dtype=np.int32) + team_id = omp_get_team_num() + + if team_id == 0: + nteams = omp_get_num_teams() + + for i in range(10): + team_shared_array[i] = team_id + + lasum = 0 + for i in range(10): + lasum += team_shared_array[i] + a[team_id] = lasum + + return a, nteams + + r, nteams = test_impl() + expected = np.arange(10) * 10 + if device == 0: + np.testing.assert_array_equal(r, expected) + elif device == 1: + np.testing.assert_array_equal(r[:nteams], expected[:nteams]) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_teams_shared_array_2d(self, device): + target_pragma = f"target teams num_teams(10) map(tofrom: a) map(from: nteams) device({device})" + + @njit + def test_impl(): + a = np.zeros((10, 2, 2), dtype=np.int32) + nteams = 0 + + with omp(target_pragma): + team_shared_array = np.empty((2, 2), dtype=np.int32) + team_id = omp_get_team_num() + + if team_id == 0: + nteams = omp_get_num_teams() + + for i in range(2): + for j in range(2): + team_shared_array[i, j] = team_id + + for i in range(2): + for j in range(2): + a[team_id, i, j] = team_shared_array[i, j] + return a, nteams + + a, nteams = test_impl() + expected = np.empty((10, 2, 2)) + for i in range(10): + expected[i] = np.full((2, 2), i) + if device == 0: + np.testing.assert_array_equal(a, expected) + elif device == 1: + np.testing.assert_array_equal(a[:nteams], expected[:nteams]) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_local_array(self, device): + target_pragma = f"target teams num_teams(1) map(tofrom: a) map(from: nthreads) device({device})" + + @njit + def test_impl(): + a = np.zeros((32, 10), dtype=np.int32) + nthreads = 0 + with omp(target_pragma): + with omp("parallel num_threads(32)"): + local_array = np.empty(10, dtype=np.int32) + tid = omp_get_thread_num() + if tid == 0: + nthreads = omp_get_num_threads() + for i in range(10): + local_array[i] = tid + for i in range(10): + a[tid, i] = local_array[i] + return a, nthreads + + a, nthreads = test_impl() + expected = np.empty((32, 10), dtype=np.int32) + for i in range(32): + expected[i] = [i] * 10 + if device == 0: + self.assertEqual(nthreads, 32) + np.testing.assert_array_equal(a, expected) + elif device == 1: + # CPU num_threads are capped by number of cores, which can be less + # than the provided value. + self.assertLessEqual(nthreads, 32) + np.testing.assert_array_equal(a[:nthreads], expected[:nthreads]) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_teams_parallel_shared_array(self, device): + target_pragma = f"target teams num_teams(10) map(tofrom: a) map(from: nteams, nthreads) device({device})" + + @njit + def test_impl(): + # save data from 10 teams each of 32 threads (maximally). + a = np.zeros((10, 32), dtype=np.int32) + nteams = 0 + nthreads = 0 + + with omp(target_pragma): + team_shared_array = np.empty(32, dtype=np.int32) + team_id = omp_get_team_num() + if team_id == 0: + nteams = omp_get_num_teams() + nthreads = omp_get_num_threads() + + with omp("parallel num_threads(32)"): + thread_local_array = np.empty(10, dtype=np.int32) + for i in range(10): + thread_local_array[i] = omp_get_thread_num() + + lasum = 0 + for i in range(10): + lasum += thread_local_array[i] + team_shared_array[omp_get_thread_num()] = lasum / 10 + + for i in range(32): + a[team_id, i] = team_shared_array[i] + + return a, nteams, nthreads + + r, nteams, nthreads = test_impl() + expected = np.tile(np.arange(32), (10, 1)) + if device == 0: + np.testing.assert_array_equal(r, expected) + elif device == 1: + np.testing.assert_array_equal( + r[:nteams, :nthreads], expected[:nteams, :nthreads] + ) + else: + raise ValueError(f"Device {device} must be 0 or 1") + + def target_teams_loop_collapse(self, device): + target_pragma = f"""target teams loop collapse(2) + device({device}) + map(tofrom: a, b, c)""" + + @njit + def test_impl(n): + a = np.ones((n, n)) + b = np.ones((n, n)) + c = np.zeros((n, n)) + with omp(target_pragma): + for i in range(n): + for j in range(n): + c[i, j] = a[i, j] + b[i, j] + return c + + n = 10 + c = test_impl(n) + np.testing.assert_array_equal(c, np.full((n, n), 2)) + + def target_nest_teams_nest_loop_collapse(self, device): + target_pragma = f"""target device({device}) map(tofrom: a, b, c)""" + + @njit + def test_impl(n): + a = np.ones((n, n)) + b = np.ones((n, n)) + c = np.zeros((n, n)) + with omp(target_pragma): + with omp("teams"): + with omp("loop collapse(2)"): + for i in range(n): + for j in range(n): + c[i, j] = a[i, j] + b[i, j] + return c + + n = 10 + c = test_impl(n) + np.testing.assert_array_equal(c, np.full((n, n), 2)) + + +for memberName in dir(TestOpenmpTarget): + if memberName.startswith("target"): + test_func = getattr(TestOpenmpTarget, memberName) + + def make_func_with_subtest(func): + def func_with_subtest(self): + for device in TestOpenmpTarget.devices: + with self.subTest(device=device): + func(self, device) + + return func_with_subtest + + setattr( + TestOpenmpTarget, + "test_" + test_func.__name__, + make_func_with_subtest(test_func), + ) + + +class TestOpenmpPi(TestOpenmpBase): + def __init__(self, *args): + TestOpenmpBase.__init__(self, *args) + + def test_pi_loop(self): + def test_impl(num_steps): + step = 1.0 / num_steps + + the_sum = 0.0 + omp_set_num_threads(4) + + with omp("parallel"): + with omp("for reduction(+:the_sum) schedule(static)"): + for j in range(num_steps): + x = ((j - 1) - 0.5) * step + the_sum += 4.0 / (1.0 + x * x) + + pi = step * the_sum + return pi + + self.check(test_impl, 100000) + + def test_pi_loop_combined(self): + def test_impl(num_steps): + step = 1.0 / num_steps + + the_sum = 0.0 + omp_set_num_threads(4) + + with omp("parallel for reduction(+:the_sum) schedule(static)"): + for j in range(num_steps): + x = ((j - 1) - 0.5) * step + the_sum += 4.0 / (1.0 + x * x) + + pi = step * the_sum + return pi + + self.check(test_impl, 100000) + + def test_pi_loop_directive(self): + def test_impl(num_steps): + step = 1.0 / num_steps + + the_sum = 0.0 + omp_set_num_threads(4) + + with omp("loop reduction(+:the_sum) schedule(static)"): + for j in range(num_steps): + x = ((j - 1) - 0.5) * step + the_sum += 4.0 / (1.0 + x * x) + + pi = step * the_sum + return pi + + self.check(test_impl, 100000) + + def test_pi_spmd(self): + def test_impl(num_steps): + step = 1.0 / num_steps + MAX_THREADS = 8 + tsum = np.zeros(MAX_THREADS) + + j = 4 + omp_set_num_threads(j) + full_sum = 0.0 + + with omp("parallel private(tid, numthreads, local_sum, x)"): + tid = omp_get_thread_num() + numthreads = omp_get_num_threads() + local_sum = 0.0 + + for i in range(tid, num_steps, numthreads): + x = (i + 0.5) * step + local_sum += 4.0 / (1.0 + x * x) + + tsum[tid] = local_sum + + for k in range(j): + full_sum += tsum[k] + + pi = step * full_sum + return pi + + self.check(test_impl, 10000000) + + def test_pi_task(self): + def test_pi_comp(Nstart, Nfinish, step): + MIN_BLK = 256 + pi_sum = 0.0 + if Nfinish - Nstart < MIN_BLK: + for i in range(Nstart, Nfinish): + x = (i + 0.5) * step + pi_sum += 4.0 / (1.0 + x * x) + else: + iblk = Nfinish - Nstart + pi_sum1 = 0.0 + pi_sum2 = 0.0 + cut = Nfinish - (iblk // 2) + with omp("task shared(pi_sum1)"): + pi_sum1 = test_pi_comp(Nstart, cut, step) + with omp("task shared(pi_sum2)"): + pi_sum2 = test_pi_comp(cut, Nfinish, step) + with omp("taskwait"): + pi_sum = pi_sum1 + pi_sum2 + return pi_sum + + @njit + def test_pi_comp_njit(Nstart, Nfinish, step): + MIN_BLK = 256 + pi_sum = 0.0 + if Nfinish - Nstart < MIN_BLK: + for i in range(Nstart, Nfinish): + x = (i + 0.5) * step + pi_sum += 4.0 / (1.0 + x * x) + else: + iblk = Nfinish - Nstart + pi_sum1 = 0.0 + pi_sum2 = 0.0 + cut = Nfinish - (iblk // 2) + with omp("task shared(pi_sum1)"): + pi_sum1 = test_pi_comp_njit(Nstart, cut, step) + with omp("task shared(pi_sum2)"): + pi_sum2 = test_pi_comp_njit(cut, Nfinish, step) + with omp("taskwait"): + pi_sum = pi_sum1 + pi_sum2 + return pi_sum + + def test_impl(lb, num_steps, pi_comp_func): + step = 1.0 / num_steps + + j = 4 + omp_set_num_threads(j) + full_sum = 0.0 + + with omp("parallel"): + with omp("single"): + full_sum = pi_comp_func(lb, num_steps, step) + + pi = step * full_sum + return pi + + py_output = test_impl(0, 1024, test_pi_comp) + njit_output = njit(test_impl)(0, 1024, test_pi_comp_njit) + self.assert_outputs_equal(py_output, njit_output) + + +if __name__ == "__main__": + unittest.main() From 7f3ba0a3f9dc921674d55566fd4f4c7e3644fd3a Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 22 Apr 2025 03:51:10 -0700 Subject: [PATCH 03/28] Cleanup openmp testing - Fix override_config - Rename openmp_context alias to openmp instead of omp to align with examples and documentation --- numba/openmp/tests/test_openmp.py | 971 +++++++++++++++--------------- 1 file changed, 486 insertions(+), 485 deletions(-) diff --git a/numba/openmp/tests/test_openmp.py b/numba/openmp/tests/test_openmp.py index 17aba2cb8b02..37ff05930ea3 100644 --- a/numba/openmp/tests/test_openmp.py +++ b/numba/openmp/tests/test_openmp.py @@ -74,9 +74,10 @@ skip_unless_scipy, needs_subprocess, ) -import numba.openmp as openmp + +import numba.openmp from numba.openmp import njit -from numba.openmp import openmp_context as omp +from numba.openmp import openmp_context as openmp from numba.openmp import ( omp_set_num_threads, omp_get_thread_num, @@ -195,12 +196,12 @@ def override_config(name, value): *name* to *value*. *name* must be the name of an existing variable in openmp. """ - old_value = getattr(openmp, name) - setattr(openmp, name, value) + old_value = getattr(numba.openmp, name) + setattr(numba.openmp, name, value) try: yield finally: - setattr(openmp, name, old_value) + setattr(numba.openmp, name, old_value) # @needs_subprocess @@ -440,9 +441,9 @@ def test_impl(): omp_set_dynamic(0) o_nt = omp_get_max_threads() count = 0 - with omp("parallel"): + with openmp("parallel"): i_nt = omp_get_max_threads() - with omp("critical"): + with openmp("critical"): count += 1 return count, i_nt, o_nt @@ -457,9 +458,9 @@ def test_impl(): omp_set_dynamic(0) o_nt = omp_get_num_threads() count = 0 - with omp("parallel"): + with openmp("parallel"): i_nt = omp_get_num_threads() - with omp("critical"): + with openmp("critical"): count += 1 return (count, i_nt), o_nt @@ -476,12 +477,12 @@ def test_impl(n1, n2): omp_set_num_threads(n1) count1 = 0 count2 = 0 - with omp("parallel"): - with omp("critical"): + with openmp("parallel"): + with openmp("critical"): count1 += 1 omp_set_num_threads(n2) - with omp("parallel"): - with omp("critical"): + with openmp("parallel"): + with openmp("critical"): count2 += 1 return count1, count2 @@ -497,17 +498,17 @@ def test_impl(n1, n2, n3): omp_set_max_active_levels(2) omp_set_num_threads(n2) count1, count2, count3 = 0, 0, 0 - with omp("parallel num_threads(n1)"): - with omp("single"): - with omp("parallel"): - with omp("single"): + with openmp("parallel num_threads(n1)"): + with openmp("single"): + with openmp("parallel"): + with openmp("single"): omp_set_num_threads(n3) - with omp("parallel"): - with omp("critical"): + with openmp("parallel"): + with openmp("critical"): count3 += 1 - with omp("critical"): + with openmp("critical"): count2 += 1 - with omp("critical"): + with openmp("critical"): count1 += 1 return count1, count2, count3 @@ -521,13 +522,13 @@ def test_func_get_ancestor_thread_num(self): @njit def test_impl(): oa = omp_get_ancestor_thread_num(0) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): m1 = omp_get_ancestor_thread_num(0) f1 = omp_get_ancestor_thread_num(1) s1 = omp_get_ancestor_thread_num(2) tn1 = omp_get_thread_num() - with omp("parallel"): + with openmp("parallel"): m2 = omp_get_ancestor_thread_num(0) f2 = omp_get_ancestor_thread_num(1) s2 = omp_get_ancestor_thread_num(2) @@ -545,14 +546,14 @@ def test_func_get_team_size(self): def test_impl(n1, n2): omp_set_max_active_levels(2) oa = omp_get_team_size(0) - with omp("parallel num_threads(n1)"): - with omp("single"): + with openmp("parallel num_threads(n1)"): + with openmp("single"): m1 = omp_get_team_size(0) f1 = omp_get_team_size(1) s1 = omp_get_team_size(2) nt1 = omp_get_num_threads() - with omp("parallel num_threads(n2)"): - with omp("single"): + with openmp("parallel num_threads(n2)"): + with openmp("single"): m2 = omp_get_team_size(0) f2 = omp_get_team_size(1) s2 = omp_get_team_size(2) @@ -570,11 +571,11 @@ def test_func_get_level(self): @njit def test_impl(): oa = omp_get_level() - with omp("parallel if(0)"): + with openmp("parallel if(0)"): f = omp_get_level() - with omp("parallel num_threads(1)"): + with openmp("parallel num_threads(1)"): s = omp_get_level() - with omp("parallel"): + with openmp("parallel"): t = omp_get_level() return oa, f, s, t @@ -585,11 +586,11 @@ def test_func_get_active_level(self): @njit def test_impl(): oa = omp_get_active_level() - with omp("parallel if(0)"): + with openmp("parallel if(0)"): f = omp_get_active_level() - with omp("parallel num_threads(1)"): + with openmp("parallel num_threads(1)"): s = omp_get_active_level() - with omp("parallel"): + with openmp("parallel"): t = omp_get_active_level() return oa, f, s, t @@ -604,14 +605,14 @@ def test_impl(): omp_set_dynamic(0) omp_set_max_active_levels(1) # 1 because first region is inactive oa = omp_in_parallel() - with omp("parallel num_threads(1)"): + with openmp("parallel num_threads(1)"): ia = omp_in_parallel() - with omp("parallel"): + with openmp("parallel"): n1a = omp_in_parallel() - with omp("single"): - with omp("parallel"): + with openmp("single"): + with openmp("parallel"): n2a = omp_in_parallel() - with omp("parallel if(0)"): + with openmp("parallel if(0)"): ua = omp_in_parallel() return oa, ia, n1a, n2a, ua @@ -629,13 +630,13 @@ def test_impl(N, c): a = np.arange(N)[::-1] fa = np.zeros(N) fia = np.zeros(N) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(len(a)): e = a[i] - with omp("task final(e >= c)"): + with openmp("task final(e >= c)"): fa[i] = omp_in_final() - with omp("task"): + with openmp("task"): fia[i] = omp_in_final() return fa, fia @@ -651,7 +652,7 @@ def __init__(self, *args): def test_parallel_for_set_elements(self): def test_impl(v): - with omp("parallel for"): + with openmp("parallel for"): for i in range(len(v)): v[i] = 1.0 return v @@ -660,8 +661,8 @@ def test_impl(v): def test_separate_parallel_for_set_elements(self): def test_impl(v): - with omp("parallel"): - with omp("for"): + with openmp("parallel"): + with openmp("for"): for i in range(len(v)): v[i] = 1.0 return v @@ -671,7 +672,7 @@ def test_impl(v): def test_parallel_for_const_var_omp_statement(self): def test_impl(v): ovar = "parallel for" - with omp(ovar): + with openmp(ovar): for i in range(len(v)): v[i] = 1.0 return v @@ -681,7 +682,7 @@ def test_impl(v): def test_parallel_for_string_conditional(self): def test_impl(S): capitalLetters = 0 - with omp("parallel for reduction(+:capitalLetters)"): + with openmp("parallel for reduction(+:capitalLetters)"): for i in range(len(S)): if S[i].isupper(): capitalLetters += 1 @@ -692,7 +693,7 @@ def test_impl(S): def test_parallel_for_tuple(self): def test_impl(t): len_total = 0 - with omp("parallel for reduction(+:len_total)"): + with openmp("parallel for reduction(+:len_total)"): for i in range(len(t)): len_total += len(t[i]) return len_total @@ -702,7 +703,7 @@ def test_impl(t): def test_parallel_for_range_step_2(self): def test_impl(N): a = np.zeros(N, dtype=np.int32) - with omp("parallel for"): + with openmp("parallel for"): for i in range(0, len(a), 2): a[i] = i + 1 @@ -713,7 +714,7 @@ def test_impl(N): def test_parallel_for_range_step_arg(self): def test_impl(N, step): a = np.zeros(N, dtype=np.int32) - with omp("parallel for"): + with openmp("parallel for"): for i in range(0, len(a), step): a[i] = i + 1 @@ -725,7 +726,7 @@ def test_parallel_for_incremented_step(self): @njit def test_impl(v, n): for i in range(n): - with omp("parallel for"): + with openmp("parallel for"): for j in range(0, len(v), i + 1): v[j] = i + 1 return v @@ -735,7 +736,7 @@ def test_impl(v, n): def test_parallel_for_range_backward_step(self): def test_impl(N): a = np.zeros(N, dtype=np.int32) - with omp("parallel for"): + with openmp("parallel for"): for i in range(N - 1, -1, -1): a[i] = i + 1 @@ -747,7 +748,7 @@ def test_impl(N): def test_parallel_for_dictionary(self): def test_impl(N, c): l = {} - with omp("parallel for"): + with openmp("parallel for"): for i in range(N): l[i] = i % c return l @@ -757,8 +758,8 @@ def test_impl(N, c): def test_parallel_for_num_threads(self): def test_impl(nt): a = np.zeros(nt) - with omp("parallel num_threads(nt)"): - with omp("for"): + with openmp("parallel num_threads(nt)"): + with openmp("for"): for i in range(nt): a[i] = i return a @@ -769,8 +770,8 @@ def test_parallel_for_only_inside_var(self): @njit def test_impl(nt): a = np.zeros(nt) - with omp("parallel num_threads(nt) private(x)"): - with omp("for private(x)"): + with openmp("parallel num_threads(nt) private(x)"): + with openmp("for private(x)"): for i in range(nt): x = 0 # print("out:", i, x, i + x, nt) @@ -786,10 +787,10 @@ def test_parallel_for_ordered(self): def test_impl(N, c): a = np.zeros(N) b = np.zeros(N) - with omp("parallel for ordered"): + with openmp("parallel for ordered"): for i in range(1, N): b[i] = b[i - 1] + c - with omp("ordered"): + with openmp("ordered"): a[i] = a[i - 1] + c return a @@ -806,7 +807,7 @@ def test_impl(n1, n2, n3): ia = np.zeros(n1) ja = np.zeros((n1, n2)) ka = np.zeros((n1, n2, n3)) - with omp("parallel for collapse(2)"): + with openmp("parallel for collapse(2)"): for i in range(n1): ia[i] = omp_get_thread_num() for j in range(n2): @@ -837,7 +838,7 @@ def test_impl(N, nt): v = np.zeros(N) step = -2 omp_set_num_threads(nt) - with omp("parallel private(thread_num)"): + with openmp("parallel private(thread_num)"): running_omp = omp_in_parallel() thread_num = omp_get_thread_num() if not running_omp: @@ -847,7 +848,7 @@ def test_impl(N, nt): for t in range(N): f = itersPerThread*(t+1)-1 + min(iters%itersPerThread, t+1) finishToThread[f] = t - with omp("for schedule(static)"): + with openmp("for schedule(static)"): for index, i in enumerate(range(N-1, N%2 - 1, -2)): if not running_omp: for finish in finishToThread.keys(): @@ -866,7 +867,7 @@ def test_avg_sched_const(self): def test_impl(n, a): b = np.zeros(n) nt = 5 - with omp("parallel for num_threads(nt) schedule(static, 4)"): + with openmp("parallel for num_threads(nt) schedule(static, 4)"): for i in range(1, n): b[i] = (a[i] + a[i - 1]) / 2.0 @@ -880,7 +881,7 @@ def test_impl(n, a): b = np.zeros(n) nt = 5 ss = 4 - with omp("parallel for num_threads(nt) schedule(static, ss)"): + with openmp("parallel for num_threads(nt) schedule(static, ss)"): for i in range(1, n): b[i] = (a[i] + a[i - 1]) / 2.0 @@ -892,7 +893,7 @@ def test_static_distribution(self): @njit def test_impl(nt, c): a = np.empty(nt * c) - with omp("parallel for num_threads(nt) schedule(static)"): + with openmp("parallel for num_threads(nt) schedule(static)"): for i in range(nt * c): a[i] = omp_get_thread_num() return a @@ -909,7 +910,7 @@ def test_static_chunk_distribution(self): @njit def test_impl(nt, c, cs): a = np.empty(nt * c) - with omp("parallel for num_threads(nt) schedule(static, cs)"): + with openmp("parallel for num_threads(nt) schedule(static, cs)"): for i in range(nt * c): a[i] = omp_get_thread_num() return a @@ -929,11 +930,11 @@ def test_static_consistency(self): def test_impl(nt, c, cs): a = np.empty(nt * c) b = np.empty(nt * c) - with omp("parallel num_threads(8)"): - with omp("for schedule(static)"): + with openmp("parallel num_threads(8)"): + with openmp("for schedule(static)"): for i in range(nt * c): a[i] = omp_get_thread_num() - with omp("for schedule(static)"): + with openmp("for schedule(static)"): for i in range(nt * c): b[i] = omp_get_thread_num() return a, b @@ -946,7 +947,7 @@ def test_dynamic_distribution(self): @njit def test_impl(nt, c, cs): a = np.empty(nt * c) - with omp("parallel for num_threads(nt) schedule(dynamic)"): + with openmp("parallel for num_threads(nt) schedule(dynamic)"): for i in range(nt * c): a[i] = omp_get_thread_num() return a @@ -971,7 +972,7 @@ def test_guided_distribution(self): @njit def test_impl(nt, c, cs): a = np.empty(nt * c) - with omp("parallel for num_threads(nt) schedule(guided, cs)"): + with openmp("parallel for num_threads(nt) schedule(guided, cs)"): for i in range(nt * c): a[i] = omp_get_thread_num() return a @@ -1007,18 +1008,18 @@ def test_impl(N, c1, c2): n_count = 0 nc_count = 0 a_count = 0 - with omp("parallel num_threads(N) shared(c2)"): - with omp("critical"): + with openmp("parallel num_threads(N) shared(c2)"): + with openmp("critical"): d_count += 1 - with omp("parallel"): - with omp("critical"): + with openmp("parallel"): + with openmp("critical"): n_count += 1 - with omp("single"): - with omp("parallel num_threads(6)"): - with omp("critical"): + with openmp("single"): + with openmp("parallel num_threads(6)"): + with openmp("critical"): nc_count += 1 - with omp("parallel"): - with omp("critical"): + with openmp("parallel"): + with openmp("critical"): a_count += 1 return d_count, a_count, n_count, nc_count @@ -1041,11 +1042,11 @@ def test_impl(s): omp_set_num_threads(s) omp_set_dynamic(0) - with omp("parallel for if(rp)"): + with openmp("parallel for if(rp)"): for i in range(s): ar[omp_get_thread_num()] = 1 par[i] = omp_in_parallel() - with omp("parallel for if(drp)"): + with openmp("parallel for if(drp)"): for i in range(s): adr[omp_get_thread_num()] = 1 padr[i] = omp_in_parallel() @@ -1065,7 +1066,7 @@ def test_impl(n, a): b = np.zeros(n) omp_set_num_threads(5) - with omp("parallel for"): + with openmp("parallel for"): for i in range(1, n): b[i] = (a[i] + a[i - 1]) / 2.0 return b @@ -1075,7 +1076,7 @@ def test_impl(n, a): def test_avg_num_threads_clause(self): def test_impl(n, a): b = np.zeros(n) - with omp("parallel for num_threads(5)"): + with openmp("parallel for num_threads(5)"): for i in range(1, n): b[i] = (a[i] + a[i - 1]) / 2.0 @@ -1087,7 +1088,7 @@ def test_avg_num_threads_clause_var(self): def test_impl(n, a): b = np.zeros(n) nt = 5 - with omp("parallel for num_threads(nt)"): + with openmp("parallel for num_threads(nt)"): for i in range(1, n): b[i] = (a[i] + a[i - 1]) / 2.0 @@ -1101,7 +1102,7 @@ def test_avg_if_const(self): def test_impl(n, a): b = np.zeros(n) nt = 5 - with omp("parallel for if(1) num_threads(nt) schedule(static, 4)"): + with openmp("parallel for if(1) num_threads(nt) schedule(static, 4)"): for i in range(1, n): b[i] = (a[i] + a[i - 1]) / 2.0 @@ -1116,7 +1117,7 @@ def test_impl(n, a): nt = 5 ss = 4 do_if = 1 - with omp("parallel for if(do_if) num_threads(nt) schedule(static, ss)"): + with openmp("parallel for if(do_if) num_threads(nt) schedule(static, ss)"): for i in range(1, n): b[i] = (a[i] + a[i - 1]) / 2.0 @@ -1127,8 +1128,8 @@ def test_impl(n, a): def test_teams1(self): def test_impl(): a = 1 - with omp("teams"): - with omp("parallel"): + with openmp("teams"): + with openmp("parallel"): a = 123 return a @@ -1144,7 +1145,7 @@ def test_parallel_reduction_add_int(self): def test_impl(): redux = 0 nthreads = 0 - with omp("parallel reduction(+:redux)"): + with openmp("parallel reduction(+:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1160,7 +1161,7 @@ def test_parallel_reduction_sub_int(self): def test_impl(): redux = 0 nthreads = 0 - with omp("parallel reduction(-:redux)"): + with openmp("parallel reduction(-:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1176,7 +1177,7 @@ def test_parallel_reduction_mul_int(self): def test_impl(): redux = 1 nthreads = 0 - with omp("parallel reduction(*:redux) num_threads(8)"): + with openmp("parallel reduction(*:redux) num_threads(8)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1192,7 +1193,7 @@ def test_parallel_reduction_add_fp64(self): def test_impl(): redux = np.float64(0.0) nthreads = np.float64(0.0) - with omp("parallel reduction(+:redux)"): + with openmp("parallel reduction(+:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1208,7 +1209,7 @@ def test_parallel_reduction_sub_fp64(self): def test_impl(): redux = np.float64(0.0) nthreads = np.float64(0.0) - with omp("parallel reduction(-:redux)"): + with openmp("parallel reduction(-:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1224,7 +1225,7 @@ def test_parallel_reduction_mul_fp64(self): def test_impl(): redux = np.float64(1.0) nthreads = np.float64(0.0) - with omp("parallel reduction(*:redux) num_threads(8)"): + with openmp("parallel reduction(*:redux) num_threads(8)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1240,7 +1241,7 @@ def test_parallel_reduction_add_fp32(self): def test_impl(): redux = np.float32(0.0) nthreads = np.float32(0.0) - with omp("parallel reduction(+:redux)"): + with openmp("parallel reduction(+:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1256,7 +1257,7 @@ def test_parallel_reduction_sub_fp32(self): def test_impl(): redux = np.float32(0.0) nthreads = np.float32(0.0) - with omp("parallel reduction(-:redux)"): + with openmp("parallel reduction(-:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1272,7 +1273,7 @@ def test_parallel_reduction_mul_fp32(self): def test_impl(): redux = np.float32(1.0) nthreads = np.float32(0.0) - with omp("parallel reduction(*:redux) num_threads(8)"): + with openmp("parallel reduction(*:redux) num_threads(8)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1287,7 +1288,7 @@ def test_parallel_for_reduction_add_int(self): @njit def test_impl(): redux = 0 - with omp("parallel for reduction(+:redux)"): + with openmp("parallel for reduction(+:redux)"): for i in range(10): redux += 1 return redux @@ -1299,7 +1300,7 @@ def test_parallel_for_reduction_sub_int(self): @njit def test_impl(): redux = 0 - with omp("parallel for reduction(-:redux)"): + with openmp("parallel for reduction(-:redux)"): for i in range(10): redux += 1 return redux @@ -1311,7 +1312,7 @@ def test_parallel_for_reduction_mul_int(self): @njit def test_impl(): redux = 1 - with omp("parallel for reduction(*:redux)"): + with openmp("parallel for reduction(*:redux)"): for i in range(10): redux *= 2 return redux @@ -1323,7 +1324,7 @@ def test_parallel_for_reduction_add_fp64(self): @njit def test_impl(): redux = np.float64(0.0) - with omp("parallel for reduction(+:redux)"): + with openmp("parallel for reduction(+:redux)"): for i in range(10): redux += np.float64(1.0) return redux @@ -1335,7 +1336,7 @@ def test_parallel_for_reduction_sub_fp64(self): @njit def test_impl(): redux = np.float64(0.0) - with omp("parallel for reduction(-:redux)"): + with openmp("parallel for reduction(-:redux)"): for i in range(10): redux += np.float64(1.0) return redux @@ -1347,7 +1348,7 @@ def test_parallel_for_reduction_mul_fp64(self): @njit def test_impl(): redux = np.float64(1.0) - with omp("parallel for reduction(*:redux)"): + with openmp("parallel for reduction(*:redux)"): for i in range(10): redux *= np.float64(2.0) return redux @@ -1359,7 +1360,7 @@ def test_parallel_for_reduction_add_fp32(self): @njit def test_impl(): redux = np.float32(0.0) - with omp("parallel for reduction(+:redux)"): + with openmp("parallel for reduction(+:redux)"): for i in range(10): redux += np.float32(1.0) return redux @@ -1371,7 +1372,7 @@ def test_parallel_for_reduction_sub_fp32(self): @njit def test_impl(): redux = np.float32(0.0) - with omp("parallel for reduction(-:redux)"): + with openmp("parallel for reduction(-:redux)"): for i in range(10): redux += np.float32(1.0) return redux @@ -1383,7 +1384,7 @@ def test_parallel_for_reduction_mul_fp32(self): @njit def test_impl(): redux = np.float32(1.0) - with omp("parallel for reduction(*:redux)"): + with openmp("parallel for reduction(*:redux)"): for i in range(10): redux *= np.float32(2.0) return redux @@ -1396,7 +1397,7 @@ def test_parallel_reduction_add_int_10(self): def test_impl(): redux = 10 nthreads = 0 - with omp("parallel reduction(+:redux)"): + with openmp("parallel reduction(+:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1412,7 +1413,7 @@ def test_parallel_reduction_add_fp32_10(self): def test_impl(): redux = np.float32(10.0) nthreads = np.float32(0.0) - with omp("parallel reduction(+:redux)"): + with openmp("parallel reduction(+:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1428,7 +1429,7 @@ def test_parallel_reduction_add_fp64_10(self): def test_impl(): redux = np.float64(10.0) nthreads = np.float64(0.0) - with omp("parallel reduction(+:redux)"): + with openmp("parallel reduction(+:redux)"): thread_id = omp_get_thread_num() if thread_id == 0: nthreads = omp_get_num_threads() @@ -1443,7 +1444,7 @@ def test_parallel_for_reduction_add_int_10(self): @njit def test_impl(): redux = 10 - with omp("parallel for reduction(+:redux)"): + with openmp("parallel for reduction(+:redux)"): for i in range(10): redux += 1 return redux @@ -1455,7 +1456,7 @@ def test_parallel_for_reduction_add_fp32(self): @njit def test_impl(): redux = np.float32(0.0) - with omp("parallel for reduction(+:redux)"): + with openmp("parallel for reduction(+:redux)"): for i in range(10): redux += np.float32(1.0) return redux @@ -1467,7 +1468,7 @@ def test_parallel_for_reduction_add_fp64_10(self): @njit def test_impl(): redux = np.float64(10.0) - with omp("parallel for reduction(+:redux)"): + with openmp("parallel for reduction(+:redux)"): for i in range(10): redux += np.float64(1.0) return redux @@ -1485,7 +1486,7 @@ def test_default_none(self): def test_impl(N): a = np.zeros(N, dtype=np.int32) x = 7 - with omp("parallel for default(none)"): + with openmp("parallel for default(none)"): for i in range(N): y = i + x a[i] = y @@ -1504,16 +1505,16 @@ def test_impl(N, M): y = np.zeros(N) z = 3.14 i = 7 - with omp("parallel private(i)"): + with openmp("parallel private(i)"): yn = M + 1 zs = z - with omp("for"): + with openmp("for"): for i in range(N): y[i] = yn + 2 * (i + 1) - with omp("for"): + with openmp("for"): for i in range(N): x[i] = y[i] - i - with omp("critical"): + with openmp("critical"): z += 3 return x, y, zs, z, i @@ -1540,10 +1541,10 @@ def test_impl(): vals = np.zeros(NTHREADS) valsfp = np.zeros(NTHREADS) - with omp("""parallel private(x) shared(zsh) + with openmp("""parallel private(x) shared(zsh) firstprivate(zfp) private(ID)"""): ID = omp_get_thread_num() - with omp("single"): + with openmp("single"): nsing = nsing + 1 numthrds = omp_get_num_threads() if y != 3: @@ -1555,7 +1556,7 @@ def test_impl(): ) # verify each thread sees the same variable vsh - with omp("critical"): + with openmp("critical"): zsh = zsh + ID # test first private @@ -1608,7 +1609,7 @@ def test_privates(self): def test_impl(N): a = np.zeros(N, dtype=np.int32) x = 7 - with omp("""parallel for firstprivate(x) private(y) + with openmp("""parallel for firstprivate(x) private(y) lastprivate(zzzz) private(private_index) shared(a) firstprivate(N) default(none)"""): for private_index in range(N): @@ -1624,7 +1625,7 @@ def test_private_retain_value(self): @njit def test_impl(): x = 5 - with omp("parallel private(x)"): + with openmp("parallel private(x)"): x = 13 return x @@ -1633,7 +1634,7 @@ def test_impl(): def test_private_retain_value_param(self): @njit def test_impl(x): - with omp("parallel private(x)"): + with openmp("parallel private(x)"): x = 13 return x @@ -1643,8 +1644,8 @@ def test_private_retain_value_for(self): @njit def test_impl(): x = 5 - with omp("parallel private(x)"): - with omp("for"): + with openmp("parallel private(x)"): + with openmp("for"): for i in range(10): x = i return x @@ -1654,8 +1655,8 @@ def test_impl(): def test_private_retain_value_for_param(self): @njit def test_impl(x): - with omp("parallel private(x)"): - with omp("for"): + with openmp("parallel private(x)"): + with openmp("for"): for i in range(10): x = i return x @@ -1666,7 +1667,7 @@ def test_private_retain_value_combined_for(self): @njit def test_impl(): x = 5 - with omp("parallel for private(x)"): + with openmp("parallel for private(x)"): for i in range(10): x = i return x @@ -1676,7 +1677,7 @@ def test_impl(): def test_private_retain_value_combined_for_param(self): @njit def test_impl(x): - with omp("parallel for private(x)"): + with openmp("parallel for private(x)"): for i in range(10): x = i return x @@ -1688,7 +1689,7 @@ def test_private_retain_two_values(self): def test_impl(): x = 5 y = 7 - with omp("parallel private(x,y)"): + with openmp("parallel private(x,y)"): x = 13 y = 40 return x, y @@ -1699,11 +1700,11 @@ def test_private_retain_array(self): @njit def test_impl(N, x): a = np.ones(N) - with omp("parallel private(a)"): - with omp("single"): + with openmp("parallel private(a)"): + with openmp("single"): sa = a a = np.zeros(N) - with omp("for"): + with openmp("for"): for i in range(N): a[i] = x return a, sa @@ -1717,7 +1718,7 @@ def test_private_divide_work(self): def test_impl(v, npoints): omp_set_num_threads(3) - with omp("""parallel default(shared) + with openmp("""parallel default(shared) private(iam,nt,ipoints,istart)"""): iam = omp_get_thread_num() nt = omp_get_num_threads() @@ -1734,7 +1735,7 @@ def test_impl(v, npoints): def test_firstprivate(self): @njit def test_impl(x, y): - with omp("parallel firstprivate(x)"): + with openmp("parallel firstprivate(x)"): xs = x x = y return xs, x @@ -1747,7 +1748,7 @@ def test_lastprivate_for(self): def test_impl(N): a = np.zeros(N) si = 0 - with omp("parallel for lastprivate(si)"): + with openmp("parallel for lastprivate(si)"): for i in range(N): si = i + 1 a[i] = si @@ -1763,7 +1764,7 @@ def test_lastprivate_non_one_step(self): def test_impl(n1, n2, s): a = np.zeros(math.ceil((n2 - n1) / s)) rl = np.arange(n1, n2, s) - with omp("parallel for lastprivate(si)"): + with openmp("parallel for lastprivate(si)"): for i in range(len(rl)): si = rl[i] + 1 a[i] = si @@ -1780,32 +1781,32 @@ def test_lastprivate_sections(self): @njit def test_impl(N2, si): a = np.zeros(N2) - with omp("parallel shared(sis1)"): - with omp("sections lastprivate(si)"): + with openmp("parallel shared(sis1)"): + with openmp("sections lastprivate(si)"): sis1 = si # N1 = number of sections - with omp("section"): + with openmp("section"): si = 0 - with omp("section"): + with openmp("section"): si = 1 - with omp("section"): + with openmp("section"): si = 2 sis2 = si - with omp("sections lastprivate(si)"): + with openmp("sections lastprivate(si)"): # N2 = number of sections - with omp("section"): + with openmp("section"): i = 0 si = N2 - i a[i] = si - with omp("section"): + with openmp("section"): i = 1 si = N2 - i a[i] = si - with omp("section"): + with openmp("section"): i = 2 si = N2 - i a[i] = si - with omp("section"): + with openmp("section"): i = 3 si = N2 - i a[i] = si @@ -1824,8 +1825,8 @@ def test_lastprivate_conditional(self): def test_impl(N, c1, c2): a = np.arange(0, N * 2, c2) num = 0 - with omp("parallel"): - with omp("for lastprivate(conditional: num)"): + with openmp("parallel"): + with openmp("for lastprivate(conditional: num)"): for i in range(N): if i < c1: num = a[i] + c2 @@ -1842,14 +1843,14 @@ def test_impl(N, c): a = np.zeros(N) ra = np.zeros(N) val = 0 - with omp("threadprivate(val)"): + with openmp("threadprivate(val)"): pass - with omp("parallel private(tn, sn)"): + with openmp("parallel private(tn, sn)"): tn = omp_get_thread_num() sn = c + tn val = sn a[tn] = sn - with omp("parallel private(tn)"): + with openmp("parallel private(tn)"): tn = omp_get_thread_num() ra[tn] = 1 if val == a[tn] else 0 return ra @@ -1864,15 +1865,15 @@ def test_impl(nt, n1, n2, n3): xsa1 = np.zeros(nt) xsa2 = np.zeros(nt) x = n1 - with omp("threadprivate(x)"): + with openmp("threadprivate(x)"): pass x = n2 - with omp("parallel num_threads(nt) copyin(x) private(tn)"): + with openmp("parallel num_threads(nt) copyin(x) private(tn)"): tn = omp_get_thread_num() xsa1[tn] = x if tn == 0: x = n3 - with omp("parallel copyin(x)"): + with openmp("parallel copyin(x)"): xsa2[omp_get_thread_num()] = x return xsa1, xsa2 @@ -1889,15 +1890,15 @@ def test_impl(nt1, nt2, mt, n1, n2, n3): xsa1 = np.zeros(nt1) xsa2 = np.zeros(nt2) x = n1 - with omp("threadprivate(x)"): + with openmp("threadprivate(x)"): pass x = n2 - with omp("parallel num_threads(nt1) copyin(x) private(tn)"): + with openmp("parallel num_threads(nt1) copyin(x) private(tn)"): tn = omp_get_thread_num() xsa1[tn] = x if tn == mt: x = n3 - with omp("parallel num_threads(nt2) copyin(x)"): + with openmp("parallel num_threads(nt2) copyin(x)"): xsa2[omp_get_thread_num()] = x return xsa1, xsa2 @@ -1915,8 +1916,8 @@ def test_impl(nt, n1, n2, n3): xsa = np.zeros(nt) ar = np.zeros(nt) omp_set_num_threads(nt) - with omp("parallel firstprivate(x, a) private(tn)"): - with omp("single copyprivate(x, a)"): + with openmp("parallel firstprivate(x, a) private(tn)"): + with openmp("single copyprivate(x, a)"): x = n2 a = np.full(nt, n3) tn = omp_get_thread_num() @@ -1937,7 +1938,7 @@ def test_impl(N): b = np.zeros(N // 2) linearj = 0 - with omp("parallel for linear(linearj:1)"): + with openmp("parallel for linear(linearj:1)"): for i in range(0, N, 2): b[linearj] = a[i] * 2 @@ -1959,7 +1960,7 @@ def __init__(self, *args): def test_parallel_for_no_for_loop(self): @njit def test_impl(): - with omp("parallel for"): + with openmp("parallel for"): pass with self.assertRaises(ParallelForWrongLoopCount) as raises: @@ -1973,7 +1974,7 @@ def test_parallel_for_multiple_for_loops(self): @njit def test_impl(): a = np.zeros(4) - with omp("parallel for"): + with openmp("parallel for"): for i in range(2): a[i] = 1 for i in range(2, 4): @@ -1990,7 +1991,7 @@ def test_statement_before_parallel_for(self): @njit def test_impl(): a = np.zeros(4) - with omp("parallel for"): + with openmp("parallel for"): print("Fail") for i in range(4): a[i] = i @@ -2004,7 +2005,7 @@ def test_statement_after_parallel_for(self): @njit def test_impl(): a = np.zeros(4) - with omp("parallel for"): + with openmp("parallel for"): for i in range(4): a[i] = i print("Fail") @@ -2019,7 +2020,7 @@ def test_nonstring_var_omp_statement(self): @njit def test_impl(v): ovar = 7 - with omp(ovar): + with openmp(ovar): for i in range(len(v)): v[i] = 1.0 return v @@ -2033,7 +2034,7 @@ def test_parallel_for_nonconst_var_omp_statement(self): def test_impl(v): ovar = "parallel " ovar += "for" - with omp(ovar): + with openmp(ovar): for i in range(len(v)): v[i] = 1.0 return v @@ -2048,10 +2049,10 @@ def test_impl(v): # @njit # def test_impl(): # n = 0 - # with omp("parallel"): + # with openmp("parallel"): # half_threads = omp_get_num_threads()//2 # if omp_get_thread_num() < half_threads: - # with omp("for reduction(+:n)"): + # with openmp("for reduction(+:n)"): # for _ in range(half_threads): # n += 1 # return n @@ -2065,11 +2066,11 @@ def test_parallel_for_delaying_condition(self): @njit def test_impl(): n = 0 - with omp("parallel private(lc)"): + with openmp("parallel private(lc)"): lc = 0 while lc < omp_get_thread_num(): lc += 1 - with omp("for reduction(+:n)"): + with openmp("for reduction(+:n)"): for _ in range(omp_get_num_threads()): n += 1 return n @@ -2080,7 +2081,7 @@ def test_parallel_for_nowait(self): @njit def test_impl(nt): a = np.zeros(nt) - with omp("parallel for num_threads(nt) nowait"): + with openmp("parallel for num_threads(nt) nowait"): for i in range(nt): a[omp_get_thread_num] = i return a @@ -2093,8 +2094,8 @@ def test_parallel_double_num_threads(self): @njit def test_impl(nt1, nt2): count = 0 - with omp("parallel num_threads(nt1) num_threads(nt2)"): - with omp("critical"): + with openmp("parallel num_threads(nt1) num_threads(nt2)"): + with openmp("critical"): count += 1 print(count) return count @@ -2108,14 +2109,14 @@ def test_impl(nt): hp = nt // 2 a = np.zeros(hp) b = np.zeros(nt - hp) - with omp("parallel num_threads(nt) private(tn)"): + with openmp("parallel num_threads(nt) private(tn)"): tn = omp_get_thread_num() if tn < hp: - with omp("barrier"): + with openmp("barrier"): pass a[tn] = 1 else: - with omp("barrier"): + with openmp("barrier"): pass b[tn - hp] = 1 return a, b @@ -2129,10 +2130,10 @@ def test_closely_nested_for_loops(self): @njit def test_impl(N): a = np.zeros((N, N)) - with omp("parallel"): - with omp("for"): + with openmp("parallel"): + with openmp("for"): for i in range(N): - with omp("for"): + with openmp("for"): for j in range(N): a[i][j] = 1 return a @@ -2145,10 +2146,10 @@ def test_nested_critical(self): @njit def test_impl(): num = 0 - with omp("parallel"): - with omp("critical"): + with openmp("parallel"): + with openmp("critical"): num += 1 - with omp("critical"): + with openmp("critical"): num -= 1 return num @@ -2164,7 +2165,7 @@ def test_parallel_region(self): @njit def test_impl(): a = 1 - with omp("parallel"): + with openmp("parallel"): a += 1 test_impl() @@ -2174,8 +2175,8 @@ def test_single(self): def test_impl(nt): omp_set_num_threads(nt) a = np.zeros(4, dtype=np.int64) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): a[0] += 1 return a @@ -2187,8 +2188,8 @@ def test_master(self): def test_impl(nt): omp_set_num_threads(nt) a = np.ones(4, dtype=np.int64) - with omp("parallel"): - with omp("master"): + with openmp("parallel"): + with openmp("master"): a[0] += omp_get_thread_num() return a @@ -2201,12 +2202,12 @@ def test_impl(N, iters): count = 0 p = 0 sum = 0 - with omp("parallel"): - with omp("barrier"): + with openmp("parallel"): + with openmp("barrier"): pass - with omp("for private(p, sum)"): + with openmp("for private(p, sum)"): for _ in range(iters): - with omp("critical"): + with openmp("critical"): p = count sum = 0 for i in range(10000): @@ -2227,11 +2228,11 @@ def test_impl(N): omp_set_num_threads(N) ca = np.zeros(N) sum = 0 - with omp("parallel private(sum) shared(c)"): + with openmp("parallel private(sum) shared(c)"): c = N - with omp("barrier"): + with openmp("barrier"): pass - with omp("critical"): + with openmp("critical"): ca[omp_get_thread_num()] = c - 1 # Sleep sum = 0 @@ -2251,12 +2252,12 @@ def test_critical_result(self): def test_impl(N): omp_set_num_threads(N) count = 0 - with omp("parallel"): + with openmp("parallel"): if omp_get_thread_num() < N // 2: - with omp("critical"): + with openmp("critical"): count += 1 else: - with omp("critical"): + with openmp("critical"): count += 1 return count @@ -2270,11 +2271,11 @@ def test_impl(N): omp_set_num_threads(N) a = np.zeros((2, N)) sa = np.zeros(N) - with omp("parallel private(a0c, sum, tn)"): + with openmp("parallel private(a0c, sum, tn)"): tn = omp_get_thread_num() - with omp("barrier"): + with openmp("barrier"): pass - with omp("critical (a)"): + with openmp("critical (a)"): # Sleep sum = 0 for j in range(1000): @@ -2283,7 +2284,7 @@ def test_impl(N): else: sum -= 1 a[0][tn] = 1 + sum - with omp("critical (b)"): + with openmp("critical (b)"): a0c = np.copy(a[0]) # Sleep sum = 0 @@ -2308,12 +2309,12 @@ def test_impl(N): # count = 0 # p = 0 # sum = 0 - # with omp("parallel"): - # with omp("barrier"): + # with openmp("parallel"): + # with openmp("barrier"): # pass - # with omp("for private(p, sum)"): + # with openmp("for private(p, sum)"): # for _ in range(iters): - # with omp("atomic"): + # with openmp("atomic"): # p = count # sum = 0 # for i in range(10000): @@ -2333,19 +2334,19 @@ def test_atomic(self): def test_impl(nt, N, c): omp_set_num_threads(nt) a = np.zeros(N) - with omp("parallel for private(b, index)"): + with openmp("parallel for private(b, index)"): for i in range(nt): b = 0 index = i % N - with omp("atomic write"): + with openmp("atomic write"): a[index] = nt % c - with omp("barrier"): + with openmp("barrier"): pass - with omp("atomic read"): + with openmp("atomic read"): b = a[index - 1] + index - with omp("barrier"): + with openmp("barrier"): pass - with omp("atomic update"): + with openmp("atomic update"): a[index] += b return a @@ -2382,15 +2383,15 @@ def test_impl(nt, N, c): a = np.zeros(s) sva = np.zeros(N) tns = np.zeros(N) - with omp("parallel for num_threads(nt) private(sv, index)"): + with openmp("parallel for num_threads(nt) private(sv, index)"): for i in range(N): index = i % s tns[i] = omp_get_thread_num() - with omp("atomic write"): + with openmp("atomic write"): a[index] = index * c + 1 - with omp("barrier"): + with openmp("barrier"): pass - with omp("atomic capture"): + with openmp("atomic capture"): sv = a[index - 1] a[index - 1] += sv + (tns[i] % c + 1) # sva[index] = sv @@ -2414,11 +2415,11 @@ def test_impl(nt): ta1 = np.zeros(nt) secpa = np.zeros(nt) - with omp("parallel sections num_threads(nt)"): - with omp("section"): + with openmp("parallel sections num_threads(nt)"): + with openmp("section"): ta0[omp_get_thread_num()] += 1 secpa[0] = omp_in_parallel() - with omp("section"): + with openmp("section"): ta1[omp_get_thread_num()] += 1 secpa[1] = omp_in_parallel() print(ta0, ta1) @@ -2440,15 +2441,15 @@ def test_impl(nt, iters, c): x = iters // c iters = x * c sum = 0 - with omp("parallel num_threads(nt) private(tn, sum)"): + with openmp("parallel num_threads(nt) private(tn, sum)"): tn = omp_get_thread_num() - with omp("critical"): + with openmp("critical"): sum = 0 for i in range(iters): if i % x == 0: sum += 1 a[tn] = sum - with omp("barrier"): + with openmp("barrier"): pass for j in range(nt): ac[tn][j] = a[j] @@ -2467,9 +2468,9 @@ def test_impl(nt, iters, c): # b = np.zeros(n) # ac = np.zeros((nt, n)) # sum = 0 - # with omp("parallel num_threads(nt) private(tn)"): + # with openmp("parallel num_threads(nt) private(tn)"): # tn = omp_get_thread_num() - # with omp("for nowait schedule(static) private(sum)"): + # with openmp("for nowait schedule(static) private(sum)"): # for i in range(n): # # Sleep # sum = 0 @@ -2481,7 +2482,7 @@ def test_impl(nt, iters, c): # a[i] = i * c1 + sum # for j in range(nt): # ac[tn][j] = a[j] - # with omp("for schedule(static)"): + # with openmp("for schedule(static)"): # for i in range(n): # b[i] = a[i] + c2 # return b, ac @@ -2496,11 +2497,11 @@ def test_impl(nt, iters, c): # def test_impl(n, m, a, b, y, z): # omp_set_num_threads(5) # - # with omp("parallel"): - # with omp("for nowait"): + # with openmp("parallel"): + # with openmp("for nowait"): # for i in range(1, n): # b[i] = (a[i] + a[i-1]) / 2.0 - # with omp("for nowait"): + # with openmp("for nowait"): # for i in range(m): # y[i] = math.sqrt(z[i]) # @@ -2516,9 +2517,9 @@ def test_impl(nt): omp_set_nested(1) omp_set_dynamic(0) a = np.zeros((nt, nt), dtype=np.int32) - with omp("parallel for"): + with openmp("parallel for"): for i in range(nt): - with omp("parallel for"): + with openmp("parallel for"): for j in range(nt): a[i][j] = omp_get_thread_num() return a @@ -2535,12 +2536,12 @@ def test_impl(nt1, nt2): omp_set_max_active_levels(2) ca = np.zeros(nt1) omp_set_num_threads(nt1) - with omp("parallel private(tn)"): + with openmp("parallel private(tn)"): tn = omp_get_thread_num() - with omp("parallel num_threads(3)"): - with omp("critical"): + with openmp("parallel num_threads(3)"): + with openmp("critical"): ca[tn] += 1 - with omp("single"): + with openmp("single"): ats = omp_get_ancestor_thread_num(1) == tn ts = omp_get_team_size(1) return ca, ats, ts @@ -2569,10 +2570,10 @@ def test_impl(mal, n1, n2, n3): omp_set_num_threads(n1) a = np.zeros((n2, 6), dtype=np.int32) b = np.zeros((n1, 6), dtype=np.int32) - with omp("parallel"): + with openmp("parallel"): omp_set_num_threads(n2) - with omp("single"): - with omp("parallel"): + with openmp("single"): + with openmp("parallel"): omp_set_num_threads(n3) set_array(a) set_array(b) @@ -2593,7 +2594,7 @@ def test_impl(N): omp_set_dynamic(0) omp_set_num_threads(N) a = np.zeros((N, 2), dtype=np.int32) - with omp("parallel private(tn)"): + with openmp("parallel private(tn)"): tn = omp_get_thread_num() a[tn][0] = 1 a[tn][1] = 2 @@ -2612,10 +2613,10 @@ def __init__(self, *args): def test_task_basic(self): def test_impl(ntsks): a = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task"): + with openmp("task"): a[i] = 1 return a @@ -2626,10 +2627,10 @@ def test_task_thread_assignment(self): @njit def test_impl(ntsks): a = np.empty(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task"): + with openmp("task"): a[i] = omp_get_thread_num() return a @@ -2641,14 +2642,14 @@ def test_task_data_sharing_default(self): @njit def test_impl(n1, n2): x = n1 - with omp("parallel private(y)"): + with openmp("parallel private(y)"): y = n1 - with omp("single"): - with omp("task"): + with openmp("single"): + with openmp("task"): xa = x == n1 ya = y == n1 x, y = n2, n2 - with omp("taskwait"): + with openmp("taskwait"): ysave = y return (x, ysave), (xa, ya) @@ -2662,10 +2663,10 @@ def test_task_single_implicit_barrier(self): @njit def test_impl(ntsks): a = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task private(sum)"): + with openmp("task private(sum)"): # Sleep sum = 0 for j in range(10000): @@ -2674,7 +2675,7 @@ def test_impl(ntsks): else: sum -= 1 a[i] = 1 + sum - # with omp("barrier"): + # with openmp("barrier"): # pass sa = np.copy(a) return sa @@ -2688,10 +2689,10 @@ def test_task_single_nowait(self): @njit def test_impl(ntsks): a = np.zeros(ntsks) - with omp("parallel"): - with omp("single nowait"): + with openmp("parallel"): + with openmp("single nowait"): for i in range(ntsks): - with omp("task private(sum)"): + with openmp("task private(sum)"): sum = 0 for j in range(10000): if j % 2 == 0: @@ -2715,14 +2716,14 @@ def test_impl(nt): omp_set_num_threads(nt) a = np.zeros((nt + 1) * nt / 2) # a = np.zeros(10) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for tn in range(nt): - with omp("task"): + with openmp("task"): for i in range(tn + 1): - with omp("task"): + with openmp("task"): a[i] = omp_get_thread_num() + 1 - with omp("barrier"): + with openmp("barrier"): ret = np.all(a) return ret @@ -2731,10 +2732,10 @@ def test_impl(nt): def test_taskwait(self): def test_impl(ntsks): a = np.zeros(ntsks) - with omp("parallel private(i)"): - with omp("single"): + with openmp("parallel private(i)"): + with openmp("single"): for i in range(ntsks): - with omp("task private(sum) private(j)"): + with openmp("task private(sum) private(j)"): sum = 0 for j in range(10000): if j % 2 == 0: @@ -2742,7 +2743,7 @@ def test_impl(ntsks): else: sum -= 1 a[i] = 1 + sum - with omp("taskwait"): + with openmp("taskwait"): ret = np.all(a) return ret @@ -2754,13 +2755,13 @@ def test_taskwait_descendants(self): def test_impl(ntsks, dtsks): a = np.zeros(ntsks) da = np.zeros((ntsks, dtsks)) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task"): + with openmp("task"): a[i] = 1 for j in range(dtsks): - with omp("task private(sum)"): + with openmp("task private(sum)"): sum = 0 for k in range(10000): if k % 2 == 0: @@ -2768,10 +2769,10 @@ def test_impl(ntsks, dtsks): else: sum -= 1 da[i][j] = 1 + sum - with omp("taskwait"): + with openmp("taskwait"): ac = np.copy(a) dac = np.copy(da) - with omp("barrier"): + with openmp("barrier"): pass return ac, dac @@ -2784,10 +2785,10 @@ def test_impl(ntsks, dtsks): def test_undeferred_task(self): @njit def test_impl(): - with omp("parallel"): + with openmp("parallel"): flag = 1 - with omp("single"): - with omp("task if(1) private(sum)"): + with openmp("single"): + with openmp("task if(1) private(sum)"): sum = 0 for i in range(10000): if i % 2 == 0: @@ -2806,12 +2807,12 @@ def test_untied_task_thread_assignment(self): def test_impl(ntsks): start_nums = np.zeros(ntsks) current_nums = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task untied private(sum)"): + with openmp("task untied private(sum)"): start_nums[i] = omp_get_thread_num() - with omp("task if(0) shared(sum)"): + with openmp("task if(0) shared(sum)"): # Sleep sum = 0 for j in range(10000): @@ -2820,7 +2821,7 @@ def test_impl(ntsks): else: sum -= 1 current_nums[i] = omp_get_thread_num() + sum - with omp("barrier"): + with openmp("barrier"): pass return start_nums, current_nums @@ -2835,10 +2836,10 @@ def test_impl(ntsks): start_nums = np.zeros(ntsks) finish_nums = np.zeros(ntsks) yielded_tasks = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task private(stn, start_i, finish_i, diff)"): + with openmp("task private(stn, start_i, finish_i, diff)"): stn = omp_get_thread_num() start_i = np.where(start_nums == stn)[0] finish_i = np.where(finish_nums == stn)[0] @@ -2852,10 +2853,10 @@ def test_impl(ntsks): for dindex in diff[diff != 0]: yielded_tasks[dindex] = 1 start_nums[i] = stn - with omp("taskyield"): + with openmp("taskyield"): pass finish_nums[i] = omp_get_thread_num() - with omp("barrier"): + with openmp("barrier"): pass return yielded_tasks @@ -2869,16 +2870,16 @@ def test_impl(ntsks, c): final_nums = np.zeros(ntsks) included_nums = np.zeros(ntsks) da = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task final(i>c) private(sum, d)"): + with openmp("task final(i>c) private(sum, d)"): ftask_num = i final_nums[ftask_num] = omp_get_thread_num() # If it is a final task, generate an included task if ftask_num > c: d = 1 - with omp("task private(sum)"): + with openmp("task private(sum)"): itask_num = ftask_num # Sleep sum = 0 @@ -2903,13 +2904,13 @@ def test_taskgroup(self): @njit def test_impl(ntsks, dtsks): a = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): - with omp("taskgroup"): + with openmp("parallel"): + with openmp("single"): + with openmp("taskgroup"): for i in range(ntsks): - with omp("task"): + with openmp("task"): for _ in range(dtsks): - with omp("task"): + with openmp("task"): # Sleep sum = 0 for j in range(10000): @@ -2932,10 +2933,10 @@ def test_task_priority(self): def test_impl(ntsks): a = np.zeros(ntsks) count = 0 - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task priority(i)"): + with openmp("task priority(i)"): count += i + 1 a[i] = count return a @@ -2952,12 +2953,12 @@ def test_task_mergeable(self): @njit def test_impl(ntsks, c1, c2): a = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task private(x)"): + with openmp("task private(x)"): x = c1 - with omp("task mergeable if(0)"): + with openmp("task mergeable if(0)"): x = c2 a[i] = x return a @@ -2970,23 +2971,23 @@ def test_task_depend(self): def test_impl(ntsks): a = np.zeros(ntsks) da = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task private(x, done)"): + with openmp("task private(x, done)"): x = 1 done = False - with omp("task shared(x) depend(out: x)"): + with openmp("task shared(x) depend(out: x)"): x = 5 - with omp("""task shared(done, x) + with openmp("""task shared(done, x) depend(out: done) depend(inout: x)"""): x += i done = True - with omp("""task shared(done, x) + with openmp("""task shared(done, x) depend(in: done) depend(inout: x)"""): x *= i da[i] = 1 if done else 0 - with omp("task shared(x) depend(in: x)"): + with openmp("task shared(x) depend(in: x)"): a[i] = x return a, da @@ -2997,14 +2998,14 @@ def test_impl(ntsks): def test_task_affinity(self): def test_impl(ntsks, const): a = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): for i in range(ntsks): - with omp("task firstprivate(i)"): - with omp("""task shared(b) depend(out: b) + with openmp("task firstprivate(i)"): + with openmp("""task shared(b) depend(out: b) affinity(a)"""): b = np.full(i, const) - with omp("""task shared(b) depend(in: b) + with openmp("""task shared(b) depend(in: b) affinity(a)"""): a[i] = np.sum(b) return a @@ -3017,17 +3018,17 @@ def test_impl(mode): return b = np.zeros(100) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): a = np.ones(100) c = 0 d = 0 if mode > 1: - with omp("task shared(a, c)"): + with openmp("task shared(a, c)"): c = a.sum() - with omp("task shared(a, d)"): + with openmp("task shared(a, d)"): d = a.sum() - with omp("taskwait"): + with openmp("taskwait"): b[:] = c + d return b @@ -3045,9 +3046,9 @@ def __init__(self, *args): def test_taskloop_basic(self): def test_impl(ntsks): a = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): - with omp("taskloop"): + with openmp("parallel"): + with openmp("single"): + with openmp("taskloop"): for i in range(ntsks): a[i] = 1 return a @@ -3058,9 +3059,9 @@ def test_taskloop_num_tasks(self): @njit def test_impl(nt, iters, ntsks): a = np.zeros(ntsks) - with omp("parallel num_threads(nt)"): - with omp("single"): - with omp("taskloop num_tasks(ntsks)"): + with openmp("parallel num_threads(nt)"): + with openmp("single"): + with openmp("taskloop num_tasks(ntsks)"): for i in range(iters): a[i] = omp_get_thread_num() return a @@ -3072,10 +3073,10 @@ def test_taskloop_grainsize(self): @njit def test_impl(nt, iters, ntsks): a = np.zeros(ntsks) - with omp("parallel num_threads(nt)"): - with omp("single"): + with openmp("parallel num_threads(nt)"): + with openmp("single"): iters_per_task = iters // ntsks - with omp("taskloop grainsize(iters_per_task)"): + with openmp("taskloop grainsize(iters_per_task)"): for i in range(iters): a[i] = omp_get_thread_num() return a @@ -3088,14 +3089,14 @@ def test_taskloop_nogroup(self): def test_impl(ntsks): a = np.zeros(ntsks) sa = np.zeros(ntsks) - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): s = 0 - with omp("taskloop nogroup num_tasks(ntsks)"): + with openmp("taskloop nogroup num_tasks(ntsks)"): for i in range(ntsks): a[i] = 1 sa[i] = s - with omp("task priority(1)"): + with openmp("task priority(1)"): s = 1 return a, sa @@ -3111,9 +3112,9 @@ def test_impl(ntsks, nt): sl = np.zeros(ntsks) tl = np.zeros(ntsks) omp_set_num_threads(nt) - with omp("parallel"): - with omp("single"): - with omp("taskloop collapse(2) num_tasks(ntsks)"): + with openmp("parallel"): + with openmp("single"): + with openmp("taskloop collapse(2) num_tasks(ntsks)"): for i in range(ntsks): fl[i] = omp_get_thread_num() for j in range(1): @@ -3162,7 +3163,7 @@ def target_nowait(self, device): @njit def test_impl(): - with omp(target_pragma): + with openmp(target_pragma): a = 0 for i in range(1000000): for j in range(1000000): @@ -3178,8 +3179,8 @@ def target_nest_parallel_default_threadlimit(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("parallel"): + with openmp(target_pragma): + with openmp("parallel"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3198,8 +3199,8 @@ def target_nest_parallel_set_numthreads(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("parallel num_threads(32)"): + with openmp(target_pragma): + with openmp("parallel num_threads(32)"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3218,8 +3219,8 @@ def target_nest_teams_default_numteams(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("teams"): + with openmp(target_pragma): + with openmp("teams"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3246,8 +3247,8 @@ def target_nest_teams_set_numteams(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("teams num_teams(32)"): + with openmp(target_pragma): + with openmp("teams num_teams(32)"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3271,9 +3272,9 @@ def target_nest_teams_nest_parallel_default_numteams_threadlimit(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("teams"): - with omp("parallel"): + with openmp(target_pragma): + with openmp("teams"): + with openmp("parallel"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3300,9 +3301,9 @@ def target_nest_teams_nest_parallel_set_numteams(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("teams num_teams(32)"): - with omp("parallel"): + with openmp(target_pragma): + with openmp("teams num_teams(32)"): + with openmp("parallel"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3326,9 +3327,9 @@ def target_nest_teams_nest_parallel_set_threadlimit(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("teams thread_limit(32)"): - with omp("parallel"): + with openmp(target_pragma): + with openmp("teams thread_limit(32)"): + with openmp("parallel"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3355,9 +3356,9 @@ def target_nest_teams_nest_parallel_set_numteams_threadlimit(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("teams num_teams(32) thread_limit(32)"): - with omp("parallel"): + with openmp(target_pragma): + with openmp("teams num_teams(32) thread_limit(32)"): + with openmp("parallel"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3385,9 +3386,9 @@ def target_nest_teams_nest_parallel_set_numteams_threadlimit_gt_numthreads( def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("teams num_teams(32) thread_limit(64)"): - with omp("parallel num_threads(32)"): + with openmp(target_pragma): + with openmp("teams num_teams(32) thread_limit(64)"): + with openmp("parallel num_threads(32)"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3415,10 +3416,10 @@ def target_nest_teams_nest_parallel_set_numteams_threadlimit_lt_numthreads( def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): + with openmp(target_pragma): # THREAD_LIMIT takes precedence over NUM_THREADS. - with omp("teams num_teams(32) thread_limit(64)"): - with omp("parallel num_threads(128)"): + with openmp("teams num_teams(32) thread_limit(64)"): + with openmp("parallel num_threads(128)"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3448,14 +3449,14 @@ def test_impl(): threads1 = 0 teams2 = 0 threads2 = 0 - with omp(target_pragma): - with omp("parallel num_threads(32)"): + with openmp(target_pragma): + with openmp("parallel num_threads(32)"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: teams1 = omp_get_num_teams() threads1 = omp_get_num_threads() - with omp("parallel num_threads(256)"): + with openmp("parallel num_threads(256)"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3480,14 +3481,14 @@ def test_impl(): threads1 = 0 teams2 = 0 threads2 = 0 - with omp(target_pragma): - with omp("parallel"): + with openmp(target_pragma): + with openmp("parallel"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: teams1 = omp_get_num_teams() threads1 = omp_get_num_threads() - with omp("parallel"): + with openmp("parallel"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3511,15 +3512,15 @@ def test_impl(): threads1 = 0 teams2 = 0 threads2 = 0 - with omp(target_pragma): + with openmp(target_pragma): max_threads = omp_get_max_threads() - with omp("parallel"): + with openmp("parallel"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: teams1 = omp_get_num_teams() threads1 = omp_get_num_threads() - with omp("parallel num_threads(256)"): + with openmp("parallel num_threads(256)"): teamno = omp_get_team_num() threadno = omp_get_thread_num() if teamno == 0 and threadno == 0: @@ -3550,8 +3551,8 @@ def target_nest_parallel(self, device): @njit def test_impl(): a = np.zeros(32, dtype=np.int64) - with omp(target_pragma): - with omp(parallel_pragma): + with openmp(target_pragma): + with openmp(parallel_pragma): thread_id = omp_get_thread_num() a[thread_id] = 1 return a @@ -3568,8 +3569,8 @@ def target_parallel_for_range_step_arg(self, device): @njit def test_impl(): a = np.zeros(N, dtype=np.int32) - with omp(target_pragma): - with omp(parallel_pragma): + with openmp(target_pragma): + with openmp(parallel_pragma): for i in range(0, len(a), step): a[i] = i + 1 @@ -3588,8 +3589,8 @@ def target_parallel_for_incremented_step(self, device): def test_impl(): a = np.zeros(N, dtype=np.int32) for i in range(step_range): - with omp(target_pragma): - with omp(parallel_pragma): + with openmp(target_pragma): + with openmp(parallel_pragma): for j in range(0, len(a), i + 1): a[j] = i + 1 return a @@ -3606,7 +3607,7 @@ def target_teams(self, device): def test_impl(): a = np.zeros(100, dtype=np.int64) nteams = 0 - with omp(target_pragma): + with openmp(target_pragma): team_id = omp_get_team_num() if team_id == 0: nteams = omp_get_num_teams() @@ -3628,8 +3629,8 @@ def target_nest_teams(self, device): def test_impl(): a = np.zeros(100, dtype=np.int64) nteams = 0 - with omp(target_pragma): - with omp("teams num_teams(100)"): + with openmp(target_pragma): + with openmp("teams num_teams(100)"): team_id = omp_get_team_num() if team_id == 0: nteams = omp_get_num_teams() @@ -3650,8 +3651,8 @@ def target_nest_teams_from_shared_expl_scalar(self, device): @njit def test_impl(): s = 0 - with omp(target_pragma): - with omp("teams num_teams(100) shared(s)"): + with openmp(target_pragma): + with openmp("teams num_teams(100) shared(s)"): team_id = omp_get_team_num() if team_id == 0: s = 1 @@ -3666,8 +3667,8 @@ def target_nest_teams_from_shared_impl_scalar(self, device): @njit def test_impl(): s = 0 - with omp(target_pragma): - with omp("teams num_teams(100)"): + with openmp(target_pragma): + with openmp("teams num_teams(100)"): team_id = omp_get_team_num() if team_id == 0: s = 1 @@ -3682,8 +3683,8 @@ def target_nest_teams_tofrom_shared_expl_scalar(self, device): @njit def test_impl(): s = 0 - with omp(target_pragma): - with omp("teams num_teams(100) shared(s)"): + with openmp(target_pragma): + with openmp("teams num_teams(100) shared(s)"): team_id = omp_get_team_num() if team_id == 0: s = 1 @@ -3699,8 +3700,8 @@ def target_nest_teams_tofrom_shared_impl_scalar(self, device): def test_impl(): s = 0 ss = np.zeros(1) - with omp(target_pragma): - with omp("teams num_teams(100)"): + with openmp(target_pragma): + with openmp("teams num_teams(100)"): team_id = omp_get_team_num() if team_id == 0: s = 1 @@ -3718,8 +3719,8 @@ def target_teams_nest_parallel(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("parallel"): + with openmp(target_pragma): + with openmp("parallel"): team_id = omp_get_team_num() thread_id = omp_get_thread_num() if team_id == 0 and thread_id == 0: @@ -3744,9 +3745,9 @@ def target_teams_nest_parallel_set_thread_limit(self, device): def test_impl(): teams = 0 threads = 0 - with omp(target_pragma): - with omp("teams num_teams(10) thread_limit(32)"): - with omp("parallel"): + with openmp(target_pragma): + with openmp("teams num_teams(10) thread_limit(32)"): + with openmp("parallel"): team_id = omp_get_team_num() thread_id = omp_get_thread_num() if team_id == 0 and thread_id == 0: @@ -3769,7 +3770,7 @@ def target_map_to_scalar(self, device): @njit def test_impl(x): - with omp(target_pragma): + with openmp(target_pragma): x += 1 r = x return r @@ -3783,7 +3784,7 @@ def target_map_to_array(self, device): @njit def test_impl(a): - with omp(target_pragma): + with openmp(target_pragma): r = 0 for i in range(len(a)): r += a[i] @@ -3800,7 +3801,7 @@ def target_map_from_scalar(self, device): @njit def test_impl(x): - with omp(target_pragma): + with openmp(target_pragma): x = 43 return x @@ -3813,7 +3814,7 @@ def target_map_tofrom_scalar(self, device): @njit def test_impl(x): - with omp(target_pragma): + with openmp(target_pragma): x += 1 return x @@ -3826,9 +3827,9 @@ def target_multiple_map_tofrom_scalar(self, device): @njit def test_impl(x): - with omp(target_pragma): + with openmp(target_pragma): x += 1 - with omp(target_pragma): + with openmp(target_pragma): x += 1 return x @@ -3842,7 +3843,7 @@ def target_map_from_array(self, device): @njit def test_impl(n): a = np.zeros(n, dtype=np.int64) - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 42 return a @@ -3858,7 +3859,7 @@ def target_map_slice_in_mapping(self, device): def test_impl(n): a = np.zeros(n) b = np.arange(n) - with omp(target_pragma): + with openmp(target_pragma): for i in range(50): # These b accesses are within the transferred region. a[i + 50] = b[i + 100] @@ -3877,7 +3878,7 @@ def target_map_slice_read_out_mapping(self, device): def test_impl(n): a = np.zeros(n) b = np.arange(n) - with omp(target_pragma): + with openmp(target_pragma): for i in range(50): # These b accesses are outside the transferred region. # Should get whatever happens to be in memory at that point. @@ -3897,7 +3898,7 @@ def target_map_tofrom_array(self, device): @njit def test_impl(a): - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] += 1 return a @@ -3912,8 +3913,8 @@ def target_nest_parallel_for(self, device): @njit def test_impl(a, sched): - with omp(target_pragma): - with omp("parallel for num_threads(256)"): + with openmp(target_pragma): + with openmp("parallel for num_threads(256)"): for i in range(len(a)): a[i] = 1 thread_id = omp_get_thread_num() @@ -3939,8 +3940,8 @@ def target_nest_teams_distribute(self, device): @njit def test_impl(a, sched): - with omp(target_pragma): - with omp("teams distribute"): + with openmp(target_pragma): + with openmp("teams distribute"): for i in range(len(a)): a[i] = 1 team_id = omp_get_team_num() @@ -3975,7 +3976,7 @@ def target_teams_distribute(self, device): @njit def test_impl(a, sched): - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 1 team_id = omp_get_team_num() @@ -4009,7 +4010,7 @@ def target_teams_distribute_set_num_teams(self, device): @njit def test_impl(a, sched): - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 1 team_id = omp_get_team_num() @@ -4032,7 +4033,7 @@ def target_firstprivate_scalar_explicit(self, device): @njit def test_impl(s): - with omp(target_pragma): + with openmp(target_pragma): s = 43 return s @@ -4045,7 +4046,7 @@ def target_firstprivate_scalar_implicit(self, device): @njit def test_impl(s): - with omp(target_pragma): + with openmp(target_pragma): s = 43 return s @@ -4061,8 +4062,8 @@ def target_data_from(self, device): @njit def test_impl(): a = np.ones(10) - with omp(target_data_pragma): - with omp(target_pragma): + with openmp(target_data_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 42 return a @@ -4079,8 +4080,8 @@ def target_data_to(self, device): def test_impl(): a = np.ones(10) b = np.zeros(10) - with omp(target_data_pragma): - with omp(target_pragma): + with openmp(target_data_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 42 b[i] = a[i] @@ -4099,8 +4100,8 @@ def target_data_tofrom(self, device): def test_impl(): s = 0 a = np.ones(10) - with omp(target_data_pragma): - with omp(target_pragma): + with openmp(target_data_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] += 41 s = 42 @@ -4121,8 +4122,8 @@ def target_data_alloc_from(self, device): def test_impl(): a = np.ones(10) b = np.zeros(10) - with omp(target_data_pragma): - with omp(target_pragma): + with openmp(target_data_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 42 b[i] = a[i] @@ -4141,8 +4142,8 @@ def target_data_mix_to_from(self, device): def test_impl(): a = np.ones(10) b = np.ones(10) - with omp(target_data_pragma): - with omp(target_pragma): + with openmp(target_data_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 42 b[i] = 42 @@ -4161,11 +4162,11 @@ def target_update_from(self, device): @njit def test_impl(): a = np.ones(10) - with omp(target_data_pragma): - with omp(target_pragma): + with openmp(target_data_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 42 - with omp(target_update_pragma): + with openmp(target_update_pragma): pass return a @@ -4181,13 +4182,13 @@ def target_update_to(self, device): @njit def test_impl(): a = np.ones(10) - with omp(target_data_pragma): + with openmp(target_data_pragma): a += 1 - with omp(target_update_pragma): + with openmp(target_update_pragma): pass - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] += 1 return a @@ -4205,17 +4206,17 @@ def target_update_to_from(self, device): @njit def test_impl(): a = np.ones(10) - with omp(target_data_pragma): + with openmp(target_data_pragma): a += 1 - with omp(target_update_to_pragma): + with openmp(target_update_to_pragma): pass - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] += 1 - with omp(target_update_from_pragma): + with openmp(target_update_from_pragma): pass a += 1 @@ -4235,16 +4236,16 @@ def target_enter_exit_data_to_from_hostonly(self, device): @njit def test_impl(): a = np.ones(10) - with omp(target_enter): + with openmp(target_enter): pass a += 1 # XXX: Test passes if uncommented! - # with omp("target device(1)"): + # with openmp("target device(1)"): # pass - with omp(target_exit): + with openmp(target_exit): pass return a @@ -4260,11 +4261,11 @@ def target_data_tofrom_hostonly(self, device): @njit def test_impl(): a = np.ones(10) - with omp(target_data): + with openmp(target_data): a += 1 # XXX: Test passes if uncommented! - # with omp("target device(1)"): + # with openmp("target device(1)"): # pass return a @@ -4279,12 +4280,12 @@ def target_data_update(self, device): @njit def test_impl(a): - with omp(target_data): + with openmp(target_data): for rep in range(10): # Target update resets a to ones. - with omp(target_update): + with openmp(target_update): pass - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] += 1 @@ -4304,15 +4305,15 @@ def test_impl(s, n1, n2): as1 = np.empty(s, dtype=a.dtype) as2 = np.empty(s, dtype=a.dtype) b = n1 - with omp(target_data_pragma): - with omp(target_pragma): + with openmp(target_data_pragma): + with openmp(target_pragma): as1[:] = a bs1 = b - with omp(target_pragma): + with openmp(target_pragma): for i in range(s): a[i] = n2 b = n2 - with omp(target_pragma): + with openmp(target_pragma): as2[:] = a bs2 = b return a, as1, as2, b, bs1, bs2 @@ -4340,13 +4341,13 @@ def test_impl(): bstop = 3 a = np.array([1, 2, 3]) b = np.array([3, 2, 1]) - with omp(target_enter_pragma): - with omp(target_pragma): + with openmp(target_enter_pragma): + with openmp(target_pragma): for i in range(1): a[0] = 42 b[0] = 42 - with omp(target_exit_pragma): + with openmp(target_exit_pragma): pass return a, b @@ -4364,15 +4365,15 @@ def target_enter_exit_data(self, device): @njit def test_impl(scalar, array): - with omp(target_enter_pragma): + with openmp(target_enter_pragma): pass - with omp(target_pragma): + with openmp(target_pragma): scalar += 1 for i in range(len(array)): array[i] += 1 - with omp(target_exit_pragma): + with openmp(target_exit_pragma): pass return scalar, array @@ -4398,12 +4399,12 @@ def target_enter_exit_data_alloc(self, device): @njit def test_impl(a): - with omp(target_enter_pragma): + with openmp(target_enter_pragma): pass - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 1 - with omp(target_exit_pragma): + with openmp(target_exit_pragma): pass return a @@ -4421,7 +4422,7 @@ def target_teams_distribute_parallel_for(self, device): @njit def test_impl(a, sched_team, sched_thread): s = 42 - with omp(target_pragma): + with openmp(target_pragma): for i in range(len(a)): a[i] = 1 team_id = omp_get_team_num() @@ -4489,8 +4490,8 @@ def target_teams_nest_distribute_parallel_for(self, device): @njit def test_impl(a, sched_team, sched_thread): s = 42 - with omp(target_pragma): - with omp(dist_parfor_pragma): + with openmp(target_pragma): + with openmp(dist_parfor_pragma): for i in range(len(a)): a[i] = 1 team_id = omp_get_team_num() @@ -4532,8 +4533,8 @@ def test_impl(): s = 42 r = np.zeros(32) threads = 0 - with omp(target_pragma): - with omp("parallel firstprivate(s)"): + with openmp(target_pragma): + with openmp("parallel firstprivate(s)"): threadno = omp_get_thread_num() if threadno == 0: threads = omp_get_num_threads() @@ -4553,8 +4554,8 @@ def target_nest_parallel_float_fpriv(self, device): def test_impl(): s = np.float32(42.0) r = np.float32(0.0) - with omp(target_pragma): - with omp("parallel firstprivate(s)"): + with openmp(target_pragma): + with openmp("parallel firstprivate(s)"): threadno = omp_get_thread_num() if threadno == 0: r = s + 1 @@ -4570,8 +4571,8 @@ def target_nest_teams_float_fpriv(self, device): def test_impl(): s = np.float32(42.0) r = np.float32(0.0) - with omp(target_pragma): - with omp("teams firstprivate(s)"): + with openmp(target_pragma): + with openmp("teams firstprivate(s)"): teamno = omp_get_thread_num() if teamno == 0: r = s + 1 @@ -4590,8 +4591,8 @@ def target_teams_nest_parallel_fpriv_shared_array(self, device): @njit def test_impl(): s = np.zeros(32) - with omp(target_pragma): - with omp("parallel firstprivate(s)"): + with openmp(target_pragma): + with openmp("parallel firstprivate(s)"): print("parallel s", s[0]) teams = omp_get_num_teams() threads = omp_get_num_threads() @@ -4611,7 +4612,7 @@ def test_impl(): a = np.zeros(10, dtype=np.int32) nteams = 0 - with omp(target_pragma): + with openmp(target_pragma): team_shared_array = np.empty(10, dtype=np.int32) team_id = omp_get_team_num() @@ -4645,7 +4646,7 @@ def test_impl(): a = np.zeros((10, 2, 2), dtype=np.int32) nteams = 0 - with omp(target_pragma): + with openmp(target_pragma): team_shared_array = np.empty((2, 2), dtype=np.int32) team_id = omp_get_team_num() @@ -4679,8 +4680,8 @@ def target_local_array(self, device): def test_impl(): a = np.zeros((32, 10), dtype=np.int32) nthreads = 0 - with omp(target_pragma): - with omp("parallel num_threads(32)"): + with openmp(target_pragma): + with openmp("parallel num_threads(32)"): local_array = np.empty(10, dtype=np.int32) tid = omp_get_thread_num() if tid == 0: @@ -4716,14 +4717,14 @@ def test_impl(): nteams = 0 nthreads = 0 - with omp(target_pragma): + with openmp(target_pragma): team_shared_array = np.empty(32, dtype=np.int32) team_id = omp_get_team_num() if team_id == 0: nteams = omp_get_num_teams() nthreads = omp_get_num_threads() - with omp("parallel num_threads(32)"): + with openmp("parallel num_threads(32)"): thread_local_array = np.empty(10, dtype=np.int32) for i in range(10): thread_local_array[i] = omp_get_thread_num() @@ -4759,7 +4760,7 @@ def test_impl(n): a = np.ones((n, n)) b = np.ones((n, n)) c = np.zeros((n, n)) - with omp(target_pragma): + with openmp(target_pragma): for i in range(n): for j in range(n): c[i, j] = a[i, j] + b[i, j] @@ -4777,9 +4778,9 @@ def test_impl(n): a = np.ones((n, n)) b = np.ones((n, n)) c = np.zeros((n, n)) - with omp(target_pragma): - with omp("teams"): - with omp("loop collapse(2)"): + with openmp(target_pragma): + with openmp("teams"): + with openmp("loop collapse(2)"): for i in range(n): for j in range(n): c[i, j] = a[i, j] + b[i, j] @@ -4820,8 +4821,8 @@ def test_impl(num_steps): the_sum = 0.0 omp_set_num_threads(4) - with omp("parallel"): - with omp("for reduction(+:the_sum) schedule(static)"): + with openmp("parallel"): + with openmp("for reduction(+:the_sum) schedule(static)"): for j in range(num_steps): x = ((j - 1) - 0.5) * step the_sum += 4.0 / (1.0 + x * x) @@ -4838,7 +4839,7 @@ def test_impl(num_steps): the_sum = 0.0 omp_set_num_threads(4) - with omp("parallel for reduction(+:the_sum) schedule(static)"): + with openmp("parallel for reduction(+:the_sum) schedule(static)"): for j in range(num_steps): x = ((j - 1) - 0.5) * step the_sum += 4.0 / (1.0 + x * x) @@ -4855,7 +4856,7 @@ def test_impl(num_steps): the_sum = 0.0 omp_set_num_threads(4) - with omp("loop reduction(+:the_sum) schedule(static)"): + with openmp("loop reduction(+:the_sum) schedule(static)"): for j in range(num_steps): x = ((j - 1) - 0.5) * step the_sum += 4.0 / (1.0 + x * x) @@ -4875,7 +4876,7 @@ def test_impl(num_steps): omp_set_num_threads(j) full_sum = 0.0 - with omp("parallel private(tid, numthreads, local_sum, x)"): + with openmp("parallel private(tid, numthreads, local_sum, x)"): tid = omp_get_thread_num() numthreads = omp_get_num_threads() local_sum = 0.0 @@ -4907,11 +4908,11 @@ def test_pi_comp(Nstart, Nfinish, step): pi_sum1 = 0.0 pi_sum2 = 0.0 cut = Nfinish - (iblk // 2) - with omp("task shared(pi_sum1)"): + with openmp("task shared(pi_sum1)"): pi_sum1 = test_pi_comp(Nstart, cut, step) - with omp("task shared(pi_sum2)"): + with openmp("task shared(pi_sum2)"): pi_sum2 = test_pi_comp(cut, Nfinish, step) - with omp("taskwait"): + with openmp("taskwait"): pi_sum = pi_sum1 + pi_sum2 return pi_sum @@ -4928,11 +4929,11 @@ def test_pi_comp_njit(Nstart, Nfinish, step): pi_sum1 = 0.0 pi_sum2 = 0.0 cut = Nfinish - (iblk // 2) - with omp("task shared(pi_sum1)"): + with openmp("task shared(pi_sum1)"): pi_sum1 = test_pi_comp_njit(Nstart, cut, step) - with omp("task shared(pi_sum2)"): + with openmp("task shared(pi_sum2)"): pi_sum2 = test_pi_comp_njit(cut, Nfinish, step) - with omp("taskwait"): + with openmp("taskwait"): pi_sum = pi_sum1 + pi_sum2 return pi_sum @@ -4943,8 +4944,8 @@ def test_impl(lb, num_steps, pi_comp_func): omp_set_num_threads(j) full_sum = 0.0 - with omp("parallel"): - with omp("single"): + with openmp("parallel"): + with openmp("single"): full_sum = pi_comp_func(lb, num_steps, step) pi = step * full_sum From c392b45c6c76db47e239f8ec071aa80f37c87dff Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 22 Apr 2025 19:55:56 -0700 Subject: [PATCH 04/28] Update pyproject.toml, setup.py and clean up --- buildscripts/conda-recipes/llvmlite/bld.bat | 25 ----- buildscripts/conda-recipes/llvmlite/build.sh | 14 --- .../llvmlite/conda_build_config.yaml | 12 --- buildscripts/conda-recipes/llvmlite/meta.yaml | 53 ---------- .../conda-recipes/llvmlite/run_test.py | 6 -- buildscripts/conda-recipes/numba/bld.bat | 3 - buildscripts/conda-recipes/numba/build.sh | 17 ---- buildscripts/conda-recipes/numba/license.txt | 24 ----- buildscripts/conda-recipes/numba/meta.yaml | 99 ------------------- buildscripts/conda-recipes/numba/run_test.bat | 23 ----- buildscripts/conda-recipes/numba/run_test.sh | 84 ---------------- .../{numba => pyomp}/conda_build_config.yaml | 0 buildscripts/conda-recipes/pyomp/meta.yaml | 35 +++++-- pyproject.toml | 15 +-- setup.py | 70 ++++++++----- 15 files changed, 80 insertions(+), 400 deletions(-) delete mode 100755 buildscripts/conda-recipes/llvmlite/bld.bat delete mode 100644 buildscripts/conda-recipes/llvmlite/build.sh delete mode 100644 buildscripts/conda-recipes/llvmlite/conda_build_config.yaml delete mode 100644 buildscripts/conda-recipes/llvmlite/meta.yaml delete mode 100644 buildscripts/conda-recipes/llvmlite/run_test.py delete mode 100644 buildscripts/conda-recipes/numba/bld.bat delete mode 100755 buildscripts/conda-recipes/numba/build.sh delete mode 100644 buildscripts/conda-recipes/numba/license.txt delete mode 100644 buildscripts/conda-recipes/numba/meta.yaml delete mode 100644 buildscripts/conda-recipes/numba/run_test.bat delete mode 100644 buildscripts/conda-recipes/numba/run_test.sh rename buildscripts/conda-recipes/{numba => pyomp}/conda_build_config.yaml (100%) diff --git a/buildscripts/conda-recipes/llvmlite/bld.bat b/buildscripts/conda-recipes/llvmlite/bld.bat deleted file mode 100755 index d7342e249d5f..000000000000 --- a/buildscripts/conda-recipes/llvmlite/bld.bat +++ /dev/null @@ -1,25 +0,0 @@ - -@rem Let CMake know about the LLVM install path, for find_package() -set CMAKE_PREFIX_PATH=%LIBRARY_PREFIX% - -@rem VS2019 uses a different naming convention for platforms than older version -if "%ARCH%"=="32" ( - @rem VS2017: - @rem set CMAKE_GENERATOR_ARCH= - set CMAKE_GENERATOR_ARCH=Win32 -) else ( - @rem VS2017 - @rem set CMAKE_GENERATOR_ARCH=Win64 - set CMAKE_GENERATOR_ARCH=x64 -) -set CMAKE_GENERATOR=Visual Studio 16 2019 -set CMAKE_GENERATOR_TOOLKIT=v142 - -@rem Ensure there are no build leftovers (CMake can complain) -if exist ffi\build rmdir /S /Q ffi\build - -%PYTHON% -S setup.py install -if errorlevel 1 exit 1 - -%PYTHON% runtests.py -if errorlevel 1 exit 1 diff --git a/buildscripts/conda-recipes/llvmlite/build.sh b/buildscripts/conda-recipes/llvmlite/build.sh deleted file mode 100644 index 9f3d87cc3ea0..000000000000 --- a/buildscripts/conda-recipes/llvmlite/build.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -x - -export PYTHONNOUSERSITE=1 - -# Enables static linking of stdlibc++ -export LLVMLITE_CXX_STATIC_LINK=1 -# cmake is broken for osx builds. -#export LLVMLITE_USE_CMAKE=1 -export LLVMLITE_SHARED=1 - -$PYTHON setup.py build --force -$PYTHON setup.py install diff --git a/buildscripts/conda-recipes/llvmlite/conda_build_config.yaml b/buildscripts/conda-recipes/llvmlite/conda_build_config.yaml deleted file mode 100644 index 81b7d08c3d19..000000000000 --- a/buildscripts/conda-recipes/llvmlite/conda_build_config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Numba/llvmlite stack needs an older compiler for backwards compatability. -c_compiler_version: # [linux] - - 7 # [linux and (x86_64 or ppc64le)] - - 9 # [linux and aarch64] - -cxx_compiler_version: # [linux] - - 7 # [linux and (x86_64 or ppc64le)] - - 9 # [linux and aarch64] - -fortran_compiler_version: # [linux] - - 7 # [linux and (x86_64 or ppc64le)] - - 9 # [linux and aarch64] diff --git a/buildscripts/conda-recipes/llvmlite/meta.yaml b/buildscripts/conda-recipes/llvmlite/meta.yaml deleted file mode 100644 index 0a15258e499e..000000000000 --- a/buildscripts/conda-recipes/llvmlite/meta.yaml +++ /dev/null @@ -1,53 +0,0 @@ -package: - name: llvmlite - version: pyomp_0.40 - -source: - git_url: https://github.com/Python-for-HPC/llvmliteWithOpenmp.git - git_rev: ce7b659c6a62aa4466d6b3894573f9900f8a1451 - git_depth: 1 - -build: - string: py{{ PY_VER }}h{{ PKG_HASH }}_{{ (GITHUB_HEAD_SHA | default(''))[:7] ~ (CI_COMMIT_SHA | default(''))[:7] }} - script_env: - - PY_VCRUNTIME_REDIST - - GITHUB_HEAD_SHA - - CI_COMMIT_SHA - -requirements: - build: - # We cannot do this on macOS as the llvm-config from the - # toolchain conflicts with the same from llvmdev, the - # build.sh deals with it! - - {{ compiler('c') }} # [not (osx or armv6l or armv7l or win)] - - {{ compiler('cxx') }} # [not (osx or armv6l or armv7l or win)] - # The DLL build uses cmake on Windows - - cmake # [win] - - make # [unix] - host: - - python - # On channel https://anaconda.org/numba/ - - llvmdev 14.0.6 - - llvm 14.0.6 - - vs2015_runtime # [win] - # llvmdev is built with libz compression support - - zlib # [unix and not (armv6l or armv7l)] - # requires libxml2 - - libxml2 # [win] - run: - - python >=3.8,<=3.10 - - vs2015_runtime # [win] - # osx has dynamically linked libstdc++ - - libcxx >=4.0.1 # [osx] - -test: - imports: - - llvmlite - - llvmlite.binding - commands: - - python -m llvmlite.tests - -about: - home: https://github.com/numba/llvmlite - license: New BSD License - summary: A lightweight LLVM python binding for writing JIT compilers diff --git a/buildscripts/conda-recipes/llvmlite/run_test.py b/buildscripts/conda-recipes/llvmlite/run_test.py deleted file mode 100644 index 5591c0533f36..000000000000 --- a/buildscripts/conda-recipes/llvmlite/run_test.py +++ /dev/null @@ -1,6 +0,0 @@ -import os -from llvmlite.tests import main - -# Enable tests for distribution only -os.environ['LLVMLITE_DIST_TEST'] = '' -main() diff --git a/buildscripts/conda-recipes/numba/bld.bat b/buildscripts/conda-recipes/numba/bld.bat deleted file mode 100644 index 6372f3a4d2e9..000000000000 --- a/buildscripts/conda-recipes/numba/bld.bat +++ /dev/null @@ -1,3 +0,0 @@ -%PYTHON% setup.py build install --single-version-externally-managed --record=record.txt - -exit /b %errorlevel% diff --git a/buildscripts/conda-recipes/numba/build.sh b/buildscripts/conda-recipes/numba/build.sh deleted file mode 100755 index 4ae35afafb09..000000000000 --- a/buildscripts/conda-recipes/numba/build.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -if [[ "$(uname -s)" == *"Linux"* ]] && [[ "$(uname -p)" == *"86"* ]]; then - EXTRA_BUILD_EXT_FLAGS="--werror --wall" -else - EXTRA_BUILD_EXT_FLAGS="" -fi - -if [[ "$(uname -s)" == *"Linux"* ]] && ([[ "$(uname -p)" == *"ppc64le"* ]] || [[ "$(uname -p)" == *"aarch64"* ]]); then - # To workaround https://github.com/numba/numba/issues/7302 - # because of a python build problem that the -pthread could be stripped. - export CC="$CC -pthread" - export CXX="$CXX -pthread" -fi - -MACOSX_DEPLOYMENT_TARGET=10.10 $PYTHON setup.py \ - build_static build_ext $EXTRA_BUILD_EXT_FLAGS build install --single-version-externally-managed --record=record.txt diff --git a/buildscripts/conda-recipes/numba/license.txt b/buildscripts/conda-recipes/numba/license.txt deleted file mode 100644 index 7d19426e7a09..000000000000 --- a/buildscripts/conda-recipes/numba/license.txt +++ /dev/null @@ -1,24 +0,0 @@ -Copyright (c) 2012, Anaconda, Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/buildscripts/conda-recipes/numba/meta.yaml b/buildscripts/conda-recipes/numba/meta.yaml deleted file mode 100644 index 947d49816e4b..000000000000 --- a/buildscripts/conda-recipes/numba/meta.yaml +++ /dev/null @@ -1,99 +0,0 @@ -{% set version = "pyomp_0.57" %} - -package: - name: numba - #version: {{ GIT_DESCRIBE_TAG }} - version: {{ version }} - -source: - git_url: https://github.com/Python-for-HPC/numbaWithOpenmp.git - git_rev: f7a481ef05bcc2a11dfead708854530800b534c8 - git_depth: 1 - -build: - string: np{{ NPY_VER }}py{{ PY_VER }}h{{ PKG_HASH }}_{{ version }}_{{ (GITHUB_HEAD_SHA | default(''))[:7] ~ (CI_COMMIT_SHA | default(''))[:7] }} - entry_points: - - numba = numba.misc.numba_entry:main - script_env: - - PY_VCRUNTIME_REDIST - - GITHUB_HEAD_SHA - - CI_COMMIT_SHA - missing_dso_whitelist: # [osx] - # optional dependency: required only when omp is chosen as the backend for - # the threading layer - - lib/libiomp5.dylib # [osx] - #ignore_run_exports: - # tbb-devel triggers hard dependency on tbb, this is not the case. - - tbb # [not (aarch64 or ppc64le or win32)] - -requirements: - # build and run dependencies are duplicated to avoid setuptools issues - # when we also set install_requires in setup.py - build: - - {{ compiler('c') }} - - {{ compiler('cxx') }} - # OpenMP headers from llvm needed for OSX. - host: - - python - - numpy >=1.22.3, <1.25 - - llvm-openmp-dev - - setuptools - - importlib_metadata # [py<39] - - llvmlite pyomp_0.40.* - # TBB devel version is to match TBB libs. - # NOTE: ppc64le and aarch64 are pending testing so excluded for now, win32 - # is not a supported parallel target. - - tbb-devel >=2021.6 # [not (aarch64 or ppc64le or win32)] - run: - - python - # NumPy 1.22.0, 1.22.1, 1.22.2 are all broken for ufuncs, see #7756 - - numpy >=1.22.3, <1.25 - - importlib_metadata # [py<39] - # On channel https://anaconda.org/numba/ - - llvmlite pyomp_0.40.* - - lark-parser - - cffi - - llvm-openmp-dev - run_constrained: - # If TBB is present it must be at least version 2021.6 - - tbb >=2021.6 # [not (aarch64 or ppc64le or win32)] - # avoid confusion from openblas bugs - - libopenblas !=0.3.6 # [x86_64] - # 0.3.17 buggy on M1 silicon - # https://github.com/xianyi/OpenBLAS/blob/v0.3.20/Changelog.txt#L118 - # https://github.com/numba/numba/issues/7822#issuecomment-1063229855 - # Exclude 0.3.20 too - # https://github.com/numba/numba/issues/8096 - - libopenblas >=0.3.18, !=0.3.20 # [arm64] - # CUDA 10.2 or later is required for CUDA support - - cudatoolkit >=10.2 - # scipy 1.0 or later - - scipy >=1.0 - # CUDA Python 11.6 or later - - cuda-python >=11.6 - -test: - requires: - - jinja2 - # Required to test optional Numba features - - cffi - - scipy - - ipython # [not aarch64] - # for pycc - - setuptools - - tbb >=2021.6 # [not (aarch64 or ppc64le or win32)] - # this is clobbering as run depends on llvm-openmp-dev - #- llvm-openmp # [osx] - # This is for driving gdb tests - - pexpect # [linux64] - # For testing ipython - - ipykernel - # Need these for AOT. Do not init msvc as it may not be present - - {{ compiler('c') }} # [not (win or aarch64)] - - {{ compiler('cxx') }} # [not (win or aarch64)] - -about: - home: https://numba.pydata.org/ - license: BSD - license_file: LICENSE - summary: a just-in-time Python function compiler based on LLVM diff --git a/buildscripts/conda-recipes/numba/run_test.bat b/buildscripts/conda-recipes/numba/run_test.bat deleted file mode 100644 index a62aac845713..000000000000 --- a/buildscripts/conda-recipes/numba/run_test.bat +++ /dev/null @@ -1,23 +0,0 @@ -set NUMBA_DEVELOPER_MODE=1 -set NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1 -set NUMBA_CAPTURED_ERRORS=new_style -set PYTHONFAULTHANDLER=1 - -@rem no parallel target support for 32 bit windows and no TBB packages -if "%ARCH%"=="32" ( - set NUMBA_DISABLE_TBB=1 -) - -@rem Check Numba executable is there -numba -h - -@rem Run system info tool -numba -s - -@rem Check test discovery works -python -m numba.tests.test_runtests - -@rem Run the whole test suite -python -m numba.runtests -b -m -- %TESTS_TO_RUN% - -if errorlevel 1 exit 1 diff --git a/buildscripts/conda-recipes/numba/run_test.sh b/buildscripts/conda-recipes/numba/run_test.sh deleted file mode 100644 index b6caec9ac810..000000000000 --- a/buildscripts/conda-recipes/numba/run_test.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash - -set -e - -export NUMBA_DEVELOPER_MODE=1 -export NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1 -export NUMBA_CAPTURED_ERRORS="new_style" -export PYTHONFAULTHANDLER=1 -# Required OpenMP test env var (for offloading). -export TEST_DEVICES=0 - -# Disable NumPy dispatching to AVX512_SKX feature extensions if the chip is -# reported to support the feature and NumPy >= 1.22 as this results in the use -# of low accuracy SVML libm replacements in ufunc loops. -_NPY_CMD='from numba.misc import numba_sysinfo;\ - sysinfo=numba_sysinfo.get_sysinfo();\ - print(sysinfo["NumPy AVX512_SKX detected"] and - sysinfo["NumPy Version"]>="1.22")' -NUMPY_DETECTS_AVX512_SKX_NP_GT_122=$(python -c "$_NPY_CMD") -echo "NumPy >= 1.22 with AVX512_SKX detected: $NUMPY_DETECTS_AVX512_SKX_NP_GT_122" - -if [[ "$NUMPY_DETECTS_AVX512_SKX_NP_GT_122" == "True" ]]; then - export NPY_DISABLE_CPU_FEATURES="AVX512_SKX" -fi - - -unamestr=`uname` -if [[ "$unamestr" == 'Linux' ]]; then - # Test if catchsegv exists, not by default in recent libc. - if catchsegv --version; then - SEGVCATCH=catchsegv - else - SEGVCATCH="" - fi -elif [[ "$unamestr" == 'Darwin' ]]; then - SEGVCATCH="" -else - echo Error -fi - -# limit CPUs in use on PPC64LE, fork() issues -# occur on high core count systems -archstr=`uname -m` -if [[ "$archstr" == 'ppc64le' ]]; then - TEST_NPROCS=16 -fi - -# Check Numba executable is there -numba -h - -# run system info tool -numba -s - -# Check test discovery works -python -m numba.tests.test_runtests - -# Disable tests for package building. -exit 0 - -if nvidia-smi --list-gpus; then - echo "Found NVIDIA GPU, enable OpenMP offloading tests" - export RUN_TARGET=1 -else - echo "Missing NVIDIA GPU, disable OpenMP offloading tests" - export RUN_TARGET=0 -fi - -# Run the whole test suite -# Test only openmp for brevity. We may want to enable the full numba tests, -# which include openmp, on larger runners. -TESTS_TO_RUN="numba.tests.test_openmp" -# Run OpenMP tests in a single-process since they use multiple cores by -# multi-threading. Using multiple processes for testing will very probably slow -# things down. -# XXX: Using -m $TEST_NPROCS, even if with 1 process, hangs on github runners -# when running the full testsuite, while individual tests pass. This requires -# more investigation. Some observations: 1) running the full test suite creates -# new threads for each region, the old ones are blocked in a futex for -# destruction, 2) it is possible that in small github runners this starves cpu -# time, 3) there may be implications with "-m 1" vs. no flag on how the runtime -# library is inited/de-inited. - -echo "Running: $SEGVCATCH python -m numba.runtests -v -- $TESTS_TO_RUN" -$SEGVCATCH python -m numba.runtests -v -- $TESTS_TO_RUN diff --git a/buildscripts/conda-recipes/numba/conda_build_config.yaml b/buildscripts/conda-recipes/pyomp/conda_build_config.yaml similarity index 100% rename from buildscripts/conda-recipes/numba/conda_build_config.yaml rename to buildscripts/conda-recipes/pyomp/conda_build_config.yaml diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml index f2bd067b96fc..4ee7193abc7e 100644 --- a/buildscripts/conda-recipes/pyomp/meta.yaml +++ b/buildscripts/conda-recipes/pyomp/meta.yaml @@ -1,17 +1,36 @@ +{% set version = "0.2.0" %} + package: name: pyomp - version: 0.1.3 + version: {{ version }} + +source: + path: ../../.. build: - string: {{ (GITHUB_HEAD_SHA | default(''))[:7] ~ (CI_COMMIT_SHA | default(''))[:7] }} + string: py{{ PY_VER }}h{{ PKG_HASH }}_{{GIT_DESCRIBE_HASH}}_{{ GIT_DESCRIBE_NUMBER }} script_env: - - PY_VCRUNTIME_REDIST - - GITHUB_HEAD_SHA - - CI_COMMIT_SHA + - PY_VCRUNTIME_REDIST # [win] + script: + - {{ PYTHON }} -m pip install . -vv requirements: + build: + - {{ compiler('c') }} + - {{ compiler('cxx') }} + host: + - python + - pip + - setuptools + - numba >=0.57, <0.58 run: - - python >=3.8, <=3.10 - - numba pyomp_0.57.* + - numba >=0.57, <0.58 + - lark + - cffi + - llvm-openmp-dev + about: - summary: "PyOMP metapackage, OpenMP for portable CPU/GPU parallel programming in Python." + home: https://github.com/Python-for-HPC/PyOMP + license: BSD-2-Clause + license_file: LICENSE + summary: "PyOMP: OpenMP for portable CPU/GPU parallel programming in Python using Numba." diff --git a/pyproject.toml b/pyproject.toml index 46b76e28f22c..af4b697973bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=61.0", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"] +requires = ["setuptools>=77.0", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"] build-backend = "setuptools.build_meta" [project] @@ -7,11 +7,10 @@ name = "pyomp" version = "0.2.0" description = "Python OpenMP library based on Numba" readme = "README.md" -requires-python = ">=3.8, <=3.12" -license = { text = "BSD 2-Clause License" } +requires-python = ">=3.8, <3.12" +license = "BSD-2-Clause" classifiers = [ "Programming Language :: Python :: 3", - "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Development Status :: 4 - Beta", "Intended Audience :: Developers", @@ -23,9 +22,5 @@ dependencies = ["numba>=0.57, <0.58", "lark", "cffi"] Homepage = "https://github.com/Python-for-HPC/PyOMP" Issues = "https://github.com/Python-for-HPC/PyOMP/issues" -[tool.setuptools.packages.find] -where = ["."] -include = ["numba.openmp"] - -[tool.setuptools.package-data] -"numba.openmp" = ["libs/*"] +[tool.setuptools] +packages = ["numba.openmp"] diff --git a/setup.py b/setup.py index 487343fcc112..d854b277c4c0 100644 --- a/setup.py +++ b/setup.py @@ -2,26 +2,28 @@ import os import numba import sysconfig -import numpy as np import subprocess +import shutil +import numpy as np from pathlib import Path from setuptools import setup, Extension from setuptools.command.build_ext import build_ext from setuptools.command.build_clib import build_clib -numba_dir = os.path.dirname(numba.__file__) +temp_dir = Path("numba/openmp/nrt/numba_src") + bundle_lib = ( "bundle", { "sources": [ "numba/openmp/nrt/init.c", - f"{numba_dir}/_helpermod.c", - f"{numba_dir}/cext/utils.c", - f"{numba_dir}/cext/dictobject.c", - f"{numba_dir}/cext/listobject.c", - f"{numba_dir}/core/runtime/_nrt_pythonmod.c", - f"{numba_dir}/core/runtime/nrt.cpp", + f"{temp_dir}/_helpermod.c", + f"{temp_dir}/cext/utils.c", + f"{temp_dir}/cext/dictobject.c", + f"{temp_dir}/cext/listobject.c", + f"{temp_dir}/core/runtime/_nrt_pythonmod.c", + f"{temp_dir}/core/runtime/nrt.cpp", ], "include_dirs": [ sysconfig.get_paths()["include"], @@ -31,23 +33,42 @@ ) -class BuildStaticBundle(build_clib): +class BuildStaticNRTBundle(build_clib): def finalize_options(self): super().finalize_options() - self.build_temp = (Path("numba/openmp/nrt") / self.build_temp).absolute() - self.build_temp.mkdir(parents=True, exist_ok=True) - self.build_temp = str(self.build_temp) - self.build_clib = str(Path("numba/openmp/libs").absolute()) + # Copy numba tree installation to the build directory for building the + # static library using relative paths. + numba_dir = numba.__path__[0] + shutil.copytree( + numba_dir, + temp_dir, + ignore=shutil.ignore_patterns( + "*.py", + "*.pyc", + "*.so", + "*.dylib", + "__pycache__", + ), + dirs_exist_ok=True, + ) + + self.build_clib = "numba/openmp/libs" + + def run(self): + super().run() + + # Clean up files after build is completed. + shutil.rmtree(temp_dir, ignore_errors=True) class CMakeExtension(Extension): def __init__(self, name, sourcedir): - # don't invoke the original build_ext for this special extension + # Don't invoke the original build_ext for this special extension. super().__init__(name, sources=[]) self.sourcedir = sourcedir -class BuildPass(build_ext): +class BuildIntrinsicsOpenMPPass(build_ext): def run(self): for ext in self.extensions: if isinstance(ext, CMakeExtension): @@ -56,24 +77,26 @@ def run(self): super().run() def build_cmake(self, ext): - build_dir = (Path(ext.sourcedir) / self.build_temp).absolute() + # Delete build directory if it exists to avoid errors with stale + # CMakeCache.txt leftovers. + shutil.rmtree(self.build_temp, ignore_errors=True) + subprocess.run( [ "cmake", "-S", ext.sourcedir, "-B", - build_dir, - "--install-prefix", - Path("numba/openmp/libs").absolute(), + self.build_temp, "-DCMAKE_BUILD_TYPE=Release", + "-DCMAKE_INSTALL_PREFIX=numba/openmp/libs", ], check=True, ) - subprocess.run(["cmake", "--build", build_dir, "-j"], check=True) + subprocess.run(["cmake", "--build", self.build_temp, "-j"], check=True) subprocess.run( - ["cmake", "--install", build_dir], + ["cmake", "--install", self.build_temp], check=True, ) @@ -81,5 +104,8 @@ def build_cmake(self, ext): setup( libraries=[bundle_lib], ext_modules=[CMakeExtension("libIntrinsicsOpenMP", "numba/openmp/pass")], - cmdclass={"build_clib": BuildStaticBundle, "build_ext": BuildPass}, + cmdclass={ + "build_clib": BuildStaticNRTBundle, + "build_ext": BuildIntrinsicsOpenMPPass, + }, ) From 57ac3d82c2a3c5d92ae9b3b9e9b9a19cc8604d0b Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 22 Apr 2025 23:03:37 -0700 Subject: [PATCH 05/28] Update conda builders and simplify llvm-openmp-dev meta.yaml --- .github/workflows/build-upload-conda-base.yml | 131 +----------------- .gitlab/jobs/lassen.yml | 15 +- .../conda-recipes/llvm-openmp-dev/meta.yaml | 6 +- buildscripts/gitlab/create-conda-pkgs.sh | 8 -- 4 files changed, 10 insertions(+), 150 deletions(-) diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml index 6a9186b1e737..e5713bd04580 100644 --- a/.github/workflows/build-upload-conda-base.yml +++ b/.github/workflows/build-upload-conda-base.yml @@ -34,7 +34,6 @@ env: git clone https://github.com/Python-for-HPC/PyOMP; cd PyOMP; git checkout ${{ inputs.commit }}; - export GITHUB_HEAD_SHA=${{ inputs.commit }}; jobs: # Job to deploy llvm-openmp-dev, runs once as it is independent of the python @@ -48,9 +47,6 @@ jobs: os: [ubuntu-latest, macOS-latest] steps: - uses: actions/checkout@v4 - - name: Set env for HEAD SHA - run: - echo "GITHUB_HEAD_SHA=${{ inputs.commit }}" >> $GITHUB_ENV - name: Create and activate conda env uses: conda-incubator/setup-miniconda@v3 with: @@ -69,43 +65,10 @@ jobs: --token ${{ secrets.ANACONDA_TOKEN }} \ buildscripts/conda-recipes/llvm-openmp-dev; - # Job to deploy llvmlite and numba, matrixed on os and python version. - conda-deploy-llvmlite: + # Job to deploy the pyomp metapackage matrixed on the python version. + conda-deploy-pyomp: needs: conda-deploy-llvm-openmp-dev - name: llvmlite ${{ matrix.os }} ${{ matrix.python-version }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - #os: [ubuntu-latest, macOS-latest, windows-latest] - os: [ubuntu-latest, macOS-latest] - python-version: ["3.8", "3.9", "3.10"] - steps: - - uses: actions/checkout@v4 - - name: Set env for HEAD SHA - run: - echo "GITHUB_HEAD_SHA=${{ inputs.commit }}" >> $GITHUB_ENV - - name: Create and activate conda env - uses: conda-incubator/setup-miniconda@v3 - with: - python-version: ${{ matrix.python-version }} - environment-file: ${{ inputs.env }} - auto-update-conda: false - auto-activate-base: false - show-channel-urls: true - - name: Build and upload llvmlite - run: | - conda remove --name base conda-anaconda-telemetry - conda install -q -y -c conda-forge conda-build conda-verify anaconda-client; - conda config --set anaconda_upload yes; - conda build --user python-for-hpc --label ${{ inputs.label }} \ - -c python-for-hpc -c conda-forge \ - --python ${{ matrix.python-version }} --token ${{ secrets.ANACONDA_TOKEN }} \ - buildscripts/conda-recipes/llvmlite; - - # Job to deploy numba, matrixed on os and python version. - conda-deploy-numba: - needs: conda-deploy-llvmlite - name: numba ${{ matrix.os }} ${{ matrix.python-version }} + name: pyomp ${{ matrix.os }} ${{ matrix.python-version }} runs-on: ${{ matrix.os }} strategy: matrix: @@ -114,42 +77,6 @@ jobs: python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v4 - - name: Set env for HEAD SHA - run: - echo "GITHUB_HEAD_SHA=${{ inputs.commit }}" >> $GITHUB_ENV - - name: Create and activate conda env - uses: conda-incubator/setup-miniconda@v3 - with: - python-version: ${{ matrix.python-version }} - environment-file: ${{ inputs.env }} - auto-update-conda: false - auto-activate-base: false - show-channel-urls: true - - name: Build and upload numba - run: | - conda remove --name base conda-anaconda-telemetry - conda install -q -y -c conda-forge conda-build conda-verify anaconda-client; - conda config --set anaconda_upload yes; - conda build --user python-for-hpc --label ${{ inputs.label }} \ - -c python-for-hpc -c conda-forge \ - --python ${{ matrix.python-version }} --token ${{ secrets.ANACONDA_TOKEN }} \ - buildscripts/conda-recipes/numba; - - # Job to deploy the pyomp metapackage, runs once as it is independent of the - # python version. - conda-deploy-pyomp: - needs: conda-deploy-numba - name: pyomp ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - #os: [ubuntu-latest, macOS-latest, windows-latest] - os: [ubuntu-latest, macOS-latest] - steps: - - uses: actions/checkout@v4 - - name: Set env for HEAD SHA - run: - echo "GITHUB_HEAD_SHA=${{ inputs.commit }}" >> $GITHUB_ENV - name: Create and activate conda env uses: conda-incubator/setup-miniconda@v3 with: @@ -164,7 +91,7 @@ jobs: conda install -q -y -c conda-forge conda-build conda-verify anaconda-client; conda config --set anaconda_upload yes; conda build --user python-for-hpc --label ${{ inputs.label }} \ - -c python-for-hpc -c conda-forge \ + -c python-for-hpc -c conda-forge --python ${{ matrix.python-version }} \ --token ${{ secrets.ANACONDA_TOKEN }} \ buildscripts/conda-recipes/pyomp; @@ -191,58 +118,13 @@ jobs: buildscripts/conda-recipes/llvm-openmp-dev; " - conda-deploy-llvmlite-linux-arm64: + conda-deploy-pyomp-linux-arm64: needs: conda-deploy-llvm-openmp-dev-linux-arm64 - name: llvmlite linux-arm64 - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - steps: - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - name: Deploy llvmlite - run: | - docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c " - ${{ env.SETUP_BASE }} - ${{ env.SETUP_MINICONDA }} - ${{ env.SETUP_REPO }} - conda build --user python-for-hpc --label ${{ inputs.label }} \ - -c python-for-hpc -c conda-forge \ - --python ${{ matrix.python-version}} --token ${{ secrets.ANACONDA_TOKEN }} \ - buildscripts/conda-recipes/llvmlite - " - - conda-deploy-numba-linux-arm64: - needs: conda-deploy-llvmlite-linux-arm64 - name: numba linux-arm64 + name: pyomp linux-arm64 runs-on: ubuntu-latest strategy: matrix: python-version: ["3.8", "3.9", "3.10"] - steps: - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - name: Deploy numba - run: | - docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c " - ${{ env.SETUP_BASE }} - ${{ env.SETUP_MINICONDA }} - ${{ env.SETUP_REPO }} - conda build --user python-for-hpc --label ${{ inputs.label }} \ - -c python-for-hpc -c conda-forge \ - --python ${{ matrix.python-version}} --token ${{ secrets.ANACONDA_TOKEN }} \ - buildscripts/conda-recipes/numba; - " - - conda-deploy-pyomp-linux-arm64: - needs: conda-deploy-numba-linux-arm64 - name: pyomp linux-arm64 - runs-on: ubuntu-latest steps: - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -256,6 +138,7 @@ jobs: ${{ env.SETUP_REPO }} conda build --user python-for-hpc --label ${{ inputs.label }} \ -c python-for-hpc -c conda-forge \ + --python ${{ matrix.python-version}} \ --token ${{ secrets.ANACONDA_TOKEN }} \ buildscripts/conda-recipes/pyomp; " \ No newline at end of file diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index b394c9a55898..05e1db7bd6ea 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -66,21 +66,8 @@ build-llvm-openmp-dev-lassen: variables: PYOMP_CI_BUILD_PKG: "llvm-openmp-dev" -build-llvmlite-lassen: +build-pyomp-lassen: extends: [.base-job, .python-variants] needs: ["build-llvm-openmp-dev-lassen"] - variables: - PYOMP_CI_BUILD_PKG: "llvmlite" - -build-numba-lassen: - extends: [.base-job, .python-variants] - needs: ["build-llvmlite-lassen"] - variables: - PYOMP_CI_BUILD_PKG: "numba" - -build-pyomp-lassen: - extends: .base-job - needs: ["build-numba-lassen"] variables: PYOMP_CI_BUILD_PKG: "pyomp" - diff --git a/buildscripts/conda-recipes/llvm-openmp-dev/meta.yaml b/buildscripts/conda-recipes/llvm-openmp-dev/meta.yaml index 7b440465ead0..93df663256ce 100644 --- a/buildscripts/conda-recipes/llvm-openmp-dev/meta.yaml +++ b/buildscripts/conda-recipes/llvm-openmp-dev/meta.yaml @@ -10,11 +10,9 @@ source: build: merge_build_host: False - string: h{{ PKG_HASH }}_{{ (GITHUB_HEAD_SHA | default(''))[:7] ~ (CI_COMMIT_SHA | default(''))[:7] }} + string: h{{ PKG_HASH }} script_env: - - PY_VCRUNTIME_REDIST - - GITHUB_HEAD_SHA - - CI_COMMIT_SHA + - PY_VCRUNTIME_REDIST # [win] requirements: build: diff --git a/buildscripts/gitlab/create-conda-pkgs.sh b/buildscripts/gitlab/create-conda-pkgs.sh index 5c7f30b66cbc..7fe406cc45d1 100755 --- a/buildscripts/gitlab/create-conda-pkgs.sh +++ b/buildscripts/gitlab/create-conda-pkgs.sh @@ -40,14 +40,6 @@ case ${PYOMP_CI_BUILD_PKG} in deploy_conda "llvm-openmp-dev" ;; - "llvmlite") - deploy_conda "llvmlite" - ;; - - "numba") - deploy_conda "numba" - ;; - "pyomp") deploy_conda "pyomp" ;; From d2d888dd199fea957a20902c3eaf356d4d2e0af1 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 22 Apr 2025 23:31:38 -0700 Subject: [PATCH 06/28] Fix to get commit hash for the pyomp build string --- .github/workflows/build-upload-conda-base.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml index e5713bd04580..3e5119e386ac 100644 --- a/.github/workflows/build-upload-conda-base.yml +++ b/.github/workflows/build-upload-conda-base.yml @@ -77,6 +77,10 @@ jobs: python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v4 + # Checkout the repo with history to get the commit hash for the build + # string. + with: + fetch-depth: 0 - name: Create and activate conda env uses: conda-incubator/setup-miniconda@v3 with: From 21bedc076d7ecece49d3dbdec200d03ac9d53781 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 00:43:30 -0700 Subject: [PATCH 07/28] Add llvmdev dependency in pyomp meta.yaml --- buildscripts/conda-recipes/pyomp/meta.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml index 4ee7193abc7e..e1d548a30725 100644 --- a/buildscripts/conda-recipes/pyomp/meta.yaml +++ b/buildscripts/conda-recipes/pyomp/meta.yaml @@ -23,11 +23,13 @@ requirements: - pip - setuptools - numba >=0.57, <0.58 + - llvmdev 14.* run: - numba >=0.57, <0.58 - lark - cffi - llvm-openmp-dev + - llvmdev 14.* about: home: https://github.com/Python-for-HPC/PyOMP From 4233b5a5f0e4b5fb2f1d86eb07b25072b771461b Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 00:59:36 -0700 Subject: [PATCH 08/28] Revert license format in pyomp meta.yaml for backwards compat --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index af4b697973bb..113a1bc2e3f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ version = "0.2.0" description = "Python OpenMP library based on Numba" readme = "README.md" requires-python = ">=3.8, <3.12" -license = "BSD-2-Clause" +license = { text = "BSD 2-Clause" } classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", From a02c13522e51327a794d209fc2fb62360fa1fcee Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 08:46:58 -0700 Subject: [PATCH 09/28] Update pyomp meta.yaml for cmake as build dep --- buildscripts/conda-recipes/pyomp/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml index e1d548a30725..81e93ed4d90a 100644 --- a/buildscripts/conda-recipes/pyomp/meta.yaml +++ b/buildscripts/conda-recipes/pyomp/meta.yaml @@ -18,6 +18,7 @@ requirements: build: - {{ compiler('c') }} - {{ compiler('cxx') }} + - cmake host: - python - pip From 314a5a6d2a3cf9d5b4e90a2efa711318f5564d9b Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 08:47:37 -0700 Subject: [PATCH 10/28] Use the arm64 github runner --- .github/workflows/build-upload-conda-base.yml | 89 ++++++++++--------- 1 file changed, 45 insertions(+), 44 deletions(-) diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml index 3e5119e386ac..9028d73302a0 100644 --- a/.github/workflows/build-upload-conda-base.yml +++ b/.github/workflows/build-upload-conda-base.yml @@ -44,7 +44,7 @@ jobs: strategy: matrix: #os: [ubuntu-latest, macOS-latest, windows-latest] - os: [ubuntu-latest, macOS-latest] + os: [ubuntu-latest, macOS-latest, ubuntu-24.04-arm] steps: - uses: actions/checkout@v4 - name: Create and activate conda env @@ -102,47 +102,48 @@ jobs: ################################################################ # Deploy on ARM64 using QEMU and a docker container for building. ################################################################ - conda-deploy-llvm-openmp-dev-linux-arm64: - name: llvm-openmp-dev linux-arm64 - runs-on: ubuntu-latest - steps: - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - name: Deploy llvm-openmp-dev - run: | - docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -xel -c " - ${{ env.SETUP_BASE }} - ${{ env.SETUP_MINICONDA }} - ${{ env.SETUP_REPO }} - conda build --user python-for-hpc --label ${{ inputs.label }} \ - -c python-for-hpc -c conda-forge \ - --token ${{ secrets.ANACONDA_TOKEN }} \ - buildscripts/conda-recipes/llvm-openmp-dev; - " + #conda-deploy-llvm-openmp-dev-linux-arm64: + # name: llvm-openmp-dev linux-arm64 + # runs-on: ubuntu-latest + # steps: + # - name: Set up QEMU + # uses: docker/setup-qemu-action@v3 + # with: + # platforms: linux/arm64 + # - name: Deploy llvm-openmp-dev + # run: | + # docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -xel -c " + # ${{ env.SETUP_BASE }} + # ${{ env.SETUP_MINICONDA }} + # ${{ env.SETUP_REPO }} + # conda build --user python-for-hpc --label ${{ inputs.label }} \ + # -c python-for-hpc -c conda-forge \ + # --token ${{ secrets.ANACONDA_TOKEN }} \ + # buildscripts/conda-recipes/llvm-openmp-dev; + # " - conda-deploy-pyomp-linux-arm64: - needs: conda-deploy-llvm-openmp-dev-linux-arm64 - name: pyomp linux-arm64 - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] - steps: - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - name: Deploy pyomp - run: | - docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c " - ${{ env.SETUP_BASE }} - ${{ env.SETUP_MINICONDA }} - ${{ env.SETUP_REPO }} - conda build --user python-for-hpc --label ${{ inputs.label }} \ - -c python-for-hpc -c conda-forge \ - --python ${{ matrix.python-version}} \ - --token ${{ secrets.ANACONDA_TOKEN }} \ - buildscripts/conda-recipes/pyomp; - " \ No newline at end of file + #conda-deploy-pyomp-linux-arm64: + # needs: conda-deploy-llvm-openmp-dev-linux-arm64 + # name: pyomp linux-arm64 + # runs-on: ubuntu-latest + # strategy: + # matrix: + # python-version: ["3.8", "3.9", "3.10"] + # steps: + # - name: Set up QEMU + # uses: docker/setup-qemu-action@v3 + # with: + # platforms: linux/arm64 + # - name: Deploy pyomp + # run: | + # docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c " + # ${{ env.SETUP_BASE }} + # ${{ env.SETUP_MINICONDA }} + # ${{ env.SETUP_REPO }} + # conda build --user python-for-hpc --label ${{ inputs.label }} \ + # -c python-for-hpc -c conda-forge \ + # --python ${{ matrix.python-version}} \ + # --token ${{ secrets.ANACONDA_TOKEN }} \ + # buildscripts/conda-recipes/pyomp; + # " + \ No newline at end of file From 5d761345913fc440e2ad55b73fa23bef7a553ba1 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 09:01:51 -0700 Subject: [PATCH 11/28] Trigger CI [run gitlab ci] From daebe46c59717ddc56b2a37a69ea49a7f4822ad2 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 10:01:53 -0700 Subject: [PATCH 12/28] Cleanup github workflow for conda deployment --- .github/workflows/build-upload-conda-base.yml | 80 ++----------------- 1 file changed, 5 insertions(+), 75 deletions(-) diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml index 9028d73302a0..4b1a5f43b454 100644 --- a/.github/workflows/build-upload-conda-base.yml +++ b/.github/workflows/build-upload-conda-base.yml @@ -13,28 +13,6 @@ on: required: true type: string -# Rembember: you have to escape characters like $ with \$. -env: - SETUP_BASE: | - apt-get -qq update > /dev/null; - apt-get -qq upgrade > /dev/null; - apt-get -qq install wget git > /dev/null; - useradd -ms /bin/bash pyompuser; - su pyompuser; - cd /home/pyompuser; - SETUP_MINICONDA: | - wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-\$(uname -m).sh -O miniconda.sh; - bash miniconda.sh -b -u -p ./miniconda3; - rm -rf miniconda.sh; - export PATH=/home/pyompuser/miniconda3/bin:\${PATH}; - conda remove --name base conda-anaconda-telemetry - conda install -q -y -c conda-forge conda-build conda-verify anaconda-client; - conda config --set anaconda_upload yes; - SETUP_REPO: | - git clone https://github.com/Python-for-HPC/PyOMP; - cd PyOMP; - git checkout ${{ inputs.commit }}; - jobs: # Job to deploy llvm-openmp-dev, runs once as it is independent of the python # version. @@ -43,7 +21,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - #os: [ubuntu-latest, macOS-latest, windows-latest] + # TODO: Add windows. os: [ubuntu-latest, macOS-latest, ubuntu-24.04-arm] steps: - uses: actions/checkout@v4 @@ -72,8 +50,8 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - #os: [ubuntu-latest, macOS-latest, windows-latest] - os: [ubuntu-latest, macOS-latest] + # TODO: Add windows. + os: [ubuntu-latest, macOS-latest, ubuntu-24.04-arm] python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v4 @@ -95,55 +73,7 @@ jobs: conda install -q -y -c conda-forge conda-build conda-verify anaconda-client; conda config --set anaconda_upload yes; conda build --user python-for-hpc --label ${{ inputs.label }} \ - -c python-for-hpc -c conda-forge --python ${{ matrix.python-version }} \ + -c python-for-hpc -c conda-forge \ + --python ${{ matrix.python-version }} \ --token ${{ secrets.ANACONDA_TOKEN }} \ buildscripts/conda-recipes/pyomp; - - ################################################################ - # Deploy on ARM64 using QEMU and a docker container for building. - ################################################################ - #conda-deploy-llvm-openmp-dev-linux-arm64: - # name: llvm-openmp-dev linux-arm64 - # runs-on: ubuntu-latest - # steps: - # - name: Set up QEMU - # uses: docker/setup-qemu-action@v3 - # with: - # platforms: linux/arm64 - # - name: Deploy llvm-openmp-dev - # run: | - # docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -xel -c " - # ${{ env.SETUP_BASE }} - # ${{ env.SETUP_MINICONDA }} - # ${{ env.SETUP_REPO }} - # conda build --user python-for-hpc --label ${{ inputs.label }} \ - # -c python-for-hpc -c conda-forge \ - # --token ${{ secrets.ANACONDA_TOKEN }} \ - # buildscripts/conda-recipes/llvm-openmp-dev; - # " - - #conda-deploy-pyomp-linux-arm64: - # needs: conda-deploy-llvm-openmp-dev-linux-arm64 - # name: pyomp linux-arm64 - # runs-on: ubuntu-latest - # strategy: - # matrix: - # python-version: ["3.8", "3.9", "3.10"] - # steps: - # - name: Set up QEMU - # uses: docker/setup-qemu-action@v3 - # with: - # platforms: linux/arm64 - # - name: Deploy pyomp - # run: | - # docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c " - # ${{ env.SETUP_BASE }} - # ${{ env.SETUP_MINICONDA }} - # ${{ env.SETUP_REPO }} - # conda build --user python-for-hpc --label ${{ inputs.label }} \ - # -c python-for-hpc -c conda-forge \ - # --python ${{ matrix.python-version}} \ - # --token ${{ secrets.ANACONDA_TOKEN }} \ - # buildscripts/conda-recipes/pyomp; - # " - \ No newline at end of file From ce9f26f59493395593f1cb5e74d21e6580ee0786 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 10:02:46 -0700 Subject: [PATCH 13/28] Update gitlab ci - Remove need for trigger ([run gitlab ci] to run - Use per job conda pkgs dir to avoid conflicts with multiple runners --- .gitlab-ci.yml | 2 -- .gitlab/subscribed-pipelines.yml | 2 -- buildscripts/gitlab/create-conda-pkgs.sh | 4 ++++ 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c73c68fc9558..851416009774 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -85,8 +85,6 @@ stages: strategy: depend forward: pipeline_variables: true - rules: - - if: ($CI_COMMIT_MESSAGE =~ /\[run gitlab ci\]/ || $CI_COMMIT_TAG) include: # Sets ID tokens for every job using `default:` diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml index 714f1d78af49..265a344ba850 100644 --- a/.gitlab/subscribed-pipelines.yml +++ b/.gitlab/subscribed-pipelines.yml @@ -23,8 +23,6 @@ --data "{ \"state\": \"failure\", \"target_url\": \"${CI_PIPELINE_URL}\", \"description\": \"GitLab ${CI_MACHINE} down\", \"context\": \"ci/gitlab/${CI_MACHINE}\" }" exit 1 fi - rules: - - if: ($CI_COMMIT_MESSAGE =~ /\[run gitlab ci\]/ || $CI_COMMIT_TAG) ### # Trigger a build-and-test pipeline for a machine. diff --git a/buildscripts/gitlab/create-conda-pkgs.sh b/buildscripts/gitlab/create-conda-pkgs.sh index 7fe406cc45d1..a143f7e6d8a9 100755 --- a/buildscripts/gitlab/create-conda-pkgs.sh +++ b/buildscripts/gitlab/create-conda-pkgs.sh @@ -7,6 +7,10 @@ else LABEL="test" fi +# Set pkg dir per job to avoid conflicts. +export CONDA_PKGS_DIRS=/tmp/ggeorgak/conda-pkgs-${CI_JOB_ID} +mkdir -p "$CONDA_PKGS_DIRS" + function deploy_conda() { PKG="${1}" From ebad638f757205676e58c94cd5b7a6970fc871f1 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 11:19:54 -0700 Subject: [PATCH 14/28] Add tests --- buildscripts/conda-recipes/pyomp/run_test.sh | 61 ++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 buildscripts/conda-recipes/pyomp/run_test.sh diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh new file mode 100644 index 000000000000..d14a16db5ce1 --- /dev/null +++ b/buildscripts/conda-recipes/pyomp/run_test.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +set -e + +export NUMBA_DEVELOPER_MODE=1 +export NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1 +export NUMBA_CAPTURED_ERRORS="new_style" +export PYTHONFAULTHANDLER=1 + +# Disable NumPy dispatching to AVX512_SKX feature extensions if the chip is +# reported to support the feature and NumPy >= 1.22 as this results in the use +# of low accuracy SVML libm replacements in ufunc loops. +_NPY_CMD='from numba.misc import numba_sysinfo;\ + sysinfo=numba_sysinfo.get_sysinfo();\ + print(sysinfo["NumPy AVX512_SKX detected"] and + sysinfo["NumPy Version"]>="1.22")' +NUMPY_DETECTS_AVX512_SKX_NP_GT_122=$(python -c "$_NPY_CMD") +echo "NumPy >= 1.22 with AVX512_SKX detected: $NUMPY_DETECTS_AVX512_SKX_NP_GT_122" + +if [[ "$NUMPY_DETECTS_AVX512_SKX_NP_GT_122" == "True" ]]; then + export NPY_DISABLE_CPU_FEATURES="AVX512_SKX" +fi + +unamestr=`uname` +if [[ "$unamestr" == 'Linux' ]]; then + # Test if catchsegv exists, not by default in recent libc. + if catchsegv --version; then + SEGVCATCH=catchsegv + else + SEGVCATCH="" + fi +elif [[ "$unamestr" == 'Darwin' ]]; then + SEGVCATCH="" +else + echo Error +fi + +# Run OpenMP tests in a single-process since they use multiple cores by +# multi-threading. Using multiple processes for testing will very probably slow +# things down. +# XXX: Using -m $TEST_NPROCS, even if with 1 process, hangs on github runners +# when running the full testsuite, while individual tests pass. This requires +# more investigation. Some observations: 1) running the full test suite creates +# new threads for each region, the old ones are blocked in a futex for +# destruction, 2) it is possible that in small github runners this starves cpu +# time, 3) there may be implications with "-m 1" vs. no flag on how the runtime +# library is inited/de-inited. + +echo "=> Run OpenMP CPU parallelism tests" +echo "=> Running: TEST_DEVICES=0 RUN_TARGET=0 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp" +# TODO: remove requiring the unused TEST_DEVICES. +TEST_DEVICES=0 RUN_TARGET=0 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp 2>&1 + +echo "=> Run OpenMP offloading tests on CPU (device 1)" +echo "=> Running: TEST_DEVICES=1 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget" +TEST_DEVICES=1 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1 +if nvidia-smi --list-gpus; then + echo "=> Found NVIDIA GPU, Run OpenMP offloading tests on GPU (device 0)" + echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget" + TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1 +fi diff --git a/pyproject.toml b/pyproject.toml index 113a1bc2e3f4..66178c9b56a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,4 +23,4 @@ Homepage = "https://github.com/Python-for-HPC/PyOMP" Issues = "https://github.com/Python-for-HPC/PyOMP/issues" [tool.setuptools] -packages = ["numba.openmp"] +packages = ["numba.openmp", "numba.openmp.libs", "numba.openmp.tests"] From 76b66ce1a531b22ad4cd5f287bf0c31abd58e947 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 11:37:19 -0700 Subject: [PATCH 15/28] Remove numba.openmp.libs - It is created at build time, hence breaks the toml unless removed --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 66178c9b56a4..7316153ba0e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,4 +23,4 @@ Homepage = "https://github.com/Python-for-HPC/PyOMP" Issues = "https://github.com/Python-for-HPC/PyOMP/issues" [tool.setuptools] -packages = ["numba.openmp", "numba.openmp.libs", "numba.openmp.tests"] +packages = ["numba.openmp", "numba.openmp.tests"] From f3b25b84b15e29e2f4de4126ea0e5627ac1cc7e9 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 19:13:48 -0700 Subject: [PATCH 16/28] Debug --- buildscripts/conda-recipes/pyomp/run_test.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh index d14a16db5ce1..3adc734f119e 100644 --- a/buildscripts/conda-recipes/pyomp/run_test.sh +++ b/buildscripts/conda-recipes/pyomp/run_test.sh @@ -59,3 +59,7 @@ if nvidia-smi --list-gpus; then echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget" TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1 fi + +opt_path=$(which opt) +echo "OPT_PATH ${opt_path}" +opt --version From 9f21ac2326824f64ac78ad056267ddd21f04432d Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 23 Apr 2025 19:23:31 -0700 Subject: [PATCH 17/28] Debug --- buildscripts/conda-recipes/pyomp/run_test.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh index 3adc734f119e..190fd359596e 100644 --- a/buildscripts/conda-recipes/pyomp/run_test.sh +++ b/buildscripts/conda-recipes/pyomp/run_test.sh @@ -35,6 +35,10 @@ else echo Error fi +opt_path=$(which opt) +echo "OPT_PATH ${opt_path}" +opt --version + # Run OpenMP tests in a single-process since they use multiple cores by # multi-threading. Using multiple processes for testing will very probably slow # things down. @@ -60,6 +64,3 @@ if nvidia-smi --list-gpus; then TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1 fi -opt_path=$(which opt) -echo "OPT_PATH ${opt_path}" -opt --version From e7e31ec301d3130cea4ed193d9f9bfe80b8cd3df Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 24 Apr 2025 09:17:01 -0700 Subject: [PATCH 18/28] Refactor to fix packaging and cleanup - Move nrt static and the llvm plugin pass under libs subdirectory - Rename libbundle to libnrt_static - Fix setup.py to move build libraries in wheel-used build directory --- buildscripts/conda-recipes/pyomp/run_test.sh | 5 -- numba/openmp/__init__.py | 2 +- numba/openmp/{ => libs}/nrt/init.c | 0 .../{ => libs}/pass/CGIntrinsicsOpenMP.cpp | 0 .../{ => libs}/pass/CGIntrinsicsOpenMP.h | 0 numba/openmp/{ => libs}/pass/CMakeLists.txt | 0 numba/openmp/{ => libs}/pass/DebugOpenMP.cpp | 0 numba/openmp/{ => libs}/pass/DebugOpenMP.h | 0 .../{ => libs}/pass/IntrinsicsOpenMP.cpp | 0 .../openmp/{ => libs}/pass/IntrinsicsOpenMP.h | 0 .../{ => libs}/pass/IntrinsicsOpenMP_CAPI.h | 0 pyproject.toml | 4 +- setup.py | 57 ++++++++++--------- 13 files changed, 34 insertions(+), 34 deletions(-) rename numba/openmp/{ => libs}/nrt/init.c (100%) rename numba/openmp/{ => libs}/pass/CGIntrinsicsOpenMP.cpp (100%) rename numba/openmp/{ => libs}/pass/CGIntrinsicsOpenMP.h (100%) rename numba/openmp/{ => libs}/pass/CMakeLists.txt (100%) rename numba/openmp/{ => libs}/pass/DebugOpenMP.cpp (100%) rename numba/openmp/{ => libs}/pass/DebugOpenMP.h (100%) rename numba/openmp/{ => libs}/pass/IntrinsicsOpenMP.cpp (100%) rename numba/openmp/{ => libs}/pass/IntrinsicsOpenMP.h (100%) rename numba/openmp/{ => libs}/pass/IntrinsicsOpenMP_CAPI.h (100%) diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh index 190fd359596e..d14a16db5ce1 100644 --- a/buildscripts/conda-recipes/pyomp/run_test.sh +++ b/buildscripts/conda-recipes/pyomp/run_test.sh @@ -35,10 +35,6 @@ else echo Error fi -opt_path=$(which opt) -echo "OPT_PATH ${opt_path}" -opt --version - # Run OpenMP tests in a single-process since they use multiple cores by # multi-threading. Using multiple processes for testing will very probably slow # things down. @@ -63,4 +59,3 @@ if nvidia-smi --list-gpus; then echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget" TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1 fi - diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py index 52e31d4a45c4..1ceeb87bef08 100644 --- a/numba/openmp/__init__.py +++ b/numba/openmp/__init__.py @@ -2628,7 +2628,7 @@ def prepend_device_to_func_name(outlined_ir): # Do whole archive to include all symbols, esp. for the # PyOMP_NRT_Init constructor. "--whole-archive", - libpath / "libbundle.a", + libpath / "libnrt_static.a", "--no-whole-archive", "-o", filename_so, diff --git a/numba/openmp/nrt/init.c b/numba/openmp/libs/nrt/init.c similarity index 100% rename from numba/openmp/nrt/init.c rename to numba/openmp/libs/nrt/init.c diff --git a/numba/openmp/pass/CGIntrinsicsOpenMP.cpp b/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp similarity index 100% rename from numba/openmp/pass/CGIntrinsicsOpenMP.cpp rename to numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp diff --git a/numba/openmp/pass/CGIntrinsicsOpenMP.h b/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h similarity index 100% rename from numba/openmp/pass/CGIntrinsicsOpenMP.h rename to numba/openmp/libs/pass/CGIntrinsicsOpenMP.h diff --git a/numba/openmp/pass/CMakeLists.txt b/numba/openmp/libs/pass/CMakeLists.txt similarity index 100% rename from numba/openmp/pass/CMakeLists.txt rename to numba/openmp/libs/pass/CMakeLists.txt diff --git a/numba/openmp/pass/DebugOpenMP.cpp b/numba/openmp/libs/pass/DebugOpenMP.cpp similarity index 100% rename from numba/openmp/pass/DebugOpenMP.cpp rename to numba/openmp/libs/pass/DebugOpenMP.cpp diff --git a/numba/openmp/pass/DebugOpenMP.h b/numba/openmp/libs/pass/DebugOpenMP.h similarity index 100% rename from numba/openmp/pass/DebugOpenMP.h rename to numba/openmp/libs/pass/DebugOpenMP.h diff --git a/numba/openmp/pass/IntrinsicsOpenMP.cpp b/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp similarity index 100% rename from numba/openmp/pass/IntrinsicsOpenMP.cpp rename to numba/openmp/libs/pass/IntrinsicsOpenMP.cpp diff --git a/numba/openmp/pass/IntrinsicsOpenMP.h b/numba/openmp/libs/pass/IntrinsicsOpenMP.h similarity index 100% rename from numba/openmp/pass/IntrinsicsOpenMP.h rename to numba/openmp/libs/pass/IntrinsicsOpenMP.h diff --git a/numba/openmp/pass/IntrinsicsOpenMP_CAPI.h b/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h similarity index 100% rename from numba/openmp/pass/IntrinsicsOpenMP_CAPI.h rename to numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h diff --git a/pyproject.toml b/pyproject.toml index 7316153ba0e8..9842df1f018c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=77.0", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"] +requires = ["setuptools>=77.0.3", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"] build-backend = "setuptools.build_meta" [project] @@ -8,7 +8,7 @@ version = "0.2.0" description = "Python OpenMP library based on Numba" readme = "README.md" requires-python = ">=3.8, <3.12" -license = { text = "BSD 2-Clause" } +license = "BSD-2-Clause" classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", diff --git a/setup.py b/setup.py index d854b277c4c0..d7fa0025120d 100644 --- a/setup.py +++ b/setup.py @@ -1,29 +1,19 @@ -# setup.py -import os import numba import sysconfig import subprocess import shutil import numpy as np -from pathlib import Path from setuptools import setup, Extension from setuptools.command.build_ext import build_ext from setuptools.command.build_clib import build_clib -temp_dir = Path("numba/openmp/nrt/numba_src") - -bundle_lib = ( - "bundle", +nrt_static = ( + "nrt_static", { + # We extend those sources with the ones from the numba tree. "sources": [ - "numba/openmp/nrt/init.c", - f"{temp_dir}/_helpermod.c", - f"{temp_dir}/cext/utils.c", - f"{temp_dir}/cext/dictobject.c", - f"{temp_dir}/cext/listobject.c", - f"{temp_dir}/core/runtime/_nrt_pythonmod.c", - f"{temp_dir}/core/runtime/nrt.cpp", + "numba/openmp/libs/nrt/init.c", ], "include_dirs": [ sysconfig.get_paths()["include"], @@ -33,15 +23,15 @@ ) -class BuildStaticNRTBundle(build_clib): +class BuildStaticNRT(build_clib): def finalize_options(self): super().finalize_options() - # Copy numba tree installation to the build directory for building the + # Copy numba tree installation to the temp directory for building the # static library using relative paths. numba_dir = numba.__path__[0] shutil.copytree( numba_dir, - temp_dir, + f"{self.build_temp}/numba_src", ignore=shutil.ignore_patterns( "*.py", "*.pyc", @@ -52,13 +42,28 @@ def finalize_options(self): dirs_exist_ok=True, ) - self.build_clib = "numba/openmp/libs" + libname, build_info = self.libraries[0] + if libname != "nrt_static": + raise Exception("Expected library name 'nrt_static'") + if len(self.libraries) != 1: + raise Exception("Expected only the `nrt_static' library in the list") - def run(self): - super().run() + sources = build_info["sources"] + sources.extend( + [ + f"{self.build_temp}/numba_src/_helpermod.c", + f"{self.build_temp}/numba_src/cext/utils.c", + f"{self.build_temp}/numba_src/cext/dictobject.c", + f"{self.build_temp}/numba_src/cext/listobject.c", + f"{self.build_temp}/numba_src/core/runtime/_nrt_pythonmod.c", + f"{self.build_temp}/numba_src/core/runtime/nrt.cpp", + ] + ) - # Clean up files after build is completed. - shutil.rmtree(temp_dir, ignore_errors=True) + # Get build_lib directory from the 'build' command. + build_cmd = self.get_finalized_command("build") + # Build the static library in the wheel output build directory. + self.build_clib = f"{build_cmd.build_lib}/numba/openmp/libs" class CMakeExtension(Extension): @@ -89,7 +94,7 @@ def build_cmake(self, ext): "-B", self.build_temp, "-DCMAKE_BUILD_TYPE=Release", - "-DCMAKE_INSTALL_PREFIX=numba/openmp/libs", + f"-DCMAKE_INSTALL_PREFIX={self.build_lib}/numba/openmp/libs", ], check=True, ) @@ -102,10 +107,10 @@ def build_cmake(self, ext): setup( - libraries=[bundle_lib], - ext_modules=[CMakeExtension("libIntrinsicsOpenMP", "numba/openmp/pass")], + libraries=[nrt_static], + ext_modules=[CMakeExtension("libIntrinsicsOpenMP", "numba/openmp/libs/pass")], cmdclass={ - "build_clib": BuildStaticNRTBundle, + "build_clib": BuildStaticNRT, "build_ext": BuildIntrinsicsOpenMPPass, }, ) From 6aeecfded82fe6f9bd167b175a1ddac525f45a34 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 24 Apr 2025 09:46:18 -0700 Subject: [PATCH 19/28] Change again pyproject license for old format --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9842df1f018c..fef5f6064cd9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ version = "0.2.0" description = "Python OpenMP library based on Numba" readme = "README.md" requires-python = ">=3.8, <3.12" -license = "BSD-2-Clause" +license = { text = "BSD-2-Clause" } classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", From 2c455b75da6a9021a0c04a3fdd42ac7409088404 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 24 Apr 2025 10:09:01 -0700 Subject: [PATCH 20/28] Do not load libomptarget if there's no gpu to avoid error --- numba/openmp/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py index 1ceeb87bef08..9613f78bd58b 100644 --- a/numba/openmp/__init__.py +++ b/numba/openmp/__init__.py @@ -480,6 +480,10 @@ def _init(): if sys_platform.startswith("darwin") or sys_platform.startswith("win32"): return + # libomptarget errors out if gpu is not available. + if not numba_cuda.is_available(): + return + omptargetlib = llvm_libpath + "/libomptarget.so" if DEBUG_OPENMP >= 1: print("Found OpenMP target runtime library at", omptargetlib) From c44a9d423ed14ef531cd3b958c406528bd485b74 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 24 Apr 2025 11:08:37 -0700 Subject: [PATCH 21/28] Fix map_clause parsing --- numba/openmp/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py index 9613f78bd58b..7baefacb8ecb 100644 --- a/numba/openmp/__init__.py +++ b/numba/openmp/__init__.py @@ -5248,8 +5248,9 @@ def map_clause(self, args): var_list = args[1] assert len(args) == 2 else: - map_type = "TOFROM" # is this default right? FIX ME - var_list = args[0] + # TODO: is this default right? + map_type = "TOFROM" + var_list = args[1] ret = [] for var in var_list: ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var)) From c0f5def1fb22ae047a999af2e8d6614e42824e7f Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 24 Apr 2025 12:25:53 -0700 Subject: [PATCH 22/28] Revert "Do not load libomptarget if there's no gpu to avoid error" This reverts commit 2c455b75da6a9021a0c04a3fdd42ac7409088404. --- numba/openmp/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py index 7baefacb8ecb..bd94bcd09913 100644 --- a/numba/openmp/__init__.py +++ b/numba/openmp/__init__.py @@ -480,10 +480,6 @@ def _init(): if sys_platform.startswith("darwin") or sys_platform.startswith("win32"): return - # libomptarget errors out if gpu is not available. - if not numba_cuda.is_available(): - return - omptargetlib = llvm_libpath + "/libomptarget.so" if DEBUG_OPENMP >= 1: print("Found OpenMP target runtime library at", omptargetlib) From b85eab28d2bc0b4dbf82a280b504ed411ab619b5 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 24 Apr 2025 14:42:07 -0700 Subject: [PATCH 23/28] Use the compiler driver to create shared lib of host offload targets - Avoids missing symbols errors that libomptarget host RTL expects --- numba/openmp/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py index bd94bcd09913..cfc7e57a40ed 100644 --- a/numba/openmp/__init__.py +++ b/numba/openmp/__init__.py @@ -2622,14 +2622,16 @@ def prepend_device_to_func_name(outlined_ir): subprocess.run( [ - "ld", + # Use the compiler driver to create the shared library + # and avoid missing symbols. + "c++", "-shared", filename_o, # Do whole archive to include all symbols, esp. for the # PyOMP_NRT_Init constructor. - "--whole-archive", + "-Wl,--whole-archive", libpath / "libnrt_static.a", - "--no-whole-archive", + "-Wl,--no-whole-archive", "-o", filename_so, ], From d64b9c9317ad2b67638a17d9e474ff9c1ebc6f87 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 24 Apr 2025 17:29:53 -0700 Subject: [PATCH 24/28] Clone with full history for versioning in gitlab ci --- buildscripts/gitlab/create-conda-pkgs.sh | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/buildscripts/gitlab/create-conda-pkgs.sh b/buildscripts/gitlab/create-conda-pkgs.sh index a143f7e6d8a9..71088faed009 100755 --- a/buildscripts/gitlab/create-conda-pkgs.sh +++ b/buildscripts/gitlab/create-conda-pkgs.sh @@ -7,6 +7,18 @@ else LABEL="test" fi +# Create a temporary directory for the build to clone the full repo for package +# versioning. +TMPDIR=/tmp/ggeorgak/${CI_JOB_ID} +mkdir -p ${TMPDIR} +pushd ${TMPDIR} + +# Clone and fetch the commit with history for package versioning. +git clone https://github.com/${GITHUB_PROJECT_ORG}/${GITHUB_PROJECT_NAME}.git --single-branch +cd ${GITHUB_PROJECT_NAME} +git fetch origin ${CI_COMMIT_SHA} +git checkout ${CI_COMMIT_SHA} + # Set pkg dir per job to avoid conflicts. export CONDA_PKGS_DIRS=/tmp/ggeorgak/conda-pkgs-${CI_JOB_ID} mkdir -p "$CONDA_PKGS_DIRS" @@ -22,13 +34,13 @@ function deploy_conda() { export CONDA_BLD_PATH="/tmp/ggeorgak/conda-build-${PYOMP_CI_BUILD_PKG}-noarch" conda build --no-lock --no-locking --user python-for-hpc --label ${LABEL} \ -c python-for-hpc/label/${LABEL} -c conda-forge \ - ${CI_PROJECT_DIR}/buildscripts/conda-recipes/${PKG} + buildscripts/conda-recipes/${PKG} else export CONDA_BLD_PATH="/tmp/ggeorgak/conda-build-${PYOMP_CI_BUILD_PKG}-${PYOMP_CI_PYTHON_VERSION}" conda build --no-lock --no-locking --user python-for-hpc --label ${LABEL} \ -c python-for-hpc/label/${LABEL} -c conda-forge \ --python ${PYOMP_CI_PYTHON_VERSION} \ - ${CI_PROJECT_DIR}/buildscripts/conda-recipes/${PKG} + buildscripts/conda-recipes/${PKG} fi rm -rf ${CONDA_BLD_PATH} @@ -55,3 +67,4 @@ case ${PYOMP_CI_BUILD_PKG} in esac +popd From 5873d1540b114dd4913bc8f6d8e043137fce1d85 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 24 Apr 2025 18:09:04 -0700 Subject: [PATCH 25/28] Simplify github workflows - Remove unused commit input - Add paths on pull_request event --- .github/workflows/build-upload-conda-base.yml | 3 --- .github/workflows/build-upload-conda-test.yml | 8 ++++++-- .github/workflows/build-upload-conda.yml | 3 +-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml index 4b1a5f43b454..e66950aa1bf1 100644 --- a/.github/workflows/build-upload-conda-base.yml +++ b/.github/workflows/build-upload-conda-base.yml @@ -6,9 +6,6 @@ on: label: required: true type: string - commit: - required: true - type: string env: required: true type: string diff --git a/.github/workflows/build-upload-conda-test.yml b/.github/workflows/build-upload-conda-test.yml index e4f17f9d0d8d..9f26990014e4 100644 --- a/.github/workflows/build-upload-conda-test.yml +++ b/.github/workflows/build-upload-conda-test.yml @@ -2,6 +2,11 @@ name: Deploy conda pkgs (test) on: pull_request: + paths: + - "buildscripts/conda-recipes/**" + - ".github/workflows/build-upload-conda-test.yml" + - ".github/workflows/build-upload-conda-base.yml" + - "numba/**" workflow_dispatch: jobs: @@ -9,6 +14,5 @@ jobs: uses: ./.github/workflows/build-upload-conda-base.yml with: label: test - commit: ${{ github.event.pull_request.head.sha }} env: .github/workflows/envs/env-test.yml - secrets: inherit \ No newline at end of file + secrets: inherit diff --git a/.github/workflows/build-upload-conda.yml b/.github/workflows/build-upload-conda.yml index 565be3389f25..96e69538316b 100644 --- a/.github/workflows/build-upload-conda.yml +++ b/.github/workflows/build-upload-conda.yml @@ -10,10 +10,9 @@ jobs: uses: ./.github/workflows/build-upload-conda-base.yml with: label: main - commit: ${{ github.sha }} env: .github/workflows/envs/env.yml secrets: inherit deploy-containers: needs: deploy-conda uses: ./.github/workflows/build-containers.yml - secrets: inherit \ No newline at end of file + secrets: inherit From fd9df32ae9e8ddfacc45ca8b9b3415114f5c3e36 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 25 Apr 2025 06:12:03 -0700 Subject: [PATCH 26/28] Update examples and code --- buildscripts/containers/examples/hello.py | 5 +- buildscripts/containers/examples/pi_loop.py | 6 +- buildscripts/containers/examples/pi_spmd.py | 19 ++- buildscripts/containers/examples/pi_task.py | 32 +++-- docs/source/usage.rst | 4 +- examples/TestDataEnv.py | 112 ++++++++-------- examples/dgemm_ompGPU.py | 137 +++++++++++--------- examples/piParLoopGPU_BUD.py | 18 +-- examples/piParLoopGPU_BUD_combined.py | 16 ++- examples/piParLoopGPU_loop.py | 16 ++- examples/pi_loop.py | 6 +- examples/pi_spmd.py | 19 ++- examples/pi_task.py | 30 +++-- 13 files changed, 243 insertions(+), 177 deletions(-) diff --git a/buildscripts/containers/examples/hello.py b/buildscripts/containers/examples/hello.py index 11de2aa5f76e..e4f2f93f3c14 100644 --- a/buildscripts/containers/examples/hello.py +++ b/buildscripts/containers/examples/hello.py @@ -1,9 +1,12 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp from numba.openmp import omp_get_thread_num, omp_get_num_threads + @njit def hello(): with openmp("parallel"): print("Hello thread", omp_get_thread_num(), " of ", omp_get_num_threads()) + + hello() diff --git a/buildscripts/containers/examples/pi_loop.py b/buildscripts/containers/examples/pi_loop.py index 39f52d18db3a..356f2a1d2d69 100644 --- a/buildscripts/containers/examples/pi_loop.py +++ b/buildscripts/containers/examples/pi_loop.py @@ -1,6 +1,7 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp + @njit def calc_pi(): num_steps = 100000 @@ -10,10 +11,11 @@ def calc_pi(): with openmp("parallel for reduction(+:the_sum) schedule(static)"): for j in range(num_steps): c = step - x = ((j-1) - 0.5) * step + x = ((j - 1) - 0.5) * step the_sum += 4.0 / (1.0 + x * x) pi = step * the_sum return pi + print("pi =", calc_pi()) diff --git a/buildscripts/containers/examples/pi_spmd.py b/buildscripts/containers/examples/pi_spmd.py index 38e582b81967..9a6775c33cdc 100644 --- a/buildscripts/containers/examples/pi_spmd.py +++ b/buildscripts/containers/examples/pi_spmd.py @@ -1,14 +1,20 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp -from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime +from numba.openmp import ( + omp_set_num_threads, + omp_get_thread_num, + omp_get_num_threads, + omp_get_wtime, +) import numpy as np + @njit def f1(): num_steps = 100000000 step = 1.0 / num_steps - MAX_THREADS=8 - for j in range(1,MAX_THREADS+1): + MAX_THREADS = 8 + for j in range(1, MAX_THREADS + 1): tsum = np.zeros(j) omp_set_num_threads(j) @@ -25,15 +31,16 @@ def f1(): x = (i + 0.5) * step local_sum += 4.0 / (1.0 + x * x) -# print("foo:", j, tid, local_sum) + # print("foo:", j, tid, local_sum) tsum[tid] = local_sum -# print("tsum:", tsum) + # print("tsum:", tsum) full_sum = np.sum(tsum) pi = step * full_sum runtime = omp_get_wtime() - start_time print("pi = ", pi, "runtime = ", runtime, j) + f1() print("DONE") diff --git a/buildscripts/containers/examples/pi_task.py b/buildscripts/containers/examples/pi_task.py index fe793c25fc5a..e8f1edb58c10 100644 --- a/buildscripts/containers/examples/pi_task.py +++ b/buildscripts/containers/examples/pi_task.py @@ -1,12 +1,18 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp -from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime +from numba.openmp import ( + omp_set_num_threads, + omp_get_thread_num, + omp_get_num_threads, + omp_get_wtime, +) import numpy as np + @njit def pi_comp(Nstart, Nfinish, step): MIN_BLK = 256 - #MIN_BLK = 1024*1024*256 + # MIN_BLK = 1024*1024*256 pi_sum = 0.0 if Nfinish - Nstart < MIN_BLK: for i in range(Nstart, Nfinish): @@ -16,25 +22,26 @@ def pi_comp(Nstart, Nfinish, step): iblk = Nfinish - Nstart pi_sum1 = 0.0 pi_sum2 = 0.0 - cut = Nfinish-(iblk // 2) + cut = Nfinish - (iblk // 2) with openmp("task shared(pi_sum1)"): pi_sum1 = pi_comp(Nstart, cut, step) with openmp("task shared(pi_sum2)"): pi_sum2 = pi_comp(cut, Nfinish, step) with openmp("taskwait"): pi_sum = pi_sum1 + pi_sum2 -# pi_sum1 = pi_comp(Nstart, cut, step) -# pi_sum2 = pi_comp(cut, Nfinish, step) -# pi_sum = pi_sum1 + pi_sum2 - return pi_sum + # pi_sum1 = pi_comp(Nstart, cut, step) + # pi_sum2 = pi_comp(cut, Nfinish, step) + # pi_sum = pi_sum1 + pi_sum2 + return pi_sum + @njit def f1(lb, num_steps): step = 1.0 / num_steps - MAX_THREADS=4 + MAX_THREADS = 4 tsum = np.zeros(MAX_THREADS) - for j in range(1,MAX_THREADS+1): + for j in range(1, MAX_THREADS + 1): omp_set_num_threads(j) full_sum = 0.0 start_time = omp_get_wtime() @@ -48,9 +55,10 @@ def f1(lb, num_steps): runtime = omp_get_wtime() - start_time print("pi = ", pi, "runtime = ", runtime, j) + lb = 0 num_steps = 1024 -#num_steps = 1024*1024*1024 -#num_steps = 1000000000 +# num_steps = 1024*1024*1024 +# num_steps = 1000000000 f1(lb, num_steps) print("DONE") diff --git a/docs/source/usage.rst b/docs/source/usage.rst index c4fc6cd353ab..e1f289088def 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -15,7 +15,7 @@ Diving right in, this is a minimal, parallel `hello world` example: .. code-block:: python :linenos: - from numba import njit + from numba.openmp import njit from numba.openmp import openmp_context as openmp from numba.openmp import omp_get_thread_num @@ -69,7 +69,7 @@ thread-blocks on the GPU device: .. code-block:: python :linenos: - from numba import njit + from numba.openmp import njit from numba.openmp import openmp_context as openmp from numba.openmp import omp_get_thread_num import numpy as np diff --git a/examples/TestDataEnv.py b/examples/TestDataEnv.py index 7c05e7d43ecb..311e596bac3b 100755 --- a/examples/TestDataEnv.py +++ b/examples/TestDataEnv.py @@ -1,81 +1,91 @@ # # Test individual constructs from OpenMP # -from numba import njit +from numba.openmp import njit import numpy as np from numba.openmp import openmp_context as openmp -from numba.openmp import omp_get_wtime, omp_get_thread_num, omp_get_num_threads,omp_set_num_threads +from numba.openmp import ( + omp_get_wtime, + omp_get_thread_num, + omp_get_num_threads, + omp_set_num_threads, +) + ############################################################################## @njit def testOMP(): - x = 5 - y = 3 - zfp = 2 - zsh = 7 + x = 5 + y = 3 + zfp = 2 + zsh = 7 nerr = 0 nsing = 0 NTHREADS = 4 numthrds = 0 omp_set_num_threads(NTHREADS) - vals = np.zeros(NTHREADS) - valsfp = np.zeros(NTHREADS) + vals = np.zeros(NTHREADS) + valsfp = np.zeros(NTHREADS) - with openmp ("parallel private(x) shared(zsh) firstprivate(zfp) private(ID)"): - ID = omp_get_thread_num() - with openmp("single"): - nsing = nsing+1 + with openmp("parallel private(x) shared(zsh) firstprivate(zfp) private(ID)"): + ID = omp_get_thread_num() + with openmp("single"): + nsing = nsing + 1 numthrds = omp_get_num_threads() - if (y != 3): - nerr = nerr+1 - print("Shared Default status failure y = ",y," It should equal 3"); - with openmp("single"): - if(x == 5): - pass -# nerr = nerr+1 -# print("Private clause failed, variable x = original variable ",x," it should be undefined") + if y != 3: + nerr = nerr + 1 + print("Shared Default status failure y = ", y, " It should equal 3") + with openmp("single"): + if x == 5: + pass + # nerr = nerr+1 + # print("Private clause failed, variable x = original variable ",x," it should be undefined") - # verify each thread sees the same variable vsh - with openmp("critical"): - zsh = zsh+ID + # verify each thread sees the same variable vsh + with openmp("critical"): + zsh = zsh + ID - # test first private - zfp = zfp+ID - valsfp[ID] = zfp + # test first private + zfp = zfp + ID + valsfp[ID] = zfp - # setup test to see if each thread got it's own x value - x = ID - vals[ID] = x + # setup test to see if each thread got it's own x value + x = ID + vals[ID] = x -# Shared clause test: assumes zsh starts at 7 and we add up IDs from 4 threads - if(zsh != 13): - print("Shared clause or critical failed",zsh) - nerr = nerr+1 + # Shared clause test: assumes zsh starts at 7 and we add up IDs from 4 threads + if zsh != 13: + print("Shared clause or critical failed", zsh) + nerr = nerr + 1 -# Single Test: How many threads updated nsing? - if(nsing!=1): - print(" Single test failed",nsing) - nerr = nerr+1 + # Single Test: How many threads updated nsing? + if nsing != 1: + print(" Single test failed", nsing) + nerr = nerr + 1 -# Private clause test: did each thread get its own x variable? + # Private clause test: did each thread get its own x variable? for i in range(numthrds): - if(int(vals[i]) != i): - print("Private clause failed",numthrds,i,vals[i]) - nerr = nerr+1 + if int(vals[i]) != i: + print("Private clause failed", numthrds, i, vals[i]) + nerr = nerr + 1 -# First private clause test: each thread should get 2 + ID for up to 4 threads + # First private clause test: each thread should get 2 + ID for up to 4 threads for i in range(numthrds): - if(int(valsfp[i]) != 2+i): - print("Firstprivate clause failed",numthrds,i,valsfp[i]) - nerr = nerr+1 + if int(valsfp[i]) != 2 + i: + print("Firstprivate clause failed", numthrds, i, valsfp[i]) + nerr = nerr + 1 + + # Test number of threads + if numthrds > NTHREADS: + print("Number of threads error: too many threads", numthrds, NTHREADS) + nerr = nerr + 1 -# Test number of threads - if(numthrds > NTHREADS): - print("Number of threads error: too many threads",numthrds,NTHREADS) - nerr = nerr+1 - - print(nerr," errors when testing parallel, private, shared, firstprivate, critical and single") + print( + nerr, + " errors when testing parallel, private, shared, firstprivate, critical and single", + ) return nerr + errors = testOMP() diff --git a/examples/dgemm_ompGPU.py b/examples/dgemm_ompGPU.py index 20a30590e41c..c3e8131a58b5 100644 --- a/examples/dgemm_ompGPU.py +++ b/examples/dgemm_ompGPU.py @@ -30,7 +30,7 @@ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -#******************************************************************* +# ******************************************************************* # # NAME: dgemm # @@ -38,12 +38,12 @@ # dense multiplication is carried out # # USAGE: The program takes as input the matrix order, -# the number of times the matrix-matrix multiplication +# the number of times the matrix-matrix multiplication # is carried out. # # <# iterations> # -# The output consists of diagnostics to make sure the +# The output consists of diagnostics to make sure the # algorithm worked, and of timing statistics. # # HISTORY: Written by Rob Van der Wijngaart, February 2009. @@ -52,75 +52,89 @@ # ******************************************************************* import sys -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp -from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime +from numba.openmp import ( + omp_set_num_threads, + omp_get_thread_num, + omp_get_num_threads, + omp_get_wtime, +) import numpy as np -#from time import process_time as timer +# from time import process_time as timer + -#@njit(enable_ssa=False, cache=True) What does "enable_ssa" mean? +# @njit(enable_ssa=False, cache=True) What does "enable_ssa" mean? @njit(fastmath=True) -def dgemm(iters,order): +def dgemm(iters, order): # ******************************************************************** # ** Allocate space for the input and transpose matrix # ******************************************************************** - print('inside dgemm') - A = np.zeros((order,order)) - B = np.zeros((order,order)) - C = np.zeros((order,order)) + print("inside dgemm") + A = np.zeros((order, order)) + B = np.zeros((order, order)) + C = np.zeros((order, order)) -# It can be very important to initialize data with the same threads -# as you will use when computing. + # It can be very important to initialize data with the same threads + # as you will use when computing. with openmp("parallel for schedule(static)"): - for i in range(order): - A[:,i] = float(i) - B[:,i] = float(i) - -# print(omp_get_num_threads()) - for kiter in range(0,iters+1): - if kiter==1: - t0 = omp_get_wtime() - tSum=0.0 - tsqSum=0.0 - with openmp("target teams distribute parallel for private(j,k)"): - for i in range(order): - for k in range(order): - for j in range(order): - C[i][j] += A[i][k] * B[k][j] - if kiter>0: - tkiter = omp_get_wtime() - t = tkiter - t0 - tSum = tSum + t - tsqSum = tsqSum+t*t - t0 = tkiter - - dgemmAve = tSum/iters - dgemmStdDev = ((tsqSum-iters*dgemmAve*dgemmAve)/(iters-1))**0.5 - print('finished with computations') + for i in range(order): + A[:, i] = float(i) + B[:, i] = float(i) + + # print(omp_get_num_threads()) + for kiter in range(0, iters + 1): + if kiter == 1: + t0 = omp_get_wtime() + tSum = 0.0 + tsqSum = 0.0 + with openmp("target teams distribute parallel for private(j,k)"): + for i in range(order): + for k in range(order): + for j in range(order): + C[i][j] += A[i][k] * B[k][j] + if kiter > 0: + tkiter = omp_get_wtime() + t = tkiter - t0 + tSum = tSum + t + tsqSum = tsqSum + t * t + t0 = tkiter + + dgemmAve = tSum / iters + dgemmStdDev = ((tsqSum - iters * dgemmAve * dgemmAve) / (iters - 1)) ** 0.5 + print("finished with computations") # ******************************************************************** # ** Analyze and output results. # ******************************************************************** - checksum = 0.0; + checksum = 0.0 for i in range(order): for j in range(order): - checksum += C[i][j]; - - ref_checksum = order*order*order - ref_checksum *= 0.25*(order-1.0)*(order-1.0) - ref_checksum *= (iters+1) - epsilon=1.e-8 - if abs((checksum - ref_checksum)/ref_checksum) < epsilon: - print('Solution validates') - nflops = 2.0*order*order*order - recipDiff = (1.0/(dgemmAve-dgemmStdDev) - 1.0/(dgemmAve+dgemmStdDev)) - GfStdDev = 1.e-6*nflops*recipDiff/2.0 - print('nflops: ',nflops) - print('Rate: ',1.e-6*nflops/dgemmAve,' +/- (MF/s): ',GfStdDev) + checksum += C[i][j] + + ref_checksum = order * order * order + ref_checksum *= 0.25 * (order - 1.0) * (order - 1.0) + ref_checksum *= iters + 1 + epsilon = 1.0e-8 + if abs((checksum - ref_checksum) / ref_checksum) < epsilon: + print("Solution validates") + nflops = 2.0 * order * order * order + recipDiff = 1.0 / (dgemmAve - dgemmStdDev) - 1.0 / (dgemmAve + dgemmStdDev) + GfStdDev = 1.0e-6 * nflops * recipDiff / 2.0 + print("nflops: ", nflops) + print("Rate: ", 1.0e-6 * nflops / dgemmAve, " +/- (MF/s): ", GfStdDev) else: - print('ERROR: Checksum = ', checksum,', Reference checksum = ', ref_checksum,'\n') + print( + "ERROR: Checksum = ", + checksum, + ", Reference checksum = ", + ref_checksum, + "\n", + ) + + # sys.exit("ERROR: solution did not validate") @@ -128,23 +142,22 @@ def dgemm(iters,order): # read and test input parameters # ******************************************************************** -print('Parallel Research Kernels version ') #, PRKVERSION -print('Python Dense matrix-matrix multiplication: C = A x B') +print("Parallel Research Kernels version ") # , PRKVERSION +print("Python Dense matrix-matrix multiplication: C = A x B") if len(sys.argv) != 3: - print('argument count = ', len(sys.argv)) - sys.exit("Usage: ./dgemm <# iterations> ") + print("argument count = ", len(sys.argv)) + sys.exit("Usage: ./dgemm <# iterations> ") itersIn = int(sys.argv[1]) if itersIn < 1: - sys.exit("ERROR: iterations must be >= 1") + sys.exit("ERROR: iterations must be >= 1") orderIn = int(sys.argv[2]) if orderIn < 1: sys.exit("ERROR: order must be >= 1") -print('Number of iterations = ', itersIn) -print('Matrix order = ', orderIn) +print("Number of iterations = ", itersIn) +print("Matrix order = ", orderIn) dgemm(itersIn, orderIn) - diff --git a/examples/piParLoopGPU_BUD.py b/examples/piParLoopGPU_BUD.py index 0fc5f950b243..9c01d81c1283 100644 --- a/examples/piParLoopGPU_BUD.py +++ b/examples/piParLoopGPU_BUD.py @@ -1,23 +1,25 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp from numba.openmp import omp_get_num_threads from numba.openmp import omp_get_thread_num + @njit def piFunc(NumSteps): - step = 1.0/NumSteps - sum = 0.0 - start_time = omp_get_wtime() + step = 1.0 / NumSteps + sum = 0.0 + start_time = omp_get_wtime() with openmp("target"): - with openmp("teams distribute parallel for private(x) reduction(+:sum)"): - for i in range(NumSteps): - x = (i+0.5)*step - sum += 4.0/(1.0 + x*x) + with openmp("teams distribute parallel for private(x) reduction(+:sum)"): + for i in range(NumSteps): + x = (i + 0.5) * step + sum += 4.0 / (1.0 + x * x) pi = step * sum runtime = omp_get_wtime() - start_time print("pi = ", pi, "runtime = ", runtime) return pi + pi = piFunc(10000000) print(pi) diff --git a/examples/piParLoopGPU_BUD_combined.py b/examples/piParLoopGPU_BUD_combined.py index a4ab403295bc..399f89fd3c49 100644 --- a/examples/piParLoopGPU_BUD_combined.py +++ b/examples/piParLoopGPU_BUD_combined.py @@ -1,21 +1,23 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp from numba.openmp import omp_get_wtime + @njit def piFunc(NumSteps): - step = 1.0/NumSteps - sum = 0.0 - start_time = omp_get_wtime() + step = 1.0 / NumSteps + sum = 0.0 + start_time = omp_get_wtime() with openmp("target teams distribute parallel for private(x) reduction(+:sum)"): - for i in range(NumSteps): - x = (i+0.5)*step - sum += 4.0/(1.0 + x*x) + for i in range(NumSteps): + x = (i + 0.5) * step + sum += 4.0 / (1.0 + x * x) pi = step * sum runtime = omp_get_wtime() - start_time print("pi = ", pi, "runtime = ", runtime) return pi + pi = piFunc(10000000) print(pi) diff --git a/examples/piParLoopGPU_loop.py b/examples/piParLoopGPU_loop.py index 68ba5c939b2f..d74219eb221a 100644 --- a/examples/piParLoopGPU_loop.py +++ b/examples/piParLoopGPU_loop.py @@ -1,23 +1,25 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp from numba.openmp import omp_get_num_threads from numba.openmp import omp_get_thread_num + @njit def piFunc(NumSteps): - step = 1.0/NumSteps - sum = 0.0 + step = 1.0 / NumSteps + sum = 0.0 start_time = omp_get_wtime() with openmp("target "): - with openmp("loop private(x) reduction(+:sum)"): - for i in range(NumSteps): - x = (i+0.5)*step - sum += 4.0/(1.0 + x*x) + with openmp("loop private(x) reduction(+:sum)"): + for i in range(NumSteps): + x = (i + 0.5) * step + sum += 4.0 / (1.0 + x * x) pi = step * sum runtime = omp_get_wtime() - start_time print("pi = ", pi, "runtime = ", runtime) return pi + pi = piFunc(10000000) print(pi) diff --git a/examples/pi_loop.py b/examples/pi_loop.py index e1168f0670b6..5da5025eb69c 100644 --- a/examples/pi_loop.py +++ b/examples/pi_loop.py @@ -1,6 +1,7 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp + @njit def calc_pi(): num_steps = 100000 @@ -9,10 +10,11 @@ def calc_pi(): the_sum = 0.0 with openmp("parallel for reduction(+:the_sum) schedule(static)"): for j in range(num_steps): - x = ((j-1) - 0.5) * step + x = ((j - 1) - 0.5) * step the_sum += 4.0 / (1.0 + x * x) pi = step * the_sum return pi + print("pi =", calc_pi()) diff --git a/examples/pi_spmd.py b/examples/pi_spmd.py index 38e582b81967..9a6775c33cdc 100644 --- a/examples/pi_spmd.py +++ b/examples/pi_spmd.py @@ -1,14 +1,20 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp -from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime +from numba.openmp import ( + omp_set_num_threads, + omp_get_thread_num, + omp_get_num_threads, + omp_get_wtime, +) import numpy as np + @njit def f1(): num_steps = 100000000 step = 1.0 / num_steps - MAX_THREADS=8 - for j in range(1,MAX_THREADS+1): + MAX_THREADS = 8 + for j in range(1, MAX_THREADS + 1): tsum = np.zeros(j) omp_set_num_threads(j) @@ -25,15 +31,16 @@ def f1(): x = (i + 0.5) * step local_sum += 4.0 / (1.0 + x * x) -# print("foo:", j, tid, local_sum) + # print("foo:", j, tid, local_sum) tsum[tid] = local_sum -# print("tsum:", tsum) + # print("tsum:", tsum) full_sum = np.sum(tsum) pi = step * full_sum runtime = omp_get_wtime() - start_time print("pi = ", pi, "runtime = ", runtime, j) + f1() print("DONE") diff --git a/examples/pi_task.py b/examples/pi_task.py index b6470935cd4b..e8f1edb58c10 100644 --- a/examples/pi_task.py +++ b/examples/pi_task.py @@ -1,12 +1,18 @@ -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp -from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime +from numba.openmp import ( + omp_set_num_threads, + omp_get_thread_num, + omp_get_num_threads, + omp_get_wtime, +) import numpy as np + @njit def pi_comp(Nstart, Nfinish, step): MIN_BLK = 256 - #MIN_BLK = 1024*1024*256 + # MIN_BLK = 1024*1024*256 pi_sum = 0.0 if Nfinish - Nstart < MIN_BLK: for i in range(Nstart, Nfinish): @@ -16,25 +22,26 @@ def pi_comp(Nstart, Nfinish, step): iblk = Nfinish - Nstart pi_sum1 = 0.0 pi_sum2 = 0.0 - cut = Nfinish-(iblk // 2) + cut = Nfinish - (iblk // 2) with openmp("task shared(pi_sum1)"): pi_sum1 = pi_comp(Nstart, cut, step) with openmp("task shared(pi_sum2)"): pi_sum2 = pi_comp(cut, Nfinish, step) with openmp("taskwait"): pi_sum = pi_sum1 + pi_sum2 -# pi_sum1 = pi_comp(Nstart, cut, step) -# pi_sum2 = pi_comp(cut, Nfinish, step) -# pi_sum = pi_sum1 + pi_sum2 + # pi_sum1 = pi_comp(Nstart, cut, step) + # pi_sum2 = pi_comp(cut, Nfinish, step) + # pi_sum = pi_sum1 + pi_sum2 return pi_sum + @njit def f1(lb, num_steps): step = 1.0 / num_steps - MAX_THREADS=4 + MAX_THREADS = 4 tsum = np.zeros(MAX_THREADS) - for j in range(1,MAX_THREADS+1): + for j in range(1, MAX_THREADS + 1): omp_set_num_threads(j) full_sum = 0.0 start_time = omp_get_wtime() @@ -48,9 +55,10 @@ def f1(lb, num_steps): runtime = omp_get_wtime() - start_time print("pi = ", pi, "runtime = ", runtime, j) + lb = 0 num_steps = 1024 -#num_steps = 1024*1024*1024 -#num_steps = 1000000000 +# num_steps = 1024*1024*1024 +# num_steps = 1000000000 f1(lb, num_steps) print("DONE") From 1548c4ba76ae559fdadb50d1d4414bf86e1ae7f1 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 25 Apr 2025 06:42:46 -0700 Subject: [PATCH 27/28] Update README and RTD conf.py --- README.md | 116 ++++++++++++++++++++++++++++---------------- docs/source/conf.py | 41 ++++++++++------ 2 files changed, 100 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 64dc24bfbb91..68e3ebb650f6 100644 --- a/README.md +++ b/README.md @@ -3,64 +3,66 @@ [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Python-for-HPC/binder/HEAD) # PyOMP -OpenMP for Python in Numba for CPU/GPU parallel programming. +OpenMP for Python CPU/GPU parallel programming, powered by Numba. -Currently, PyOMP is distributed as a full version of Numba which is based on a -Numba version a few versions behind mainline. -Since Numba is available for every combination of the past few Python versions -and the past few NumPy versions and various operating systems and architectures, -there is quite an extensive build infrastructure required to get all these -combinations and recently we have sorted out some of these combinations. -The architecture and operating system combinations that currently work are: -linux-64 (x86_64), osx-arm64 (mac), linux-arm64, and linux-ppc64le. -These distributions are available with the `conda` command in the next section. +PyOMP provides a familiar interface for CPU/GPU programming using OpenMP +abstractions adapted for Python. +Besides effortless programmability, PyOMP generates fast code using Numba's JIT +compiler based on LLVM, which is competitive with equivalent C/C++ implementations. -Due to PyOMP using the LLVM OpenMP infrastructure, we also inherit its -limitations which means that GPU support is only available on Linux. +PyOMP is developed and distributed as an *extension* to Numba, so it uses +Numba as a dependency. +It is currently tested with Numba versions 0.57.x, 0.58.x on the following +architecture and operating system combinations: linux-64 (x86_64), osx-arm64 +(mac), linux-arm64, and linux-ppc64le. +Installation is possible through `conda`, detailed in the next section. -In the future, we plan on converting PyOMP to a Numba extension which should eliminate the Python and NumPy versioning issues. +As PyOMP builds on to of the LLVM OpenMP infrastructure, it also inherits its +limitations: GPU support is only available on Linux. +Also, PyOMP currently supports only NVIDIA GPUs with AMD GPU support planned for. ## Installation ### Conda -PyOMP is distributed as a package through Conda, currently supporting linux-64 -(x86_64), osx-arm64 (mac), linux-arm64, and linux-ppc64le architectures. +PyOMP is distributed through Conda, easily installable using the following command: ```bash conda install -c python-for-hpc -c conda-forge pyomp ``` +Besides a standard installation, we also provide the following options to +quickly try out PyOMP online or through a container. -## Trying it out +### Trying it out -### Binder +#### Binder You can try it out for free on a multi-core CPU in JupyterLab at the following link: https://mybinder.org/v2/gh/Python-for-HPC/binder/HEAD -### Docker +#### Docker We also provide pre-built containers for arm64 and amd64 architectures with PyOMP and Jupyter pre-installed. The following show how to access the container through the terminal or using -jupyter. +Jupyter. First pull the container -``` +```bash docker pull ghcr.io/python-for-hpc/pyomp:latest ``` To use the terminal, run a shell on the container -``` +```bash docker run -it ghcr.io/python-for-hpc/pyomp:latest /bin/bash ``` To use Jupyter, run without arguments and forward port 8888. -``` +```bash docker run -it -p 8888:8888 ghcr.io/python-for-hpc/pyomp:latest ``` Jupyter will start as a service on localhost with token authentication by default. Grep the url with the token from the output and copy it to the browser. -``` +```bash ... [I 2024-09-15 17:24:47.912 ServerApp] http://127.0.0.1:8888/tree?token= ... @@ -68,40 +70,72 @@ Grep the url with the token from the output and copy it to the browser. ## Usage -Import Numba and add the `@njit` decorator to the function in which you want to use OpenMP. -Add `with` contexts for each OpenMP region you want to have, importing the -context `openmp_context` from the `numba.openmp` module. +From `numba.openmp` import the `@njit` decorator and the `openmp_context` to +create OpenMP regions using `with` contexts. +Decorate with `njit` the function you want to parallelize with OpenMP and +describe parallelism in OpenMP directives using `with` contexts. +Enjoy the simplicity of OpenMP with Python syntax and parallel performance. For a list of supported OpenMP directives and more detailed information, check out the [Documentation](https://pyomp.readthedocs.io). -PyOMP supports both CPU and GPU programming for NVIDIA GPUs through the `target` + +PyOMP supports both CPU and GPU programming implementing OpenMP's `target` directive for offloading. -For GPU programming, PyOMP supports the `device` clause and by convention the -default without using the clause or providing `device(0)` always refers to the -accelerator GPU device. -It is also possible to use the host as a multi-core CPU target device setting `device(1)`. +For GPU programming, PyOMP supports the `device` clause, with `device(0)` by +convention offloading to a GPU device. +It is also possible to use the host as a multi-core CPU target device (mainly +for testing purposes) by setting `device(1)`. ### Example -This is an example of calculating $\pi$ with PyOMP with a `parallel for` loop. +This is an example of calculating $\pi$ with PyOMP with a `parallel for` loop +using CPU parallelism: ```python -from numba import njit +from numba.openmp import njit from numba.openmp import openmp_context as openmp @njit -def calc_pi(): - num_steps = 100000 +def calc_pi(num_steps): step = 1.0 / num_steps - - the_sum = 0.0 - with openmp("parallel for reduction(+:the_sum) schedule(static)"): + red_sum = 0.0 + with openmp("parallel for reduction(+:red_sum) schedule(static)"): for j in range(num_steps): x = ((j-1) - 0.5) * step - the_sum += 4.0 / (1.0 + x * x) + red_sum += 4.0 / (1.0 + x * x) - pi = step * the_sum + pi = step * red_sum return pi -print("pi =", calc_pi()) +print("pi =", calc_pi(1000000)) ``` + +and this is the same example using GPU offloading: + +```python +from numba.openmp import njit +from numba.openmp import openmp_context as openmp +from numba.openmp import omp_get_thread_num + +@njit +def calc_pi(num_steps): + step = 1.0/num_steps + red_sum = 0.0 + with openmp("target map(tofrom: red_sum)"): + with openmp("loop private(x) reduction(+:red_sum)"): + for i in range(num_steps): + tid = omp_get_thread_num() + x = (i+0.5)*step + red_sum += 4.0 / (1.0 + x*x) + + pi = step * red_sum + print("pi=", pi) + +print("pi =", calc_pi(1000000)) +``` + +## Support + +We welcome any feedback, bug reports, or feature requests. +Please open an [Issue](https://github.com/Python-for-HPC/PyOMP/issues) or post +in [Discussions](https://github.com/Python-for-HPC/PyOMP/discussions). \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 7a30782a0ade..2750a959b4cf 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,35 +1,44 @@ # Configuration file for the Sphinx documentation builder. +import subprocess + # -- Project information -project = 'PyOMP' -copyright = '2024, PyOMP developers' -author = 'Giorgis Georgakoudis' +project = "PyOMP" +copyright = "2024, PyOMP developers" +author = "Giorgis Georgakoudis" -release = '0.1' -version = '0.1.0' +try: + release = ( + subprocess.check_output(["git", "describe", "--tags", "--abbrev=0"]) + .strip() + .decode() + ) +except subprocess.CalledProcessError: + release = "latest" +version = release # -- General configuration extensions = [ - 'sphinx.ext.duration', - 'sphinx.ext.doctest', - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.intersphinx', + "sphinx.ext.duration", + "sphinx.ext.doctest", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", ] intersphinx_mapping = { - 'python': ('https://docs.python.org/3/', None), - 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), + "python": ("https://docs.python.org/3/", None), + "sphinx": ("https://www.sphinx-doc.org/en/master/", None), } -intersphinx_disabled_domains = ['std'] +intersphinx_disabled_domains = ["std"] -templates_path = ['_templates'] +templates_path = ["_templates"] # -- Options for HTML output -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # -- Options for EPUB output -epub_show_urls = 'footnote' \ No newline at end of file +epub_show_urls = "footnote" From 8cb57b818af0fef25309d4702ef4061b94d5f3cc Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 25 Apr 2025 07:09:46 -0700 Subject: [PATCH 28/28] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 68e3ebb650f6..eb8853f6ab09 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ architecture and operating system combinations: linux-64 (x86_64), osx-arm64 (mac), linux-arm64, and linux-ppc64le. Installation is possible through `conda`, detailed in the next section. -As PyOMP builds on to of the LLVM OpenMP infrastructure, it also inherits its +As PyOMP builds on top of the LLVM OpenMP infrastructure, it also inherits its limitations: GPU support is only available on Linux. Also, PyOMP currently supports only NVIDIA GPUs with AMD GPU support planned for.