From 67a27443f5ae8c469b8304c76f2a922e316b8703 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Tue, 22 Apr 2025 03:38:55 -0700
Subject: [PATCH 01/28] Add python packaging files

---
 pyproject.toml | 31 ++++++++++++++++++
 setup.py       | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 pyproject.toml
 create mode 100644 setup.py

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000000..46b76e28f22c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,31 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "pyomp"
+version = "0.2.0"
+description = "Python OpenMP library based on Numba"
+readme = "README.md"
+requires-python = ">=3.8, <=3.12"
+license = { text = "BSD 2-Clause License" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: BSD License",
+    "Operating System :: OS Independent",
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Topic :: Software Development :: Compilers",
+]
+dependencies = ["numba>=0.57, <0.58", "lark", "cffi"]
+
+[project.urls]
+Homepage = "https://github.com/Python-for-HPC/PyOMP"
+Issues = "https://github.com/Python-for-HPC/PyOMP/issues"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["numba.openmp"]
+
+[tool.setuptools.package-data]
+"numba.openmp" = ["libs/*"]
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000000..487343fcc112
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,85 @@
+# setup.py
+import os
+import numba
+import sysconfig
+import numpy as np
+import subprocess
+from pathlib import Path
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from setuptools.command.build_clib import build_clib
+
+
+numba_dir = os.path.dirname(numba.__file__)
+bundle_lib = (
+    "bundle",
+    {
+        "sources": [
+            "numba/openmp/nrt/init.c",
+            f"{numba_dir}/_helpermod.c",
+            f"{numba_dir}/cext/utils.c",
+            f"{numba_dir}/cext/dictobject.c",
+            f"{numba_dir}/cext/listobject.c",
+            f"{numba_dir}/core/runtime/_nrt_pythonmod.c",
+            f"{numba_dir}/core/runtime/nrt.cpp",
+        ],
+        "include_dirs": [
+            sysconfig.get_paths()["include"],
+            np.get_include(),
+        ],
+    },
+)
+
+
+class BuildStaticBundle(build_clib):
+    def finalize_options(self):
+        super().finalize_options()
+        self.build_temp = (Path("numba/openmp/nrt") / self.build_temp).absolute()
+        self.build_temp.mkdir(parents=True, exist_ok=True)
+        self.build_temp = str(self.build_temp)
+        self.build_clib = str(Path("numba/openmp/libs").absolute())
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir):
+        # don't invoke the original build_ext for this special extension
+        super().__init__(name, sources=[])
+        self.sourcedir = sourcedir
+
+
+class BuildPass(build_ext):
+    def run(self):
+        for ext in self.extensions:
+            if isinstance(ext, CMakeExtension):
+                self.build_cmake(ext)
+                return
+        super().run()
+
+    def build_cmake(self, ext):
+        build_dir = (Path(ext.sourcedir) / self.build_temp).absolute()
+        subprocess.run(
+            [
+                "cmake",
+                "-S",
+                ext.sourcedir,
+                "-B",
+                build_dir,
+                "--install-prefix",
+                Path("numba/openmp/libs").absolute(),
+                "-DCMAKE_BUILD_TYPE=Release",
+            ],
+            check=True,
+        )
+
+        subprocess.run(["cmake", "--build", build_dir, "-j"], check=True)
+        subprocess.run(
+            ["cmake", "--install", build_dir],
+            check=True,
+        )
+
+
+setup(
+    libraries=[bundle_lib],
+    ext_modules=[CMakeExtension("libIntrinsicsOpenMP", "numba/openmp/pass")],
+    cmdclass={"build_clib": BuildStaticBundle, "build_ext": BuildPass},
+)

From 45e1f358eee7068ff99396411e344c23c11271e8 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Tue, 22 Apr 2025 03:40:57 -0700
Subject: [PATCH 02/28] Add pyomp numba extension files

---
 numba/openmp/__init__.py                  | 7778 +++++++++++++++++++++
 numba/openmp/nrt/init.c                   |    3 +
 numba/openmp/pass/CGIntrinsicsOpenMP.cpp  | 3102 ++++++++
 numba/openmp/pass/CGIntrinsicsOpenMP.h    |  527 ++
 numba/openmp/pass/CMakeLists.txt          |   40 +
 numba/openmp/pass/DebugOpenMP.cpp         |   16 +
 numba/openmp/pass/DebugOpenMP.h           |   28 +
 numba/openmp/pass/IntrinsicsOpenMP.cpp    |  732 ++
 numba/openmp/pass/IntrinsicsOpenMP.h      |   14 +
 numba/openmp/pass/IntrinsicsOpenMP_CAPI.h |   23 +
 numba/openmp/tests/test_openmp.py         | 4959 +++++++++++++
 11 files changed, 17222 insertions(+)
 create mode 100644 numba/openmp/__init__.py
 create mode 100644 numba/openmp/nrt/init.c
 create mode 100644 numba/openmp/pass/CGIntrinsicsOpenMP.cpp
 create mode 100644 numba/openmp/pass/CGIntrinsicsOpenMP.h
 create mode 100644 numba/openmp/pass/CMakeLists.txt
 create mode 100644 numba/openmp/pass/DebugOpenMP.cpp
 create mode 100644 numba/openmp/pass/DebugOpenMP.h
 create mode 100644 numba/openmp/pass/IntrinsicsOpenMP.cpp
 create mode 100644 numba/openmp/pass/IntrinsicsOpenMP.h
 create mode 100644 numba/openmp/pass/IntrinsicsOpenMP_CAPI.h
 create mode 100644 numba/openmp/tests/test_openmp.py

diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py
new file mode 100644
index 000000000000..52e31d4a45c4
--- /dev/null
+++ b/numba/openmp/__init__.py
@@ -0,0 +1,7778 @@
+import warnings
+from numba.core.withcontexts import WithContext, _CallContextType
+from lark import Lark, Transformer
+from lark.exceptions import VisitError
+from numba.core.ir_utils import (
+    get_call_table,
+    dump_blocks,
+    dprint_func_ir,
+    replace_vars,
+    apply_copy_propagate_extensions,
+    visit_vars_extensions,
+    remove_dels,
+    visit_vars_inner,
+    visit_vars,
+    get_name_var_table,
+    replace_var_names,
+    get_definition,
+    build_definitions,
+    dead_code_elimination,
+    mk_unique_var,
+    find_topo_order,
+    flatten_labels,
+)
+from numba.core.analysis import (
+    compute_cfg_from_blocks,
+    compute_use_defs,
+    compute_live_map,
+    _fix_loop_exit,
+)
+from numba.core import (
+    ir,
+    config,
+    types,
+    typeinfer,
+    cgutils,
+    compiler,
+    transforms,
+    bytecode,
+    typed_passes,
+    imputils,
+    typing,
+    cpu,
+    compiler_machinery,
+)
+from numba.core.compiler_machinery import PassManager
+from numba.core.compiler import DefaultPassBuilder
+from numba.core.untyped_passes import (
+    TranslateByteCode,
+    FixupArgs,
+    IRProcessing,
+    InlineClosureLikes,
+    RewriteSemanticConstants,
+    DeadBranchPrune,
+    GenericRewrites,
+    RewriteDynamicRaises,
+    MakeFunctionToJitFunction,
+    InlineInlinables,
+    FindLiterallyCalls,
+    LiteralUnroll,
+    LiteralPropagationSubPipelinePass,
+    WithLifting,
+)
+from numba import np as numba_np
+from numba import cuda as numba_cuda
+from numba.core.controlflow import CFGraph
+from numba.core.ssa import _run_ssa
+from numba.extending import overload, intrinsic
+from numba.core.callconv import (
+    BaseCallConv,
+    MinimalCallConv,
+    errcode_t,
+    RETCODE_OK,
+    Status,
+    excinfo_t,
+    CPUCallConv,
+)
+from functools import cached_property
+from numba.core.datamodel.registry import register_default as model_register
+from numba.core.datamodel.registry import default_manager as model_manager
+from numba.core.datamodel.models import OpaqueModel
+from numba.core.types.functions import Dispatcher, ExternalFunction
+from numba.core.dispatcher import _FunctionCompiler
+from numba.np.ufunc import array_exprs
+from cffi import FFI
+import llvmlite.binding as ll
+import llvmlite.ir as lir
+import operator
+import sys
+import copy
+import os
+import numpy as np
+from numba.core.analysis import ir_extension_usedefs, _use_defs_result
+from numba.core.lowering import Lower
+from numba.core.codegen import AOTCodeLibrary, JITCodeLibrary
+from numba.cuda import descriptor as cuda_descriptor, compiler as cuda_compiler
+from numba.cuda.target import CUDACallConv
+import subprocess
+import tempfile
+import types as python_types
+import numba
+from pathlib import Path
+
+llvm_binpath = None
+llvm_libpath = None
+libpath = Path(__file__).parent / "libs"
+
+
+###### START OF NUMBA EXTENSIONS ######
+
+
+### ir_utils.py
+def dump_block(label, block):
+    print(label, ":")
+    for stmt in block.body:
+        print("    ", stmt)
+
+
+###
+
+
+### analysis.py
+def filter_nested_loops(cfg, loops):
+    blocks_in_loop = set()
+    # get loop bodies
+    for loop in loops.values():
+        insiders = set(loop.body) | set(loop.entries) | set(loop.exits)
+        insiders.discard(loop.header)
+        blocks_in_loop |= insiders
+    # find loop that is not part of other loops
+    for loop in loops.values():
+        if loop.header not in blocks_in_loop:
+            yield _fix_loop_exit(cfg, loop)
+
+
+###
+
+
+### config.py
+def safe_readenv(name, ctor, default):
+    value = os.environ.get(name, default)
+    try:
+        return ctor(value)
+    except Exception:
+        warnings.warn(
+            "environ %s defined but failed to parse '%s'" % (name, value),
+            RuntimeWarning,
+        )
+        return default
+
+
+DEBUG_OPENMP = safe_readenv("NUMBA_DEBUG_OPENMP", int, 0)
+if DEBUG_OPENMP > 0 and config.DEBUG_ARRAY_OPT == 0:
+    config.DEBUG_ARRAY_OPT = 1
+DEBUG_OPENMP_LLVM_PASS = safe_readenv("NUMBA_DEBUG_OPENMP_LLVM_PASS", int, 0)
+OPENMP_DISABLED = safe_readenv("NUMBA_OPENMP_DISABLED", int, 0)
+OPENMP_DEVICE_TOOLCHAIN = safe_readenv("NUMBA_OPENMP_DEVICE_TOOLCHAIN", int, 0)
+###
+
+
+class LowerNoSROA(Lower):
+    @property
+    def _disable_sroa_like_opt(self):
+        # Always return True for this instance
+        return True
+
+    def lower_assign_inst(self, orig, inst):
+        # This fixes assignments for Arg instructions when the target is a
+        # CPointer. It sets the backing storage to the pointer of the argument
+        # itself.
+        if isinstance(self.context, OpenmpCPUTargetContext) or isinstance(
+            self.context, OpenmpCUDATargetContext
+        ):
+            value = inst.value
+            if isinstance(value, ir.Arg):
+                argname = value.name
+                argty = self.typeof("arg." + argname)
+                if isinstance(argty, types.CPointer):
+                    llty = self.context.get_value_type(argty)
+                    ptr = lir.values.Argument(self.module, llty, "arg." + argname)
+                    self.varmap[value.name] = ptr
+                    return
+
+        return orig(self, inst)
+
+    def lower_return_inst(self, orig, inst):
+        if isinstance(self.context, OpenmpCUDATargetContext):
+            # This fixes Return instructions for CUDA device functions in an
+            # OpenMP target region. It avoids setting a value to the return
+            # value pointer argument, which otherwise breaks OpenMP code
+            # generation (looks like an upstream miscompilation) by DCE any
+            # memory effects (e.g., to other pointer arguments from a tofrom
+            # mapping.)
+            if self.fndesc.qualname == self.context.device_func_name:
+                self.call_conv._return_errcode_raw(self.builder, RETCODE_OK)
+                return
+        return orig(self, inst)
+
+
+def run_intrinsics_openmp_pass(ll_module):
+    libpass = (
+        libpath / f"libIntrinsicsOpenMP.{'dylib' if sys.platform == 'darwin' else 'so'}"
+    )
+
+    try:
+        r = subprocess.run(
+            [
+                llvm_binpath + "/opt",
+                "-f",
+                f"-load-pass-plugin={libpass}",
+                "-passes=intrinsics-openmp",
+            ],
+            input=ll_module.as_bitcode(),
+            check=True,
+            capture_output=True,
+        )
+    except subprocess.CalledProcessError as e:
+        print("Error running LLVM pass:", e, file=sys.stderr)
+        print("Command:", e.cmd, file=sys.stderr)
+        print("Return code:", e.returncode, file=sys.stderr)
+        print("Output:", e.output.decode(), file=sys.stderr)
+        print("Error output:", e.stderr.decode(), file=sys.stderr)
+        raise
+
+    if DEBUG_OPENMP_LLVM_PASS >= 1:
+        print(r.stderr.decode(), file=sys.stderr)
+
+    bitcode_output = r.stdout
+    lowered_module = ll.parse_bitcode(bitcode_output)
+
+    return lowered_module
+
+
+class CustomCPUCodeLibrary(JITCodeLibrary):
+    def add_llvm_module(self, ll_module):
+        lowered_module = run_intrinsics_openmp_pass(ll_module)
+        super().add_llvm_module(lowered_module)
+
+    def _finalize_specific(self):
+        super()._finalize_specific()
+        ll.ExecutionEngine.run_static_constructors(self._codegen._engine._ee)
+
+
+class CustomAOTCPUCodeLibrary(AOTCodeLibrary):
+    def add_llvm_module(self, ll_module):
+        lowered_module = run_intrinsics_openmp_pass(ll_module)
+        super().add_llvm_module(lowered_module)
+
+
+class CustomFunctionCompiler(_FunctionCompiler):
+    def _customize_flags(self, flags):
+        # We need to disable SSA form for OpenMP analysis to detect variables
+        # used within regions.
+        flags.enable_ssa = False
+        return flags
+
+
+class CustomCompiler(compiler.CompilerBase):
+    @staticmethod
+    def custom_untyped_pipeline(state, name="untyped-openmp"):
+        """Returns an untyped part of the nopython OpenMP pipeline"""
+        pm = PassManager(name)
+        if state.func_ir is None:
+            pm.add_pass(TranslateByteCode, "analyzing bytecode")
+            pm.add_pass(FixupArgs, "fix up args")
+        pm.add_pass(IRProcessing, "processing IR")
+
+        # inline closures early in case they are using nonlocal's
+        # see issue #6585.
+        pm.add_pass(InlineClosureLikes, "inline calls to locally defined closures")
+
+        # pre typing
+        if not state.flags.no_rewrites:
+            pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants")
+            pm.add_pass(DeadBranchPrune, "dead branch pruning")
+            pm.add_pass(GenericRewrites, "nopython rewrites")
+
+        pm.add_pass(RewriteDynamicRaises, "rewrite dynamic raises")
+
+        # convert any remaining closures into functions
+        pm.add_pass(
+            MakeFunctionToJitFunction, "convert make_function into JIT functions"
+        )
+        # inline functions that have been determined as inlinable and rerun
+        # branch pruning, this needs to be run after closures are inlined as
+        # the IR repr of a closure masks call sites if an inlinable is called
+        # inside a closure
+        pm.add_pass(InlineInlinables, "inline inlinable functions")
+        if not state.flags.no_rewrites:
+            pm.add_pass(DeadBranchPrune, "dead branch pruning")
+
+        pm.add_pass(FindLiterallyCalls, "find literally calls")
+        pm.add_pass(LiteralUnroll, "handles literal_unroll")
+
+        if state.flags.enable_ssa:
+            assert False, "SSA form is not supported in OpenMP"
+
+        pm.add_pass(LiteralPropagationSubPipelinePass, "Literal propagation")
+        # Run WithLifting late to for make_implicit_explicit to work.  TODO: We
+        # should create a pass that does this instead of replicating and hacking
+        # the untyped pipeline. This handling may also negatively affect
+        # optimizations.
+        pm.add_pass(WithLifting, "Handle with contexts")
+
+        pm.finalize()
+        return pm
+
+    def define_pipelines(self):
+        # compose pipeline from untyped, typed and lowering parts
+        dpb = DefaultPassBuilder
+        pm = PassManager("omp")
+        untyped_passes = self.custom_untyped_pipeline(self.state)
+        pm.passes.extend(untyped_passes.passes)
+
+        typed_passes = dpb.define_typed_pipeline(self.state)
+        pm.passes.extend(typed_passes.passes)
+
+        lowering_passes = dpb.define_nopython_lowering_pipeline(self.state)
+        pm.passes.extend(lowering_passes.passes)
+
+        pm.finalize()
+        return [pm]
+
+
+class CustomContext(cpu.CPUContext):
+    def post_lowering(self, mod, library):
+        if hasattr(library, "openmp") and library.openmp:
+            post_lowering_openmp(mod)
+            super().post_lowering(mod, library)
+
+
+### decorators
+
+
+def jit(*args, **kws):
+    """
+    Equivalent to jit(nopython=True, nogil=True)
+    """
+    if "nopython" in kws:
+        warnings.warn("nopython is set for njit and is ignored", RuntimeWarning)
+    if "forceobj" in kws:
+        warnings.warn("forceobj is set for njit and is ignored", RuntimeWarning)
+        del kws["forceobj"]
+    kws.update({"nopython": True, "nogil": True})
+    dispatcher = numba.jit(*args, **kws)
+    dispatcher._compiler.__class__ = CustomFunctionCompiler
+    dispatcher._compiler.pipeline_class = CustomCompiler
+    return dispatcher
+
+
+def njit(*args, **kws):
+    return jit(*args, **kws)
+
+
+class OpenmpCUDATargetContext(cuda_descriptor.CUDATargetContext):
+    def __init__(self, name, typingctx, target="cuda"):
+        super().__init__(typingctx, target)
+        self.device_func_name = name
+
+    def post_lowering(self, mod, library):
+        if hasattr(library, "openmp") and library.openmp:
+            post_lowering_openmp(mod)
+            super().post_lowering(mod, library)
+
+    @cached_property
+    def call_conv(self):
+        return CUDACallConv(self)
+
+
+class OpenmpCPUTargetContext(CustomContext):
+    def __init__(self, name, typingctx, target="cpu"):
+        super().__init__(typingctx, target)
+        self.device_func_name = name
+
+
+##### END OF NUMBA EXTENSIONS ######
+
+
+###### START OF LLVMLITE EXTENSIONS ######
+def get_decl(alloca):
+    if not isinstance(alloca, lir.instructions.AllocaInstr):
+        raise TypeError("Expected AllocaInstr, got %s" % type(alloca))
+    return '{0} %"{1}"'.format(alloca.type, alloca._get_name())
+
+
+# TODO: Upstream to llvmlite, it's part of the langref.
+class TokenType(lir.Type):
+    """
+    The type for tokens.  From the LLVM Language Reference.
+
+      'The token type is used when a value is associated with an
+       instruction but all uses of the value must not attempt to
+       introspect or obscure it. As such, it is not appropriate
+       to have a phi or select of type token.'
+    """
+
+    def _to_string(self):
+        return "token"
+
+    def __eq__(self, other):
+        return isinstance(other, TokenType)
+
+    def __hash__(self):
+        return hash(TokenType)
+
+
+class CallInstrWithOperandBundle(lir.instructions.CallInstr):
+    def set_tags(self, tags):
+        self.tags = tags
+
+    # TODO: This is ugly duplication, we should upstream to llvmlite.
+    def descr(self, buf, add_metadata=True):
+        def descr_arg(i, a):
+            if i in self.arg_attributes:
+                attrs = " ".join(self.arg_attributes[i]._to_list()) + " "
+            else:
+                attrs = ""
+            return "{0} {1}{2}".format(a.type, attrs, a.get_reference())
+
+        args = ", ".join([descr_arg(i, a) for i, a in enumerate(self.args)])
+
+        fnty = self.callee.function_type
+        # Only print function type if variable-argument
+        if fnty.var_arg:
+            ty = fnty
+        # Otherwise, just print the return type.
+        else:
+            # Fastmath flag work only in this case
+            ty = fnty.return_type
+        callee_ref = "{0} {1}".format(ty, self.callee.get_reference())
+        if self.cconv:
+            callee_ref = "{0} {1}".format(self.cconv, callee_ref)
+
+        tail_marker = ""
+        if self.tail:
+            tail_marker = "{0} ".format(self.tail)
+
+        buf.append(
+            "{tail}{op}{fastmath} {callee}({args}){attr}{tags}{meta}\n".format(
+                tail=tail_marker,
+                op=self.opname,
+                fastmath="".join([" " + attr for attr in self.fastmath]),
+                callee=callee_ref,
+                args=args,
+                attr="".join([" " + attr for attr in self.attributes]),
+                tags=(" " + self.tags if self.tags is not None else ""),
+                meta=(
+                    self._stringify_metadata(leading_comma=True) if add_metadata else ""
+                ),
+            )
+        )
+
+
+###### END OF LLVMLITE EXTENSIONS ######
+
+
+def _init():
+    global llvm_binpath
+    global llvm_libpath
+
+    sys_platform = sys.platform
+
+    llvm_version = (
+        subprocess.check_output(["llvm-config", "--version"]).decode().strip()
+    )
+    if llvm_version != "14.0.6":
+        raise RuntimeError(
+            f"Incompatible LLVM version {llvm_version}, PyOMP expects LLVM 14.0.6"
+        )
+
+    llvm_binpath = subprocess.check_output(["llvm-config", "--bindir"]).decode().strip()
+    llvm_libpath = subprocess.check_output(["llvm-config", "--libdir"]).decode().strip()
+    iomplib = (
+        llvm_libpath + "/libomp" + (".dylib" if sys_platform == "darwin" else ".so")
+    )
+    if DEBUG_OPENMP >= 1:
+        print("Found OpenMP runtime library at", iomplib)
+    ll.load_library_permanently(iomplib)
+
+    # libomptarget is unavailable on apple, windows, so return.
+    if sys_platform.startswith("darwin") or sys_platform.startswith("win32"):
+        return
+
+    omptargetlib = llvm_libpath + "/libomptarget.so"
+    if DEBUG_OPENMP >= 1:
+        print("Found OpenMP target runtime library at", omptargetlib)
+    ll.load_library_permanently(omptargetlib)
+
+
+_init()
+
+
+# ----------------------------------------------------------------------------------------------
+
+
+class NameSlice:
+    def __init__(self, name, the_slice):
+        self.name = name
+        self.the_slice = the_slice
+
+    def __str__(self):
+        return "NameSlice(" + str(self.name) + "," + str(self.the_slice) + ")"
+
+
+class StringLiteral:
+    def __init__(self, x):
+        self.x = x
+
+
+@intrinsic
+def get_itercount(typingctx, it):
+    if isinstance(it, types.RangeIteratorType):
+        sig = typing.signature(it.yield_type, it)
+
+        def codegen(context, builder, signature, args):
+            assert len(args) == 1
+            val = args[0]
+            pair = context.make_helper(builder, it, val)
+            return builder.load(pair.count)
+
+        return sig, codegen
+
+
+def remove_privatized(x):
+    if isinstance(x, ir.Var):
+        x = x.name
+
+    if isinstance(x, str) and x.endswith("%privatized"):
+        return x[: len(x) - len("%privatized")]
+    else:
+        return x
+
+
+def remove_all_privatized(x):
+    new_x = None
+    while new_x != x:
+        new_x = x
+        x = remove_privatized(new_x)
+
+    return new_x
+
+
+def typemap_lookup(typemap, x):
+    orig_x = x
+    if isinstance(x, ir.Var):
+        x = x.name
+
+    while True:
+        if x in typemap:
+            return typemap[x]
+        new_x = remove_privatized(x)
+        if new_x == x:
+            break
+        else:
+            x = new_x
+
+    tkeys = typemap.keys()
+
+    # Get basename (without privatized)
+    x = remove_all_privatized(x)
+
+    potential_keys = list(filter(lambda y: y.startswith(x), tkeys))
+
+    for pkey in potential_keys:
+        pkey_base = remove_all_privatized(pkey)
+        if pkey_base == x:
+            return typemap[pkey]
+
+    raise KeyError(f"{orig_x} and all of its non-privatized names not found in typemap")
+
+
+class openmp_tag(object):
+    def __init__(self, name, arg=None, load=False, non_arg=False, omp_slice=None):
+        self.name = name
+        self.arg = arg
+        self.load = load
+        self.loaded_arg = None
+        self.xarginfo = []
+        self.non_arg = non_arg
+        self.omp_slice = omp_slice
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if isinstance(self.arg, lir.instructions.AllocaInstr):
+            del state["arg"]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if not hasattr(self, "arg"):
+            self.arg = None
+
+    def var_in(self, var):
+        assert isinstance(var, str)
+
+        if isinstance(self.arg, ir.Var):
+            return self.arg.name == var
+
+        if isinstance(self.arg, str):
+            return self.arg == var
+
+        return False
+
+    def arg_size(self, x, lowerer):
+        if DEBUG_OPENMP >= 2:
+            print("arg_size:", x, type(x))
+        if isinstance(x, NameSlice):
+            x = x.name
+        if isinstance(x, ir.Var):
+            # Make sure the var referred to has been alloc'ed already.
+            lowerer._alloca_var(x.name, lowerer.fndesc.typemap[x.name])
+            if self.load:
+                assert False
+            else:
+                arg_str = lowerer.getvar(x.name)
+                return lowerer.context.get_abi_sizeof(arg_str.type.pointee)
+        elif isinstance(x, lir.instructions.AllocaInstr):
+            return lowerer.context.get_abi_sizeof(x.type.pointee)
+        elif isinstance(x, str):
+            xtyp = lowerer.fndesc.typemap[x]
+            if DEBUG_OPENMP >= 1:
+                print("xtyp:", xtyp, type(xtyp))
+            lowerer._alloca_var(x, xtyp)
+            if self.load:
+                assert False
+            else:
+                arg_str = lowerer.getvar(x)
+                return lowerer.context.get_abi_sizeof(arg_str.type.pointee)
+        elif isinstance(x, int):
+            assert False
+        else:
+            print("unknown arg type:", x, type(x))
+            assert False
+
+    def arg_to_str(
+        self, x, lowerer, struct_lower=False, var_table=None, gen_copy=False
+    ):
+        if DEBUG_OPENMP >= 1:
+            print("arg_to_str:", x, type(x), self.load, type(self.load))
+        if struct_lower:
+            assert isinstance(x, str)
+            assert var_table is not None
+
+        typemap = lowerer.fndesc.typemap
+
+        if isinstance(x, NameSlice):
+            if DEBUG_OPENMP >= 2:
+                print("nameslice found:", x)
+            x = x.name
+        if isinstance(x, ir.Var):
+            # Make sure the var referred to has been alloc'ed already.
+            lowerer._alloca_var(x.name, typemap_lookup(typemap, x))
+            if self.load:
+                if not self.loaded_arg:
+                    self.loaded_arg = lowerer.loadvar(x.name)
+                lop = self.loaded_arg.operands[0]
+                loptype = lop.type
+                pointee = loptype.pointee
+                ref = self.loaded_arg._get_reference()
+                decl = str(pointee) + " " + ref
+            else:
+                arg_str = lowerer.getvar(x.name)
+                if isinstance(arg_str, lir.values.Argument):
+                    decl = str(arg_str)
+                else:
+                    decl = get_decl(arg_str)
+        elif isinstance(x, lir.instructions.AllocaInstr):
+            decl = get_decl(x)
+        elif isinstance(x, str):
+            if "*" in x:
+                xsplit = x.split("*")
+                assert len(xsplit) == 2
+                # xtyp = get_dotted_type(x, typemap, lowerer)
+                xtyp = typemap_lookup(typemap, xsplit[0])
+                if DEBUG_OPENMP >= 1:
+                    print("xtyp:", xtyp, type(xtyp))
+                lowerer._alloca_var(x, xtyp)
+                if self.load:
+                    if not self.loaded_arg:
+                        self.loaded_arg = lowerer.loadvar(x)
+                    lop = self.loaded_arg.operands[0]
+                    loptype = lop.type
+                    pointee = loptype.pointee
+                    ref = self.loaded_arg._get_reference()
+                    decl = str(pointee) + " " + ref
+                    assert len(xsplit) == 1
+                else:
+                    arg_str = lowerer.getvar(xsplit[0])
+                    # arg_str = lowerer.getvar(x)
+                    if isinstance(arg_str, lir.Argument):
+                        decl = str(arg_str)
+                    else:
+                        decl = get_decl(arg_str)
+                    if len(xsplit) > 1:
+                        cur_typ = xtyp
+                        field_indices = []
+                        for field in xsplit[1:]:
+                            dm = lowerer.context.data_model_manager.lookup(cur_typ)
+                            findex = dm._fields.index(field)
+                            field_indices.append("i32 " + str(findex))
+                            cur_typ = dm._members[findex]
+                        fi_str = ",".join(field_indices)
+                        decl += f", {fi_str}"
+                        # decl = f"SCOPE({decl}, {fi_str})"
+            else:
+                xtyp = typemap_lookup(typemap, x)
+                if DEBUG_OPENMP >= 1:
+                    print("xtyp:", xtyp, type(xtyp))
+                lowerer._alloca_var(x, xtyp)
+                if self.load:
+                    if not self.loaded_arg:
+                        self.loaded_arg = lowerer.loadvar(x)
+                    lop = self.loaded_arg.operands[0]
+                    loptype = lop.type
+                    pointee = loptype.pointee
+                    ref = self.loaded_arg._get_reference()
+                    decl = str(pointee) + " " + ref
+                else:
+                    arg_str = lowerer.getvar(x)
+                    if isinstance(arg_str, lir.values.Argument):
+                        decl = str(arg_str)
+                    elif isinstance(arg_str, lir.instructions.AllocaInstr):
+                        decl = get_decl(arg_str)
+                    else:
+                        assert False, (
+                            f"Don't know how to get decl string for variable {arg_str} of type {type(arg_str)}"
+                        )
+
+                if struct_lower and isinstance(xtyp, types.npytypes.Array):
+                    dm = lowerer.context.data_model_manager.lookup(xtyp)
+                    cur_tag_ndim = xtyp.ndim
+                    stride_typ = lowerer.context.get_value_type(
+                        types.intp
+                    )  # lir.Type.int(64)
+                    stride_abi_size = lowerer.context.get_abi_sizeof(stride_typ)
+                    array_var = var_table[self.arg]
+                    if DEBUG_OPENMP >= 1:
+                        print(
+                            "Found array mapped:",
+                            self.name,
+                            self.arg,
+                            xtyp,
+                            type(xtyp),
+                            stride_typ,
+                            type(stride_typ),
+                            stride_abi_size,
+                            array_var,
+                            type(array_var),
+                        )
+                    size_var = ir.Var(None, self.arg + "_size_var", array_var.loc)
+                    # size_var = array_var.scope.redefine("size_var", array_var.loc)
+                    size_getattr = ir.Expr.getattr(array_var, "size", array_var.loc)
+                    size_assign = ir.Assign(size_getattr, size_var, array_var.loc)
+                    typemap[size_var.name] = types.int64
+                    lowerer._alloca_var(size_var.name, typemap[size_var.name])
+                    lowerer.lower_inst(size_assign)
+                    data_field = dm._fields.index("data")
+                    shape_field = dm._fields.index("shape")
+                    strides_field = dm._fields.index("strides")
+                    size_lowered = get_decl(lowerer.getvar(size_var.name))
+                    fixed_size = cur_tag_ndim
+                    # fixed_size = stride_abi_size * cur_tag_ndim
+                    decl += f", i32 {data_field}, i64 0, {size_lowered}"
+                    decl += f", i32 {shape_field}, i64 0, i64 {fixed_size}"
+                    decl += f", i32 {strides_field}, i64 0, i64 {fixed_size}"
+
+                    # see core/datamodel/models.py
+                    # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*data", non_arg=True, omp_slice=(0,lowerer.loadvar(size_var.name))))
+                    # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*shape", non_arg=True, omp_slice=(0,stride_abi_size * cur_tag_ndim)))
+                    # struct_tags.append(openmp_tag(cur_tag.name, cur_tag.arg + "*strides", non_arg=True, omp_slice=(0,stride_abi_size * cur_tag_ndim)))
+
+                if gen_copy and isinstance(xtyp, types.npytypes.Array):
+                    native_np_copy, copy_cres = create_native_np_copy(xtyp)
+                    lowerer.library.add_llvm_module(copy_cres.library._final_module)
+                    nnclen = len(native_np_copy)
+                    decl += f', [{nnclen} x i8] c"{native_np_copy}"'
+        elif isinstance(x, StringLiteral):
+            decl = str(cgutils.make_bytearray(x.x))
+        elif isinstance(x, int):
+            decl = "i32 " + str(x)
+        else:
+            print("unknown arg type:", x, type(x))
+
+        if self.omp_slice is not None:
+
+            def handle_var(x):
+                if isinstance(x, ir.Var):
+                    loaded_size = lowerer.loadvar(x.name)
+                    loaded_op = loaded_size.operands[0]
+                    loaded_pointee = loaded_op.type.pointee
+                    ret = str(loaded_pointee) + " " + loaded_size._get_reference()
+                else:
+                    ret = "i64 " + str(x)
+                return ret
+
+            start_slice = handle_var(self.omp_slice[0])
+            end_slice = handle_var(self.omp_slice[1])
+            decl += f", {start_slice}, {end_slice}"
+            # decl = f"SLICE({decl}, {self.omp_slice[0]}, {self.omp_slice[1]})"
+
+        return decl
+
+    def post_entry(self, lowerer):
+        for xarginfo, xarginfo_args, x, alloca_tuple_list in self.xarginfo:
+            loaded_args = [
+                lowerer.builder.load(alloca_tuple[2])
+                for alloca_tuple in alloca_tuple_list
+            ]
+            fa_res = xarginfo.from_arguments(lowerer.builder, tuple(loaded_args))
+            # fa_res = xarginfo.from_arguments(lowerer.builder,tuple([xarg for xarg in xarginfo_args]))
+            assert len(fa_res) == 1
+            lowerer.storevar(fa_res[0], x)
+
+    def add_length_firstprivate(self, x, lowerer):
+        if self.name == "QUAL.OMP.FIRSTPRIVATE":
+            return [x]
+            # return [x, self.arg_size(x, lowerer)]
+            # return [x, lowerer.context.get_constant(types.uintp, self.arg_size(x, lowerer))]
+        else:
+            return [x]
+
+    def unpack_arg(self, x, lowerer, xarginfo_list):
+        if isinstance(x, ir.Var):
+            return self.add_length_firstprivate(x, lowerer), None
+        elif isinstance(x, lir.instructions.AllocaInstr):
+            return self.add_length_firstprivate(x, lowerer), None
+        elif isinstance(x, str):
+            xtyp = lowerer.fndesc.typemap[x]
+            if DEBUG_OPENMP >= 2:
+                print("xtyp:", xtyp, type(xtyp))
+            if self.load:
+                return self.add_length_firstprivate(x, lowerer), None
+            else:
+                names_to_unpack = []
+                # names_to_unpack = ["QUAL.OMP.FIRSTPRIVATE"]
+                # names_to_unpack = ["QUAL.OMP.PRIVATE", "QUAL.OMP.FIRSTPRIVATE"]
+                if (
+                    isinstance(xtyp, types.npytypes.Array)
+                    and self.name in names_to_unpack
+                ):
+                    # from core/datamodel/packer.py
+                    xarginfo = lowerer.context.get_arg_packer((xtyp,))
+                    xloaded = lowerer.loadvar(x)
+                    xarginfo_args = list(
+                        xarginfo.as_arguments(lowerer.builder, [xloaded])
+                    )
+                    xarg_alloca_vars = []
+                    for xarg in xarginfo_args:
+                        if DEBUG_OPENMP >= 2:
+                            print(
+                                "xarg:",
+                                type(xarg),
+                                xarg,
+                                "agg:",
+                                xarg.aggregate,
+                                type(xarg.aggregate),
+                                "ind:",
+                                xarg.indices,
+                            )
+                            print(xarg.aggregate.type.elements[xarg.indices[0]])
+                        alloca_name = "$alloca_" + xarg.name
+                        alloca_typ = xarg.aggregate.type.elements[xarg.indices[0]]
+                        alloca_res = lowerer.alloca_lltype(alloca_name, alloca_typ)
+                        if DEBUG_OPENMP >= 2:
+                            print(
+                                "alloca:",
+                                alloca_name,
+                                alloca_typ,
+                                alloca_res,
+                                alloca_res.get_reference(),
+                            )
+                        xarg_alloca_vars.append((alloca_name, alloca_typ, alloca_res))
+                        lowerer.builder.store(xarg, alloca_res)
+                    xarginfo_list.append((xarginfo, xarginfo_args, x, xarg_alloca_vars))
+                    rets = []
+                    for i, xarg in enumerate(xarg_alloca_vars):
+                        rets.append(xarg[2])
+                        if i == 4:
+                            alloca_name = "$alloca_total_size_" + str(x)
+                            if DEBUG_OPENMP >= 2:
+                                print("alloca_name:", alloca_name)
+                            alloca_typ = lowerer.context.get_value_type(
+                                types.intp
+                            )  # lir.Type.int(64)
+                            alloca_res = lowerer.alloca_lltype(alloca_name, alloca_typ)
+                            if DEBUG_OPENMP >= 2:
+                                print(
+                                    "alloca:",
+                                    alloca_name,
+                                    alloca_typ,
+                                    alloca_res,
+                                    alloca_res.get_reference(),
+                                )
+                            mul_res = lowerer.builder.mul(
+                                lowerer.builder.load(xarg_alloca_vars[2][2]),
+                                lowerer.builder.load(xarg_alloca_vars[3][2]),
+                            )
+                            lowerer.builder.store(mul_res, alloca_res)
+                            rets.append(alloca_res)
+                        else:
+                            rets.append(self.arg_size(xarg[2], lowerer))
+                    return rets, [x]
+                else:
+                    return self.add_length_firstprivate(x, lowerer), None
+        elif isinstance(x, int):
+            return self.add_length_firstprivate(x, lowerer), None
+        else:
+            print("unknown arg type:", x, type(x))
+
+        return self.add_length_firstprivate(x, lowerer), None
+
+    def unpack_arrays(self, lowerer):
+        if isinstance(self.arg, list):
+            arg_list = self.arg
+        elif self.arg is not None:
+            arg_list = [self.arg]
+        else:
+            return [self]
+        new_xarginfo = []
+        unpack_res = [self.unpack_arg(arg, lowerer, new_xarginfo) for arg in arg_list]
+        new_args = [x[0] for x in unpack_res]
+        arrays_to_private = []
+        for x in unpack_res:
+            if x[1]:
+                arrays_to_private.append(x[1])
+        ot_res = openmp_tag(self.name, sum(new_args, []), self.load)
+        ot_res.xarginfo = new_xarginfo
+        return [ot_res] + (
+            []
+            if len(arrays_to_private) == 0
+            else [openmp_tag("QUAL.OMP.PRIVATE", sum(arrays_to_private, []), self.load)]
+        )
+
+    def lower(self, lowerer, debug):
+        decl = ""
+        if debug and DEBUG_OPENMP >= 1:
+            print("openmp_tag::lower", self.name, self.arg, type(self.arg))
+
+        if isinstance(self.arg, list):
+            arg_list = self.arg
+        elif self.arg is not None:
+            arg_list = [self.arg]
+        else:
+            arg_list = []
+        typemap = lowerer.fndesc.typemap
+        assert len(arg_list) <= 1
+
+        if self.name == "QUAL.OMP.TARGET.IMPLICIT":
+            assert False  # shouldn't get here anymore
+
+        name_to_use = self.name
+
+        is_array = self.arg in typemap and isinstance(
+            typemap[self.arg], types.npytypes.Array
+        )
+
+        gen_copy = name_to_use in ["QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.LASTPRIVATE"]
+
+        if (
+            name_to_use
+            in [
+                "QUAL.OMP.MAP.TOFROM",
+                "QUAL.OMP.MAP.TO",
+                "QUAL.OMP.MAP.FROM",
+                "QUAL.OMP.MAP.ALLOC",
+            ]
+            and is_array
+        ):
+            # name_to_use += ".STRUCT"
+            # var_table = get_name_var_table(lowerer.func_ir.blocks)
+            # decl = ",".join([self.arg_to_str(x, lowerer, struct_lower=True, var_table=var_table) for x in arg_list])
+            decl = ",".join(
+                [
+                    self.arg_to_str(x, lowerer, struct_lower=False, gen_copy=gen_copy)
+                    for x in arg_list
+                ]
+            )
+        else:
+            decl = ",".join(
+                [
+                    self.arg_to_str(x, lowerer, struct_lower=False, gen_copy=gen_copy)
+                    for x in arg_list
+                ]
+            )
+
+        return '"' + name_to_use + '"(' + decl + ")"
+
+    def replace_vars_inner(self, var_dict):
+        if isinstance(self.arg, ir.Var):
+            self.arg = replace_vars_inner(self.arg, var_dict)
+
+    def add_to_usedef_set(self, use_set, def_set, start):
+        assert start == True or start == False
+        if DEBUG_OPENMP >= 3:
+            print("add_to_usedef_set", start, self.name, "is_dsa=", is_dsa(self.name))
+
+        def add_arg(arg, the_set):
+            if isinstance(self.arg, ir.Var):
+                the_set.add(self.arg.name)
+            elif isinstance(self.arg, str):
+                the_set.add(self.arg)
+            elif isinstance(self.arg, NameSlice):
+                assert isinstance(self.arg.name, str), "Expected str in NameSlice arg"
+                the_set.add(self.arg.name)
+            # TODO: Create a good error check mechanism.
+            # else: ?
+
+        if self.name.startswith("DIR.OMP"):
+            assert not isinstance(self.arg, (ir.Var, str))
+            return
+
+        if self.name in [
+            "QUAL.OMP.MAP.TO",
+            "QUAL.OMP.IF",
+            "QUAL.OMP.NUM_THREADS",
+            "QUAL.OMP.NUM_TEAMS",
+            "QUAL.OMP.THREAD_LIMIT",
+            "QUAL.OMP.SCHEDULE.STATIC",
+            "QUAL.OMP.SCHEDULE.RUNTIME",
+            "QUAL.OMP.SCHEDULE.GUIDED",
+            "QUAL.OMP.SCHEDULE_DYNAMIC",
+            "QUAL.OMP.FIRSTPRIVATE",
+            "QUAL.OMP.COPYIN",
+            "QUAL.OMP.COPYPRIVATE",
+            "QUAL.OMP.NORMALIZED.LB",
+            "QUAL.OMP.NORMALIZED.START",
+            "QUAL.OMP.NORMALIZED.UB",
+            "QUAL.OMP.MAP.TO.STRUCT",
+        ]:
+            if start:
+                add_arg(self.arg, use_set)
+        elif self.name in [
+            "QUAL.OMP.PRIVATE",
+            "QUAL.OMP.LINEAR",
+            "QUAL.OMP.NORMALIZED.IV",
+            "QUAL.OMP.MAP.ALLOC",
+            "QUAL.OMP.MAP.ALLOC.STRUCT",
+        ]:
+            # Intentionally do nothing.
+            pass
+        elif self.name in ["QUAL.OMP.SHARED"]:
+            add_arg(self.arg, use_set)
+        elif self.name in [
+            "QUAL.OMP.MAP.TOFROM",
+            "QUAL.OMP.TARGET.IMPLICIT",
+            "QUAL.OMP.MAP.TOFROM.STRUCT",
+        ]:
+            if start:
+                add_arg(self.arg, use_set)
+            else:
+                add_arg(self.arg, use_set)
+                add_arg(self.arg, def_set)
+        elif self.name in [
+            "QUAL.OMP.MAP.FROM",
+            "QUAL.OMP.LASTPRIVATE",
+            "QUAL.OMP.MAP.FROM.STRUCT",
+        ] or self.name.startswith("QUAL.OMP.REDUCTION"):
+            if not start:
+                add_arg(self.arg, use_set)
+                add_arg(self.arg, def_set)
+        else:
+            # All other clauses should not have a variable argument.
+            if isinstance(self.arg, (ir.Var, str)):
+                print("Bad usedef tag:", self.name, self.arg)
+            assert not isinstance(self.arg, (ir.Var, str))
+
+    def __str__(self):
+        return (
+            "openmp_tag("
+            + str(self.name)
+            + ","
+            + str(self.arg)
+            + (
+                ""
+                if self.omp_slice is None
+                else f", omp_slice({self.omp_slice[0]},{self.omp_slice[1]})"
+            )
+            + ")"
+        )
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def openmp_tag_list_to_str(tag_list, lowerer, debug):
+    tag_strs = [x.lower(lowerer, debug) for x in tag_list]
+    return "[ " + ", ".join(tag_strs) + " ]"
+
+
+def list_vars_from_tags(tags):
+    used_vars = []
+    for t in tags:
+        if isinstance(t.arg, ir.Var):
+            used_vars.append(t.arg)
+    return used_vars
+
+
+def openmp_region_alloca(obj, alloca_instr, typ):
+    obj.alloca(alloca_instr, typ)
+
+
+def push_alloca_callback(lowerer, callback, data, builder):
+    # cgutils.push_alloca_callbacks(callback, data)
+    if not hasattr(builder, "_lowerer_push_alloca_callbacks"):
+        builder._lowerer_push_alloca_callbacks = 0
+    builder._lowerer_push_alloca_callbacks += 1
+
+
+def pop_alloca_callback(lowerer, builder):
+    # cgutils.pop_alloca_callbacks()
+    builder._lowerer_push_alloca_callbacks -= 1
+
+
+def in_openmp_region(builder):
+    if hasattr(builder, "_lowerer_push_alloca_callbacks"):
+        return builder._lowerer_push_alloca_callbacks > 0
+    else:
+        return False
+
+
+def find_target_start_end(func_ir, target_num):
+    start_block = None
+    end_block = None
+
+    for label, block in func_ir.blocks.items():
+        if isinstance(block.body[0], openmp_region_start):
+            block_target_num = block.body[0].has_target()
+            if target_num == block_target_num:
+                start_block = label
+                if start_block is not None and end_block is not None:
+                    return start_block, end_block
+        elif isinstance(block.body[0], openmp_region_end):
+            block_target_num = block.body[0].start_region.has_target()
+            if target_num == block_target_num:
+                end_block = label
+                if start_block is not None and end_block is not None:
+                    return start_block, end_block
+
+    dprint_func_ir(func_ir, "find_target_start_end")
+    print("target_num:", target_num)
+    assert False
+
+
+def get_tags_of_type(clauses, ctype):
+    ret = []
+    for c in clauses:
+        if c.name == ctype:
+            ret.append(c)
+    return ret
+
+
+def copy_one(x, calltypes):
+    if DEBUG_OPENMP >= 2:
+        print("copy_one:", x, type(x))
+    if isinstance(x, ir.Loc):
+        return copy.copy(x)
+    elif isinstance(x, ir.Expr):
+        if x in calltypes:
+            ctyp = calltypes[x]
+        else:
+            ctyp = None
+        ret = ir.Expr(
+            copy_one(x.op, calltypes),
+            copy_one(x.loc, calltypes),
+            **copy_one(x._kws, calltypes),
+        )
+        if ctyp and ret not in calltypes:
+            calltypes[ret] = ctyp
+        return ret
+    elif isinstance(x, dict):
+        return {k: copy_one(v, calltypes) for k, v in x.items()}
+    elif isinstance(x, list):
+        return [copy_one(v, calltypes) for v in x]
+    elif isinstance(x, tuple):
+        return tuple([copy_one(v, calltypes) for v in x])
+    elif isinstance(x, ir.Const):
+        return ir.Const(
+            copy_one(x.value, calltypes), copy_one(x.loc, calltypes), x.use_literal_type
+        )
+    elif isinstance(
+        x,
+        (
+            int,
+            float,
+            str,
+            ir.Global,
+            python_types.BuiltinFunctionType,
+            ir.UndefinedType,
+            type(None),
+            types.functions.ExternalFunction,
+        ),
+    ):
+        return x
+    elif isinstance(x, ir.Var):
+        return ir.Var(x.scope, copy_one(x.name, calltypes), copy_one(x.loc, calltypes))
+    elif isinstance(x, ir.Del):
+        return ir.Del(copy_one(x.value, calltypes), copy_one(x.loc, calltypes))
+    elif isinstance(x, ir.Jump):
+        return ir.Jump(copy_one(x.target, calltypes), copy_one(x.loc, calltypes))
+    elif isinstance(x, ir.Return):
+        return ir.Return(copy_one(x.value, calltypes), copy_one(x.loc, calltypes))
+    elif isinstance(x, ir.Branch):
+        return ir.Branch(
+            copy_one(x.cond, calltypes),
+            copy_one(x.truebr, calltypes),
+            copy_one(x.falsebr, calltypes),
+            copy_one(x.loc, calltypes),
+        )
+    elif isinstance(x, ir.Print):
+        ctyp = calltypes[x]
+        ret = copy.copy(x)
+        calltypes[ret] = ctyp
+        return ret
+    elif isinstance(x, ir.Assign):
+        return ir.Assign(
+            copy_one(x.value, calltypes),
+            copy_one(x.target, calltypes),
+            copy_one(x.loc, calltypes),
+        )
+    elif isinstance(x, ir.Arg):
+        return ir.Arg(
+            copy_one(x.name, calltypes),
+            copy_one(x.index, calltypes),
+            copy_one(x.loc, calltypes),
+        )
+    elif isinstance(x, ir.SetItem):
+        ctyp = calltypes[x]
+        ret = ir.SetItem(
+            copy_one(x.target, calltypes),
+            copy_one(x.index, calltypes),
+            copy_one(x.value, calltypes),
+            copy_one(x.loc, calltypes),
+        )
+        calltypes[ret] = ctyp
+        return ret
+    elif isinstance(x, ir.StaticSetItem):
+        ctyp = calltypes[x]
+        ret = ir.StaticSetItem(
+            copy_one(x.target, calltypes),
+            copy_one(x.index, calltypes),
+            copy_one(x.index_var, calltypes),
+            copy_one(x.value, calltypes),
+            copy_one(x.loc, calltypes),
+        )
+        calltypes[ret] = ctyp
+        return ret
+    elif isinstance(x, ir.FreeVar):
+        return ir.FreeVar(
+            copy_one(x.index, calltypes),
+            copy_one(x.name, calltypes),
+            copy_one(x.value, calltypes),
+            copy_one(x.loc, calltypes),
+        )
+    elif isinstance(x, slice):
+        return slice(
+            copy_one(x.start, calltypes),
+            copy_one(x.stop, calltypes),
+            copy_one(x.step, calltypes),
+        )
+    elif isinstance(x, ir.PopBlock):
+        return ir.PopBlock(copy_one(x.loc, calltypes))
+    elif isinstance(x, ir.SetAttr):
+        ctyp = calltypes[x]
+        ret = ir.SetAttr(
+            copy_one(x.target, calltypes),
+            copy_one(x.attr, calltypes),
+            copy_one(x.value, calltypes),
+            copy_one(x.loc, calltypes),
+        )
+        calltypes[ret] = ctyp
+        return ret
+    elif isinstance(x, ir.DelAttr):
+        return ir.DelAttr(
+            copy_one(x.target, calltypes),
+            copy_one(x.attr, calltypes),
+            copy_one(x.loc, calltypes),
+        )
+    elif isinstance(x, types.Type):
+        return x  # Don't copy types.
+    print("Failed to handle the following type when copying target IR.", type(x), x)
+    assert False
+
+
+def copy_ir(input_ir, calltypes, depth=1):
+    assert depth >= 0 and depth <= 1
+
+    # This is a depth 0 copy.
+    cur_ir = input_ir.copy()
+    if depth == 1:
+        for blk in cur_ir.blocks.values():
+            for i in range(len(blk.body)):
+                if not isinstance(
+                    blk.body[i], (openmp_region_start, openmp_region_end)
+                ):
+                    blk.body[i] = copy_one(blk.body[i], calltypes)
+
+    return cur_ir
+
+
+def is_target_tag(x):
+    ret = x.startswith("DIR.OMP.TARGET") and x not in [
+        "DIR.OMP.TARGET.DATA",
+        "DIR.OMP.TARGET.ENTER.DATA",
+        "DIR.OMP.TARGET.EXIT.DATA",
+    ]
+    return ret
+
+
+def replace_np_empty_with_cuda_shared(
+    outlined_ir, typemap, calltypes, prefix, typingctx
+):
+    if DEBUG_OPENMP >= 2:
+        print("starting replace_np_empty_with_cuda_shared")
+    outlined_ir = outlined_ir.blocks
+    converted_arrays = []
+    consts = {}
+    topo_order = find_topo_order(outlined_ir)
+    mode = 0  # 0 = non-target region, 1 = target region, 2 = teams region, 3 = teams parallel region
+    # For each block in topological order...
+    for label in topo_order:
+        block = outlined_ir[label]
+        new_block_body = []
+        blen = len(block.body)
+        index = 0
+        # For each statement in the block.
+        while index < blen:
+            stmt = block.body[index]
+            # Adjust mode based on the start of an openmp region.
+            if isinstance(stmt, openmp_region_start):
+                if "TARGET" in stmt.tags[0].name:
+                    assert mode == 0
+                    mode = 1
+                if "TEAMS" in stmt.tags[0].name and mode == 1:
+                    mode = 2
+                if "PARALLEL" in stmt.tags[0].name and mode == 2:
+                    mode = 3
+                new_block_body.append(stmt)
+            # Adjust mode based on the end of an openmp region.
+            elif isinstance(stmt, openmp_region_end):
+                if mode == 3 and "PARALLEL" in stmt.tags[0].name:
+                    mode = 2
+                if mode == 2 and "TEAMS" in stmt.tags[0].name:
+                    mode = 1
+                if mode == 1 and "TARGET" in stmt.tags[0].name:
+                    mode = 0
+                new_block_body.append(stmt)
+            # Fix calltype for the np.empty call to have literal as first
+            # arg and include explicit dtype.
+            elif (
+                isinstance(stmt, ir.Assign)
+                and isinstance(stmt.value, ir.Expr)
+                and stmt.value.op == "call"
+                and stmt.value.func in converted_arrays
+            ):
+                size = consts[stmt.value.args[0].name]
+                # The 1D case where the dimension size is directly a const.
+                if isinstance(size, ir.Const):
+                    size = size.value
+                    signature = calltypes[stmt.value]
+                    signature_args = (
+                        types.scalars.IntegerLiteral(size),
+                        types.functions.NumberClass(signature.return_type.dtype),
+                    )
+                    del calltypes[stmt.value]
+                    calltypes[stmt.value] = typing.templates.Signature(
+                        signature.return_type, signature_args, signature.recvr
+                    )
+                # The 2D+ case where the dimension sizes are in a tuple.
+                elif isinstance(size, ir.Expr):
+                    signature = calltypes[stmt.value]
+                    signature_args = (
+                        types.Tuple(
+                            [
+                                types.scalars.IntegerLiteral(consts[x.name].value)
+                                for x in size.items
+                            ]
+                        ),
+                        types.functions.NumberClass(signature.return_type.dtype),
+                    )
+                    del calltypes[stmt.value]
+                    calltypes[stmt.value] = typing.templates.Signature(
+                        signature.return_type, signature_args, signature.recvr
+                    )
+
+                # These lines will force the function to be in the data structures that lowering uses.
+                afnty = typemap[stmt.value.func.name]
+                afnty.get_call_type(typingctx, signature_args, {})
+                if len(stmt.value.args) == 1:
+                    dtype_to_use = signature.return_type.dtype
+                    # If dtype in kwargs then remove it.
+                    if len(stmt.value.kws) > 0:
+                        for kwarg in stmt.value.kws:
+                            if kwarg[0] == "dtype":
+                                stmt.value.kws = list(
+                                    filter(lambda x: x[0] != "dtype", stmt.value.kws)
+                                )
+                                break
+                    new_block_body.append(
+                        ir.Assign(
+                            ir.Global("np", np, lhs.loc),
+                            ir.Var(lhs.scope, mk_unique_var(".np_global"), lhs.loc),
+                            lhs.loc,
+                        )
+                    )
+                    typemap[new_block_body[-1].target.name] = types.Module(np)
+                    new_block_body.append(
+                        ir.Assign(
+                            ir.Expr.getattr(
+                                new_block_body[-1].target, str(dtype_to_use), lhs.loc
+                            ),
+                            ir.Var(lhs.scope, mk_unique_var(".np_dtype"), lhs.loc),
+                            lhs.loc,
+                        )
+                    )
+                    typemap[new_block_body[-1].target.name] = (
+                        types.functions.NumberClass(signature.return_type.dtype)
+                    )
+                    stmt.value.args.append(new_block_body[-1].target)
+                else:
+                    raise NotImplementedError(
+                        "np.empty having more than shape and dtype arguments not yet supported."
+                    )
+                new_block_body.append(stmt)
+            # Keep track of variables assigned from consts or from build_tuples make up exclusively of
+            # variables assigned from consts.
+            elif isinstance(stmt, ir.Assign) and (
+                isinstance(stmt.value, ir.Const)
+                or (
+                    isinstance(stmt.value, ir.Expr)
+                    and stmt.value.op == "build_tuple"
+                    and all([x.name in consts for x in stmt.value.items])
+                )
+            ):
+                consts[stmt.target.name] = stmt.value
+                new_block_body.append(stmt)
+            # If we see a global for the numpy module.
+            elif (
+                isinstance(stmt, ir.Assign)
+                and isinstance(stmt.value, ir.Global)
+                and isinstance(stmt.value.value, python_types.ModuleType)
+                and stmt.value.value.__name__ == "numpy"
+            ):
+                lhs = stmt.target
+                index += 1
+                next_stmt = block.body[index]
+                # And the next statement is a getattr for the name "empty" on the numpy module
+                # and we are in a target region.
+                if (
+                    isinstance(next_stmt, ir.Assign)
+                    and isinstance(next_stmt.value, ir.Expr)
+                    and next_stmt.value.value == lhs
+                    and next_stmt.value.op == "getattr"
+                    and next_stmt.value.attr == "empty"
+                    and mode > 0
+                ):
+                    # Remember that we are converting this np.empty into a CUDA call.
+                    converted_arrays.append(next_stmt.target)
+
+                    # Create numba.cuda module variable.
+                    new_block_body.append(
+                        ir.Assign(
+                            ir.Global("numba", numba, lhs.loc),
+                            ir.Var(
+                                lhs.scope, mk_unique_var(".cuda_shared_global"), lhs.loc
+                            ),
+                            lhs.loc,
+                        )
+                    )
+                    typemap[new_block_body[-1].target.name] = types.Module(numba)
+                    new_block_body.append(
+                        ir.Assign(
+                            ir.Expr.getattr(new_block_body[-1].target, "cuda", lhs.loc),
+                            ir.Var(
+                                lhs.scope,
+                                mk_unique_var(".cuda_shared_getattr"),
+                                lhs.loc,
+                            ),
+                            lhs.loc,
+                        )
+                    )
+                    typemap[new_block_body[-1].target.name] = types.Module(numba.cuda)
+
+                    if mode == 1:
+                        raise NotImplementedError(
+                            "np.empty used in non-teams or parallel target region"
+                        )
+                        pass
+                    elif mode == 2:
+                        # Create numba.cuda.shared module variable.
+                        new_block_body.append(
+                            ir.Assign(
+                                ir.Expr.getattr(
+                                    new_block_body[-1].target, "shared", lhs.loc
+                                ),
+                                ir.Var(
+                                    lhs.scope,
+                                    mk_unique_var(".cuda_shared_getattr"),
+                                    lhs.loc,
+                                ),
+                                lhs.loc,
+                            )
+                        )
+                        typemap[new_block_body[-1].target.name] = types.Module(
+                            numba.cuda.stubs.shared
+                        )
+                    elif mode == 3:
+                        # Create numba.cuda.local module variable.
+                        new_block_body.append(
+                            ir.Assign(
+                                ir.Expr.getattr(
+                                    new_block_body[-1].target, "local", lhs.loc
+                                ),
+                                ir.Var(
+                                    lhs.scope,
+                                    mk_unique_var(".cuda_local_getattr"),
+                                    lhs.loc,
+                                ),
+                                lhs.loc,
+                            )
+                        )
+                        typemap[new_block_body[-1].target.name] = types.Module(
+                            numba.cuda.stubs.local
+                        )
+
+                    # Change the typemap for the original function variable for np.empty.
+                    afnty = typingctx.resolve_getattr(
+                        typemap[new_block_body[-1].target.name], "array"
+                    )
+                    del typemap[next_stmt.target.name]
+                    typemap[next_stmt.target.name] = afnty
+                    # Change the variable that previously was assigned np.empty to now be one of
+                    # the CUDA array allocators.
+                    new_block_body.append(
+                        ir.Assign(
+                            ir.Expr.getattr(
+                                new_block_body[-1].target, "array", lhs.loc
+                            ),
+                            next_stmt.target,
+                            lhs.loc,
+                        )
+                    )
+                else:
+                    new_block_body.append(stmt)
+                    new_block_body.append(next_stmt)
+            else:
+                new_block_body.append(stmt)
+            index += 1
+        block.body = new_block_body
+
+
+class openmp_region_start(ir.Stmt):
+    def __init__(self, tags, region_number, loc, firstprivate_dead_after=None):
+        if DEBUG_OPENMP >= 2:
+            print("region ids openmp_region_start::__init__", id(self))
+        self.tags = tags
+        self.region_number = region_number
+        self.loc = loc
+        self.omp_region_var = None
+        self.omp_metadata = None
+        self.tag_vars = set()
+        self.normal_iv = None
+        self.target_copy = False
+        self.firstprivate_dead_after = (
+            [] if firstprivate_dead_after is None else firstprivate_dead_after
+        )
+        for tag in self.tags:
+            if isinstance(tag.arg, ir.Var):
+                self.tag_vars.add(tag.arg.name)
+            elif isinstance(tag.arg, str):
+                self.tag_vars.add(tag.arg)
+            elif isinstance(tag.arg, NameSlice):
+                self.tag_vars.add(tag.arg.name)
+
+            if tag.name == "QUAL.OMP.NORMALIZED.IV":
+                self.normal_iv = tag.arg
+        if DEBUG_OPENMP >= 1:
+            print("tags:", self.tags)
+            print("tag_vars:", sorted(self.tag_vars))
+        self.acq_res = False
+        self.acq_rel = False
+        self.alloca_queue = []
+        self.end_region = None
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def replace_var_names(self, namedict):
+        for i in range(len(self.tags)):
+            if isinstance(self.tags[i].arg, ir.Var):
+                if self.tags[i].arg.name in namedict:
+                    var = self.tags[i].arg
+                    self.tags[i].arg = ir.Var(var.scope, namedict[var.name], var.log)
+            elif isinstance(self.tags[i].arg, str):
+                if "*" in self.tags[i].arg:
+                    xsplit = self.tags[i].arg.split("*")
+                    assert len(xsplit) == 2
+                    if xsplit[0] in namedict:
+                        self.tags[i].arg = namedict[xsplit[0]] + "*" + xsplit[1]
+                else:
+                    if self.tags[i].arg in namedict:
+                        self.tags[i].arg = namedict[self.tags[i].arg]
+
+    def add_tag(self, tag):
+        tag_arg_str = None
+        if isinstance(tag.arg, ir.Var):
+            tag_arg_str = tag.arg.name
+        elif isinstance(tag.arg, str):
+            tag_arg_str = tag.arg
+        elif isinstance(tag.arg, lir.instructions.AllocaInstr):
+            tag_arg_str = tag.arg._get_name()
+        else:
+            assert False
+        if isinstance(tag_arg_str, str):
+            self.tag_vars.add(tag_arg_str)
+        self.tags.append(tag)
+
+    def get_var_dsa(self, var):
+        assert isinstance(var, str)
+        for tag in self.tags:
+            if is_dsa(tag.name) and tag.var_in(var):
+                return tag.name
+        return None
+
+    def requires_acquire_release(self):
+        pass
+        # self.acq_res = True
+
+    def requires_combined_acquire_release(self):
+        pass
+        # self.acq_rel = True
+
+    def has_target(self):
+        for t in self.tags:
+            if is_target_tag(t.name):
+                return t.arg
+        return None
+
+    def list_vars(self):
+        return list_vars_from_tags(self.tags)
+
+    def update_tags(self):
+        with self.builder.goto_block(self.block):
+            cur_instr = -1
+
+            while True:
+                last_instr = self.builder.block.instructions[cur_instr]
+                if (
+                    isinstance(last_instr, lir.instructions.CallInstr)
+                    and last_instr.tags is not None
+                    and len(last_instr.tags) > 0
+                ):
+                    break
+                cur_instr -= 1
+
+            last_instr.tags = openmp_tag_list_to_str(self.tags, self.lowerer, False)
+            if DEBUG_OPENMP >= 1:
+                print("last_tags:", last_instr.tags, type(last_instr.tags))
+
+    def alloca(self, alloca_instr, typ):
+        # We can't process these right away since the processing required can
+        # lead to infinite recursion.  So, we just accumulate them in a queue
+        # and then process them later at the end_region marker so that the
+        # variables are guaranteed to exist in their full form so that when we
+        # process them then they won't lead to infinite recursion.
+        self.alloca_queue.append((alloca_instr, typ))
+
+    def process_alloca_queue(self):
+        # This should be old code...making sure with the assertion.
+        assert len(self.alloca_queue) == 0
+        has_update = False
+        for alloca_instr, typ in self.alloca_queue:
+            has_update = self.process_one_alloca(alloca_instr, typ) or has_update
+        if has_update:
+            self.update_tags()
+        self.alloca_queue = []
+
+    def post_lowering_process_alloca_queue(self, enter_directive):
+        has_update = False
+        if DEBUG_OPENMP >= 1:
+            print("starting post_lowering_process_alloca_queue")
+        for alloca_instr, typ in self.alloca_queue:
+            has_update = self.process_one_alloca(alloca_instr, typ) or has_update
+        if has_update:
+            if DEBUG_OPENMP >= 1:
+                print(
+                    "post_lowering_process_alloca_queue has update:",
+                    enter_directive.tags,
+                )
+            enter_directive.tags = openmp_tag_list_to_str(
+                self.tags, self.lowerer, False
+            )
+            # LLVM IR is doing some string caching and the following line is necessary to
+            # reset that caching so that the original tag text can be overwritten above.
+            enter_directive._clear_string_cache()
+            if DEBUG_OPENMP >= 1:
+                print(
+                    "post_lowering_process_alloca_queue updated tags:",
+                    enter_directive.tags,
+                )
+        self.alloca_queue = []
+
+    def process_one_alloca(self, alloca_instr, typ):
+        avar = alloca_instr.name
+        if DEBUG_OPENMP >= 1:
+            print(
+                "openmp_region_start process_one_alloca:",
+                id(self),
+                alloca_instr,
+                avar,
+                typ,
+                type(alloca_instr),
+                self.tag_vars,
+            )
+
+        has_update = False
+        if (
+            self.normal_iv is not None
+            and avar != self.normal_iv
+            and avar.startswith(self.normal_iv)
+        ):
+            for i in range(len(self.tags)):
+                if DEBUG_OPENMP >= 1:
+                    print("Replacing normalized iv with", avar)
+                self.tags[i].arg = avar
+                has_update = True
+                break
+
+        if not self.needs_implicit_vars():
+            return has_update
+        if avar not in self.tag_vars:
+            if DEBUG_OPENMP >= 1:
+                print(
+                    f"LLVM variable {avar} didn't previously exist in the list of vars so adding as private."
+                )
+            self.add_tag(
+                openmp_tag("QUAL.OMP.PRIVATE", alloca_instr)
+            )  # is FIRSTPRIVATE right here?
+            has_update = True
+        return has_update
+
+    def needs_implicit_vars(self):
+        first_tag = self.tags[0]
+        if (
+            first_tag.name == "DIR.OMP.PARALLEL"
+            or first_tag.name == "DIR.OMP.PARALLEL.LOOP"
+            or first_tag.name == "DIR.OMP.TASK"
+        ):
+            return True
+        return False
+
+    def update_context(self, context, builder):
+        cctyp = type(context.call_conv)
+        # print("start update_context id(context)", id(context), "id(const.call_conv)", id(context.call_conv), "cctyp", cctyp, "id(cctyp)", id(cctyp))
+
+        if (
+            not hasattr(cctyp, "pyomp_patch_installed")
+            or cctyp.pyomp_patch_installed == False
+        ):
+            cctyp.pyomp_patch_installed = True
+            # print("update_context", "id(cctyp.return_user_exec)", id(cctyp.return_user_exc), "id(context)", id(context))
+            setattr(cctyp, "orig_return_user_exc", cctyp.return_user_exc)
+
+            def pyomp_return_user_exc(self, builder, *args, **kwargs):
+                # print("pyomp_return_user_exc")
+                # Handle exceptions in OpenMP regions by emitting a trap and an
+                # unreachable terminator.
+                if in_openmp_region(builder):
+                    fnty = lir.types.FunctionType(lir.types.VoidType(), [])
+                    fn = builder.module.declare_intrinsic("llvm.trap", (), fnty)
+                    builder.call(fn, [])
+                    builder.unreachable()
+                    return
+                self.orig_return_user_exc(builder, *args, **kwargs)
+
+            setattr(cctyp, "return_user_exc", pyomp_return_user_exc)
+            # print("after", id(pyomp_return_user_exc), id(cctyp.return_user_exc))
+
+            setattr(
+                cctyp, "orig_return_status_propagate", cctyp.return_status_propagate
+            )
+
+            def pyomp_return_status_propagate(self, builder, *args, **kwargs):
+                if in_openmp_region(builder):
+                    return
+                self.orig_return_status_propagate(builder, *args, **kwargs)
+
+            setattr(cctyp, "return_status_propagate", pyomp_return_status_propagate)
+
+        cemtyp = type(context.error_model)
+        # print("start update_context id(context)", id(context), "id(const.error_model)", id(context.error_model), "cemtyp", cemtyp, "id(cemtyp)", id(cemtyp))
+
+        if (
+            not hasattr(cemtyp, "pyomp_patch_installed")
+            or cemtyp.pyomp_patch_installed == False
+        ):
+            cemtyp.pyomp_patch_installed = True
+            # print("update_context", "id(cemtyp.return_user_exec)", id(cemtyp.fp_zero_division), "id(context)", id(context))
+            setattr(cemtyp, "orig_fp_zero_division", cemtyp.fp_zero_division)
+
+            def pyomp_fp_zero_division(self, builder, *args, **kwargs):
+                # print("pyomp_fp_zero_division")
+                if in_openmp_region(builder):
+                    return False
+                return self.orig_fp_zero_division(builder, *args, **kwargs)
+
+            setattr(cemtyp, "fp_zero_division", pyomp_fp_zero_division)
+            # print("after", id(pyomp_fp_zero_division), id(cemtyp.fp_zero_division))
+
+        pyapi = context.get_python_api(builder)
+        ptyp = type(pyapi)
+
+        if (
+            not hasattr(ptyp, "pyomp_patch_installed")
+            or ptyp.pyomp_patch_installed == False
+        ):
+            ptyp.pyomp_patch_installed = True
+            # print("update_context", "id(ptyp.emit_environment_sentry)", id(ptyp.emit_environment_sentry), "id(context)", id(context))
+            setattr(ptyp, "orig_emit_environment_sentry", ptyp.emit_environment_sentry)
+
+            def pyomp_emit_environment_sentry(self, *args, **kwargs):
+                builder = self.builder
+                # print("pyomp_emit_environment_sentry")
+                if in_openmp_region(builder):
+                    return False
+                return self.orig_emit_environment_sentry(*args, **kwargs)
+
+            setattr(ptyp, "emit_environment_sentry", pyomp_emit_environment_sentry)
+            # print("after", id(pyomp_emit_environment_sentry), id(ptyp.emit_environment_sentry))
+
+    def fix_dispatchers(self, typemap, typingctx, cuda_target):
+        fixup_dict = {}
+        for k, v in typemap.items():
+            if isinstance(v, Dispatcher) and not isinstance(
+                v, numba_cuda.types.CUDADispatcher
+            ):
+                # targetoptions = v.targetoptions.copy()
+                # targetoptions['device'] = True
+                # targetoptions['debug'] = targetoptions.get('debug', False)
+                # targetoptions['opt'] = targetoptions.get('opt', True)
+                vdispatcher = v.dispatcher
+                vdispatcher.targetoptions.pop("nopython", None)
+                vdispatcher.targetoptions.pop("boundscheck", None)
+                disp = typingctx.resolve_value_type(vdispatcher)
+                fixup_dict[k] = disp
+                for sig in vdispatcher.overloads.keys():
+                    disp.dispatcher.compile_device(sig, cuda_target=cuda_target)
+
+        for k, v in fixup_dict.items():
+            del typemap[k]
+            typemap[k] = v
+
+    def lower(self, lowerer):
+        typingctx = lowerer.context.typing_context
+        targetctx = lowerer.context
+        typemap = lowerer.fndesc.typemap
+        calltypes = lowerer.fndesc.calltypes
+        context = lowerer.context
+        builder = lowerer.builder
+        mod = builder.module
+        library = lowerer.library
+        library.openmp = True
+        self.block = builder.block
+        self.builder = builder
+        self.lowerer = lowerer
+        self.update_context(context, builder)
+        if DEBUG_OPENMP >= 1:
+            print(
+                "region ids lower:block",
+                id(self),
+                self,
+                id(self.block),
+                self.block,
+                type(self.block),
+                self.tags,
+                len(self.tags),
+                "builder_id:",
+                id(self.builder),
+                "block_id:",
+                id(self.block),
+            )
+            for k, v in lowerer.func_ir.blocks.items():
+                print("block post copy:", k, id(v), id(v.body))
+
+        # Convert implicit tags to explicit form now that we have typing info.
+        for i in range(len(self.tags)):
+            cur_tag = self.tags[i]
+            if cur_tag.name == "QUAL.OMP.TARGET.IMPLICIT":
+                if isinstance(
+                    typemap_lookup(typemap, cur_tag.arg), types.npytypes.Array
+                ):
+                    cur_tag.name = "QUAL.OMP.MAP.TOFROM"
+                else:
+                    cur_tag.name = "QUAL.OMP.FIRSTPRIVATE"
+
+        if DEBUG_OPENMP >= 1:
+            for otag in self.tags:
+                print("otag:", otag, type(otag.arg))
+
+        # Remove LLVM vars that might have been added if this is an OpenMP
+        # region inside a target region.
+        count_alloca_instr = len(
+            list(
+                filter(
+                    lambda x: isinstance(x.arg, lir.instructions.AllocaInstr), self.tags
+                )
+            )
+        )
+        assert count_alloca_instr == 0
+        # self.tags = list(filter(lambda x: not isinstance(x.arg, lir.instructions.AllocaInstr), self.tags))
+        if DEBUG_OPENMP >= 1:
+            print("after LLVM tag filter", self.tags, len(self.tags))
+            for otag in self.tags:
+                print("otag:", otag, type(otag.arg))
+
+        host_side_target_tags = []
+        target_num = self.has_target()
+
+        def add_struct_tags(self, var_table):
+            extras_before = []
+            struct_tags = []
+            for i in range(len(self.tags)):
+                cur_tag = self.tags[i]
+                if cur_tag.name in [
+                    "QUAL.OMP.MAP.TOFROM",
+                    "QUAL.OMP.MAP.TO",
+                    "QUAL.OMP.MAP.FROM",
+                    "QUAL.OMP.MAP.ALLOC",
+                ]:
+                    cur_tag_var = cur_tag.arg
+                    if isinstance(cur_tag_var, NameSlice):
+                        cur_tag_var = cur_tag_var.name
+                    assert isinstance(cur_tag_var, str)
+                    cur_tag_typ = typemap_lookup(typemap, cur_tag_var)
+                    if isinstance(cur_tag_typ, types.npytypes.Array):
+                        cur_tag_ndim = cur_tag_typ.ndim
+                        stride_typ = lowerer.context.get_value_type(
+                            types.intp
+                        )  # lir.Type.int(64)
+                        stride_abi_size = context.get_abi_sizeof(stride_typ)
+                        array_var = var_table[cur_tag_var]
+                        if DEBUG_OPENMP >= 1:
+                            print(
+                                "Found array mapped:",
+                                cur_tag.name,
+                                cur_tag.arg,
+                                cur_tag_typ,
+                                type(cur_tag_typ),
+                                stride_typ,
+                                type(stride_typ),
+                                stride_abi_size,
+                                array_var,
+                                type(array_var),
+                            )
+                        uniqueness = get_unique()
+                        if isinstance(cur_tag.arg, NameSlice):
+                            the_slice = cur_tag.arg.the_slice[0][0]
+                            assert the_slice.step is None
+                            if isinstance(the_slice.start, int):
+                                start_index_var = ir.Var(
+                                    None,
+                                    f"{cur_tag_var}_start_index_var{target_num}{uniqueness}",
+                                    array_var.loc,
+                                )
+                                start_assign = ir.Assign(
+                                    ir.Const(the_slice.start, array_var.loc),
+                                    start_index_var,
+                                    array_var.loc,
+                                )
+
+                                typemap[start_index_var.name] = types.int64
+                                lowerer.lower_inst(start_assign)
+                                extras_before.append(start_assign)
+                                lowerer._alloca_var(
+                                    start_index_var.name, typemap[start_index_var.name]
+                                )
+                                lowerer.loadvar(start_index_var.name)
+                            else:
+                                start_index_var = the_slice.start
+                                assert isinstance(start_index_var, str)
+                                start_index_var = ir.Var(
+                                    None, start_index_var, array_var.loc
+                                )
+                            if isinstance(the_slice.stop, int):
+                                end_index_var = ir.Var(
+                                    None,
+                                    f"{cur_tag_var}_end_index_var{target_num}{uniqueness}",
+                                    array_var.loc,
+                                )
+                                end_assign = ir.Assign(
+                                    ir.Const(the_slice.stop, array_var.loc),
+                                    end_index_var,
+                                    array_var.loc,
+                                )
+                                typemap[end_index_var.name] = types.int64
+                                lowerer.lower_inst(end_assign)
+                                extras_before.append(end_assign)
+                                lowerer._alloca_var(
+                                    end_index_var.name, typemap[end_index_var.name]
+                                )
+                                lowerer.loadvar(end_index_var.name)
+                            else:
+                                end_index_var = the_slice.stop
+                                assert isinstance(end_index_var, str)
+                                end_index_var = ir.Var(
+                                    None, end_index_var, array_var.loc
+                                )
+
+                            num_elements_var = ir.Var(
+                                None,
+                                f"{cur_tag_var}_num_elements_var{target_num}{uniqueness}",
+                                array_var.loc,
+                            )
+                            size_binop = ir.Expr.binop(
+                                operator.sub,
+                                end_index_var,
+                                start_index_var,
+                                array_var.loc,
+                            )
+                            size_assign = ir.Assign(
+                                size_binop, num_elements_var, array_var.loc
+                            )
+                            calltypes[size_binop] = typing.signature(
+                                types.int64, types.int64, types.int64
+                            )
+                        else:
+                            start_index_var = 0
+                            num_elements_var = ir.Var(
+                                None,
+                                f"{cur_tag_var}_num_elements_var{target_num}{uniqueness}",
+                                array_var.loc,
+                            )
+                            size_getattr = ir.Expr.getattr(
+                                array_var, "size", array_var.loc
+                            )
+                            size_assign = ir.Assign(
+                                size_getattr, num_elements_var, array_var.loc
+                            )
+
+                        typemap[num_elements_var.name] = types.int64
+                        lowerer.lower_inst(size_assign)
+                        extras_before.append(size_assign)
+                        lowerer._alloca_var(
+                            num_elements_var.name, typemap[num_elements_var.name]
+                        )
+
+                        # see core/datamodel/models.py
+                        lowerer.loadvar(num_elements_var.name)  # alloca the var
+
+                        # see core/datamodel/models.py
+                        if isinstance(start_index_var, ir.Var):
+                            lowerer.loadvar(start_index_var.name)  # alloca the var
+                        if isinstance(num_elements_var, ir.Var):
+                            lowerer.loadvar(num_elements_var.name)  # alloca the var
+                        struct_tags.append(
+                            openmp_tag(
+                                cur_tag.name + ".STRUCT",
+                                cur_tag_var + "*data",
+                                non_arg=True,
+                                omp_slice=(start_index_var, num_elements_var),
+                            )
+                        )
+                        struct_tags.append(
+                            openmp_tag(
+                                "QUAL.OMP.MAP.TO.STRUCT",
+                                cur_tag_var + "*shape",
+                                non_arg=True,
+                                omp_slice=(0, 1),
+                            )
+                        )
+                        struct_tags.append(
+                            openmp_tag(
+                                "QUAL.OMP.MAP.TO.STRUCT",
+                                cur_tag_var + "*strides",
+                                non_arg=True,
+                                omp_slice=(0, 1),
+                            )
+                        )
+                        # Peel off NameSlice, it served its purpose and is not
+                        # needed by the rest of compilation.
+                        if isinstance(cur_tag.arg, NameSlice):
+                            cur_tag.arg = cur_tag.arg.name
+
+            return struct_tags, extras_before
+
+        if self.tags[0].name in [
+            "DIR.OMP.TARGET.DATA",
+            "DIR.OMP.TARGET.ENTER.DATA",
+            "DIR.OMP.TARGET.EXIT.DATA",
+            "DIR.OMP.TARGET.UPDATE",
+        ]:
+            var_table = get_name_var_table(lowerer.func_ir.blocks)
+            struct_tags, extras_before = add_struct_tags(self, var_table)
+            self.tags.extend(struct_tags)
+            for extra in extras_before:
+                lowerer.lower_inst(extra)
+
+        elif target_num is not None and self.target_copy != True:
+            var_table = get_name_var_table(lowerer.func_ir.blocks)
+
+            ompx_attrs = list(
+                filter(lambda x: x.name == "QUAL.OMP.OMPX_ATTRIBUTE", self.tags)
+            )
+            self.tags = list(
+                filter(lambda x: x.name != "QUAL.OMP.OMPX_ATTRIBUTE", self.tags)
+            )
+            selected_device = 0
+            device_tags = get_tags_of_type(self.tags, "QUAL.OMP.DEVICE")
+            if len(device_tags) > 0:
+                device_tag = device_tags[-1]
+                if isinstance(device_tag.arg, int):
+                    selected_device = device_tag.arg
+                else:
+                    assert False
+                if DEBUG_OPENMP >= 1:
+                    print("new selected device:", selected_device)
+
+            struct_tags, extras_before = add_struct_tags(self, var_table)
+            self.tags.extend(struct_tags)
+            if DEBUG_OPENMP >= 1:
+                for otag in self.tags:
+                    print("tag in target:", otag, type(otag.arg))
+
+            from numba.core.compiler import Compiler, Flags
+
+            if DEBUG_OPENMP >= 1:
+                print("openmp start region lower has target", type(lowerer.func_ir))
+            # Make a copy of the host IR being lowered.
+            dprint_func_ir(lowerer.func_ir, "original func_ir")
+            func_ir = copy_ir(lowerer.func_ir, calltypes)
+            dprint_func_ir(func_ir, "copied func_ir")
+            if DEBUG_OPENMP >= 1:
+                for k, v in lowerer.func_ir.blocks.items():
+                    print(
+                        "region ids block post copy:",
+                        k,
+                        id(v),
+                        id(func_ir.blocks[k]),
+                        id(v.body),
+                        id(func_ir.blocks[k].body),
+                    )
+
+            remove_dels(func_ir.blocks)
+
+            dprint_func_ir(func_ir, "func_ir after remove_dels")
+
+            def fixup_openmp_pairs(blocks):
+                """The Numba IR nodes for the start and end of an OpenMP region
+                contain references to each other.  When a target region is
+                outlined that contains these pairs of IR nodes then if we
+                simply shallow copy them then they'll point to their original
+                matching pair in the original IR.  In this function, we go
+                through and find what should be matching pairs in the
+                outlined (target) IR and make those copies point to each
+                other.
+                """
+                start_dict = {}
+                end_dict = {}
+
+                # Go through the blocks in the original IR and create a mapping
+                # between the id of the start nodes with their block label and
+                # position in the block.  Likewise, do the same for end nodes.
+                for label, block in func_ir.blocks.items():
+                    for bindex, bstmt in enumerate(block.body):
+                        if isinstance(bstmt, openmp_region_start):
+                            if DEBUG_OPENMP >= 2:
+                                print("region ids found region start", id(bstmt))
+                            start_dict[id(bstmt)] = (label, bindex)
+                        elif isinstance(bstmt, openmp_region_end):
+                            if DEBUG_OPENMP >= 2:
+                                print(
+                                    "region ids found region end",
+                                    id(bstmt.start_region),
+                                    id(bstmt),
+                                )
+                            end_dict[id(bstmt.start_region)] = (label, bindex)
+                assert len(start_dict) == len(end_dict)
+
+                # For each start node that we found above, create a copy in the target IR
+                # and fixup the references of the copies to point at each other.
+                for start_id, blockindex in start_dict.items():
+                    start_block, sbindex = blockindex
+
+                    end_block_index = end_dict[start_id]
+                    end_block, ebindex = end_block_index
+
+                    if DEBUG_OPENMP >= 2:
+                        start_pre_copy = blocks[start_block].body[sbindex]
+                        end_pre_copy = blocks[end_block].body[ebindex]
+
+                    # Create copy of the OpenMP start and end nodes in the target outlined IR.
+                    blocks[start_block].body[sbindex] = copy.copy(
+                        blocks[start_block].body[sbindex]
+                    )
+                    blocks[end_block].body[ebindex] = copy.copy(
+                        blocks[end_block].body[ebindex]
+                    )
+                    # Reset some fields in the start OpenMP region because the target IR
+                    # has not been lowered yet.
+                    start_region = blocks[start_block].body[sbindex]
+                    start_region.builder = None
+                    start_region.block = None
+                    start_region.lowerer = None
+                    start_region.target_copy = True
+                    start_region.tags = copy.deepcopy(start_region.tags)
+                    # Remove unnecessary num_teams, thread_limit tags when
+                    # emitting a target directive within a kernel to avoid
+                    # extraneous arguments in the kernel function.
+                    if start_region.has_target() == target_num:
+                        start_region.tags.append(openmp_tag("OMP.DEVICE"))
+                    end_region = blocks[end_block].body[ebindex]
+                    # assert(start_region.omp_region_var is None)
+                    assert len(start_region.alloca_queue) == 0
+                    # Make start and end copies point at each other.
+                    end_region.start_region = start_region
+                    start_region.end_region = end_region
+                    if DEBUG_OPENMP >= 2:
+                        print(
+                            f"region ids fixup start: {id(start_pre_copy)}->{id(start_region)} end: {id(end_pre_copy)}->{id(end_region)}"
+                        )
+
+            fixup_openmp_pairs(func_ir.blocks)
+            state = compiler.StateDict()
+            fndesc = lowerer.fndesc
+            state.typemap = fndesc.typemap
+            state.calltypes = fndesc.calltypes
+            state.argtypes = fndesc.argtypes
+            state.return_type = fndesc.restype
+            if DEBUG_OPENMP >= 1:
+                print("context:", context, type(context))
+                print("targetctx:", targetctx, type(targetctx))
+                print("state:", state, dir(state))
+                print("fndesc:", fndesc, type(fndesc))
+                print("func_ir type:", type(func_ir))
+            dprint_func_ir(func_ir, "target func_ir")
+            internal_codegen = targetctx._internal_codegen
+
+            # Find the start and end IR blocks for this offloaded region.
+            start_block, end_block = find_target_start_end(func_ir, target_num)
+            end_target_node = func_ir.blocks[end_block].body[0]
+
+            if DEBUG_OPENMP >= 1:
+                print("start_block:", start_block)
+                print("end_block:", end_block)
+
+            blocks_in_region = get_blocks_between_start_end(
+                func_ir.blocks, start_block, end_block
+            )
+            if DEBUG_OPENMP >= 1:
+                print("lower blocks_in_region:", blocks_in_region)
+
+            # Find the variables that cross the boundary between the target
+            # region and the non-target host-side code.
+            ins, outs = transforms.find_region_inout_vars(
+                blocks=func_ir.blocks,
+                livemap=func_ir.variable_lifetime.livemap,
+                callfrom=start_block,
+                returnto=end_block,
+                body_block_ids=blocks_in_region,
+            )
+
+            def add_mapped_to_ins(ins, tags):
+                for tag in tags:
+                    if tag.arg in ins:
+                        continue
+
+                    if tag.name in ["QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.MAP.FROM"]:
+                        ins.append(tag.arg)
+
+            add_mapped_to_ins(ins, self.tags)
+
+            normalized_ivs = get_tags_of_type(self.tags, "QUAL.OMP.NORMALIZED.IV")
+            if DEBUG_OPENMP >= 1:
+                print("ivs ins", normalized_ivs, ins, outs)
+            for niv in normalized_ivs:
+                if DEBUG_OPENMP >= 1:
+                    print("Removing normalized iv from ins", niv.arg)
+                if niv.arg in ins:
+                    ins.remove(niv.arg)
+            # Get the types of the variables live-in to the target region.
+            target_args_unordered = ins + list(set(outs) - set(ins))
+            if DEBUG_OPENMP >= 1:
+                print("ins:", ins, type(ins))
+                print("outs:", outs, type(outs))
+                print("args:", state.args)
+                print("rettype:", state.return_type, type(state.return_type))
+                print("target_args_unordered:", target_args_unordered)
+            # Re-use Numba loop lifting code to extract the target region as
+            # its own function.
+            region_info = transforms._loop_lift_info(
+                loop=None,
+                inputs=ins,
+                # outputs=outs,
+                outputs=(),
+                callfrom=start_block,
+                returnto=end_block,
+            )
+
+            region_blocks = dict((k, func_ir.blocks[k]) for k in blocks_in_region)
+
+            if DEBUG_OPENMP >= 1:
+                print("region_info:", region_info)
+            transforms._loop_lift_prepare_loop_func(region_info, region_blocks)
+            # exit_block_label = max(region_blocks.keys())
+            # region_blocks[exit_block_label].body = []
+            # exit_scope = region_blocks[exit_block_label].scope
+            # tmp = exit_scope.make_temp(loc=func_ir.loc)
+            # region_blocks[exit_block_label].append(ir.Assign(value=ir.Const(0, func_ir.loc), target=tmp, loc=func_ir.loc))
+            # region_blocks[exit_block_label].append(ir.Return(value=tmp, loc=func_ir.loc))
+
+            target_args = []
+            outline_arg_typs = []
+            # outline_arg_typs = [None] * len(target_args_unordered)
+            for tag in self.tags:
+                if DEBUG_OPENMP >= 1:
+                    print(1, "target_arg?", tag, tag.non_arg, is_target_arg(tag.name))
+                if (
+                    tag.arg in target_args_unordered
+                    and not tag.non_arg
+                    and is_target_arg(tag.name)
+                ):
+                    target_args.append(tag.arg)
+                    # target_arg_index = target_args.index(tag.arg)
+                    atyp = get_dotted_type(tag.arg, typemap, lowerer)
+                    if is_pointer_target_arg(tag.name, atyp):
+                        # outline_arg_typs[target_arg_index] = types.CPointer(atyp)
+                        outline_arg_typs.append(types.CPointer(atyp))
+                        if DEBUG_OPENMP >= 1:
+                            print(1, "found cpointer target_arg", tag, atyp, id(atyp))
+                    else:
+                        # outline_arg_typs[target_arg_index] = atyp
+                        outline_arg_typs.append(atyp)
+                        if DEBUG_OPENMP >= 1:
+                            print(1, "found target_arg", tag, atyp, id(atyp))
+
+            if DEBUG_OPENMP >= 1:
+                print("target_args:", target_args)
+                print("target_args_unordered:", target_args_unordered)
+                print("outline_arg_typs:", outline_arg_typs)
+                print("extras_before:", extras_before, start_block)
+                for eb in extras_before:
+                    print(eb)
+
+            assert len(target_args) == len(target_args_unordered)
+            assert len(target_args) == len(outline_arg_typs)
+
+            # Create the outlined IR from the blocks in the region, making the
+            # variables crossing into the regions argument.
+            outlined_ir = func_ir.derive(
+                blocks=region_blocks,
+                arg_names=tuple(target_args),
+                arg_count=len(target_args),
+                force_non_generator=True,
+            )
+            outlined_ir.blocks[start_block].body = (
+                extras_before + outlined_ir.blocks[start_block].body
+            )
+            for stmt in outlined_ir.blocks[min(outlined_ir.blocks.keys())].body:
+                if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Arg):
+                    stmt.value.index = target_args.index(stmt.value.name)
+
+            def prepend_device_to_func_name(outlined_ir):
+                # Change the name of the outlined function to prepend the
+                # word "device" to the function name.
+                fparts = outlined_ir.func_id.func_qualname.split(".")
+                fparts[-1] = "device" + str(target_num) + fparts[-1]
+                outlined_ir.func_id.func_qualname = ".".join(fparts)
+                outlined_ir.func_id.func_name = fparts[-1]
+                uid = next(bytecode.FunctionIdentity._unique_ids)
+                outlined_ir.func_id.unique_name = "{}${}".format(
+                    outlined_ir.func_id.func_qualname, uid
+                )
+
+            prepend_device_to_func_name(outlined_ir)
+            device_func_name = outlined_ir.func_id.func_qualname
+            if DEBUG_OPENMP >= 1:
+                print(
+                    "outlined_ir:",
+                    type(outlined_ir),
+                    type(outlined_ir.func_id),
+                    outlined_ir.arg_names,
+                    device_func_name,
+                )
+                dprint_func_ir(outlined_ir, "outlined_ir")
+
+            # Create a copy of the state and the typemap inside of it so that changes
+            # for compiling the outlined IR don't effect the original compilation state
+            # of the host.
+            state_copy = copy.copy(state)
+            state_copy.typemap = copy.copy(typemap)
+
+            entry_block_num = min(outlined_ir.blocks.keys())
+            entry_block = outlined_ir.blocks[entry_block_num]
+            if DEBUG_OPENMP >= 1:
+                print("entry_block:", entry_block)
+                for x in entry_block.body:
+                    print(x)
+            rev_arg_assigns = []
+            # Add entries in the copied typemap for the arguments to the outlined IR.
+            for idx, zipvar in enumerate(zip(target_args, outline_arg_typs)):
+                var_in, vtyp = zipvar
+                arg_name = "arg." + var_in
+                state_copy.typemap.pop(arg_name, None)
+                state_copy.typemap[arg_name] = vtyp
+
+            last_block = outlined_ir.blocks[end_block]
+            last_block.body = (
+                [end_target_node]
+                + last_block.body[:-1]
+                + rev_arg_assigns
+                + last_block.body[-1:]
+            )
+
+            assert isinstance(last_block.body[-1], ir.Return)
+            # Add typemap entry for the empty tuple return type.
+            state_copy.typemap[last_block.body[-1].value.name] = types.none
+            # end test
+
+            if DEBUG_OPENMP >= 1:
+                print("selected_device:", selected_device)
+
+            if selected_device == 1:
+                flags = Flags()
+                flags.enable_ssa = False
+                device_lowerer_pipeline = OnlyLower
+
+                subtarget = OpenmpCPUTargetContext(
+                    device_func_name, targetctx.typing_context
+                )
+                # Copy everything (like registries) from cpu context into the new OpenMPCPUTargetContext subtarget
+                # except call_conv which is the whole point of that class so that the minimal call convention is used.
+                subtarget.__dict__.update(
+                    {
+                        k: targetctx.__dict__[k]
+                        for k in targetctx.__dict__.keys() - {"call_conv"}
+                    }
+                )
+                # subtarget.install_registry(imputils.builtin_registry)
+                # Turn off the Numba runtime (incref and decref mostly) for the target compilation.
+                subtarget.enable_nrt = False
+                typingctx_outlined = targetctx.typing_context
+
+                import numba.core.codegen as codegen
+
+                subtarget._internal_codegen = codegen.AOTCPUCodegen(
+                    mod.name + f"$device{selected_device}"
+                )
+                subtarget._internal_codegen._library_class = CustomAOTCPUCodeLibrary
+                subtarget._internal_codegen._engine.set_object_cache(None, None)
+                device_target = subtarget
+            elif selected_device == 0:
+                from numba.core import target_extension
+
+                orig_target = getattr(
+                    target_extension._active_context,
+                    "target",
+                    target_extension._active_context_default,
+                )
+                target_extension._active_context.target = "cuda"
+
+                flags = cuda_compiler.CUDAFlags()
+
+                typingctx_outlined = cuda_descriptor.cuda_target.typing_context
+                device_target = OpenmpCUDATargetContext(
+                    device_func_name, typingctx_outlined
+                )
+                device_target.fndesc = fndesc
+                # device_target = cuda_descriptor.cuda_target.target_context
+
+                device_lowerer_pipeline = OnlyLowerCUDA
+                openmp_cuda_target = numba_cuda.descriptor.CUDATarget("openmp_cuda")
+                openmp_cuda_target._typingctx = typingctx_outlined
+                openmp_cuda_target._targetctx = device_target
+                self.fix_dispatchers(
+                    state_copy.typemap, typingctx_outlined, openmp_cuda_target
+                )
+
+                typingctx_outlined.refresh()
+                device_target.refresh()
+                dprint_func_ir(outlined_ir, "outlined_ir before replace np.empty")
+                replace_np_empty_with_cuda_shared(
+                    outlined_ir,
+                    state_copy.typemap,
+                    calltypes,
+                    device_func_name,
+                    typingctx_outlined,
+                )
+                dprint_func_ir(outlined_ir, "outlined_ir after replace np.empty")
+            else:
+                raise NotImplementedError("Unsupported OpenMP device number")
+
+            device_target.state_copy = state_copy
+            # Do not compile (generate native code), just lower (to LLVM)
+            flags.no_compile = True
+            flags.no_cpython_wrapper = True
+            flags.no_cfunc_wrapper = True
+            # What to do here?
+            flags.forceinline = True
+            # Propagate fastmath flag on the outer function to the inner outlined compile.
+            # TODO: find a good way to handle fastmath. Clang has
+            # fp-contractions on by default for GPU code.
+            # flags.fastmath = True#state_copy.flags.fastmath
+            flags.release_gil = True
+            flags.nogil = True
+            flags.inline = "always"
+            # Create a pipeline that only lowers the outlined target code.  No need to
+            # compile because it has already gone through those passes.
+            if DEBUG_OPENMP >= 1:
+                print(
+                    "outlined_ir:",
+                    outlined_ir,
+                    type(outlined_ir),
+                    outlined_ir.arg_names,
+                )
+                dprint_func_ir(outlined_ir, "outlined_ir")
+                dprint_func_ir(func_ir, "target after outline func_ir")
+                dprint_func_ir(lowerer.func_ir, "original func_ir")
+                print("state_copy.typemap:", state_copy.typemap)
+                print("region ids before compile_ir")
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+
+            cres = compiler.compile_ir(
+                typingctx_outlined,
+                device_target,
+                outlined_ir,
+                outline_arg_typs,
+                types.none,
+                flags,
+                {},
+                pipeline_class=device_lowerer_pipeline,
+                is_lifted_loop=False,
+            )  # tried this as True since code derived from loop lifting code but it goes through the pipeline twice and messes things up
+
+            if DEBUG_OPENMP >= 2:
+                print("cres:", type(cres))
+                print("fndesc:", cres.fndesc, cres.fndesc.mangled_name)
+                print("metadata:", cres.metadata)
+            cres_library = cres.library
+            if DEBUG_OPENMP >= 2:
+                print("cres_library:", type(cres_library))
+                sys.stdout.flush()
+            cres_library._ensure_finalized()
+            if DEBUG_OPENMP >= 2:
+                print("ensure_finalized:")
+                sys.stdout.flush()
+
+            if DEBUG_OPENMP >= 1:
+                print("region ids compile_ir")
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+                print(
+                    "==================================================================================="
+                )
+
+                for k, v in lowerer.func_ir.blocks.items():
+                    print(
+                        "block post copy:",
+                        k,
+                        id(v),
+                        id(func_ir.blocks[k]),
+                        id(v.body),
+                        id(func_ir.blocks[k].body),
+                    )
+
+            shared_ext = ".so"
+            if sys.platform.startswith("win"):
+                shared_ext = ".dll"
+
+            # TODO: move device pipelines in numba proper.
+            if selected_device == 1:
+                if DEBUG_OPENMP >= 1:
+                    with open(cres_library.name + ".ll", "w") as f:
+                        f.write(cres_library.get_llvm_str())
+
+                fd_o, filename_o = tempfile.mkstemp(".o")
+                fd_so, filename_so = tempfile.mkstemp(shared_ext)
+
+                target_elf = cres_library.emit_native_object()
+                with open(filename_o, "wb") as f:
+                    f.write(target_elf)
+
+                # Create shared library as required by the libomptarget host
+                # plugin.
+
+                subprocess.run(
+                    [
+                        "ld",
+                        "-shared",
+                        filename_o,
+                        # Do whole archive to include all symbols, esp. for the
+                        # PyOMP_NRT_Init constructor.
+                        "--whole-archive",
+                        libpath / "libbundle.a",
+                        "--no-whole-archive",
+                        "-o",
+                        filename_so,
+                    ],
+                    check=True,
+                )
+
+                with open(filename_so, "rb") as f:
+                    target_elf = f.read()
+                if DEBUG_OPENMP >= 1:
+                    print("filename_o", filename_o, "filename_so", filename_so)
+
+                os.close(fd_o)
+                os.remove(filename_o)
+                os.close(fd_so)
+                os.remove(filename_so)
+
+                if DEBUG_OPENMP >= 1:
+                    print("target_elf:", type(target_elf), len(target_elf))
+                    sys.stdout.flush()
+            elif selected_device == 0:
+                import numba.cuda.api as cudaapi
+                import numba.cuda.cudadrv.libs as cudalibs
+                from numba.cuda.cudadrv import driver
+                from numba.core.llvm_bindings import create_pass_manager_builder
+                from numba.cuda.codegen import CUDA_TRIPLE
+
+                class OpenMPCUDACodegen:
+                    def __init__(self):
+                        self.cc = cudaapi.get_current_device().compute_capability
+                        self.sm = "sm_" + str(self.cc[0]) + str(self.cc[1])
+                        self.libdevice_path = cudalibs.get_libdevice()
+                        with open(self.libdevice_path, "rb") as f:
+                            self.libs_mod = ll.parse_bitcode(f.read())
+                        self.libomptarget_arch = (
+                            llvm_libpath + "/libomptarget-new-nvptx-" + self.sm + ".bc"
+                        )
+                        with open(self.libomptarget_arch, "rb") as f:
+                            libomptarget_mod = ll.parse_bitcode(f.read())
+                        # Link in device, openmp libraries.
+                        self.libs_mod.link_in(libomptarget_mod)
+                        # Initialize asm printers to codegen ptx.
+                        ll.initialize_all_targets()
+                        ll.initialize_all_asmprinters()
+                        target = ll.Target.from_triple(CUDA_TRIPLE)
+                        self.tm = target.create_target_machine(cpu=self.sm, opt=3)
+
+                    def _get_target_image_in_memory(self, mod, filename_prefix):
+                        if DEBUG_OPENMP_LLVM_PASS >= 1:
+                            with open(filename_prefix + ".ll", "w") as f:
+                                f.write(str(mod))
+
+                        # Lower openmp intrinsics.
+                        mod = run_intrinsics_openmp_pass(mod)
+                        with ll.create_module_pass_manager() as pm:
+                            pm.add_cfg_simplification_pass()
+                            pm.run(mod)
+
+                        if DEBUG_OPENMP_LLVM_PASS >= 1:
+                            with open(filename_prefix + "-intrinsics_omp.ll", "w") as f:
+                                f.write(str(mod))
+
+                        mod.link_in(self.libs_mod, preserve=True)
+                        # Internalize non-kernel function definitions.
+                        for func in mod.functions:
+                            if func.is_declaration:
+                                continue
+                            if func.linkage != ll.Linkage.external:
+                                continue
+                            if "__omp_offload_numba" in func.name:
+                                continue
+                            func.linkage = "internal"
+
+                        with ll.create_module_pass_manager() as pm:
+                            self.tm.add_analysis_passes(pm)
+                            pm.add_global_dce_pass()
+                            pm.run(mod)
+
+                        if DEBUG_OPENMP_LLVM_PASS >= 1:
+                            with open(
+                                filename_prefix + "-intrinsics_omp-linked.ll", "w"
+                            ) as f:
+                                f.write(str(mod))
+
+                        # Run passes for optimization, including target-specific passes.
+                        # Run function passes.
+                        with ll.create_function_pass_manager(mod) as pm:
+                            self.tm.add_analysis_passes(pm)
+                            with create_pass_manager_builder(
+                                opt=3, slp_vectorize=True, loop_vectorize=True
+                            ) as pmb:
+                                # TODO: upstream adjust_pass_manager to llvmlite?
+                                # self.tm.adjust_pass_manager(pmb)
+                                pmb.populate(pm)
+                            for func in mod.functions:
+                                pm.initialize()
+                                pm.run(func)
+                                pm.finalize()
+
+                        # Run module passes.
+                        with ll.create_module_pass_manager() as pm:
+                            self.tm.add_analysis_passes(pm)
+                            with create_pass_manager_builder(
+                                opt=3, slp_vectorize=True, loop_vectorize=True
+                            ) as pmb:
+                                # TODO: upstream adjust_pass_manager to llvmlite?
+                                # self.tm.adjust_pass_manager(pmb)
+                                pmb.populate(pm)
+                            pm.run(mod)
+
+                        if DEBUG_OPENMP_LLVM_PASS >= 1:
+                            mod.verify()
+                            with open(
+                                filename_prefix + "-intrinsics_omp-linked-opt.ll", "w"
+                            ) as f:
+                                f.write(str(mod))
+
+                        # Generate ptx assemlby.
+                        ptx = self.tm.emit_assembly(mod)
+
+                        if DEBUG_OPENMP_LLVM_PASS >= 1:
+                            with open(
+                                filename_prefix + "-intrinsics_omp-linked-opt.s", "w"
+                            ) as f:
+                                f.write(ptx)
+
+                        linker_kwargs = {}
+                        for x in ompx_attrs:
+                            linker_kwargs[x.arg[0]] = (
+                                tuple(x.arg[1]) if len(x.arg[1]) > 1 else x.arg[1][0]
+                            )
+                        # NOTE: DO NOT set cc, since the linker will always
+                        # compile for the existing GPU context and it is
+                        # incompatible with the launch_bounds ompx_attribute.
+                        linker = driver.Linker.new(**linker_kwargs)
+                        linker.add_ptx(ptx.encode())
+                        cubin = linker.complete()
+
+                        if DEBUG_OPENMP_LLVM_PASS >= 1:
+                            with open(
+                                filename_prefix + "-intrinsics_omp-linked-opt.o", "wb"
+                            ) as f:
+                                f.write(cubin)
+
+                        return cubin
+
+                    def _get_target_image_toolchain(self, mod, filename_prefix):
+                        with open(filename_prefix + ".ll", "w") as f:
+                            f.write(str(mod))
+
+                        # Lower openmp intrinsics.
+                        mod = run_intrinsics_openmp_pass(mod)
+                        with ll.create_module_pass_manager() as pm:
+                            pm.add_cfg_simplification_pass()
+                            pm.run(mod)
+
+                        with open(filename_prefix + "-intrinsics_omp.ll", "w") as f:
+                            f.write(str(mod))
+
+                        if DEBUG_OPENMP >= 1:
+                            print("libomptarget_arch", self.libomptarget_arch)
+                        subprocess.run(
+                            [
+                                llvm_binpath + "/llvm-link",
+                                "--suppress-warnings",
+                                "--internalize",
+                                "-S",
+                                filename_prefix + "-intrinsics_omp.ll",
+                                self.libomptarget_arch,
+                                self.libdevice_path,
+                                "-o",
+                                filename_prefix + "-intrinsics_omp-linked.ll",
+                            ],
+                            check=True,
+                        )
+                        subprocess.run(
+                            [
+                                llvm_binpath + "/opt",
+                                "-S",
+                                "-O3",
+                                filename_prefix + "-intrinsics_omp-linked.ll",
+                                "-o",
+                                filename_prefix + "-intrinsics_omp-linked-opt.ll",
+                            ],
+                            check=True,
+                        )
+
+                        subprocess.run(
+                            [
+                                llvm_binpath + "/llc",
+                                "-O3",
+                                "-march=nvptx64",
+                                f"-mcpu={self.sm}",
+                                f"-mattr=+ptx64,+{self.sm}",
+                                filename_prefix + "-intrinsics_omp-linked-opt.ll",
+                                "-o",
+                                filename_prefix + "-intrinsics_omp-linked-opt.s",
+                            ],
+                            check=True,
+                        )
+
+                        subprocess.run(
+                            [
+                                "ptxas",
+                                "-m64",
+                                "--gpu-name",
+                                self.sm,
+                                filename_prefix + "-intrinsics_omp-linked-opt.s",
+                                "-o",
+                                filename_prefix + "-intrinsics_omp-linked-opt.o",
+                            ],
+                            check=True,
+                        )
+                        with open(
+                            filename_prefix + "-intrinsics_omp-linked-opt.o", "rb"
+                        ) as f:
+                            target_elf = f.read()
+                        return target_elf
+
+                    def get_target_image(self, cres):
+                        filename_prefix = cres_library.name
+                        allmods = cres_library.modules
+                        linked_mod = ll.parse_assembly(str(allmods[0]))
+                        for mod in allmods[1:]:
+                            linked_mod.link_in(ll.parse_assembly(str(mod)))
+                        if OPENMP_DEVICE_TOOLCHAIN >= 1:
+                            return self._get_target_image_toolchain(
+                                linked_mod, filename_prefix
+                            )
+                        else:
+                            return self._get_target_image_in_memory(
+                                linked_mod, filename_prefix
+                            )
+
+                target_extension._active_context.target = orig_target
+                omp_cuda_cg = OpenMPCUDACodegen()
+                target_elf = omp_cuda_cg.get_target_image(cres)
+            else:
+                raise NotImplementedError("Unsupported OpenMP device number")
+
+            # if cuda then run ptxas on the cres and pass that
+
+            # bytes_array_typ = lir.ArrayType(cgutils.voidptr_t, len(target_elf))
+            # bytes_array_typ = lir.ArrayType(cgutils.int8_t, len(target_elf))
+            # dev_image = cgutils.add_global_variable(mod, bytes_array_typ, ".omp_offloading.device_image")
+            # dev_image.initializer = lir.Constant.array(cgutils.int8_t, target_elf)
+            # dev_image.initializer = lir.Constant.array(cgutils.int8_t, target_elf)
+            add_target_globals_in_numba = int(
+                os.environ.get("NUMBA_OPENMP_ADD_TARGET_GLOBALS", 0)
+            )
+            if add_target_globals_in_numba != 0:
+                elftext = cgutils.make_bytearray(target_elf)
+                dev_image = targetctx.insert_unique_const(
+                    mod, ".omp_offloading.device_image", elftext
+                )
+                mangled_name = cgutils.make_bytearray(
+                    cres.fndesc.mangled_name.encode("utf-8") + b"\x00"
+                )
+                mangled_var = targetctx.insert_unique_const(
+                    mod, ".omp_offloading.entry_name", mangled_name
+                )
+
+                llvmused_typ = lir.ArrayType(cgutils.voidptr_t, 2)
+                llvmused_gv = cgutils.add_global_variable(
+                    mod, llvmused_typ, "llvm.used"
+                )
+                llvmused_syms = [
+                    lir.Constant.bitcast(dev_image, cgutils.voidptr_t),
+                    lir.Constant.bitcast(mangled_var, cgutils.voidptr_t),
+                ]
+                llvmused_gv.initializer = lir.Constant.array(
+                    cgutils.voidptr_t, llvmused_syms
+                )
+                llvmused_gv.linkage = "appending"
+            else:
+                host_side_target_tags.append(
+                    openmp_tag(
+                        "QUAL.OMP.TARGET.DEV_FUNC",
+                        StringLiteral(cres.fndesc.mangled_name.encode("utf-8")),
+                    )
+                )
+                host_side_target_tags.append(
+                    openmp_tag("QUAL.OMP.TARGET.ELF", StringLiteral(target_elf))
+                )
+
+            if DEBUG_OPENMP >= 1:
+                dprint_func_ir(func_ir, "target after outline compiled func_ir")
+
+        llvm_token_t = TokenType()
+        fnty = lir.FunctionType(llvm_token_t, [])
+        tags_to_include = self.tags + host_side_target_tags
+        # tags_to_include = list(filter(lambda x: x.name != "DIR.OMP.TARGET", tags_to_include))
+        self.filtered_tag_length = len(tags_to_include)
+        if DEBUG_OPENMP >= 1:
+            print("filtered_tag_length:", self.filtered_tag_length)
+
+        if len(tags_to_include) > 0:
+            if DEBUG_OPENMP >= 1:
+                print("push_alloca_callbacks")
+
+            push_alloca_callback(lowerer, openmp_region_alloca, self, builder)
+            tag_str = openmp_tag_list_to_str(tags_to_include, lowerer, True)
+            pre_fn = builder.module.declare_intrinsic(
+                "llvm.directive.region.entry", (), fnty
+            )
+            assert self.omp_region_var is None
+            self.omp_region_var = builder.call(pre_fn, [], tail=False)
+            self.omp_region_var.__class__ = CallInstrWithOperandBundle
+            self.omp_region_var.set_tags(tag_str)
+            # This is used by the post-lowering pass over LLVM to add LLVM alloca
+            # vars to the Numba IR openmp node and then when the exit of the region
+            # is detected then the tags in the enter directive are updated.
+            self.omp_region_var.save_orig_numba_openmp = self
+            if DEBUG_OPENMP >= 2:
+                print("setting omp_region_var", self.omp_region_var._get_name())
+        if self.acq_res:
+            builder.fence("acquire")
+        if self.acq_rel:
+            builder.fence("acq_rel")
+
+        for otag in self.tags:  # should be tags_to_include?
+            otag.post_entry(lowerer)
+
+        if DEBUG_OPENMP >= 1:
+            sys.stdout.flush()
+
+    def __str__(self):
+        return (
+            "openmp_region_start "
+            + ", ".join([str(x) for x in self.tags])
+            + " target="
+            + str(self.target_copy)
+        )
+
+
+class OnlyLower(compiler.CompilerBase):
+    def __init__(self, typingctx, targetctx, library, args, restype, flags, locals):
+        super().__init__(typingctx, targetctx, library, args, restype, flags, locals)
+        self.state.typemap = targetctx.state_copy.typemap
+        self.state.calltypes = targetctx.state_copy.calltypes
+
+    def define_pipelines(self):
+        pms = []
+        if not self.state.flags.force_pyobject:
+            pms.append(
+                compiler.DefaultPassBuilder.define_nopython_lowering_pipeline(
+                    self.state
+                )
+            )
+        return pms
+
+
+class OnlyLowerCUDA(numba_cuda.compiler.CUDACompiler):
+    def __init__(self, typingctx, targetctx, library, args, restype, flags, locals):
+        super().__init__(typingctx, targetctx, library, args, restype, flags, locals)
+        self.state.typemap = targetctx.state_copy.typemap
+        self.state.calltypes = targetctx.state_copy.calltypes
+
+    def define_pipelines(self):
+        pm = compiler_machinery.PassManager("cuda")
+        pm.add_pass(numba_cuda.compiler.CUDALegalization, "CUDA legalization")
+        lowering_passes = self.define_cuda_lowering_pipeline(self.state)
+        pm.passes.extend(lowering_passes.passes)
+        pm.finalize()
+        return [pm]
+
+
+class openmp_region_end(ir.Stmt):
+    def __init__(self, start_region, tags, loc):
+        if DEBUG_OPENMP >= 1:
+            print("region ids openmp_region_end::__init__", id(self), id(start_region))
+        self.start_region = start_region
+        self.tags = tags
+        self.loc = loc
+        self.start_region.end_region = self
+
+    def __new__(cls, *args, **kwargs):
+        instance = super(openmp_region_end, cls).__new__(cls)
+        # print("openmp_region_end::__new__", id(instance))
+        return instance
+
+    def list_vars(self):
+        return list_vars_from_tags(self.tags)
+
+    def lower(self, lowerer):
+        typingctx = lowerer.context.typing_context
+        targetctx = lowerer.context
+        typemap = lowerer.fndesc.typemap
+        context = lowerer.context
+        builder = lowerer.builder
+        library = lowerer.library
+
+        if DEBUG_OPENMP >= 2:
+            print("openmp_region_end::lower", id(self), id(self.start_region))
+            sys.stdout.flush()
+
+        if self.start_region.acq_res:
+            builder.fence("release")
+
+        if DEBUG_OPENMP >= 1:
+            print("pop_alloca_callbacks")
+
+        if DEBUG_OPENMP >= 2:
+            print("start_region tag length:", self.start_region.filtered_tag_length)
+
+        if self.start_region.filtered_tag_length > 0:
+            llvm_token_t = TokenType()
+            fnty = lir.FunctionType(lir.VoidType(), [llvm_token_t])
+            # The callback is only needed if llvm.directive.region.entry was added
+            # which only happens if tag length > 0.
+            pop_alloca_callback(lowerer, builder)
+
+            # Process the accumulated allocas in the start region.
+            self.start_region.process_alloca_queue()
+
+            assert self.start_region.omp_region_var != None
+            if DEBUG_OPENMP >= 2:
+                print(
+                    "before adding exit", self.start_region.omp_region_var._get_name()
+                )
+
+            for fp in filter(
+                lambda x: x.name == "QUAL.OMP.FIRSTPRIVATE", self.start_region.tags
+            ):
+                new_del = ir.Del(fp.arg, self.loc)
+                lowerer.lower_inst(new_del)
+
+            pre_fn = builder.module.declare_intrinsic(
+                "llvm.directive.region.exit", (), fnty
+            )
+            or_end_call = builder.call(
+                pre_fn, [self.start_region.omp_region_var], tail=True
+            )
+            or_end_call.__class__ = CallInstrWithOperandBundle
+            or_end_call.set_tags(openmp_tag_list_to_str(self.tags, lowerer, True))
+
+            if DEBUG_OPENMP >= 1:
+                print(
+                    "OpenMP end lowering firstprivate_dead_after len:",
+                    len(self.start_region.firstprivate_dead_after),
+                )
+
+            for fp in self.start_region.firstprivate_dead_after:
+                new_del = ir.Del(fp.arg, self.loc)
+                lowerer.lower_inst(new_del)
+
+    def __str__(self):
+        return "openmp_region_end " + ", ".join([str(x) for x in self.tags])
+
+    def has_target(self):
+        for t in self.tags:
+            if is_target_tag(t.name):
+                return t.arg
+        return None
+
+
+def compute_cfg_from_llvm_blocks(blocks):
+    cfg = CFGraph()
+    name_to_index = {}
+    for b in blocks:
+        # print("b:", b.name, type(b.name))
+        cfg.add_node(b.name)
+
+    for bindex, b in enumerate(blocks):
+        term = b.terminator
+        # print("term:", b.name, term, type(term))
+        if isinstance(term, lir.instructions.Branch):
+            cfg.add_edge(b.name, term.operands[0].name)
+            name_to_index[b.name] = (bindex, [term.operands[0].name])
+        elif isinstance(term, lir.instructions.ConditionalBranch):
+            cfg.add_edge(b.name, term.operands[1].name)
+            cfg.add_edge(b.name, term.operands[2].name)
+            name_to_index[b.name] = (
+                bindex,
+                [term.operands[1].name, term.operands[2].name],
+            )
+        elif isinstance(term, lir.instructions.Ret):
+            name_to_index[b.name] = (bindex, [])
+        elif isinstance(term, lir.instructions.SwitchInstr):
+            cfg.add_edge(b.name, term.default.name)
+            for _, blk in term.cases:
+                cfg.add_edge(b.name, blk.name)
+            out_blks = [x[1].name for x in term.cases]
+            out_blks.append(term.default.name)
+            name_to_index[b.name] = (bindex, out_blks)
+        elif isinstance(term, lir.instructions.Unreachable):
+            pass
+        else:
+            print("Unknown term:", term, type(term))
+            assert False  # Should never get here.
+
+    cfg.set_entry_point("entry")
+    cfg.process()
+    return cfg, name_to_index
+
+
+def compute_llvm_topo_order(blocks):
+    cfg, name_to_index = compute_cfg_from_llvm_blocks(blocks)
+    post_order = []
+    seen = set()
+
+    def _dfs_rec(node):
+        if node not in seen:
+            seen.add(node)
+            succs = cfg._succs[node]
+
+            # If there are no successors then we are done.
+            # This is the case for an unreachable.
+            if not succs:
+                return
+
+            # This is needed so that the inside of loops are
+            # handled first before their exits.
+            nexts = name_to_index[node][1]
+            if len(nexts) == 2:
+                succs = [nexts[1], nexts[0]]
+
+            for dest in succs:
+                if (node, dest) not in cfg._back_edges:
+                    _dfs_rec(dest)
+            post_order.append(node)
+
+    _dfs_rec(cfg.entry_point())
+    post_order.reverse()
+    return post_order, name_to_index
+
+
+class CollectUnknownLLVMVarsPrivate(lir.transforms.Visitor):
+    def __init__(self):
+        self.active_openmp_directives = []
+        self.start_num = 0
+
+    # Override the default function visitor to go in topo order
+    def visit_Function(self, func):
+        self._function = func
+        if len(func.blocks) == 0:
+            return None
+        if DEBUG_OPENMP >= 1:
+            print("Collect visit_Function:", func.blocks, type(func.blocks))
+        topo_order, name_to_index = compute_llvm_topo_order(func.blocks)
+        topo_order = list(topo_order)
+        if DEBUG_OPENMP >= 1:
+            print("topo_order:", topo_order)
+
+        for bbname in topo_order:
+            if DEBUG_OPENMP >= 1:
+                print("Visiting block:", bbname)
+            self.visit_BasicBlock(func.blocks[name_to_index[bbname][0]])
+
+        if DEBUG_OPENMP >= 1:
+            print("Collect visit_Function done")
+
+    def visit_Instruction(self, instr):
+        if len(self.active_openmp_directives) > 0:
+            if DEBUG_OPENMP >= 1:
+                print("Collect instr:", instr, type(instr))
+            for op in instr.operands:
+                if isinstance(op, lir.AllocaInstr):
+                    if DEBUG_OPENMP >= 1:
+                        print("Collect AllocaInstr operand:", op, op.name)
+                    for directive in self.active_openmp_directives:
+                        directive.save_orig_numba_openmp.alloca(op, None)
+                else:
+                    if DEBUG_OPENMP >= 2:
+                        print("non-alloca:", op, type(op))
+                    pass
+
+        if isinstance(instr, lir.CallInstr):
+            if instr.callee.name == "llvm.directive.region.entry":
+                if DEBUG_OPENMP >= 1:
+                    print(
+                        "Collect Found openmp region entry:",
+                        instr,
+                        type(instr),
+                        "\n",
+                        instr.tags,
+                        type(instr.tags),
+                        id(self),
+                        len(self.active_openmp_directives),
+                    )
+                self.active_openmp_directives.append(instr)
+                if DEBUG_OPENMP >= 1:
+                    print("post append:", len(self.active_openmp_directives))
+                assert hasattr(instr, "save_orig_numba_openmp")
+            if instr.callee.name == "llvm.directive.region.exit":
+                if DEBUG_OPENMP >= 1:
+                    print(
+                        "Collect Found openmp region exit:",
+                        instr,
+                        type(instr),
+                        "\n",
+                        instr.tags,
+                        type(instr.tags),
+                        id(self),
+                        len(self.active_openmp_directives),
+                    )
+                enter_directive = self.active_openmp_directives.pop()
+                enter_directive.save_orig_numba_openmp.post_lowering_process_alloca_queue(
+                    enter_directive
+                )
+
+
+def post_lowering_openmp(mod):
+    if DEBUG_OPENMP >= 1:
+        print("post_lowering_openmp")
+
+    # This will gather the information.
+    collect_fixup = CollectUnknownLLVMVarsPrivate()
+    collect_fixup.visit(mod)
+
+    if DEBUG_OPENMP >= 1:
+        print("post_lowering_openmp done")
+
+
+# Callback for ir_extension_usedefs
+def openmp_region_start_defs(region, use_set=None, def_set=None):
+    assert isinstance(region, openmp_region_start)
+    if use_set is None:
+        use_set = set()
+    if def_set is None:
+        def_set = set()
+    for tag in region.tags:
+        tag.add_to_usedef_set(use_set, def_set, start=True)
+    return _use_defs_result(usemap=use_set, defmap=def_set)
+
+
+def openmp_region_end_defs(region, use_set=None, def_set=None):
+    assert isinstance(region, openmp_region_end)
+    if use_set is None:
+        use_set = set()
+    if def_set is None:
+        def_set = set()
+    # We refer to the clauses from the corresponding start of the region.
+    start_region = region.start_region
+    for tag in start_region.tags:
+        tag.add_to_usedef_set(use_set, def_set, start=False)
+    return _use_defs_result(usemap=use_set, defmap=def_set)
+
+
+# Extend usedef analysis to support openmp_region_start/end nodes.
+ir_extension_usedefs[openmp_region_start] = openmp_region_start_defs
+ir_extension_usedefs[openmp_region_end] = openmp_region_end_defs
+
+
+def openmp_region_start_infer(prs, typeinferer):
+    pass
+
+
+def openmp_region_end_infer(pre, typeinferer):
+    pass
+
+
+typeinfer.typeinfer_extensions[openmp_region_start] = openmp_region_start_infer
+typeinfer.typeinfer_extensions[openmp_region_end] = openmp_region_end_infer
+
+
+def _lower_openmp_region_start(lowerer, prs):
+    # TODO: if we set it always in numba_fixups we can remove from here
+    if isinstance(lowerer.context, OpenmpCPUTargetContext) or isinstance(
+        lowerer.context, OpenmpCUDATargetContext
+    ):
+        pass
+    else:
+        lowerer.library.__class__ = CustomCPUCodeLibrary
+        lowerer.context.__class__ = CustomContext
+    prs.lower(lowerer)
+
+
+def _lower_openmp_region_end(lowerer, pre):
+    # TODO: if we set it always in numba_fixups we can remove from here
+    if isinstance(lowerer.context, OpenmpCPUTargetContext) or isinstance(
+        lowerer.context, OpenmpCUDATargetContext
+    ):
+        pass
+    else:
+        lowerer.library.__class__ = CustomCPUCodeLibrary
+        lowerer.context.__class__ = CustomContext
+    pre.lower(lowerer)
+
+
+def apply_copies_openmp_region(
+    region, var_dict, name_var_table, typemap, calltypes, save_copies
+):
+    for i in range(len(region.tags)):
+        region.tags[i].replace_vars_inner(var_dict)
+
+
+apply_copy_propagate_extensions[openmp_region_start] = apply_copies_openmp_region
+apply_copy_propagate_extensions[openmp_region_end] = apply_copies_openmp_region
+
+
+def visit_vars_openmp_region(region, callback, cbdata):
+    for i in range(len(region.tags)):
+        if DEBUG_OPENMP >= 1:
+            print("visit_vars before", region.tags[i], type(region.tags[i].arg))
+        region.tags[i].arg = visit_vars_inner(region.tags[i].arg, callback, cbdata)
+        if DEBUG_OPENMP >= 1:
+            print("visit_vars after", region.tags[i])
+
+
+visit_vars_extensions[openmp_region_start] = visit_vars_openmp_region
+visit_vars_extensions[openmp_region_end] = visit_vars_openmp_region
+
+# ----------------------------------------------------------------------------------------------
+
+
+class PythonOpenmp:
+    def __init__(self, *args):
+        self.args = args
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, typ, val, tb):
+        pass
+
+
+def iscall(x):
+    if isinstance(x, ir.Assign):
+        return isinstance(x.value, ir.Expr) and x.value.op == "call"
+    elif isinstance(x, ir.Expr):
+        return x.op == "call"
+    else:
+        return False
+
+
+def extract_args_from_openmp(func_ir):
+    """Find all the openmp context calls in the function and then
+    use the VarCollector transformer to find all the Python variables
+    referenced in the openmp clauses.  We then add those variables as
+    regular arguments to the openmp context call just so Numba's
+    usedef analysis is able to keep variables alive that are only
+    referenced in openmp clauses.
+    """
+    func_ir._definitions = build_definitions(func_ir.blocks)
+    var_table = get_name_var_table(func_ir.blocks)
+    for block in func_ir.blocks.values():
+        for inst in block.body:
+            if iscall(inst):
+                func_def = get_definition(func_ir, inst.value.func)
+                if isinstance(func_def, ir.Global) and isinstance(
+                    func_def.value, _OpenmpContextType
+                ):
+                    str_def = get_definition(func_ir, inst.value.args[0])
+                    if not isinstance(str_def, ir.Const) or not isinstance(
+                        str_def.value, str
+                    ):
+                        # The non-const openmp string error is handled later.
+                        continue
+                    assert isinstance(str_def, ir.Const) and isinstance(
+                        str_def.value, str
+                    )
+                    parse_res = var_collector_parser.parse(str_def.value)
+                    visitor = VarCollector()
+                    try:
+                        visit_res = visitor.transform(parse_res)
+                        inst.value.args.extend([var_table[x] for x in visit_res])
+                    except Exception as f:
+                        print("generic transform exception")
+                        exc_type, exc_obj, exc_tb = sys.exc_info()
+                        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
+                        print(exc_type, fname, exc_tb.tb_lineno)
+                        # print("Internal error for OpenMp pragma '{}'".format(arg.value))
+                        sys.exit(-2)
+                    except:
+                        print("fallthrough exception")
+                        # print("Internal error for OpenMp pragma '{}'".format(arg.value))
+                        sys.exit(-3)
+
+
+def remove_empty_blocks(blocks):
+    found = True
+    while found:
+        found = False
+        empty_block = None
+        for label, block in blocks.items():
+            if len(block.body) == 1:
+                assert isinstance(block.body[-1], ir.Jump)
+                empty_block = label
+                next_block = block.body[-1].target
+                break
+
+        if empty_block is not None:
+            del blocks[empty_block]
+
+            found = True
+            for block in blocks.values():
+                last_stmt = block.body[-1]
+                if isinstance(last_stmt, ir.Jump):
+                    if last_stmt.target == empty_block:
+                        block.body[-1] = ir.Jump(next_block, last_stmt.loc)
+                elif isinstance(last_stmt, ir.Branch):
+                    if last_stmt.truebr == empty_block:
+                        block.body[-1] = ir.Branch(
+                            last_stmt.cond, next_block, last_stmt.falsebr, last_stmt.loc
+                        )
+                    elif block.body[-1].falsebr == empty_block:
+                        block.body[-1] = ir.Branch(
+                            last_stmt.cond, last_stmt.truebr, next_block, last_stmt.loc
+                        )
+                elif isinstance(last_stmt, ir.Return):
+                    # Intentionally do nothing.
+                    pass
+                else:
+                    print(type(last_stmt))
+                    assert False
+
+
+class _OpenmpContextType(WithContext):
+    is_callable = True
+    first_time = True
+    blk_end_live_map = set()
+
+    def do_numba_fixups(self):
+        from numba import core
+
+        orig_lower_inst = core.lowering.Lower.lower_inst
+        core.lowering.Lower.orig_lower_inst = orig_lower_inst
+
+        orig_lower = core.lowering.Lower.lower
+        core.lowering.Lower.orig_lower = orig_lower
+
+        # Use method to retrieve the outside region live map, which is updated
+        # during the with-context mutation.
+        def get_blk_end_live_map():
+            return self.blk_end_live_map
+
+        def new_lower(self, inst):
+            if not isinstance(self, LowerNoSROA):
+                self.__class__ = LowerNoSROA
+            if isinstance(inst, openmp_region_start):
+                return _lower_openmp_region_start(self, inst)
+            elif isinstance(inst, openmp_region_end):
+                return _lower_openmp_region_end(self, inst)
+            # TODO: instead of monkey patching for Del instructions outside the
+            # openmp region do: (1) either outline to create a function scope
+            # that will decouple the lifetime of variables inside the OpenMP
+            # region, (2) or subclass the PostProcessor to extend use-def
+            # analysis with OpenMP lifetime information.
+            elif isinstance(inst, ir.Del):
+                # Lower Del normally in the openmp region.
+                if in_openmp_region(self.builder):
+                    return self.orig_lower_inst(inst)
+
+                # Lower the Del instruction ONLY if the variable is not live
+                # after the openmp region.
+                if inst.value not in get_blk_end_live_map():
+                    return self.orig_lower_inst(inst)
+            elif isinstance(inst, ir.Assign):
+                return self.lower_assign_inst(orig_lower_inst, inst)
+            elif isinstance(inst, ir.Return):
+                return self.lower_return_inst(orig_lower_inst, inst)
+            else:
+                return self.orig_lower_inst(inst)
+
+        core.lowering.Lower.lower_inst = new_lower
+
+    def mutate_with_body(
+        self,
+        func_ir,
+        blocks,
+        blk_start,
+        blk_end,
+        body_blocks,
+        dispatcher_factory,
+        extra,
+    ):
+        if _OpenmpContextType.first_time == True:
+            _OpenmpContextType.first_time = False
+            self.do_numba_fixups()
+
+        if DEBUG_OPENMP >= 1:
+            print("pre-dead-code")
+            dump_blocks(blocks)
+        if not OPENMP_DISABLED and not hasattr(func_ir, "has_openmp_region"):
+            # We can't do dead code elimination at this point because if an argument
+            # is used only in an openmp clause then it is detected as dead and is
+            # eliminated.  We'd have to run through the IR and find all the
+            # openmp regions and extract the vars used there and then modify the
+            # IR with something fake just to take the var alive.  The other approach
+            # would be to modify dead code elimination to find the vars referenced
+            # in openmp context strings.
+            extract_args_from_openmp(func_ir)
+            # dead_code_elimination(func_ir)
+            remove_ssa_from_func_ir(func_ir)
+            # remove_empty_blocks(blocks)
+            func_ir.has_openmp_region = True
+        if DEBUG_OPENMP >= 1:
+            print("pre-with-removal")
+            dump_blocks(blocks)
+        if OPENMP_DISABLED:
+            # If OpenMP disabled, do nothing except remove the enter_with marker.
+            sblk = blocks[blk_start]
+            sblk.body = sblk.body[1:]
+        else:
+            if DEBUG_OPENMP >= 1:
+                print("openmp:mutate_with_body")
+                dprint_func_ir(func_ir, "func_ir")
+                print("blocks:", blocks, type(blocks))
+                print("blk_start:", blk_start, type(blk_start))
+                print("blk_end:", blk_end, type(blk_end))
+                print("body_blocks:", body_blocks, type(body_blocks))
+                print("extra:", extra, type(extra))
+            assert extra is not None
+            _add_openmp_ir_nodes(
+                func_ir, blocks, blk_start, blk_end, body_blocks, extra
+            )
+            func_ir._definitions = build_definitions(blocks)
+            if DEBUG_OPENMP >= 1:
+                print("post-with-removal")
+                dump_blocks(blocks)
+            dispatcher = dispatcher_factory(func_ir)
+            dispatcher.can_cache = True
+
+            # Find live variables after the region to make sure we don't Del
+            # them if they are defined in the openmp region.
+            cfg = compute_cfg_from_blocks(blocks)
+            usedefs = compute_use_defs(blocks)
+            live_map = compute_live_map(cfg, blocks, usedefs.usemap, usedefs.defmap)
+            self.blk_end_live_map = live_map[blk_end]
+            return dispatcher
+
+    def __call__(self, args):
+        return PythonOpenmp(args)
+
+
+def remove_indirections(clause):
+    try:
+        while len(clause) == 1 and isinstance(clause[0], list):
+            clause = clause[0]
+    except:
+        pass
+    return clause
+
+
+class default_shared_val:
+    def __init__(self, val):
+        self.val = val
+
+
+class UnspecifiedVarInDefaultNone(Exception):
+    pass
+
+
+class ParallelForExtraCode(Exception):
+    pass
+
+
+class ParallelForWrongLoopCount(Exception):
+    pass
+
+
+class ParallelForInvalidCollapseCount(Exception):
+    pass
+
+
+class NonconstantOpenmpSpecification(Exception):
+    pass
+
+
+class NonStringOpenmpSpecification(Exception):
+    pass
+
+
+class MultipleNumThreadsClauses(Exception):
+    pass
+
+
+openmp_context = _OpenmpContextType()
+
+
+def is_dsa(name):
+    return (
+        name
+        in [
+            "QUAL.OMP.FIRSTPRIVATE",
+            "QUAL.OMP.PRIVATE",
+            "QUAL.OMP.SHARED",
+            "QUAL.OMP.LASTPRIVATE",
+            "QUAL.OMP.TARGET.IMPLICIT",
+        ]
+        or name.startswith("QUAL.OMP.REDUCTION")
+        or name.startswith("QUAL.OMP.MAP")
+    )
+
+
+def get_dotted_type(x, typemap, lowerer):
+    xsplit = x.split("*")
+    cur_typ = typemap_lookup(typemap, xsplit[0])
+    # print("xsplit:", xsplit, cur_typ, type(cur_typ))
+    for field in xsplit[1:]:
+        dm = lowerer.context.data_model_manager.lookup(cur_typ)
+        findex = dm._fields.index(field)
+        cur_typ = dm._members[findex]
+        # print("dm:", dm, type(dm), dm._members, type(dm._members), dm._fields, type(dm._fields), findex, cur_typ, type(cur_typ))
+    return cur_typ
+
+
+def is_target_arg(name):
+    return (
+        name in ["QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.TARGET.IMPLICIT"]
+        or name.startswith("QUAL.OMP.MAP")
+        or name.startswith("QUAL.OMP.REDUCTION")
+    )
+
+
+def is_pointer_target_arg(name, typ):
+    if name.startswith("QUAL.OMP.MAP"):
+        if isinstance(typ, types.npytypes.Array):
+            return True
+        else:
+            return True
+    if name in ["QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.PRIVATE"]:
+        return False
+    if name in ["QUAL.OMP.TARGET.IMPLICIT"]:
+        if isinstance(typ, types.npytypes.Array):
+            return True
+        else:
+            return False
+    return False
+    # print("is_pointer_target_arg:", name, typ, type(typ))
+    assert False
+
+
+def is_internal_var(var):
+    # Determine if a var is a Python var or an internal Numba var.
+    if var.is_temp:
+        return True
+    return var.unversioned_name != var.name
+
+
+def remove_ssa(var_name, scope, loc):
+    # Get the base name of a variable, removing the SSA extension.
+    var = ir.Var(scope, var_name, loc)
+    return var.unversioned_name
+
+
+def user_defined_var(var):
+    if not isinstance(var, str):
+        return False
+    return not var.startswith("$")
+
+
+def has_user_defined_var(the_set):
+    for x in the_set:
+        if user_defined_var(x):
+            return True
+    return False
+
+
+def get_user_defined_var(the_set):
+    ret = set()
+    for x in the_set:
+        if user_defined_var(x):
+            ret.add(x)
+    return ret
+
+
+unique = 0
+
+
+def get_unique():
+    global unique
+    ret = unique
+    unique += 1
+    return ret
+
+
+def is_private(x):
+    return x in [
+        "QUAL.OMP.PRIVATE",
+        "QUAL.OMP.FIRSTPRIVATE",
+        "QUAL.OMP.LASTPRIVATE",
+        "QUAL.OMP.TARGET.IMPLICIT",
+    ]
+
+
+def openmp_copy(a):
+    pass  # should always be called through overload
+
+
+@overload(openmp_copy)
+def openmp_copy_overload(a):
+    if DEBUG_OPENMP >= 1:
+        print("openmp_copy:", a, type(a))
+    if isinstance(a, types.npytypes.Array):
+
+        def cimpl(a):
+            return np.copy(a)
+
+        return cimpl
+    else:
+
+        def cimpl(a):
+            return a
+
+        return cimpl
+
+
+def replace_ssa_var_callback(var, vardict):
+    assert isinstance(var, ir.Var)
+    while var.unversioned_name in vardict.keys():
+        assert vardict[var.unversioned_name].name != var.unversioned_name
+        new_var = vardict[var.unversioned_name]
+        var = ir.Var(new_var.scope, new_var.name, new_var.loc)
+    return var
+
+
+def replace_ssa_vars(blocks, vardict):
+    """replace variables (ir.Var to ir.Var) from dictionary (name -> ir.Var)"""
+    # remove identity values to avoid infinite loop
+    new_vardict = {}
+    for l, r in vardict.items():
+        if l != r.name:
+            new_vardict[l] = r
+    visit_vars(blocks, replace_ssa_var_callback, new_vardict)
+
+
+def get_blocks_between_start_end(blocks, start_block, end_block):
+    cfg = compute_cfg_from_blocks(blocks)
+    blocks_in_region = [start_block]
+
+    def add_in_region(cfg, blk, blocks_in_region, end_block):
+        """For each successor in the CFG of the block we're currently
+        adding to blocks_in_region, add that successor to
+        blocks_in_region if it isn't the end_block.  Then,
+        recursively call this routine for the added block to add
+        its successors.
+        """
+        for out_blk, _ in cfg.successors(blk):
+            if out_blk != end_block and out_blk not in blocks_in_region:
+                blocks_in_region.append(out_blk)
+                add_in_region(cfg, out_blk, blocks_in_region, end_block)
+
+    # Calculate all the Numba IR blocks in the target region.
+    add_in_region(cfg, start_block, blocks_in_region, end_block)
+    return blocks_in_region
+
+
+class VarName(str):
+    pass
+
+
+class OnlyClauseVar(VarName):
+    pass
+
+
+# This Transformer visitor class just finds the referenced python names
+# and puts them in a list of VarName.  The default visitor function
+# looks for list of VarNames in the args to that tree node and then
+# concatenates them all together.  The final return value is a list of
+# VarName that are variables used in the openmp clauses.
+class VarCollector(Transformer):
+    def __init__(self):
+        super(VarCollector, self).__init__()
+
+    def PYTHON_NAME(self, args):
+        return [VarName(args)]
+
+    def const_num_or_var(self, args):
+        return args[0]
+
+    def num_threads_clause(self, args):
+        (_, num_threads) = args
+        if isinstance(num_threads, list):
+            assert len(num_threads) == 1
+            return [OnlyClauseVar(num_threads[0])]
+        else:
+            return None
+
+    def __default__(self, data, children, meta):
+        ret = []
+        for c in children:
+            if isinstance(c, list) and len(c) > 0:
+                if isinstance(c[0], OnlyClauseVar):
+                    ret.extend(c)
+        return ret
+
+
+def add_tags_to_enclosing(func_ir, cur_block, tags):
+    enclosing_region = get_enclosing_region(func_ir, cur_block)
+    if enclosing_region:
+        for region in enclosing_region:
+            for tag in tags:
+                region.add_tag(tag)
+
+
+def add_enclosing_region(func_ir, blocks, openmp_node):
+    if not hasattr(func_ir, "openmp_enclosing"):
+        func_ir.openmp_enclosing = {}
+    if not hasattr(func_ir, "openmp_regions"):
+        func_ir.openmp_regions = {}
+    func_ir.openmp_regions[openmp_node] = sorted(blocks)
+    for b in blocks:
+        if b not in func_ir.openmp_enclosing:
+            func_ir.openmp_enclosing[b] = []
+        func_ir.openmp_enclosing[b].append(openmp_node)
+
+
+def get_enclosing_region(func_ir, cur_block):
+    if not hasattr(func_ir, "openmp_enclosing"):
+        func_ir.openmp_enclosing = {}
+    if cur_block in func_ir.openmp_enclosing:
+        return func_ir.openmp_enclosing[cur_block]
+    else:
+        return None
+
+
+def get_var_from_enclosing(enclosing_regions, var):
+    if not enclosing_regions:
+        return None
+    if len(enclosing_regions) == 0:
+        return None
+    return enclosing_regions[-1].get_var_dsa(var)
+
+
+class OpenmpVisitor(Transformer):
+    target_num = 0
+
+    def __init__(self, func_ir, blocks, blk_start, blk_end, body_blocks, loc):
+        self.func_ir = func_ir
+        self.blocks = blocks
+        self.blk_start = blk_start
+        self.blk_end = blk_end
+        self.body_blocks = body_blocks
+        self.loc = loc
+        super(OpenmpVisitor, self).__init__()
+
+    # --------- Non-parser functions --------------------
+
+    def remove_explicit_from_one(
+        self, varset, vars_in_explicit_clauses, clauses, scope, loc
+    ):
+        """Go through a set of variables and see if their non-SSA form is in an explicitly
+        provided data clause.  If so, remove it from the set and add a clause so that the
+        SSA form gets the same data clause.
+        """
+        if DEBUG_OPENMP >= 1:
+            print(
+                "remove_explicit start:",
+                sorted(varset),
+                sorted(vars_in_explicit_clauses),
+            )
+        diff = set()
+        # For each variable in the set.
+        for v in sorted(varset):
+            # Get the non-SSA form.
+            flat = remove_ssa(v, scope, loc)
+            # Skip non-SSA introduced variables (i.e., Python vars).
+            if flat == v:
+                continue
+            if DEBUG_OPENMP >= 1:
+                print("remove_explicit:", v, flat, flat in vars_in_explicit_clauses)
+            # If we have the non-SSA form in an explicit data clause.
+            if flat in vars_in_explicit_clauses:
+                # We will remove it from the set.
+                diff.add(v)
+                # Copy the non-SSA variables data clause.
+                ccopy = copy.copy(vars_in_explicit_clauses[flat])
+                # Change the name in the clause to the SSA form.
+                ccopy.arg = ir.Var(scope, v, loc)
+                # Add to the clause set.
+                clauses.append(ccopy)
+        # Remove the vars from the set that we added a clause for.
+        varset.difference_update(diff)
+        if DEBUG_OPENMP >= 1:
+            print("remove_explicit end:", sorted(varset))
+
+    def remove_explicit_from_io_vars(
+        self,
+        inputs_to_region,
+        def_but_live_out,
+        private_to_region,
+        vars_in_explicit_clauses,
+        clauses,
+        non_user_explicits,
+        scope,
+        loc,
+    ):
+        """Remove vars in explicit data clauses from the auto-determined vars.
+        Then call remove_explicit_from_one to take SSA variants out of the auto-determined sets
+        and to create clauses so that SSA versions get the same clause as the explicit Python non-SSA var.
+        """
+        inputs_to_region.difference_update(vars_in_explicit_clauses.keys())
+        def_but_live_out.difference_update(vars_in_explicit_clauses.keys())
+        private_to_region.difference_update(vars_in_explicit_clauses.keys())
+        inputs_to_region.difference_update(non_user_explicits.keys())
+        def_but_live_out.difference_update(non_user_explicits.keys())
+        private_to_region.difference_update(non_user_explicits.keys())
+        self.remove_explicit_from_one(
+            inputs_to_region, vars_in_explicit_clauses, clauses, scope, loc
+        )
+        self.remove_explicit_from_one(
+            def_but_live_out, vars_in_explicit_clauses, clauses, scope, loc
+        )
+        self.remove_explicit_from_one(
+            private_to_region, vars_in_explicit_clauses, clauses, scope, loc
+        )
+
+    def find_io_vars(self, selected_blocks):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+        scope = sblk.scope
+
+        cfg = compute_cfg_from_blocks(self.blocks)
+        usedefs = compute_use_defs(self.blocks)
+        if DEBUG_OPENMP >= 1:
+            print("usedefs:", usedefs)
+        live_map = compute_live_map(cfg, self.blocks, usedefs.usemap, usedefs.defmap)
+        # Assumes enter_with is first statement in block.
+        inputs_to_region = live_map[self.blk_start]
+        if DEBUG_OPENMP >= 1:
+            print("live_map:", live_map)
+            print("inputs_to_region:", sorted(inputs_to_region), type(inputs_to_region))
+            print("selected blocks:", sorted(selected_blocks))
+        all_uses = set()
+        all_defs = set()
+        for label in selected_blocks:
+            all_uses = all_uses.union(usedefs.usemap[label])
+            all_defs = all_defs.union(usedefs.defmap[label])
+        # Filter out those vars live to the region but not used within it.
+        inputs_to_region = inputs_to_region.intersection(all_uses)
+        def_but_live_out = all_defs.difference(inputs_to_region).intersection(
+            live_map[self.blk_end]
+        )
+        private_to_region = all_defs.difference(inputs_to_region).difference(
+            live_map[self.blk_end]
+        )
+
+        if DEBUG_OPENMP >= 1:
+            print("all_uses:", sorted(all_uses))
+            print("inputs_to_region:", sorted(inputs_to_region))
+            print("private_to_region:", sorted(private_to_region))
+            print("def_but_live_out:", sorted(def_but_live_out))
+        return inputs_to_region, def_but_live_out, private_to_region, live_map
+
+    def get_explicit_vars(self, clauses):
+        user_vars = {}
+        non_user_vars = {}
+        privates = []
+        for c in clauses:
+            if DEBUG_OPENMP >= 1:
+                print("get_explicit_vars:", c, type(c))
+            if isinstance(c, openmp_tag):
+                if DEBUG_OPENMP >= 1:
+                    print("arg:", c.arg, type(c.arg))
+                if isinstance(c.arg, list):
+                    carglist = c.arg
+                else:
+                    carglist = [c.arg]
+                # carglist = c.arg if isinstance(c.arg, list) else [c.arg]
+                for carg in carglist:
+                    if DEBUG_OPENMP >= 1:
+                        print(
+                            "carg:",
+                            carg,
+                            type(carg),
+                            user_defined_var(carg),
+                            is_dsa(c.name),
+                        )
+                    # Extract the var name from the NameSlice.
+                    if isinstance(carg, NameSlice):
+                        carg = carg.name
+                    if isinstance(carg, str) and is_dsa(c.name):
+                        if user_defined_var(carg):
+                            user_vars[carg] = c
+                            if is_private(c.name):
+                                privates.append(carg)
+                        else:
+                            non_user_vars[carg] = c
+        return user_vars, privates, non_user_vars
+
+    def filter_unused_vars(self, clauses, used_vars):
+        new_clauses = []
+        for c in clauses:
+            if DEBUG_OPENMP >= 1:
+                print("filter_unused_vars:", c, type(c))
+            if isinstance(c, openmp_tag):
+                if DEBUG_OPENMP >= 1:
+                    print("arg:", c.arg, type(c.arg))
+                assert not isinstance(c.arg, list)
+                if DEBUG_OPENMP >= 1:
+                    print(
+                        "c.arg:",
+                        c.arg,
+                        type(c.arg),
+                        user_defined_var(c.arg),
+                        is_dsa(c.name),
+                    )
+
+                if (
+                    isinstance(c.arg, str)
+                    and user_defined_var(c.arg)
+                    and is_dsa(c.name)
+                ):
+                    if c.arg in used_vars:
+                        new_clauses.append(c)
+                else:
+                    new_clauses.append(c)
+        return new_clauses
+
+    def get_clause_privates(self, clauses, def_but_live_out, scope, loc):
+        # Get all the private clauses from the whole set of clauses.
+        private_clauses_vars = [
+            remove_privatized(x.arg)
+            for x in clauses
+            if x.name in ["QUAL.OMP.PRIVATE", "QUAL.OMP.FIRSTPRIVATE"]
+        ]
+        # private_clauses_vars = [remove_privatized(x.arg) for x in clauses if x.name in ["QUAL.OMP.PRIVATE", "QUAL.OMP.FIRSTPRIVATE", "QUAL.OMP.LASTPRIVATE"]]
+        ret = {}
+        # Get a mapping of vars in private clauses to the SSA version of variable exiting the region.
+        for lo in def_but_live_out:
+            without_ssa = remove_ssa(lo, scope, loc)
+            if without_ssa in private_clauses_vars:
+                ret[without_ssa] = lo
+        return ret
+
+    def make_implicit_explicit(
+        self,
+        scope,
+        vars_in_explicit,
+        explicit_clauses,
+        gen_shared,
+        inputs_to_region,
+        def_but_live_out,
+        private_to_region,
+        for_task=False,
+    ):
+        if for_task is None:
+            for_task = []
+        if gen_shared:
+            for var_name in sorted(inputs_to_region):
+                if (
+                    for_task != False
+                    and get_var_from_enclosing(for_task, var_name) != "QUAL.OMP.SHARED"
+                ):
+                    explicit_clauses.append(
+                        openmp_tag("QUAL.OMP.FIRSTPRIVATE", var_name)
+                    )
+                else:
+                    explicit_clauses.append(openmp_tag("QUAL.OMP.SHARED", var_name))
+                vars_in_explicit[var_name] = explicit_clauses[-1]
+
+            for var_name in sorted(def_but_live_out):
+                if (
+                    for_task != False
+                    and get_var_from_enclosing(for_task, var_name) != "QUAL.OMP.SHARED"
+                ):
+                    explicit_clauses.append(
+                        openmp_tag("QUAL.OMP.FIRSTPRIVATE", var_name)
+                    )
+                else:
+                    explicit_clauses.append(openmp_tag("QUAL.OMP.SHARED", var_name))
+                vars_in_explicit[var_name] = explicit_clauses[-1]
+
+            # What to do below for task regions?
+            for var_name in sorted(private_to_region):
+                temp_var = ir.Var(scope, var_name, self.loc)
+                if not is_internal_var(temp_var):
+                    explicit_clauses.append(openmp_tag("QUAL.OMP.PRIVATE", var_name))
+                    vars_in_explicit[var_name] = explicit_clauses[-1]
+
+        for var_name in sorted(private_to_region):
+            temp_var = ir.Var(scope, var_name, self.loc)
+            if is_internal_var(temp_var):
+                explicit_clauses.append(openmp_tag("QUAL.OMP.PRIVATE", var_name))
+                vars_in_explicit[var_name] = explicit_clauses[-1]
+
+    def make_implicit_explicit_target(
+        self,
+        scope,
+        vars_in_explicit,
+        explicit_clauses,
+        gen_shared,
+        inputs_to_region,
+        def_but_live_out,
+        private_to_region,
+    ):
+        # unversioned_privates = set() # we get rid of SSA on the first openmp region so no SSA forms should be here
+        if gen_shared:
+            for var_name in sorted(inputs_to_region):
+                explicit_clauses.append(
+                    openmp_tag(
+                        "QUAL.OMP.TARGET.IMPLICIT"
+                        if user_defined_var(var_name)
+                        else "QUAL.OMP.PRIVATE",
+                        var_name,
+                    )
+                )
+                vars_in_explicit[var_name] = explicit_clauses[-1]
+            for var_name in sorted(def_but_live_out):
+                explicit_clauses.append(
+                    openmp_tag(
+                        "QUAL.OMP.TARGET.IMPLICIT"
+                        if user_defined_var(var_name)
+                        else "QUAL.OMP.PRIVATE",
+                        var_name,
+                    )
+                )
+                vars_in_explicit[var_name] = explicit_clauses[-1]
+            for var_name in sorted(private_to_region):
+                temp_var = ir.Var(scope, var_name, self.loc)
+                if not is_internal_var(temp_var):
+                    explicit_clauses.append(openmp_tag("QUAL.OMP.PRIVATE", var_name))
+                    # explicit_clauses.append(openmp_tag("QUAL.OMP.TARGET.IMPLICIT" if user_defined_var(var_name) else "QUAL.OMP.PRIVATE", var_name))
+                    vars_in_explicit[var_name] = explicit_clauses[-1]
+
+        for var_name in sorted(private_to_region):
+            temp_var = ir.Var(scope, var_name, self.loc)
+            if is_internal_var(temp_var):
+                explicit_clauses.append(
+                    openmp_tag(
+                        "QUAL.OMP.TARGET.IMPLICIT"
+                        if user_defined_var(var_name)
+                        else "QUAL.OMP.PRIVATE",
+                        var_name,
+                    )
+                )
+                vars_in_explicit[var_name] = explicit_clauses[-1]
+
+    def add_explicits_to_start(
+        self,
+        scope,
+        vars_in_explicit,
+        explicit_clauses,
+        gen_shared,
+        start_tags,
+        keep_alive,
+    ):
+        start_tags.extend(explicit_clauses)
+        return []
+        # tags_for_enclosing = []
+        # for var in vars_in_explicit:
+        #    if not is_private(vars_in_explicit[var].name):
+        #        print("EVAR_COPY FOR", var)
+        #        evar = ir.Var(scope, var, self.loc)
+        #        evar_copy = scope.redefine("evar_copy_aets", self.loc)
+        #        keep_alive.append(ir.Assign(evar, evar_copy, self.loc))
+        #        #keep_alive.append(ir.Assign(evar, evar, self.loc))
+        #        tags_for_enclosing.append(openmp_tag("QUAL.OMP.PRIVATE", evar_copy))
+        # return tags_for_enclosing
+
+    def flatten(self, all_clauses, start_block):
+        if DEBUG_OPENMP >= 1:
+            print("flatten", id(start_block))
+        incoming_clauses = [remove_indirections(x) for x in all_clauses]
+        clauses = []
+        default_shared = True
+        for clause in incoming_clauses:
+            if DEBUG_OPENMP >= 1:
+                print("clause:", clause, type(clause))
+            if isinstance(clause, openmp_tag):
+                clauses.append(clause)
+            elif isinstance(clause, list):
+                clauses.extend(remove_indirections(clause))
+            elif clause == "nowait":
+                clauses.append(openmp_tag("QUAL.OMP.NOWAIT"))
+            elif isinstance(clause, default_shared_val):
+                default_shared = clause.val
+                if DEBUG_OPENMP >= 1:
+                    print("got new default_shared:", clause.val)
+            else:
+                if DEBUG_OPENMP >= 1:
+                    print(
+                        "Unknown clause type in incoming_clauses", clause, type(clause)
+                    )
+                assert 0
+
+        if hasattr(start_block, "openmp_replace_vardict"):
+            for clause in clauses:
+                # print("flatten out clause:", clause, clause.arg, type(clause.arg))
+                for vardict in start_block.openmp_replace_vardict:
+                    if clause.arg in vardict:
+                        # print("clause.arg in vardict:", clause.arg, type(clause.arg), vardict[clause.arg], type(vardict[clause.arg]))
+                        clause.arg = vardict[clause.arg].name
+
+        return clauses, default_shared
+
+    def add_replacement(self, blocks, replace_vardict):
+        for b in blocks.values():
+            if not hasattr(b, "openmp_replace_vardict"):
+                b.openmp_replace_vardict = []
+            b.openmp_replace_vardict.append(replace_vardict)
+
+    def make_consts_unliteral_for_privates(self, privates, blocks):
+        for blk in blocks.values():
+            for stmt in blk.body:
+                if (
+                    isinstance(stmt, ir.Assign)
+                    and isinstance(stmt.value, ir.Const)
+                    and stmt.target.name in privates
+                ):
+                    stmt.value.use_literal_type = False
+
+    def fix_empty_header(self, block, label):
+        if len(block.body) == 1:
+            assert isinstance(block.body[0], ir.Jump)
+            return self.blocks[block.body[0].target], block.body[0].target
+        return block, label
+
+    def prepare_for_directive(
+        self,
+        clauses,
+        vars_in_explicit_clauses,
+        before_start,
+        after_start,
+        start_tags,
+        end_tags,
+        scope,
+    ):
+        start_tags = clauses
+        call_table, _ = get_call_table(self.blocks)
+        cfg = compute_cfg_from_blocks(self.blocks)
+        usedefs = compute_use_defs(self.blocks)
+        live_map = compute_live_map(cfg, self.blocks, usedefs.usemap, usedefs.defmap)
+
+        def get_loops_in_region(all_loops):
+            loops = {}
+            for k, v in all_loops.items():
+                if v.header >= self.blk_start and v.header <= self.blk_end:
+                    loops[k] = v
+            return loops
+
+        all_loops = cfg.loops()
+        if DEBUG_OPENMP >= 1:
+            print("all_loops:", all_loops)
+            print("live_map:", live_map)
+            print("body_blocks:", self.body_blocks)
+
+        loops = get_loops_in_region(all_loops)
+        # Find the outer-most loop in this OpenMP region.
+        loops = list(filter_nested_loops(cfg, loops))
+
+        if DEBUG_OPENMP >= 1:
+            print("loops:", loops)
+        if len(loops) != 1:
+            raise ParallelForWrongLoopCount(
+                f"OpenMP parallel for regions must contain exactly one range based loop.  The parallel for at line {self.loc} contains {len(loops)} loops."
+            )
+
+        collapse_tags = get_tags_of_type(clauses, "QUAL.OMP.COLLAPSE")
+        new_stmts_for_iterspace = []
+        collapse_iterspace_block = set()
+        iterspace_vars = []
+        if len(collapse_tags) > 0:
+            # Limit all_loops to just loops within the openmp region.
+            all_loops = get_loops_in_region(all_loops)
+            # In case of multiple collapse tags, use the last one.
+            collapse_tag = collapse_tags[-1]
+            # Remove collapse tags from clauses so they don't go to LLVM pass.
+            clauses[:] = [x for x in clauses if x not in collapse_tags]
+            # Add top level loop to loop_order list.
+            loop_order = list(filter_nested_loops(cfg, all_loops))
+            if len(loop_order) != 1:
+                raise ParallelForWrongLoopCount(
+                    f"OpenMP parallel for region must have only one top-level loop at line {self.loc}."
+                )
+            # Determine how many nested loops we need to process.
+            collapse_value = collapse_tag.arg - 1
+            # Make sure initial collapse value was >= 2.
+            if collapse_value <= 0:
+                raise ParallelForInvalidCollapseCount(
+                    f"OpenMP parallel for regions with collapse clauses must be greather than or equal to 2 at line {self.loc}."
+                )
+
+            # Delete top-level loop from all_loops.
+            del all_loops[loop_order[-1].header]
+            # For remaining nested loops...
+            for _ in range(collapse_value):
+                # Get the next most top-level loop.
+                loops = list(filter_nested_loops(cfg, all_loops))
+                # Make sure there is only one.
+                if len(loops) != 1:
+                    raise ParallelForWrongLoopCount(
+                        f"OpenMP parallel for collapse regions must be perfectly nested for the parallel for at line {self.loc}."
+                    )
+                # Add this loop to the loops to process in order.
+                loop_order.append(loops[0])
+                # Delete this loop from all_loops.
+                del all_loops[loop_order[-1].header]
+
+            if DEBUG_OPENMP >= 2:
+                print("loop_order:", loop_order)
+            stmts_to_retain = []
+            loop_bounds = []
+            for loop in loop_order:
+                loop_entry = list(loop.entries)[0]
+                loop_exit = list(loop.exits)[0]
+                loop_header = loop.header
+                loop_entry_block = self.blocks[loop_entry]
+                loop_exit_block = self.blocks[loop_exit]
+                loop_header_block, _ = self.fix_empty_header(
+                    self.blocks[loop_header], loop_header
+                )
+
+                # Copy all stmts from the loop entry block up to the ir.Global
+                # for range.
+                call_offset = None
+                for entry_block_index, stmt in enumerate(loop_entry_block.body):
+                    found_range = False
+                    if (
+                        isinstance(stmt, ir.Assign)
+                        and isinstance(stmt.value, ir.Global)
+                        and stmt.value.name == "range"
+                    ):
+                        found_range = True
+                        range_target = stmt.target
+                        found_call = False
+                        for call_index in range(
+                            entry_block_index + 1, len(loop_entry_block.body)
+                        ):
+                            call_stmt = loop_entry_block.body[call_index]
+                            if (
+                                isinstance(call_stmt, ir.Assign)
+                                and isinstance(call_stmt.value, ir.Expr)
+                                and call_stmt.value.op == "call"
+                                and call_stmt.value.func == range_target
+                            ):
+                                found_call = True
+                                # Remove stmts that were retained.
+                                loop_entry_block.body = loop_entry_block.body[
+                                    entry_block_index:
+                                ]
+                                call_offset = call_index - entry_block_index
+                                break
+                        assert found_call
+                        break
+                    stmts_to_retain.append(stmt)
+                assert found_range
+                for header_block_index, stmt in enumerate(loop_header_block.body):
+                    if (
+                        isinstance(stmt, ir.Assign)
+                        and isinstance(stmt.value, ir.Expr)
+                        and stmt.value.op == "iternext"
+                    ):
+                        iternext_inst = loop_header_block.body[header_block_index]
+                        pair_first_inst = loop_header_block.body[header_block_index + 1]
+                        pair_second_inst = loop_header_block.body[
+                            header_block_index + 2
+                        ]
+
+                        assert (
+                            isinstance(iternext_inst, ir.Assign)
+                            and isinstance(iternext_inst.value, ir.Expr)
+                            and iternext_inst.value.op == "iternext"
+                        )
+                        assert (
+                            isinstance(pair_first_inst, ir.Assign)
+                            and isinstance(pair_first_inst.value, ir.Expr)
+                            and pair_first_inst.value.op == "pair_first"
+                        )
+                        assert (
+                            isinstance(pair_second_inst, ir.Assign)
+                            and isinstance(pair_second_inst.value, ir.Expr)
+                            and pair_second_inst.value.op == "pair_second"
+                        )
+                        stmts_to_retain.extend(
+                            loop_header_block.body[header_block_index + 3 : -1]
+                        )
+                        loop_index = pair_first_inst.target
+                        break
+                    stmts_to_retain.append(stmt)
+                loop_bounds.append((call_stmt.value.args[0], loop_index))
+            if DEBUG_OPENMP >= 1:
+                print("collapse 1")
+                dump_blocks(self.blocks)
+            # For all the loops except the last...
+            for loop in loop_order[:-1]:
+                # Change the unneeded headers to just jump to the next block.
+                loop_header = loop.header
+                loop_header_block, real_loop_header = self.fix_empty_header(
+                    self.blocks[loop_header], loop_header
+                )
+                collapse_iterspace_block.add(real_loop_header)
+                loop_header_block.body[-1] = ir.Jump(
+                    loop_header_block.body[-1].truebr, loop_header_block.body[-1].loc
+                )
+                last_eliminated_loop_header_block = loop_header_block
+                self.body_blocks = [
+                    x for x in self.body_blocks if x not in loop.entries
+                ]
+                self.body_blocks.remove(loop.header)
+            if DEBUG_OPENMP >= 1:
+                print("loop order:", loop_order)
+                print("loop bounds:", loop_bounds)
+                print("collapse 2")
+                dump_blocks(self.blocks)
+            last_loop = loop_order[-1]
+            last_loop_entry = list(last_loop.entries)[0]
+            last_loop_exit = list(last_loop.exits)[0]
+            last_loop_header = last_loop.header
+            last_loop_entry_block = self.blocks[last_loop_entry]
+            last_loop_exit_block = self.blocks[last_loop_exit]
+            last_loop_header_block, _ = self.fix_empty_header(
+                self.blocks[last_loop_header], loop_header
+            )
+            last_loop_first_body_block = last_loop_header_block.body[-1].truebr
+            self.blocks[last_loop_first_body_block].body = (
+                stmts_to_retain + self.blocks[last_loop_first_body_block].body
+            )
+            last_loop_header_block.body[-1].falsebr = list(loop_order[0].exits)[0]
+            new_var_scope = last_loop_entry_block.body[0].target.scope
+
+            # -------- Add vars to remember cumulative product of iteration space sizes.
+            new_iterspace_var = new_var_scope.redefine("new_iterspace0", self.loc)
+            start_tags.append(
+                openmp_tag("QUAL.OMP.FIRSTPRIVATE", new_iterspace_var.name)
+            )
+            iterspace_vars.append(new_iterspace_var)
+            new_stmts_for_iterspace.append(
+                ir.Assign(loop_bounds[0][0], new_iterspace_var, self.loc)
+            )
+            for lb_num, loop_bound in enumerate(loop_bounds[1:]):
+                mul_op = ir.Expr.binop(
+                    operator.mul, new_iterspace_var, loop_bound[0], self.loc
+                )
+                new_iterspace_var = new_var_scope.redefine(
+                    "new_iterspace" + str(lb_num + 1), self.loc
+                )
+                start_tags.append(
+                    openmp_tag("QUAL.OMP.FIRSTPRIVATE", new_iterspace_var.name)
+                )
+                iterspace_vars.append(new_iterspace_var)
+                new_stmts_for_iterspace.append(
+                    ir.Assign(mul_op, new_iterspace_var, self.loc)
+                )
+            # Change iteration space of innermost loop to the product of all the
+            # loops' iteration spaces.
+            last_loop_entry_block.body[call_offset].value.args[0] = new_iterspace_var
+
+            last_eliminated_loop_header_block.body = (
+                new_stmts_for_iterspace + last_eliminated_loop_header_block.body
+            )
+
+            deconstruct_indices = []
+            new_deconstruct_var = new_var_scope.redefine("deconstruct", self.loc)
+            deconstruct_indices.append(
+                ir.Assign(loop_bounds[-1][1], new_deconstruct_var, self.loc)
+            )
+            for deconstruct_index in range(len(loop_bounds) - 1):
+                cur_iterspace_var = iterspace_vars[
+                    len(loop_bounds) - 2 - deconstruct_index
+                ]
+                cur_loop_bound = loop_bounds[deconstruct_index][1]
+                # if DEBUG_OPENMP >= 1:
+                #    print("deconstructing", cur_iterspace_var)
+                #    deconstruct_indices.append(ir.Print([new_deconstruct_var, cur_iterspace_var], None, self.loc))
+                deconstruct_div = ir.Expr.binop(
+                    operator.floordiv, new_deconstruct_var, cur_iterspace_var, self.loc
+                )
+                new_deconstruct_var_loop = new_var_scope.redefine(
+                    "deconstruct" + str(deconstruct_index), self.loc
+                )
+                deconstruct_indices.append(
+                    ir.Assign(deconstruct_div, cur_loop_bound, self.loc)
+                )
+                # if DEBUG_OPENMP >= 1:
+                #    deconstruct_indices.append(ir.Print([cur_loop_bound], None, self.loc))
+                new_deconstruct_var_mul = new_var_scope.redefine(
+                    "deconstruct_mul" + str(deconstruct_index), self.loc
+                )
+                deconstruct_indices.append(
+                    ir.Assign(
+                        ir.Expr.binop(
+                            operator.mul, cur_loop_bound, cur_iterspace_var, self.loc
+                        ),
+                        new_deconstruct_var_mul,
+                        self.loc,
+                    )
+                )
+                # if DEBUG_OPENMP >= 1:
+                #    deconstruct_indices.append(ir.Print([new_deconstruct_var_mul], None, self.loc))
+                deconstruct_indices.append(
+                    ir.Assign(
+                        ir.Expr.binop(
+                            operator.sub,
+                            new_deconstruct_var,
+                            new_deconstruct_var_mul,
+                            self.loc,
+                        ),
+                        new_deconstruct_var_loop,
+                        self.loc,
+                    )
+                )
+                # if DEBUG_OPENMP >= 1:
+                #    deconstruct_indices.append(ir.Print([new_deconstruct_var_loop], None, self.loc))
+                new_deconstruct_var = new_deconstruct_var_loop
+            deconstruct_indices.append(
+                ir.Assign(new_deconstruct_var, loop_bounds[-1][1], self.loc)
+            )
+
+            self.blocks[last_loop_first_body_block].body = (
+                deconstruct_indices + self.blocks[last_loop_first_body_block].body
+            )
+
+            if DEBUG_OPENMP >= 1:
+                print("collapse 3", self.blk_start, self.blk_end)
+                dump_blocks(self.blocks)
+
+            cfg = compute_cfg_from_blocks(self.blocks)
+            live_map = compute_live_map(
+                cfg, self.blocks, usedefs.usemap, usedefs.defmap
+            )
+            all_loops = cfg.loops()
+            loops = get_loops_in_region(all_loops)
+            loops = list(filter_nested_loops(cfg, loops))
+            if DEBUG_OPENMP >= 2:
+                print("loops after collapse:", loops)
+            if DEBUG_OPENMP >= 1:
+                print("blocks after collapse", self.blk_start, self.blk_end)
+                dump_blocks(self.blocks)
+
+        def _get_loop_kind(func_var, call_table):
+            if func_var not in call_table:
+                return False
+            call = call_table[func_var]
+            if len(call) == 0:
+                return False
+
+            return call[0]
+
+        loop = loops[0]
+        entry = list(loop.entries)[0]
+        header = loop.header
+        exit = list(loop.exits)[0]
+
+        loop_blocks_for_io = loop.entries.union(loop.body)
+        loop_blocks_for_io_minus_entry = loop_blocks_for_io - {entry}
+        non_loop_blocks = set(self.body_blocks)
+        non_loop_blocks.difference_update(loop_blocks_for_io)
+        non_loop_blocks.difference_update(collapse_iterspace_block)
+        # non_loop_blocks.difference_update({exit})
+
+        if DEBUG_OPENMP >= 1:
+            print("non_loop_blocks:", non_loop_blocks)
+            print("entry:", entry)
+            print("header:", header)
+            print("exit:", exit)
+            print("body_blocks:", self.body_blocks)
+            print("loop:", loop)
+
+        # Find the first statement after any iterspace calculation ones for collapse.
+        first_stmt = self.blocks[entry].body[0]
+        # first_stmt = self.blocks[entry].body[len(new_stmts_for_iterspace)]
+        if (
+            not isinstance(first_stmt, ir.Assign)
+            or not isinstance(first_stmt.value, ir.Global)
+            or first_stmt.value.name != "range"
+        ):
+            raise ParallelForExtraCode(
+                f"Extra code near line {self.loc} is not allowed before or after the loop in an OpenMP parallel for region."
+            )
+
+        live_end = live_map[self.blk_end]
+        for non_loop_block in non_loop_blocks:
+            nlb = self.blocks[non_loop_block]
+            if isinstance(nlb.body[0], ir.Jump):
+                # Non-loop empty blocks are fine.
+                continue
+            if (
+                isinstance(nlb.body[-1], ir.Jump)
+                and nlb.body[-1].target == self.blk_end
+            ):
+                # Loop through all statements in block that jumps to the end of the region.
+                # If those are all assignments where the LHS is dead then they are safe.
+                for nlb_stmt in nlb.body[:-1]:
+                    if isinstance(nlb_stmt, ir.PopBlock):
+                        continue
+
+                    break
+                    # if not isinstance(nlb_stmt, ir.Assign):
+                    #    break  # Non-assignment is not known to be safe...will fallthrough to raise exception.
+                    # if nlb_stmt.target.name in live_end:
+                    #    break  # Non-dead variables in assignment is not safe...will fallthrough to raise exception.
+                else:
+                    continue
+            raise ParallelForExtraCode(
+                f"Extra code near line {self.loc} is not allowed before or after the loop in an OpenMP parallel for region."
+            )
+
+        if DEBUG_OPENMP >= 1:
+            print("loop_blocks_for_io:", loop_blocks_for_io, entry, exit)
+            print("non_loop_blocks:", non_loop_blocks)
+            print("header:", header)
+
+        entry_block = self.blocks[entry]
+        assert isinstance(entry_block.body[-1], ir.Jump)
+        assert entry_block.body[-1].target == header
+        exit_block = self.blocks[exit]
+        header_block = self.blocks[header]
+        extra_block = (
+            None if len(header_block.body) > 1 else header_block.body[-1].target
+        )
+
+        latch_block_num = max(self.blocks.keys()) + 1
+
+        # We have to reformat the Numba style of loop to the form that the LLVM
+        # openmp pass supports.
+        header_preds = [x[0] for x in cfg.predecessors(header)]
+        entry_preds = list(set(header_preds).difference(loop.body))
+        back_blocks = list(set(header_preds).intersection(loop.body))
+        if DEBUG_OPENMP >= 1:
+            print("header_preds:", header_preds)
+            print("entry_preds:", entry_preds)
+            print("back_blocks:", back_blocks)
+        assert len(entry_preds) == 1
+        entry_pred_label = entry_preds[0]
+        entry_pred = self.blocks[entry_pred_label]
+        if extra_block is not None:
+            header_block = self.blocks[extra_block]
+            header = extra_block
+        header_branch = header_block.body[-1]
+        post_header = {header_branch.truebr, header_branch.falsebr}
+        post_header.remove(exit)
+        if DEBUG_OPENMP >= 1:
+            print("post_header:", post_header)
+        post_header = self.blocks[list(post_header)[0]]
+        if DEBUG_OPENMP >= 1:
+            print("post_header:", post_header)
+
+        normalized = True
+
+        for inst_num, inst in enumerate(entry_block.body):
+            if (
+                isinstance(inst, ir.Assign)
+                and isinstance(inst.value, ir.Expr)
+                and inst.value.op == "call"
+            ):
+                loop_kind = _get_loop_kind(inst.value.func.name, call_table)
+                if DEBUG_OPENMP >= 1:
+                    print("loop_kind:", loop_kind)
+                if loop_kind != False and loop_kind == range:
+                    range_inst = inst
+                    range_args = inst.value.args
+                    if DEBUG_OPENMP >= 1:
+                        print("found one", loop_kind, inst, range_args)
+
+                    # ----------------------------------------------
+                    # Find getiter instruction for this range.
+                    for entry_inst in entry_block.body[inst_num + 1 :]:
+                        if (
+                            isinstance(entry_inst, ir.Assign)
+                            and isinstance(entry_inst.value, ir.Expr)
+                            and entry_inst.value.op == "getiter"
+                            and entry_inst.value.value == range_inst.target
+                        ):
+                            getiter_inst = entry_inst
+                            break
+                    assert getiter_inst
+                    if DEBUG_OPENMP >= 1:
+                        print("getiter_inst:", getiter_inst)
+                    # ----------------------------------------------
+
+                    assert len(header_block.body) > 3
+                    if DEBUG_OPENMP >= 1:
+                        print("header block before removing Numba range vars:")
+                        dump_block(header, header_block)
+
+                    for ii in range(len(header_block.body)):
+                        ii_inst = header_block.body[ii]
+                        if (
+                            isinstance(ii_inst, ir.Assign)
+                            and isinstance(ii_inst.value, ir.Expr)
+                            and ii_inst.value.op == "iternext"
+                        ):
+                            iter_num = ii
+                            break
+
+                    iternext_inst = header_block.body[iter_num]
+                    pair_first_inst = header_block.body[iter_num + 1]
+                    pair_second_inst = header_block.body[iter_num + 2]
+
+                    assert (
+                        isinstance(iternext_inst, ir.Assign)
+                        and isinstance(iternext_inst.value, ir.Expr)
+                        and iternext_inst.value.op == "iternext"
+                    )
+                    assert (
+                        isinstance(pair_first_inst, ir.Assign)
+                        and isinstance(pair_first_inst.value, ir.Expr)
+                        and pair_first_inst.value.op == "pair_first"
+                    )
+                    assert (
+                        isinstance(pair_second_inst, ir.Assign)
+                        and isinstance(pair_second_inst.value, ir.Expr)
+                        and pair_second_inst.value.op == "pair_second"
+                    )
+                    # Remove those nodes from the IR.
+                    header_block.body = (
+                        header_block.body[:iter_num] + header_block.body[iter_num + 3 :]
+                    )
+                    if DEBUG_OPENMP >= 1:
+                        print("header block after removing Numba range vars:")
+                        dump_block(header, header_block)
+
+                    loop_index = pair_first_inst.target
+                    if DEBUG_OPENMP >= 1:
+                        print("loop_index:", loop_index, type(loop_index))
+                    # The loop_index from Numba's perspective is not what it is from the
+                    # programmer's perspective.  The OpenMP loop index is always private so
+                    # we need to start from Numba's loop index (e.g., $48for_iter.3) and
+                    # trace assignments from that through the header block and then find
+                    # the first such assignment in the first loop block that the header
+                    # branches to.
+                    latest_index = loop_index
+                    for hinst in header_block.body:
+                        if isinstance(hinst, ir.Assign) and isinstance(
+                            hinst.value, ir.Var
+                        ):
+                            if hinst.value.name == latest_index.name:
+                                latest_index = hinst.target
+                    for phinst in post_header.body:
+                        if isinstance(phinst, ir.Assign) and isinstance(
+                            phinst.value, ir.Var
+                        ):
+                            if phinst.value.name == latest_index.name:
+                                latest_index = phinst.target
+                                break
+                    if DEBUG_OPENMP >= 1:
+                        print("latest_index:", latest_index, type(latest_index))
+
+                    if latest_index.name not in vars_in_explicit_clauses:
+                        new_index_clause = openmp_tag(
+                            "QUAL.OMP.PRIVATE",
+                            ir.Var(loop_index.scope, latest_index.name, inst.loc),
+                        )
+                        clauses.append(new_index_clause)
+                        vars_in_explicit_clauses[latest_index.name] = new_index_clause
+                    else:
+                        if (
+                            vars_in_explicit_clauses[latest_index.name].name
+                            != "QUAL.OMP.PRIVATE"
+                        ):
+                            pass
+                            # throw error?  FIX ME
+
+                    if DEBUG_OPENMP >= 1:
+                        for clause in clauses:
+                            print("post-latest_index clauses:", clause)
+
+                    start = 0
+                    step = 1
+                    size_var = range_args[0]
+                    if len(range_args) == 2:
+                        start = range_args[0]
+                        size_var = range_args[1]
+                    if len(range_args) == 3:
+                        start = range_args[0]
+                        size_var = range_args[1]
+                        try:
+                            step = self.func_ir.get_definition(range_args[2])
+                            # Only use get_definition to get a const if
+                            # available.  Otherwise use the variable.
+                            if not isinstance(step, (int, ir.Const)):
+                                step = range_args[2]
+                        except KeyError:
+                            # If there is more than one definition possible for the
+                            # step variable then just use the variable and don't try
+                            # to convert to a const.
+                            step = range_args[2]
+                        if isinstance(step, ir.Const):
+                            step = step.value
+
+                    if DEBUG_OPENMP >= 1:
+                        print("size_var:", size_var, type(size_var))
+
+                    omp_lb_var = loop_index.scope.redefine("$omp_lb", inst.loc)
+                    before_start.append(
+                        ir.Assign(ir.Const(0, inst.loc), omp_lb_var, inst.loc)
+                    )
+
+                    omp_iv_var = loop_index.scope.redefine("$omp_iv", inst.loc)
+                    # before_start.append(ir.Assign(omp_lb_var, omp_iv_var, inst.loc))
+                    # Don't use omp_lb here because that makes a live-in to the region that
+                    # becomes a parameter to an outlined target region.
+                    after_start.append(
+                        ir.Assign(ir.Const(0, inst.loc), omp_iv_var, inst.loc)
+                    )
+                    # after_start.append(ir.Assign(omp_lb_var, omp_iv_var, inst.loc))
+
+                    types_mod_var = loop_index.scope.redefine(
+                        "$numba_types_mod", inst.loc
+                    )
+                    types_mod = ir.Global("types", types, inst.loc)
+                    types_mod_assign = ir.Assign(types_mod, types_mod_var, inst.loc)
+                    before_start.append(types_mod_assign)
+
+                    int64_var = loop_index.scope.redefine("$int64_var", inst.loc)
+                    int64_getattr = ir.Expr.getattr(types_mod_var, "int64", inst.loc)
+                    int64_assign = ir.Assign(int64_getattr, int64_var, inst.loc)
+                    before_start.append(int64_assign)
+
+                    get_itercount_var = loop_index.scope.redefine(
+                        "$get_itercount", inst.loc
+                    )
+                    get_itercount_global = ir.Global(
+                        "get_itercount", get_itercount, inst.loc
+                    )
+                    get_itercount_assign = ir.Assign(
+                        get_itercount_global, get_itercount_var, inst.loc
+                    )
+                    before_start.append(get_itercount_assign)
+
+                    itercount_var = loop_index.scope.redefine("$itercount", inst.loc)
+                    itercount_expr = ir.Expr.call(
+                        get_itercount_var, [getiter_inst.target], (), inst.loc
+                    )
+                    # itercount_expr = ir.Expr.itercount(getiter_inst.target, inst.loc)
+                    before_start.append(
+                        ir.Assign(itercount_expr, itercount_var, inst.loc)
+                    )
+
+                    omp_ub_var = loop_index.scope.redefine("$omp_ub", inst.loc)
+                    omp_ub_expr = ir.Expr.call(int64_var, [itercount_var], (), inst.loc)
+                    before_start.append(ir.Assign(omp_ub_expr, omp_ub_var, inst.loc))
+
+                    const1_var = loop_index.scope.redefine("$const1", inst.loc)
+                    start_tags.append(openmp_tag("QUAL.OMP.PRIVATE", const1_var))
+                    const1_assign = ir.Assign(
+                        ir.Const(1, inst.loc), const1_var, inst.loc
+                    )
+                    before_start.append(const1_assign)
+                    count_add_1 = ir.Expr.binop(
+                        operator.sub, omp_ub_var, const1_var, inst.loc
+                    )
+                    before_start.append(ir.Assign(count_add_1, omp_ub_var, inst.loc))
+
+                    #                    before_start.append(ir.Print([omp_ub_var], None, inst.loc))
+
+                    omp_start_var = loop_index.scope.redefine("$omp_start", inst.loc)
+                    if start == 0:
+                        start = ir.Const(start, inst.loc)
+                    before_start.append(ir.Assign(start, omp_start_var, inst.loc))
+
+                    # ---------- Create latch block -------------------------------
+                    latch_iv = omp_iv_var
+
+                    latch_block = ir.Block(scope, inst.loc)
+                    const1_latch_var = loop_index.scope.redefine(
+                        "$const1_latch", inst.loc
+                    )
+                    start_tags.append(openmp_tag("QUAL.OMP.PRIVATE", const1_latch_var))
+                    const1_assign = ir.Assign(
+                        ir.Const(1, inst.loc), const1_latch_var, inst.loc
+                    )
+                    latch_block.body.append(const1_assign)
+                    latch_assign = ir.Assign(
+                        ir.Expr.binop(
+                            operator.add, omp_iv_var, const1_latch_var, inst.loc
+                        ),
+                        latch_iv,
+                        inst.loc,
+                    )
+                    latch_block.body.append(latch_assign)
+                    latch_block.body.append(ir.Jump(header, inst.loc))
+
+                    self.blocks[latch_block_num] = latch_block
+                    for bb in back_blocks:
+                        if False:
+                            str_var = scope.redefine("$str_var", inst.loc)
+                            str_const = ir.Const("mid start:", inst.loc)
+                            str_assign = ir.Assign(str_const, str_var, inst.loc)
+                            str_print = ir.Print([str_var, size_var], None, inst.loc)
+                            # before_start.append(str_assign)
+                            # before_start.append(str_print)
+                            self.blocks[bb].body = self.blocks[bb].body[:-1] + [
+                                str_assign,
+                                str_print,
+                                ir.Jump(latch_block_num, inst.loc),
+                            ]
+                        else:
+                            self.blocks[bb].body[-1] = ir.Jump(
+                                latch_block_num, inst.loc
+                            )
+                    # -------------------------------------------------------------
+
+                    # ---------- Header Manipulation ------------------------------
+                    step_var = loop_index.scope.redefine("$step_var", inst.loc)
+                    detect_step_assign = ir.Assign(
+                        ir.Const(0, inst.loc), step_var, inst.loc
+                    )
+                    after_start.append(detect_step_assign)
+
+                    if isinstance(step, int):
+                        step_assign = ir.Assign(
+                            ir.Const(step, inst.loc), step_var, inst.loc
+                        )
+                    elif isinstance(step, ir.Var):
+                        step_assign = ir.Assign(step, step_var, inst.loc)
+                        start_tags.append(
+                            openmp_tag("QUAL.OMP.FIRSTPRIVATE", step.name)
+                        )
+                    else:
+                        print("Unsupported step:", step, type(step))
+                        raise NotImplementedError(
+                            f"Unknown step type that isn't a constant or variable but {type(step)} instead."
+                        )
+                    scale_var = loop_index.scope.redefine("$scale", inst.loc)
+                    fake_iternext = ir.Assign(
+                        ir.Const(0, inst.loc), iternext_inst.target, inst.loc
+                    )
+                    fake_second = ir.Assign(
+                        ir.Const(0, inst.loc), pair_second_inst.target, inst.loc
+                    )
+                    scale_assign = ir.Assign(
+                        ir.Expr.binop(operator.mul, step_var, omp_iv_var, inst.loc),
+                        scale_var,
+                        inst.loc,
+                    )
+                    unnormalize_iv = ir.Assign(
+                        ir.Expr.binop(operator.add, omp_start_var, scale_var, inst.loc),
+                        loop_index,
+                        inst.loc,
+                    )
+                    cmp_var = loop_index.scope.redefine("$cmp", inst.loc)
+                    iv_lte_ub = ir.Assign(
+                        ir.Expr.binop(operator.le, omp_iv_var, omp_ub_var, inst.loc),
+                        cmp_var,
+                        inst.loc,
+                    )
+                    old_branch = header_block.body[-1]
+                    new_branch = ir.Branch(
+                        cmp_var, old_branch.truebr, old_branch.falsebr, old_branch.loc
+                    )
+                    body_label = old_branch.truebr
+                    first_body_block = self.blocks[body_label]
+                    new_end = [iv_lte_ub, new_branch]
+                    # Turn this on to add printing to help debug at runtime.
+                    if False:
+                        str_var = loop_index.scope.redefine("$str_var", inst.loc)
+                        str_const = ir.Const("header1:", inst.loc)
+                        str_assign = ir.Assign(str_const, str_var, inst.loc)
+                        new_end.append(str_assign)
+                        str_print = ir.Print(
+                            [str_var, omp_start_var, omp_iv_var], None, inst.loc
+                        )
+                        new_end.append(str_print)
+
+                    # Prepend original contents of header into the first body block minus the comparison
+                    first_body_block.body = (
+                        [
+                            fake_iternext,
+                            fake_second,
+                            step_assign,
+                            scale_assign,
+                            unnormalize_iv,
+                        ]
+                        + header_block.body[:-1]
+                        + first_body_block.body
+                    )
+
+                    header_block.body = new_end
+                    # header_block.body = [fake_iternext, fake_second, unnormalize_iv] + header_block.body[:-1] + new_end
+
+                    # -------------------------------------------------------------
+
+                    # const_start_var = loop_index.scope.redefine("$const_start", inst.loc)
+                    # before_start.append(ir.Assign(ir.Const(0, inst.loc), const_start_var, inst.loc))
+                    # start_tags.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", const_start_var.name))
+                    start_tags.append(
+                        openmp_tag("QUAL.OMP.NORMALIZED.IV", omp_iv_var.name)
+                    )
+                    start_tags.append(
+                        openmp_tag("QUAL.OMP.NORMALIZED.START", omp_start_var.name)
+                    )
+                    start_tags.append(
+                        openmp_tag("QUAL.OMP.NORMALIZED.LB", omp_lb_var.name)
+                    )
+                    start_tags.append(
+                        openmp_tag("QUAL.OMP.NORMALIZED.UB", omp_ub_var.name)
+                    )
+                    start_tags.append(openmp_tag("QUAL.OMP.PRIVATE", omp_iv_var.name))
+                    start_tags.append(
+                        openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_start_var.name)
+                    )
+                    start_tags.append(
+                        openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_lb_var.name)
+                    )
+                    start_tags.append(
+                        openmp_tag("QUAL.OMP.FIRSTPRIVATE", omp_ub_var.name)
+                    )
+                    tags_for_enclosing = [
+                        cmp_var.name,
+                        omp_lb_var.name,
+                        omp_start_var.name,
+                        omp_iv_var.name,
+                        types_mod_var.name,
+                        int64_var.name,
+                        itercount_var.name,
+                        omp_ub_var.name,
+                        const1_var.name,
+                        const1_latch_var.name,
+                        get_itercount_var.name,
+                    ] + [x.name for x in iterspace_vars]
+                    tags_for_enclosing = [
+                        openmp_tag("QUAL.OMP.PRIVATE", x) for x in tags_for_enclosing
+                    ]
+                    # Don't blindly copy code here...this isn't doing what the other spots are doing with privatization.
+                    add_tags_to_enclosing(
+                        self.func_ir, self.blk_start, tags_for_enclosing
+                    )
+                    # start_tags.append(openmp_tag("QUAL.OMP.NORMALIZED.IV", loop_index.name))
+                    # start_tags.append(openmp_tag("QUAL.OMP.NORMALIZED.UB", size_var.name))
+                    return (
+                        True,
+                        loop_blocks_for_io,
+                        loop_blocks_for_io_minus_entry,
+                        entry_pred,
+                        exit_block,
+                        inst,
+                        size_var,
+                        step_var,
+                        latest_index,
+                        loop_index,
+                    )
+
+        return False, None, None, None, None, None, None, None, None, None
+
+    def some_for_directive(
+        self, args, main_start_tag, main_end_tag, first_clause, gen_shared
+    ):
+        if DEBUG_OPENMP >= 1:
+            print("some_for_directive", self.body_blocks)
+        start_tags = [openmp_tag(main_start_tag)]
+        end_tags = [openmp_tag(main_end_tag)]
+        clauses = self.some_data_clause_directive(
+            args, start_tags, end_tags, first_clause, has_loop=True
+        )
+
+        if "PARALLEL" in main_start_tag:
+            # ---- Back propagate THREAD_LIMIT to enclosed target region. ----
+            self.parallel_back_prop(clauses)
+
+        if len(list(filter(lambda x: x.name == "QUAL.OMP.NUM_THREADS", clauses))) > 1:
+            raise MultipleNumThreadsClauses(
+                f"Multiple num_threads clauses near line {self.loc} is not allowed in an OpenMP parallel region."
+            )
+
+    # --------- Parser functions ------------------------
+
+    def barrier_directive(self, args):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit barrier_directive", args, type(args))
+        or_start = openmp_region_start([openmp_tag("DIR.OMP.BARRIER")], 0, self.loc)
+        or_start.requires_combined_acquire_release()
+        or_end = openmp_region_end(
+            or_start, [openmp_tag("DIR.OMP.END.BARRIER")], self.loc
+        )
+        sblk.body = [or_start] + [or_end] + sblk.body[:]
+
+    def taskwait_directive(self, args):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit taskwait_directive", args, type(args))
+        or_start = openmp_region_start([openmp_tag("DIR.OMP.TASKWAIT")], 0, self.loc)
+        or_start.requires_combined_acquire_release()
+        or_end = openmp_region_end(
+            or_start, [openmp_tag("DIR.OMP.END.TASKWAIT")], self.loc
+        )
+        sblk.body = [or_start] + [or_end] + sblk.body[:]
+
+    def taskyield_directive(self, args):
+        raise NotImplementedError("Taskyield currently unsupported.")
+
+    # Don't need a rule for BARRIER.
+    # Don't need a rule for TASKWAIT.
+    # Don't need a rule for TASKYIELD.
+
+    def taskgroup_directive(self, args):
+        raise NotImplementedError("Taskgroup currently unsupported.")
+
+    # Don't need a rule for taskgroup_construct.
+    # Don't need a rule for TASKGROUP.
+
+    # Don't need a rule for openmp_construct.
+
+    # def teams_distribute_parallel_for_simd_clause(self, args):
+    #    raise NotImplementedError("""Simd clause for target teams
+    #                             distribute parallel loop currently unsupported.""")
+    #    if DEBUG_OPENMP >= 1:
+    #        print("visit device_clause", args, type(args))
+
+    # Don't need a rule for for_simd_construct.
+
+    def for_simd_directive(self, args):
+        raise NotImplementedError("For simd currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit for_simd_directive", args, type(args))
+
+    def for_simd_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit for_simd_clause", args, type(args), args[0])
+        return args[0]
+
+    def schedule_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit schedule_clause", args, type(args), args[0])
+        return args[0]
+
+    def dist_schedule_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit dist_schedule_clause", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for parallel_for_simd_construct.
+
+    def parallel_for_simd_directive(self, args):
+        raise NotImplementedError("Parallel for simd currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit parallel_for_simd_directive", args, type(args))
+
+    def parallel_for_simd_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit parallel_for_simd_clause", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for target_data_construct.
+
+    def target_data_directive(self, args):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit target_data_directive", args, type(args))
+
+        before_start = []
+        after_start = []
+
+        clauses, default_shared = self.flatten(args[2:], sblk)
+
+        if DEBUG_OPENMP >= 1:
+            for clause in clauses:
+                print("final clause:", clause)
+
+        inputs_to_region, def_but_live_out, private_to_region, live_map = (
+            self.find_io_vars(self.body_blocks)
+        )
+        used_in_region = inputs_to_region | def_but_live_out | private_to_region
+        clauses = self.filter_unused_vars(clauses, used_in_region)
+
+        start_tags = [openmp_tag("DIR.OMP.TARGET.DATA")] + clauses
+        end_tags = [openmp_tag("DIR.OMP.END.TARGET.DATA")]
+
+        or_start = openmp_region_start(start_tags, 0, self.loc)
+        or_end = openmp_region_end(or_start, end_tags, self.loc)
+        sblk.body = before_start + [or_start] + after_start + sblk.body[:]
+        eblk.body = [or_end] + eblk.body[:]
+
+        add_enclosing_region(self.func_ir, self.body_blocks, or_start)
+
+    # Don't need a rule for DATA.
+
+    def target_data_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit target_data_clause", args, type(args), args[0])
+        (val,) = args
+        if isinstance(val, openmp_tag):
+            return [val]
+        elif isinstance(val, list):
+            return val
+        elif val == "nowait":
+            return openmp_tag("QUAL.OMP.NOWAIT")
+        else:
+            return val
+
+    def target_enter_data_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit target_enter_data_clause", args, type(args), args[0])
+        (val,) = args
+        if isinstance(val, openmp_tag):
+            return [val]
+        elif isinstance(val, list):
+            return val
+        elif val == "nowait":
+            return openmp_tag("QUAL.OMP.NOWAIT")
+        else:
+            return val
+
+    def target_exit_data_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit target_exit_data_clause", args, type(args), args[0])
+        (val,) = args
+        if isinstance(val, openmp_tag):
+            return [val]
+        elif isinstance(val, list):
+            return val
+        elif val == "nowait":
+            return openmp_tag("QUAL.OMP.NOWAIT")
+        else:
+            return val
+
+    def device_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit device_clause", args, type(args))
+        return [openmp_tag("QUAL.OMP.DEVICE", args[0])]
+
+    def map_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit map_clause", args, type(args), args[0])
+        if args[0] in ["to", "from", "alloc", "tofrom"]:
+            map_type = args[0].upper()
+            var_list = args[1]
+            assert len(args) == 2
+        else:
+            map_type = "TOFROM"  # is this default right?  FIX ME
+            var_list = args[0]
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var))
+        return ret
+
+    def map_enter_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit map_enter_clause", args, type(args), args[0])
+        assert args[0] in ["to", "alloc"]
+        map_type = args[0].upper()
+        var_list = args[1]
+        assert len(args) == 2
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var))
+        return ret
+
+    def map_exit_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit map_exit_clause", args, type(args), args[0])
+        assert args[0] in ["from", "release", "delete"]
+        map_type = args[0].upper()
+        var_list = args[1]
+        assert len(args) == 2
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var))
+        return ret
+
+    def depend_with_modifier_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit depend_with_modifier_clause", args, type(args), args[0])
+        dep_type = args[1].upper()
+        var_list = args[2]
+        assert len(args) == 3
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.DEPEND." + dep_type, var))
+        return ret
+
+    def map_type(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit map_type", args, type(args), args[0])
+        return str(args[0])
+
+    def map_enter_type(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit map_enter_type", args, type(args), args[0])
+        return str(args[0])
+
+    def map_exit_type(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit map_exit_type", args, type(args), args[0])
+        return str(args[0])
+
+    def update_motion_type(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit update_motion_type", args, type(args), args[0])
+        return str(args[0])
+
+    # Don't need a rule for TO.
+    # Don't need a rule for FROM.
+    # Don't need a rule for ALLOC.
+    # Don't need a rule for TOFROM.
+    # Don't need a rule for parallel_sections_construct.
+
+    def parallel_sections_directive(self, args):
+        raise NotImplementedError("Parallel sections currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit parallel_sections_directive", args, type(args))
+
+    def parallel_sections_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit parallel_sections_clause", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for sections_construct.
+
+    def sections_directive(self, args):
+        raise NotImplementedError("Sections directive currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit sections_directive", args, type(args))
+
+    # Don't need a rule for SECTIONS.
+
+    def sections_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit sections_clause", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for section_construct.
+
+    def section_directive(self, args):
+        raise NotImplementedError("Section directive currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit section_directive", args, type(args))
+
+    # Don't need a rule for SECTION.
+    # Don't need a rule for atomic_construct.
+
+    def atomic_directive(self, args):
+        raise NotImplementedError("Atomic currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit atomic_directive", args, type(args))
+
+    # Don't need a rule for ATOMIC.
+
+    def atomic_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit atomic_clause", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for READ.
+    # Don't need a rule for WRITE.
+    # Don't need a rule for UPDATE.
+    # Don't need a rule for CAPTURE.
+    # Don't need a rule for seq_cst_clause.
+    # Don't need a rule for critical_construct.
+
+    def critical_directive(self, args):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+        scope = sblk.scope
+
+        if DEBUG_OPENMP >= 1:
+            print("visit critical_directive", args, type(args))
+        or_start = openmp_region_start([openmp_tag("DIR.OMP.CRITICAL")], 0, self.loc)
+        or_start.requires_acquire_release()
+        or_end = openmp_region_end(
+            or_start, [openmp_tag("DIR.OMP.END.CRITICAL")], self.loc
+        )
+
+        inputs_to_region, def_but_live_out, private_to_region, live_map = (
+            self.find_io_vars(self.body_blocks)
+        )
+        inputs_to_region = {remove_ssa(x, scope, self.loc): x for x in inputs_to_region}
+        def_but_live_out = {remove_ssa(x, scope, self.loc): x for x in def_but_live_out}
+        common_keys = inputs_to_region.keys() & def_but_live_out.keys()
+        in_def_live_out = {
+            inputs_to_region[k]: def_but_live_out[k] for k in common_keys
+        }
+        if DEBUG_OPENMP >= 1:
+            print("inputs_to_region:", sorted(inputs_to_region))
+            print("def_but_live_out:", sorted(def_but_live_out))
+            print("in_def_live_out:", sorted(in_def_live_out))
+
+        reset = []
+        for k, v in in_def_live_out.items():
+            reset.append(
+                ir.Assign(
+                    ir.Var(scope, v, self.loc), ir.Var(scope, k, self.loc), self.loc
+                )
+            )
+
+        sblk.body = [or_start] + sblk.body[:]
+        eblk.body = reset + [or_end] + eblk.body[:]
+
+    # Don't need a rule for CRITICAL.
+    # Don't need a rule for target_construct.
+    # Don't need a rule for target_teams_distribute_parallel_for_simd_construct.
+
+    def teams_back_prop(self, clauses):
+        enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start)
+        if DEBUG_OPENMP >= 1:
+            print("teams enclosing_regions:", enclosing_regions)
+        if not enclosing_regions:
+            return
+
+        for enclosing_region in enclosing_regions[::-1]:
+            if not self.get_directive_match(enclosing_region.tags, "DIR.OMP.TARGET"):
+                continue
+
+            nt_tag = self.get_clauses_by_name(
+                enclosing_region.tags, "QUAL.OMP.NUM_TEAMS"
+            )
+            assert len(nt_tag) == 1
+            cur_num_team_clauses = self.get_clauses_by_name(
+                clauses, "QUAL.OMP.NUM_TEAMS", remove_from_orig=True
+            )
+            if len(cur_num_team_clauses) >= 1:
+                nt_tag[-1].arg = cur_num_team_clauses[-1].arg
+            else:
+                nt_tag[-1].arg = 0
+
+            nt_tag = self.get_clauses_by_name(
+                enclosing_region.tags, "QUAL.OMP.THREAD_LIMIT"
+            )
+            assert len(nt_tag) == 1
+            cur_num_team_clauses = self.get_clauses_by_name(
+                clauses, "QUAL.OMP.THREAD_LIMIT", remove_from_orig=True
+            )
+            if len(cur_num_team_clauses) >= 1:
+                nt_tag[-1].arg = cur_num_team_clauses[-1].arg
+            else:
+                nt_tag[-1].arg = 0
+
+            return
+
+    def check_distribute_nesting(self, dir_tag):
+        if "DISTRIBUTE" in dir_tag and "TEAMS" not in dir_tag:
+            enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start)
+            if (
+                len(enclosing_regions) < 1
+                or "TEAMS" not in enclosing_regions[-1].tags[0].name
+            ):
+                raise NotImplementedError(
+                    "DISTRIBUTE must be nested under or combined with TEAMS."
+                )
+
+    def teams_directive(self, args):
+        if DEBUG_OPENMP >= 1:
+            print(
+                "visit teams_directive", args, type(args), self.blk_start, self.blk_end
+            )
+        start_tags = [openmp_tag("DIR.OMP.TEAMS")]
+        end_tags = [openmp_tag("DIR.OMP.END.TEAMS")]
+        clauses = self.some_data_clause_directive(args, start_tags, end_tags, 1)
+
+        self.teams_back_prop(clauses)
+
+    def target_directive(self, args):
+        if sys.platform.startswith("darwin"):
+            print("ERROR: OpenMP target offloading is unavailable on Darwin")
+            sys.exit(-1)
+        self.some_target_directive(args, "TARGET", 1)
+
+    def target_teams_directive(self, args):
+        self.some_target_directive(args, "TARGET.TEAMS", 2)
+
+    def target_teams_distribute_directive(self, args):
+        self.some_target_directive(args, "TARGET.TEAMS.DISTRIBUTE", 3, has_loop=True)
+
+    def target_loop_directive(self, args):
+        self.some_target_directive(
+            args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True
+        )
+
+    def target_teams_loop_directive(self, args):
+        self.some_target_directive(
+            args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 3, has_loop=True
+        )
+
+    def target_teams_distribute_parallel_for_directive(self, args):
+        self.some_target_directive(
+            args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 5, has_loop=True
+        )
+
+    def target_teams_distribute_parallel_for_simd_directive(self, args):
+        # Intentionally dropping "SIMD" from string as that typically isn't implemented on GPU.
+        self.some_target_directive(
+            args, "TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP", 6, has_loop=True
+        )
+
+    def get_clauses_by_name(self, clauses, names, remove_from_orig=False):
+        if not isinstance(names, list):
+            names = [names]
+
+        ret = list(filter(lambda x: x.name in names, clauses))
+        if remove_from_orig:
+            clauses[:] = list(filter(lambda x: x.name not in names, clauses))
+        return ret
+
+    def get_clauses_by_start(self, clauses, names, remove_from_orig=False):
+        if not isinstance(names, list):
+            names = [names]
+        ret = list(
+            filter(lambda x: any([x.name.startswith(y) for y in names]), clauses)
+        )
+        if remove_from_orig:
+            clauses[:] = list(
+                filter(
+                    lambda x: any([not x.name.startswith(y) for y in names]), clauses
+                )
+            )
+        return ret
+
+    def get_clauses_if_contains(self, clauses, names, remove_from_orig=False):
+        if not isinstance(names, list):
+            names = [names]
+        ret = list(filter(lambda x: any([y in x.name for y in names]), clauses))
+        if remove_from_orig:
+            clauses[:] = list(
+                filter(lambda x: any([not y in x.name for y in names]), clauses)
+            )
+        return ret
+
+    def get_directive_if_contains(self, tags, name):
+        dir = [x for x in tags if x.name.startswith("DIR")]
+        assert len(dir) == 1, "Expected one directive tag"
+        ret = [x for x in dir if name in x.name]
+        return ret
+
+    def get_directive_match(self, tags, name):
+        dir = [x for x in tags if x.name.startswith("DIR")]
+        assert len(dir) == 1, "Expected one directive tag"
+        ret = [x for x in dir if name == x.name]
+        return ret
+
+    def target_enter_data_directive(self, args):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit target_enter_data_directive", args, type(args))
+
+        clauses, _ = self.flatten(args[3:], sblk)
+        or_start = openmp_region_start(
+            [openmp_tag("DIR.OMP.TARGET.ENTER.DATA")] + clauses, 0, self.loc
+        )
+        or_end = openmp_region_end(
+            or_start, [openmp_tag("DIR.OMP.END.TARGET.ENTER.DATA")], self.loc
+        )
+        sblk.body = [or_start] + [or_end] + sblk.body[:]
+
+    def target_exit_data_directive(self, args):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit target_exit_data_directive", args, type(args))
+
+        clauses, _ = self.flatten(args[3:], sblk)
+        or_start = openmp_region_start(
+            [openmp_tag("DIR.OMP.TARGET.EXIT.DATA")] + clauses, 0, self.loc
+        )
+        or_end = openmp_region_end(
+            or_start, [openmp_tag("DIR.OMP.END.TARGET.EXIT.DATA")], self.loc
+        )
+        sblk.body = [or_start] + [or_end] + sblk.body[:]
+
+    def teams_distribute_parallel_for_simd_directive(self, args):
+        self.some_distribute_directive(
+            args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP.SIMD", 5, has_loop=True
+        )
+
+    def teams_distribute_parallel_for_directive(self, args):
+        self.some_distribute_directive(
+            args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 4, has_loop=True
+        )
+
+    def teams_distribute_directive(self, args):
+        self.some_distribute_directive(args, "TEAMS.DISTRIBUTE", 2, has_loop=True)
+
+    def teams_distribute_simd_directive(self, args):
+        self.some_distribute_directive(args, "TEAMS.DISTRIBUTE.SIMD", 3, has_loop=True)
+
+    def teams_loop_directive(self, args):
+        self.some_distribute_directive(
+            args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 2, has_loop=True
+        )
+
+    def loop_directive(self, args):
+        # TODO Add error checking that a clause that the parser accepts if we find that
+        # loop can even take clauses, which we're not sure that it can.
+        enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start)
+        if not enclosing_regions or len(enclosing_regions) < 1:
+            self.some_for_directive(
+                args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 1, True
+            )
+        else:
+            if "DISTRIBUTE" in enclosing_regions[-1].tags[0].name:
+                self.some_distribute_directive(args, "PARALLEL.LOOP", 1, has_loop=True)
+            elif "TEAMS" in enclosing_regions[-1].tags[0].name:
+                self.some_distribute_directive(
+                    args, "DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True
+                )
+            else:
+                if "TARGET" in enclosing_regions[-1].tags[0].name:
+                    self.some_distribute_directive(
+                        args, "TEAMS.DISTRIBUTE.PARALLEL.LOOP", 1, has_loop=True
+                    )
+                else:
+                    self.some_for_directive(
+                        args,
+                        "DIR.OMP.PARALLEL.LOOP",
+                        "DIR.OMP.END.PARALLEL.LOOP",
+                        1,
+                        True,
+                    )
+
+    def distribute_directive(self, args):
+        self.some_distribute_directive(args, "DISTRIBUTE", 1, has_loop=True)
+
+    def distribute_simd_directive(self, args):
+        self.some_distribute_directive(args, "DISTRIBUTE.SIMD", 2, has_loop=True)
+
+    def distribute_parallel_for_directive(self, args):
+        self.some_distribute_directive(
+            args, "DISTRIBUTE.PARALLEL.LOOP", 3, has_loop=True
+        )
+
+    def distribute_parallel_for_simd_directive(self, args):
+        self.some_distribute_directive(
+            args, "DISTRIBUTE.PARALLEL.LOOP.SIMD", 4, has_loop=True
+        )
+
+    def some_distribute_directive(self, args, dir_tag, lexer_count, has_loop=False):
+        if DEBUG_OPENMP >= 1:
+            print(
+                "visit some_distribute_directive",
+                args,
+                type(args),
+                self.blk_start,
+                self.blk_end,
+            )
+
+        self.check_distribute_nesting(dir_tag)
+
+        target_num = OpenmpVisitor.target_num
+        OpenmpVisitor.target_num += 1
+
+        dir_start_tag = "DIR.OMP." + dir_tag
+        dir_end_tag = "DIR.OMP.END." + dir_tag
+        start_tags = [openmp_tag(dir_start_tag, target_num)]
+        end_tags = [openmp_tag(dir_end_tag, target_num)]
+
+        sblk = self.blocks[self.blk_start]
+        clauses, _ = self.flatten(args[lexer_count:], sblk)
+
+        if "TEAMS" in dir_tag:
+            # NUM_TEAMS, THREAD_LIMIT are not in clauses, set them to 0 to
+            # use runtime defaults in teams, thread launching.
+            if len(self.get_clauses_by_name(clauses, "QUAL.OMP.NUM_TEAMS")) == 0:
+                start_tags.append(openmp_tag("QUAL.OMP.NUM_TEAMS", 0))
+            if len(self.get_clauses_by_name(clauses, "QUAL.OMP.THREAD_LIMIT")) == 0:
+                start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 0))
+            self.teams_back_prop(clauses)
+        elif "PARALLEL" in dir_tag:
+            self.parallel_back_prop(clauses)
+
+        if DEBUG_OPENMP >= 1:
+            for clause in clauses:
+                print("target clause:", clause)
+
+        self.some_data_clause_directive(
+            clauses, start_tags, end_tags, 0, has_loop=has_loop, for_target=False
+        )
+
+    def some_target_directive(self, args, dir_tag, lexer_count, has_loop=False):
+        if DEBUG_OPENMP >= 1:
+            print(
+                "visit some_target_directive",
+                args,
+                type(args),
+                self.blk_start,
+                self.blk_end,
+            )
+
+        self.check_distribute_nesting(dir_tag)
+
+        target_num = OpenmpVisitor.target_num
+        OpenmpVisitor.target_num += 1
+
+        dir_start_tag = "DIR.OMP." + dir_tag
+        dir_end_tag = "DIR.OMP.END." + dir_tag
+        start_tags = [openmp_tag(dir_start_tag, target_num)]
+        end_tags = [openmp_tag(dir_end_tag, target_num)]
+
+        sblk = self.blocks[self.blk_start]
+        clauses, _ = self.flatten(args[lexer_count:], sblk)
+
+        if "TEAMS" in dir_tag:
+            # When NUM_TEAMS, THREAD_LIMIT are not in clauses, set them to 0 to
+            # use runtime defaults in teams, thread launching, otherwise use
+            # existing clauses.
+            clause_num_teams = self.get_clauses_by_name(clauses, "QUAL.OMP.NUM_TEAMS")
+            if not clause_num_teams:
+                start_tags.append(openmp_tag("QUAL.OMP.NUM_TEAMS", 0))
+
+            # Use the THREAD_LIMIT clause value if it exists, regardless of a
+            # combined PARALLEL (see
+            # https://www.openmp.org/spec-html/5.0/openmpse15.html) since
+            # THREAD_LIMIT takes precedence.  If clause does not exist, set to 0
+            # or to NUM_THREADS of the combined PARALLEL (if this exists).
+            clause_thread_limit = self.get_clauses_by_name(
+                clauses, "QUAL.OMP.THREAD_LIMIT"
+            )
+            if not clause_thread_limit:
+                thread_limit = 0
+                if "PARALLEL" in dir_tag:
+                    clause_num_threads = self.get_clauses_by_name(
+                        clauses, "QUAL.OMP.NUM_THREADS"
+                    )
+                    if clause_num_threads:
+                        assert len(clause_num_threads) == 1, (
+                            "Expected single NUM_THREADS clause"
+                        )
+                        thread_limit = clause_num_threads[0].arg
+                start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", thread_limit))
+        elif "PARALLEL" in dir_tag:
+            # PARALLEL in the directive (without TEAMS), set THREAD_LIMIT to NUM_THREADS clause
+            # (if NUM_THREADS exists), or 0 (if NUM_THREADS does not exist)
+            num_threads = 0
+            clause_num_threads = self.get_clauses_by_name(
+                clauses, "QUAL.OMP.NUM_THREADS"
+            )
+            if clause_num_threads:
+                assert len(clause_num_threads) == 1, (
+                    "Expected single NUM_THREADS clause"
+                )
+                num_threads = clause_num_threads[0].arg
+
+            # Replace existing THREAD_LIMIT clause.
+            clause_thread_limit = self.get_clauses_by_name(
+                clauses, "QUAL.OMP.THREAD_LIMIT", remove_from_orig=True
+            )
+            clauses.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", num_threads))
+        else:
+            # Neither TEAMS or PARALLEL in directive, set teams, threads to 1.
+            start_tags.append(openmp_tag("QUAL.OMP.NUM_TEAMS", 1))
+            start_tags.append(openmp_tag("QUAL.OMP.THREAD_LIMIT", 1))
+
+        if DEBUG_OPENMP >= 1:
+            for clause in clauses:
+                print("target clause:", clause)
+
+        self.some_data_clause_directive(
+            clauses, start_tags, end_tags, 0, has_loop=has_loop, for_target=True
+        )
+        # self.some_data_clause_directive(args, start_tags, end_tags, lexer_count, has_loop=has_loop)
+
+    def add_to_returns(self, stmts):
+        for blk in self.blocks.values():
+            if isinstance(blk.body[-1], ir.Return):
+                blk.body = blk.body[:-1] + stmts + [blk.body[-1]]
+
+    def add_block_in_order(self, new_block, insert_after_block):
+        """Insert a new block after the specified block while maintaining topological order"""
+        new_blocks = {}
+        # Copy blocks up to and including insert_after_block
+        for label, block in self.blocks.items():
+            new_blocks[label] = block
+            if label == insert_after_block:
+                # Insert new block right after
+                # We add a fractional to make sure the block is sorted right
+                # after the insert_after_block and before its successor.
+                # TODO: Avoid this fractional addition.
+                new_block_num = label + 0.1
+                new_blocks[new_block_num] = new_block
+        # Copy remaining blocks
+        for label, block in self.blocks.items():
+            if label > insert_after_block:
+                new_blocks[label] = block
+        # new_blocks = flatten_labels(new_blocks)
+        self.blocks.clear()
+        self.blocks.update(new_blocks)
+        return new_block_num
+
+    def some_data_clause_directive(
+        self,
+        args,
+        start_tags,
+        end_tags,
+        lexer_count,
+        has_loop=False,
+        for_target=False,
+        for_task=False,
+    ):
+        if DEBUG_OPENMP >= 1:
+            print(
+                "visit some_data_clause_directive",
+                args,
+                type(args),
+                self.blk_start,
+                self.blk_end,
+            )
+        assert not (for_target and for_task)
+
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+        scope = sblk.scope
+
+        if DEBUG_OPENMP >= 1:
+            for clause in args[lexer_count:]:
+                print("pre clause:", clause)
+        clauses, default_shared = self.flatten(args[lexer_count:], sblk)
+        if DEBUG_OPENMP >= 1:
+            for clause in clauses:
+                print("final clause:", clause)
+
+        before_start = []
+        after_start = []
+        for_before_start = []
+        for_after_start = []
+
+        # Get a dict mapping variables explicitly mentioned in the data clauses above to their openmp_tag.
+        vars_in_explicit_clauses, explicit_privates, non_user_explicits = (
+            self.get_explicit_vars(clauses)
+        )
+        if DEBUG_OPENMP >= 1:
+            print(
+                "vars_in_explicit_clauses:",
+                sorted(vars_in_explicit_clauses),
+                type(vars_in_explicit_clauses),
+            )
+            for v in clauses:
+                print("vars_in_explicit clauses first:", v)
+
+        if has_loop:
+            prepare_out = self.prepare_for_directive(
+                clauses,
+                vars_in_explicit_clauses,
+                for_before_start,
+                for_after_start,
+                start_tags,
+                end_tags,
+                scope,
+            )
+            vars_in_explicit_clauses, explicit_privates, non_user_explicits = (
+                self.get_explicit_vars(clauses)
+            )
+            (
+                found_loop,
+                blocks_for_io,
+                blocks_in_region,
+                entry_pred,
+                exit_block,
+                inst,
+                size_var,
+                step_var,
+                latest_index,
+                loop_index,
+            ) = prepare_out
+            assert found_loop
+        else:
+            blocks_for_io = self.body_blocks
+            blocks_in_region = get_blocks_between_start_end(
+                self.blocks, self.blk_start, self.blk_end
+            )
+            entry_pred = sblk
+            exit_block = eblk
+
+        # Do an analysis to get variable use information coming into and out of the region.
+        inputs_to_region, def_but_live_out, private_to_region, live_map = (
+            self.find_io_vars(blocks_for_io)
+        )
+        live_out_copy = copy.copy(def_but_live_out)
+
+        if DEBUG_OPENMP >= 1:
+            print("inputs_to_region:", sorted(inputs_to_region))
+            print("def_but_live_out:", sorted(def_but_live_out))
+            print("private_to_region:", sorted(private_to_region))
+            for v in clauses:
+                print("clause after find_io_vars:", v)
+
+        # Remove variables the user explicitly added to a clause from the auto-determined variables.
+        # This will also treat SSA forms of vars the same as their explicit Python var clauses.
+        self.remove_explicit_from_io_vars(
+            inputs_to_region,
+            def_but_live_out,
+            private_to_region,
+            vars_in_explicit_clauses,
+            clauses,
+            non_user_explicits,
+            scope,
+            self.loc,
+        )
+
+        if DEBUG_OPENMP >= 1:
+            for v in clauses:
+                print("clause after remove_explicit_from_io_vars:", v)
+
+        if DEBUG_OPENMP >= 1:
+            for k, v in vars_in_explicit_clauses.items():
+                print("vars_in_explicit before:", k, v)
+            for v in clauses:
+                print("vars_in_explicit clauses before:", v)
+            for k, v in non_user_explicits.items():
+                print("non_user_explicits before:", k, v)
+
+        if DEBUG_OPENMP >= 1:
+            print("inputs_to_region after remove_explicit:", sorted(inputs_to_region))
+            print("def_but_live_out after remove_explicit:", sorted(def_but_live_out))
+            print("private_to_region after remove_explicit:", sorted(private_to_region))
+
+        if not default_shared and (
+            has_user_defined_var(inputs_to_region)
+            or has_user_defined_var(def_but_live_out)
+            or has_user_defined_var(private_to_region)
+        ):
+            user_defined_inputs = get_user_defined_var(inputs_to_region)
+            user_defined_def_live = get_user_defined_var(def_but_live_out)
+            user_defined_private = get_user_defined_var(private_to_region)
+            if DEBUG_OPENMP >= 1:
+                print("inputs users:", sorted(user_defined_inputs))
+                print("def users:", sorted(user_defined_def_live))
+                print("private users:", sorted(user_defined_private))
+            raise UnspecifiedVarInDefaultNone(
+                "Variables with no data env clause in OpenMP region: "
+                + str(
+                    user_defined_inputs.union(user_defined_def_live).union(
+                        user_defined_private
+                    )
+                )
+            )
+
+        if for_target:
+            self.make_implicit_explicit_target(
+                scope,
+                vars_in_explicit_clauses,
+                clauses,
+                True,
+                inputs_to_region,
+                def_but_live_out,
+                private_to_region,
+            )
+        elif for_task:
+            self.make_implicit_explicit(
+                scope,
+                vars_in_explicit_clauses,
+                clauses,
+                True,
+                inputs_to_region,
+                def_but_live_out,
+                private_to_region,
+                for_task=get_enclosing_region(self.func_ir, self.blk_start),
+            )
+        else:
+            self.make_implicit_explicit(
+                scope,
+                vars_in_explicit_clauses,
+                clauses,
+                True,
+                inputs_to_region,
+                def_but_live_out,
+                private_to_region,
+            )
+        if DEBUG_OPENMP >= 1:
+            for k, v in vars_in_explicit_clauses.items():
+                print("vars_in_explicit after:", k, v)
+            for v in clauses:
+                print("vars_in_explicit clauses after:", v)
+        vars_in_explicit_clauses, explicit_privates, non_user_explicits = (
+            self.get_explicit_vars(clauses)
+        )
+        if DEBUG_OPENMP >= 1:
+            print("post get_explicit_vars:", explicit_privates)
+            for k, v in vars_in_explicit_clauses.items():
+                print("vars_in_explicit post:", k, v)
+        if DEBUG_OPENMP >= 1:
+            print("blocks_in_region:", blocks_in_region)
+
+        self.make_consts_unliteral_for_privates(explicit_privates, self.blocks)
+
+        # Returns a dict of private clause variables and their potentially SSA form at the end of the region.
+        clause_privates = self.get_clause_privates(
+            clauses, live_out_copy, scope, self.loc
+        )
+
+        if DEBUG_OPENMP >= 1:
+            print("clause_privates:", sorted(clause_privates), type(clause_privates))
+            print("inputs_to_region:", sorted(inputs_to_region))
+            print("def_but_live_out:", sorted(def_but_live_out))
+            print("live_out_copy:", sorted(live_out_copy))
+            print("private_to_region:", sorted(private_to_region))
+
+        keep_alive = []
+        tags_for_enclosing = self.add_explicits_to_start(
+            scope, vars_in_explicit_clauses, clauses, True, start_tags, keep_alive
+        )
+        add_tags_to_enclosing(self.func_ir, self.blk_start, tags_for_enclosing)
+
+        # or_start = openmp_region_start([openmp_tag("DIR.OMP.TARGET", target_num)] + clauses, 0, self.loc)
+        # or_end   = openmp_region_end(or_start, [openmp_tag("DIR.OMP.END.TARGET", target_num)], self.loc)
+        # new_header_block_num = max(self.blocks.keys()) + 1
+
+        firstprivate_dead_after = list(
+            filter(
+                lambda x: x.name == "QUAL.OMP.FIRSTPRIVATE"
+                and x.arg not in live_map[self.blk_end],
+                start_tags,
+            )
+        )
+
+        or_start = openmp_region_start(
+            start_tags, 0, self.loc, firstprivate_dead_after=firstprivate_dead_after
+        )
+        or_end = openmp_region_end(or_start, end_tags, self.loc)
+
+        if DEBUG_OPENMP >= 1:
+            for x in keep_alive:
+                print("keep_alive:", x)
+            for x in firstprivate_dead_after:
+                print("firstprivate_dead_after:", x)
+
+        # Adding the openmp tags in topo order to avoid problems with code
+        # generation and with_lifting legalization.
+        # TODO: we should remove the requirement to process in topo order. There
+        # is state depending on topo order processing.
+        if has_loop:
+            new_header_block = ir.Block(scope, self.loc)
+            new_header_block.body = (
+                [or_start] + after_start + for_after_start + [entry_pred.body[-1]]
+            )
+            new_block_num = self.add_block_in_order(new_header_block, self.blk_start)
+            entry_pred.body = (
+                entry_pred.body[:-1]
+                + before_start
+                + for_before_start
+                + [ir.Jump(new_block_num, self.loc)]
+            )
+
+            if for_task:
+                exit_block.body = [or_end] + exit_block.body
+                self.add_to_returns(keep_alive)
+            else:
+                exit_block.body = [or_end] + keep_alive + exit_block.body
+        else:
+            new_header_block = ir.Block(scope, self.loc)
+            new_header_block.body = [or_start] + after_start + sblk.body[:]
+            new_header_block_num = self.add_block_in_order(
+                new_header_block, self.blk_start
+            )
+            sblk.body = before_start + [ir.Jump(new_header_block_num, self.loc)]
+
+            # NOTE: or_start could also be inlined for correct codegen as
+            # follows. Favoring the add_block_in_order method for consistency.
+            # sblk.body = before_start + [or_start] + after_start + sblk.body[:]
+
+            if for_task:
+                eblk.body = [or_end] + eblk.body[:]
+                self.add_to_returns(keep_alive)
+            else:
+                eblk.body = [or_end] + keep_alive + eblk.body[:]
+
+        add_enclosing_region(self.func_ir, self.body_blocks, or_start)
+        return clauses
+
+    def target_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit target_clause", args, type(args), args[0])
+            if isinstance(args[0], list):
+                print(args[0][0])
+        (val,) = args
+        if isinstance(val, openmp_tag):
+            return [val]
+        elif isinstance(val, list):
+            return val
+        elif val == "nowait":
+            return openmp_tag("QUAL.OMP.NOWAIT")
+        else:
+            return val
+        # return args[0]
+
+    def target_teams_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit target_teams_clause", args, type(args), args[0])
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def target_teams_distribute_parallel_for_simd_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print(
+                "visit target_teams_distribute_parallel_for_simd_clause",
+                args,
+                type(args),
+                args[0],
+            )
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def teams_distribute_parallel_for_simd_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print(
+                "visit teams_distribute_parallel_for_simd_clause",
+                args,
+                type(args),
+                args[0],
+            )
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def teams_distribute_parallel_for_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print(
+                "visit teams_distribute_parallel_for_clause", args, type(args), args[0]
+            )
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def distribute_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit distribute_clause", args, type(args), args[0])
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def teams_distribute_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit teams_distribute_clause", args, type(args), args[0])
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def teams_distribute_simd_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit teams_distribute_simd_clause", args, type(args), args[0])
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def distribute_parallel_for_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit distribute_parallel_for_clause", args, type(args), args[0])
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def target_teams_distribute_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit target_teams_distribute_clause", args, type(args), args[0])
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    def target_teams_distribute_parallel_for_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print(
+                "visit target_teams_distribute_parallel_for_clause",
+                args,
+                type(args),
+                args[0],
+            )
+            if isinstance(args[0], list):
+                print(args[0][0])
+        return args[0]
+
+    # Don't need a rule for target_update_construct.
+
+    def target_update_directive(self, args):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit target_update_directive", args, type(args))
+        clauses, _ = self.flatten(args[2:], sblk)
+        or_start = openmp_region_start(
+            [openmp_tag("DIR.OMP.TARGET.UPDATE")] + clauses, 0, self.loc
+        )
+        or_end = openmp_region_end(
+            or_start, [openmp_tag("DIR.OMP.END.TARGET.UPDATE")], self.loc
+        )
+        sblk.body = [or_start] + [or_end] + sblk.body[:]
+
+    def target_update_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit target_update_clause", args, type(args), args[0])
+        # return args[0]
+        (val,) = args
+        if isinstance(val, openmp_tag):
+            return [val]
+        elif isinstance(val, list):
+            return val
+        else:
+            return val
+
+    def motion_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit motion_clause", args, type(args))
+        assert args[0] in ["to", "from"]
+        map_type = args[0].upper()
+        var_list = args[1]
+        assert len(args) == 2
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var))
+        return ret
+
+    def variable_array_section_list(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit variable_array_section_list", args, type(args))
+        if len(args) == 1:
+            return args
+        else:
+            args[0].append(args[1])
+            return args[0]
+
+    """
+    def array_section(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit array_section", args, type(args))
+        return args
+
+    def array_section_subscript(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit array_section_subscript", args, type(args))
+        return args
+    """
+
+    # Don't need a rule for TARGET.
+    # Don't need a rule for single_construct.
+
+    def single_directive(self, args):
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit single_directive", args, type(args))
+        or_start = openmp_region_start([openmp_tag("DIR.OMP.SINGLE")], 0, self.loc)
+        or_start.requires_acquire_release()
+        or_end = openmp_region_end(
+            or_start, [openmp_tag("DIR.OMP.END.SINGLE")], self.loc
+        )
+        sblk.body = [or_start] + sblk.body[:]
+        eblk.body = [or_end] + eblk.body[:]
+
+    def single_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit single_clause", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for unique_single_clause.
+    # def NOWAIT(self, args):
+    #    return "nowait"
+    # Don't need a rule for NOWAIT.
+    # Don't need a rule for master_construct.
+
+    def master_directive(self, args):
+        raise NotImplementedError("Master directive currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit master_directive", args, type(args))
+
+    # Don't need a rule for simd_construct.
+
+    def simd_directive(self, args):
+        raise NotImplementedError("Simd directive currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit simd_directive", args, type(args))
+
+    # Don't need a rule for SIMD.
+
+    def simd_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit simd_clause", args, type(args), args[0])
+        return args[0]
+
+    def aligned_clause(self, args):
+        raise NotImplementedError("Aligned clause currently unsupported.")
+        if DEBUG_OPENMP >= 1:
+            print("visit aligned_clause", args, type(args))
+
+    # Don't need a rule for declare_simd_construct.
+
+    def declare_simd_directive_seq(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit declare_simd_directive_seq", args, type(args), args[0])
+        return args[0]
+
+    def declare_simd_directive(self, args):
+        raise NotImplementedError("Declare simd directive currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit declare_simd_directive", args, type(args))
+
+    def declare_simd_clause(self, args):
+        raise NotImplementedError("Declare simd clauses currently unsupported.")
+        if DEBUG_OPENMP >= 1:
+            print("visit declare_simd_clause", args, type(args))
+
+    # Don't need a rule for ALIGNED.
+
+    def inbranch_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit inbranch_clause", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for INBRANCH.
+    # Don't need a rule for NOTINBRANCH.
+
+    def uniform_clause(self, args):
+        raise NotImplementedError("Uniform clause currently unsupported.")
+        if DEBUG_OPENMP >= 1:
+            print("visit uniform_clause", args, type(args))
+
+    # Don't need a rule for UNIFORM.
+
+    def collapse_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit collapse_clause", args, type(args))
+        return openmp_tag("QUAL.OMP.COLLAPSE", args[1])
+
+    # Don't need a rule for COLLAPSE.
+    # Don't need a rule for task_construct.
+    # Don't need a rule for TASK.
+
+    def task_directive(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit task_directive", args, type(args))
+
+        start_tags = [openmp_tag("DIR.OMP.TASK")]
+        end_tags = [openmp_tag("DIR.OMP.END.TASK")]
+        self.some_data_clause_directive(args, start_tags, end_tags, 1, for_task=True)
+
+    def task_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit task_clause", args, type(args), args[0])
+        return args[0]
+
+    def unique_task_clause(self, args):
+        raise NotImplementedError("Task-related clauses currently unsupported.")
+        if DEBUG_OPENMP >= 1:
+            print("visit unique_task_clause", args, type(args))
+
+    # Don't need a rule for DEPEND.
+    # Don't need a rule for FINAL.
+    # Don't need a rule for UNTIED.
+    # Don't need a rule for MERGEABLE.
+
+    def dependence_type(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit dependence_type", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for IN.
+    # Don't need a rule for OUT.
+    # Don't need a rule for INOUT.
+
+    def data_default_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit data_default_clause", args, type(args), args[0])
+        return args[0]
+
+    def data_sharing_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit data_sharing_clause", args, type(args), args[0])
+        return args[0]
+
+    def data_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit data_clause", args, type(args), args[0])
+        return args[0]
+
+    def private_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit private_clause", args, type(args), args[0])
+        (_, var_list) = args
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.PRIVATE", var))
+        return ret
+
+    # Don't need a rule for PRIVATE.
+
+    def copyprivate_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit copyprivate_clause", args, type(args), args[0])
+        (_, var_list) = args
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.COPYPRIVATE", var))
+        return ret
+
+    # Don't need a rule for COPYPRIVATE.
+
+    def firstprivate_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit firstprivate_clause", args, type(args), args[0])
+        (_, var_list) = args
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.FIRSTPRIVATE", var))
+        return ret
+
+    # Don't need a rule for FIRSTPRIVATE.
+
+    def lastprivate_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit lastprivate_clause", args, type(args), args[0])
+        (_, var_list) = args
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.LASTPRIVATE", var))
+        return ret
+
+    # Don't need a rule for LASTPRIVATE.
+
+    def shared_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit shared_clause", args, type(args), args[0])
+        (_, var_list) = args
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.SHARED", var))
+        return ret
+
+    # Don't need a rule for SHARED.
+
+    def copyin_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit copyin_clause", args, type(args), args[0])
+        (_, var_list) = args
+        ret = []
+        for var in var_list:
+            ret.append(openmp_tag("QUAL.OMP.COPYIN", var))
+        return ret
+
+    # Don't need a rule for COPYIN.
+    # Don't need a rule for REDUCTION.
+
+    def reduction_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit reduction_clause", args, type(args), args[0])
+
+        (_, red_op, red_list) = args
+        ret = []
+        for shared in red_list:
+            ret.append(openmp_tag("QUAL.OMP.REDUCTION." + red_op, shared))
+        return ret
+
+    def default_shared_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit default_shared_clause", args, type(args))
+        return default_shared_val(True)
+
+    def default_none_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit default_none", args, type(args))
+        return default_shared_val(False)
+
+    def const_num_or_var(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit const_num_or_var", args, type(args))
+        return args[0]
+
+    # Don't need a rule for parallel_construct.
+
+    def parallel_back_prop(self, clauses):
+        enclosing_regions = get_enclosing_region(self.func_ir, self.blk_start)
+        if DEBUG_OPENMP >= 1:
+            print("parallel enclosing_regions:", enclosing_regions)
+        if not enclosing_regions:
+            return
+
+        for enclosing_region in enclosing_regions[::-1]:
+            # If there is TEAMS in the enclosing region then THREAD_LIMIT is
+            # already set, do nothing.
+            if self.get_directive_if_contains(enclosing_region.tags, "TEAMS"):
+                return
+            if not self.get_directive_if_contains(enclosing_region.tags, "TARGET"):
+                continue
+
+            # Set to 0 means "don't care", use implementation specific number of threads.
+            num_threads = 0
+            num_threads_clause = self.get_clauses_by_name(
+                clauses, "QUAL.OMP.NUM_THREADS"
+            )
+            if num_threads_clause:
+                assert len(num_threads_clause) == 1, (
+                    "Expected num_threads clause defined once"
+                )
+                num_threads = num_threads_clause[0].arg
+            nt_tag = self.get_clauses_by_name(
+                enclosing_region.tags, "QUAL.OMP.THREAD_LIMIT"
+            )
+            assert len(nt_tag) > 0
+
+            # If THREAD_LIMIT is less than requested NUM_THREADS or 1,
+            # increase it.  This is still valid if THREAD_LIMIT is 0, since this
+            # means there was a parallel region before that did not specify
+            # NUM_THREADS so we can set to the concrete value of the sibling
+            # parallel region with the max value of NUM_THREADS.
+            if num_threads > nt_tag[-1].arg or nt_tag[-1].arg == 1:
+                nt_tag[-1].arg = num_threads
+            return
+
+    def parallel_directive(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit parallel_directive", args, type(args))
+
+        start_tags = [openmp_tag("DIR.OMP.PARALLEL")]
+        end_tags = [openmp_tag("DIR.OMP.END.PARALLEL")]
+        clauses = self.some_data_clause_directive(args, start_tags, end_tags, 1)
+
+        # sblk = self.blocks[self.blk_start]
+        # eblk = self.blocks[self.blk_end]
+        # scope = sblk.scope
+
+        # before_start = []
+        # after_start = []
+        # clauses, default_shared = self.flatten(args[1:], sblk)
+
+        if len(list(filter(lambda x: x.name == "QUAL.OMP.NUM_THREADS", clauses))) > 1:
+            raise MultipleNumThreadsClauses(
+                f"Multiple num_threads clauses near line {self.loc} is not allowed in an OpenMP parallel region."
+            )
+
+        if DEBUG_OPENMP >= 1:
+            for clause in clauses:
+                print("final clause:", clause)
+
+        # ---- Back propagate THREAD_LIMIT to enclosed target region. ----
+        self.parallel_back_prop(clauses)
+
+    def parallel_clause(self, args):
+        (val,) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit parallel_clause", args, type(args), args[0])
+        return val
+
+    def unique_parallel_clause(self, args):
+        (val,) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit unique_parallel_clause", args, type(args), args[0])
+        assert isinstance(val, openmp_tag)
+        return val
+
+    def teams_clause(self, args):
+        (val,) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit teams_clause", args, type(args), args[0])
+        return val
+
+    def num_teams_clause(self, args):
+        (_, num_teams) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit num_teams_clause", args, type(args))
+
+        return openmp_tag("QUAL.OMP.NUM_TEAMS", num_teams, load=True)
+
+    def thread_limit_clause(self, args):
+        (_, thread_limit) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit thread_limit_clause", args, type(args))
+
+        return openmp_tag("QUAL.OMP.THREAD_LIMIT", thread_limit, load=True)
+
+    def if_clause(self, args):
+        (_, if_val) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit if_clause", args, type(args))
+
+        return openmp_tag("QUAL.OMP.IF", if_val, load=True)
+
+    # Don't need a rule for IF.
+
+    def num_threads_clause(self, args):
+        (_, num_threads) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit num_threads_clause", args, type(args))
+
+        return openmp_tag("QUAL.OMP.NUM_THREADS", num_threads, load=True)
+
+    # Don't need a rule for NUM_THREADS.
+    # Don't need a rule for PARALLEL.
+    # Don't need a rule for FOR.
+    # Don't need a rule for parallel_for_construct.
+
+    def parallel_for_directive(self, args):
+        return self.some_for_directive(
+            args, "DIR.OMP.PARALLEL.LOOP", "DIR.OMP.END.PARALLEL.LOOP", 2, True
+        )
+
+    def parallel_for_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit parallel_for_clause", args, type(args), args[0])
+        return args[0]
+
+    # Don't need a rule for for_construct.
+
+    def for_directive(self, args):
+        return self.some_for_directive(
+            args, "DIR.OMP.LOOP", "DIR.OMP.END.LOOP", 1, False
+        )
+
+    def for_clause(self, args):
+        (val,) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit for_clause", args, type(args))
+        if isinstance(val, openmp_tag):
+            return [val]
+        elif isinstance(val, list):
+            return val
+        elif val == "nowait":
+            return openmp_tag("QUAL.OMP.NOWAIT")
+
+    def unique_for_clause(self, args):
+        (val,) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit unique_for_clause", args, type(args))
+        if isinstance(val, openmp_tag):
+            return val
+        elif val == "ordered":
+            return openmp_tag("QUAL.OMP.ORDERED", 0)
+
+    # Don't need a rule for LINEAR.
+
+    def linear_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit linear_clause", args, type(args), args[0])
+        return args[0]
+
+    """
+    Linear_expr not in grammar
+    def linear_expr(self, args):
+        (_, var, step) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit linear_expr", args, type(args))
+        return openmp_tag("QUAL.OMP.LINEAR", [var, step])
+    """
+
+    """
+    def ORDERED(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit ordered", args, type(args))
+        return "ordered"
+    """
+
+    def sched_no_expr(self, args):
+        (_, kind) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit sched_no_expr", args, type(args))
+        if kind == "static":
+            return openmp_tag("QUAL.OMP.SCHEDULE.STATIC", 0)
+        elif kind == "dynamic":
+            return openmp_tag("QUAL.OMP.SCHEDULE.DYNAMIC", 0)
+        elif kind == "guided":
+            return openmp_tag("QUAL.OMP.SCHEDULE.GUIDED", 0)
+        elif kind == "runtime":
+            return openmp_tag("QUAL.OMP.SCHEDULE.RUNTIME", 0)
+
+    def sched_expr(self, args):
+        (_, kind, num_or_var) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit sched_expr", args, type(args), num_or_var, type(num_or_var))
+        if kind == "static":
+            return openmp_tag("QUAL.OMP.SCHEDULE.STATIC", num_or_var, load=True)
+        elif kind == "dynamic":
+            return openmp_tag("QUAL.OMP.SCHEDULE.DYNAMIC", num_or_var, load=True)
+        elif kind == "guided":
+            return openmp_tag("QUAL.OMP.SCHEDULE.GUIDED", num_or_var, load=True)
+        elif kind == "runtime":
+            return openmp_tag("QUAL.OMP.SCHEDULE.RUNTIME", num_or_var, load=True)
+
+    def SCHEDULE(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit SCHEDULE", args, type(args))
+        return "schedule"
+
+    def schedule_kind(self, args):
+        (kind,) = args
+        if DEBUG_OPENMP >= 1:
+            print("visit schedule_kind", args, type(args))
+        return kind
+
+    # Don't need a rule for STATIC.
+    # Don't need a rule for DYNAMIC.
+    # Don't need a rule for GUIDED.
+    # Don't need a rule for RUNTIME.
+
+    """
+    def STATIC(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit STATIC", args, type(args))
+        return "static"
+
+    def DYNAMIC(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit DYNAMIC", args, type(args))
+        return "dynamic"
+
+    def GUIDED(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit GUIDED", args, type(args))
+        return "guided"
+
+    def RUNTIME(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit RUNTIME", args, type(args))
+        return "runtime"
+    """
+
+    def COLON(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit COLON", args, type(args))
+        return ":"
+
+    def oslice(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit oslice", args, type(args))
+        start = None
+        end = None
+        if args[0] != ":":
+            start = args[0]
+            args = args[2:]
+        else:
+            args = args[1:]
+
+        if len(args) > 0:
+            end = args[0]
+        return slice(start, end)
+
+    def slice_list(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit slice_list", args, type(args))
+        if len(args) == 1:
+            return args
+        else:
+            args[0].append(args[1])
+            return args[0]
+
+    def name_slice(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit name_slice", args, type(args))
+        if len(args) == 1 or args[1] is None:
+            return args[0]
+        else:
+            return NameSlice(args[0], args[1:])
+
+    def var_list(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit var_list", args, type(args))
+        if len(args) == 1:
+            return args
+        else:
+            args[0].append(args[1])
+            return args[0]
+
+    def number_list(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit number_list", args, type(args))
+        if len(args) == 1:
+            return args
+        else:
+            args[0].append(args[1])
+            return args[0]
+
+    def ompx_attribute(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit ompx_attribute", args, type(args), args[0])
+        (_, attr, number_list) = args
+        return openmp_tag("QUAL.OMP.OMPX_ATTRIBUTE", (attr, number_list))
+
+    def PLUS(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit PLUS", args, type(args))
+        return "+"
+
+    def MINUS(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit MINUS", args, type(args))
+        return "-"
+
+    def STAR(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit STAR", args, type(args))
+        return "*"
+
+    def reduction_operator(self, args):
+        arg = args[0]
+        if DEBUG_OPENMP >= 1:
+            print("visit reduction_operator", args, type(args), arg, type(arg))
+        if arg == "+":
+            return "ADD"
+        elif arg == "-":
+            return "SUB"
+        elif arg == "*":
+            return "MUL"
+        assert 0
+
+    def threadprivate_directive(self, args):
+        raise NotImplementedError("Threadprivate currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit threadprivate_directive", args, type(args))
+
+    def cancellation_point_directive(self, args):
+        raise NotImplementedError("""Explicit cancellation points
+                                 currently unsupported.""")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit cancellation_point_directive", args, type(args))
+
+    def construct_type_clause(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit construct_type_clause", args, type(args), args[0])
+        return args[0]
+
+    def cancel_directive(self, args):
+        raise NotImplementedError("Cancel directive currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit cancel_directive", args, type(args))
+
+    # Don't need a rule for ORDERED.
+
+    def flush_directive(self, args):
+        raise NotImplementedError("Flush directive currently unsupported.")
+        sblk = self.blocks[self.blk_start]
+        eblk = self.blocks[self.blk_end]
+
+        if DEBUG_OPENMP >= 1:
+            print("visit flush_directive", args, type(args))
+
+    def region_phrase(self, args):
+        raise NotImplementedError("No implementation for region phrase.")
+        if DEBUG_OPENMP >= 1:
+            print("visit region_phrase", args, type(args))
+
+    def PYTHON_NAME(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit PYTHON_NAME", args, type(args), str(args))
+        return str(args)
+
+    def NUMBER(self, args):
+        if DEBUG_OPENMP >= 1:
+            print("visit NUMBER", args, type(args), str(args))
+        return int(args)
+
+
+openmp_grammar = r"""
+    openmp_statement: openmp_construct
+                    | openmp_directive
+    openmp_directive: barrier_directive
+                    | taskwait_directive
+                    | taskyield_directive
+                    | flush_directive
+    barrier_directive: BARRIER
+    taskwait_directive: TASKWAIT
+    taskyield_directive: TASKYIELD
+    BARRIER: "barrier"
+    TASKWAIT: "taskwait"
+    TASKYIELD: "taskyield"
+    taskgroup_directive: TASKGROUP
+    taskgroup_construct: taskgroup_directive
+    TASKGROUP: "taskgroup"
+    openmp_construct: parallel_construct
+                    | parallel_for_construct
+                    | for_construct
+                    | single_construct
+                    | task_construct
+                    | teams_construct
+                    | teams_distribute_construct
+                    | teams_distribute_simd_construct
+                    | teams_distribute_parallel_for_construct
+                    | teams_distribute_parallel_for_simd_construct
+                    | loop_construct
+                    | teams_loop_construct
+                    | target_construct
+                    | target_teams_construct
+                    | target_teams_distribute_construct
+                    | target_teams_distribute_simd_construct
+                    | target_teams_distribute_parallel_for_simd_construct
+                    | target_teams_distribute_parallel_for_construct
+                    | target_loop_construct
+                    | target_teams_loop_construct
+                    | target_enter_data_construct
+                    | target_exit_data_construct
+                    | distribute_construct
+                    | distribute_simd_construct
+                    | distribute_parallel_for_construct
+                    | distribute_parallel_for_simd_construct
+                    | critical_construct
+                    | atomic_construct
+                    | sections_construct
+                    | section_construct
+                    | simd_construct
+                    | for_simd_construct
+                    | parallel_for_simd_construct
+                    | target_data_construct
+                    | target_update_construct
+                    | parallel_sections_construct
+                    | master_construct
+                    | ordered_construct
+    for_simd_construct: for_simd_directive
+    for_simd_directive: FOR SIMD [for_simd_clause*]
+    for_simd_clause: for_clause
+                   | simd_clause
+    parallel_for_simd_construct: parallel_for_simd_directive
+    parallel_for_simd_directive: PARALLEL FOR SIMD [parallel_for_simd_clause*]
+    parallel_for_simd_clause: parallel_for_clause
+                            | simd_clause
+    distribute_construct: distribute_directive
+    distribute_simd_construct: distribute_simd_directive
+    distribute_directive: DISTRIBUTE [distribute_clause*]
+    distribute_simd_directive: DISTRIBUTE SIMD [distribute_simd_clause*]
+    distribute_clause: private_clause
+                     | firstprivate_clause
+              //     | lastprivate_distribute_clause
+                     | collapse_clause
+                     | dist_schedule_clause
+                     | allocate_clause
+    distribute_simd_clause: private_clause
+                          | firstprivate_clause
+                   //     | lastprivate_distribute_clause
+                          | collapse_clause
+                          | dist_schedule_clause
+                          | allocate_clause
+                          | if_clause
+                   //     | safelen_clause
+                   //     | simdlen_clause
+                          | linear_clause
+                          | aligned_clause
+                   //     | nontemporal_clause
+                          | reduction_clause
+                   //     | order_clause
+
+    teams_distribute_clause: num_teams_clause
+                           | thread_limit_clause
+                           | data_default_clause
+                           | private_clause
+                           | firstprivate_clause
+                           | data_sharing_clause
+                           | reduction_clause
+                           | allocate_clause
+                    //     | lastprivate_distribute_clause
+                           | collapse_clause
+                           | dist_schedule_clause
+                           | ompx_attribute
+
+    teams_distribute_simd_clause: num_teams_clause
+                                | thread_limit_clause
+                                | data_default_clause
+                                | private_clause
+                                | firstprivate_clause
+                                | data_sharing_clause
+                                | reduction_clause
+                                | allocate_clause
+                         //     | lastprivate_distribute_clause
+                                | collapse_clause
+                                | dist_schedule_clause
+                                | if_clause
+                         //     | safelen_clause
+                         //     | simdlen_clause
+                                | linear_clause
+                                | aligned_clause
+                         //     | nontemporal_clause
+                         //     | order_clause
+                                | ompx_attribute
+
+    distribute_parallel_for_construct: distribute_parallel_for_directive
+    distribute_parallel_for_directive: DISTRIBUTE PARALLEL FOR [distribute_parallel_for_clause*]
+    distribute_parallel_for_clause: if_clause
+                                  | num_threads_clause
+                                  | data_default_clause
+                                  | private_clause
+                                  | firstprivate_clause
+                                  | data_sharing_clause
+                                  | reduction_clause
+                                  | copyin_clause
+                           //     | proc_bind_clause
+                                  | allocate_clause
+                                  | lastprivate_clause
+                                  | linear_clause
+                                  | schedule_clause
+                                  | collapse_clause
+                                  | ORDERED
+                                  | NOWAIT
+                           //     | order_clause
+                                  | dist_schedule_clause
+
+    distribute_parallel_for_simd_construct: distribute_parallel_for_simd_directive
+    distribute_parallel_for_simd_directive: DISTRIBUTE PARALLEL FOR SIMD [distribute_parallel_for_simd_clause*]
+    distribute_parallel_for_simd_clause: if_clause
+                                  | num_threads_clause
+                                  | data_default_clause
+                                  | private_clause
+                                  | firstprivate_clause
+                                  | data_sharing_clause
+                                  | reduction_clause
+                                  | copyin_clause
+                           //     | proc_bind_clause
+                                  | allocate_clause
+                                  | lastprivate_clause
+                                  | linear_clause
+                                  | schedule_clause
+                                  | collapse_clause
+                                  | ORDERED
+                                  | NOWAIT
+                           //     | order_clause
+                                  | dist_schedule_clause
+                           //     | safelen_clause
+                           //     | simdlen_clause
+                                  | aligned_clause
+                           //     | nontemporal_clause
+
+    target_data_construct: target_data_directive
+    target_data_directive: TARGET DATA [target_data_clause*]
+    DATA: "data"
+    ENTER: "enter"
+    EXIT: "exit"
+    target_enter_data_construct: target_enter_data_directive
+    target_enter_data_directive: TARGET ENTER DATA [target_enter_data_clause*]
+    target_exit_data_construct: target_exit_data_directive
+    target_exit_data_directive: TARGET EXIT DATA [target_exit_data_clause*]
+    target_data_clause: device_clause
+                      | map_clause
+                      | if_clause
+                      | NOWAIT
+                      | depend_with_modifier_clause
+    target_enter_data_clause: device_clause
+                            | map_enter_clause
+                            | if_clause
+                            | NOWAIT
+                            | depend_with_modifier_clause
+    target_exit_data_clause: device_clause
+                           | map_exit_clause
+                           | if_clause
+                           | NOWAIT
+                           | depend_with_modifier_clause
+    device_clause: "device" "(" const_num_or_var ")"
+    map_clause: "map" "(" [map_type ":"] var_list ")"
+    map_type: ALLOC | TO | FROM | TOFROM
+    map_enter_clause: "map" "(" map_enter_type ":" var_list ")"
+    map_enter_type: ALLOC | TO
+    map_exit_clause: "map" "(" map_exit_type ":" var_list ")"
+    map_exit_type: FROM | RELEASE | DELETE
+    update_motion_type: TO | FROM
+    TO: "to"
+    FROM: "from"
+    ALLOC: "alloc"
+    TOFROM: "tofrom"
+    RELEASE: "release"
+    DELETE: "delete"
+    parallel_sections_construct: parallel_sections_directive
+    parallel_sections_directive: PARALLEL SECTIONS [parallel_sections_clause*]
+    parallel_sections_clause: unique_parallel_clause
+                            | data_default_clause
+                            | private_clause
+                            | firstprivate_clause
+                            | lastprivate_clause
+                            | data_sharing_clause
+                            | reduction_clause
+    sections_construct: sections_directive
+    sections_directive: SECTIONS [sections_clause*]
+    SECTIONS: "sections"
+    sections_clause: private_clause
+                   | firstprivate_clause
+                   | lastprivate_clause
+                   | reduction_clause
+                   | NOWAIT
+    section_construct: section_directive
+    section_directive: SECTION
+    SECTION: "section"
+    atomic_construct: atomic_directive
+    atomic_directive: ATOMIC [atomic_clause] [seq_cst_clause]
+    ATOMIC: "atomic"
+    atomic_clause: READ
+                 | WRITE
+                 | UPDATE
+                 | CAPTURE
+    READ: "read"
+    WRITE: "write"
+    UPDATE: "update"
+    CAPTURE: "capture"
+    seq_cst_clause: "seq_cst"
+    critical_construct: critical_directive
+    critical_directive: CRITICAL
+    CRITICAL: "critical"
+    teams_construct: teams_directive
+    teams_directive: TEAMS [teams_clause*]
+    teams_distribute_directive: TEAMS DISTRIBUTE [teams_distribute_clause*]
+    teams_distribute_simd_directive: TEAMS DISTRIBUTE SIMD [teams_distribute_simd_clause*]
+    target_construct: target_directive
+    target_teams_distribute_parallel_for_simd_construct: target_teams_distribute_parallel_for_simd_directive
+    target_teams_distribute_parallel_for_construct: target_teams_distribute_parallel_for_directive
+    teams_distribute_parallel_for_construct: teams_distribute_parallel_for_directive
+    teams_distribute_parallel_for_simd_construct: teams_distribute_parallel_for_simd_directive
+    loop_construct: loop_directive
+    teams_loop_construct: teams_loop_directive
+    target_loop_construct: target_loop_directive
+    target_teams_loop_construct: target_teams_loop_directive
+    target_teams_construct: target_teams_directive
+    target_teams_distribute_construct: target_teams_distribute_directive
+    target_teams_distribute_simd_construct: target_teams_distribute_simd_directive
+    teams_distribute_construct: teams_distribute_directive
+    teams_distribute_simd_construct: teams_distribute_simd_directive
+    target_directive: TARGET [target_clause*]
+    HAS_DEVICE_ADDR: "has_device_addr"
+    has_device_addr_clause: HAS_DEVICE_ADDR "(" var_list ")"
+    target_clause: if_clause
+                 | device_clause
+                 | thread_limit_clause
+                 | private_clause
+                 | firstprivate_clause
+          //     | in_reduction_clause
+                 | map_clause
+                 | is_device_ptr_clause
+                 | has_device_addr_clause
+          //     | defaultmap_clause
+                 | NOWAIT
+                 | allocate_clause
+                 | depend_with_modifier_clause
+          //     | uses_allocators_clause
+                 | ompx_attribute
+    teams_clause: num_teams_clause
+                | thread_limit_clause
+                | data_default_clause
+                | private_clause
+                | firstprivate_clause
+                | data_sharing_clause
+                | reduction_clause
+                | allocate_clause
+    num_teams_clause: NUM_TEAMS "(" const_num_or_var ")"
+    NUM_TEAMS: "num_teams"
+    thread_limit_clause: THREAD_LIMIT "(" const_num_or_var ")"
+    THREAD_LIMIT: "thread_limit"
+
+    dist_schedule_expr: DIST_SCHEDULE "(" STATIC ")"
+    dist_schedule_no_expr: DIST_SCHEDULE "(" STATIC "," const_num_or_var ")"
+    dist_schedule_clause: dist_schedule_expr
+                        | dist_schedule_no_expr
+    DIST_SCHEDULE: "dist_schedule"
+
+    target_teams_distribute_parallel_for_simd_directive: TARGET TEAMS DISTRIBUTE PARALLEL FOR SIMD [target_teams_distribute_parallel_for_simd_clause*]
+    target_teams_distribute_parallel_for_simd_clause: if_clause
+                                                    | device_clause
+                                                    | private_clause
+                                                    | firstprivate_clause
+                                             //     | in_reduction_clause
+                                                    | map_clause
+                                                    | is_device_ptr_clause
+                                             //     | defaultmap_clause
+                                                    | NOWAIT
+                                                    | allocate_clause
+                                                    | depend_with_modifier_clause
+                                             //     | uses_allocators_clause
+                                                    | num_teams_clause
+                                                    | thread_limit_clause
+                                                    | data_default_clause
+                                                    | data_sharing_clause
+                                                    | reduction_clause
+                                                    | num_threads_clause
+                                                    | copyin_clause
+                                             //     | proc_bind_clause
+                                                    | lastprivate_clause
+                                                    | linear_clause
+                                                    | schedule_clause
+                                                    | collapse_clause
+                                                    | ORDERED
+                                             //     | order_clause
+                                                    | dist_schedule_clause
+                                             //     | safelen_clause
+                                             //     | simdlen_clause
+                                                    | aligned_clause
+                                             //     | nontemporal_clause
+                                                    | ompx_attribute
+
+    teams_distribute_parallel_for_simd_directive: TEAMS DISTRIBUTE PARALLEL FOR SIMD [teams_distribute_parallel_for_simd_clause*]
+    teams_distribute_parallel_for_simd_clause: num_teams_clause
+                                             | thread_limit_clause
+                                      //     | default_clause
+                                             | private_clause
+                                             | firstprivate_clause
+                                             | data_sharing_clause
+                                             | reduction_clause
+                                             | if_clause
+                                             | num_threads_clause
+                                             | copyin_clause
+                                      //     | proc_bind_clause
+                                             | lastprivate_clause
+                                             | linear_clause
+                                             | schedule_clause
+                                             | collapse_clause
+                                             | ORDERED
+                                             | NOWAIT
+                                      //     | order_clause
+                                             | dist_schedule_clause
+                                      //     | safelen_clause
+                                      //     | simdlen_clause
+                                             | aligned_clause
+                                      //     | nontemporal_clause
+                                      //     | in_reduction_clause
+                                             | map_clause
+                                             | is_device_ptr_clause
+                                      //     | defaultmap_clause
+                                             | allocate_clause
+                                             | depend_with_modifier_clause
+                                      //     | uses_allocators_clause
+                                             | data_default_clause
+                                             | ompx_attribute
+
+    target_teams_distribute_parallel_for_directive: TARGET TEAMS DISTRIBUTE PARALLEL FOR [target_teams_distribute_parallel_for_clause*]
+    target_teams_distribute_parallel_for_clause: if_clause
+                                               | device_clause
+                                               | private_clause
+                                               | firstprivate_clause
+                                        //     | in_reduction_clause
+                                               | map_clause
+                                               | is_device_ptr_clause
+                                        //     | defaultmap_clause
+                                               | NOWAIT
+                                               | allocate_clause
+                                               | depend_with_modifier_clause
+                                        //     | uses_allocators_clause
+                                               | num_teams_clause
+                                               | thread_limit_clause
+                                               | data_default_clause
+                                               | data_sharing_clause
+                                               | reduction_clause
+                                               | num_threads_clause
+                                               | copyin_clause
+                                        //     | proc_bind_clause
+                                               | lastprivate_clause
+                                               | linear_clause
+                                               | schedule_clause
+                                               | collapse_clause
+                                               | ORDERED
+                                        //     | order_clause
+                                               | dist_schedule_clause
+                                               | ompx_attribute
+
+    teams_distribute_parallel_for_directive: TEAMS DISTRIBUTE PARALLEL FOR [teams_distribute_parallel_for_clause*]
+    teams_distribute_parallel_for_clause: num_teams_clause
+                                        | thread_limit_clause
+                                        | data_default_clause
+                                        | private_clause
+                                        | firstprivate_clause
+                                        | data_sharing_clause
+                                        | reduction_clause
+                                        | allocate_clause
+                                        | if_clause
+                                        | num_threads_clause
+                                        | copyin_clause
+                                 //     | proc_bind_clause
+                                        | lastprivate_clause
+                                        | linear_clause
+                                        | schedule_clause
+                                        | collapse_clause
+                                        | ORDERED
+                                        | NOWAIT
+                                 //     | order_clause
+                                        | dist_schedule_clause
+                                        | ompx_attribute
+
+    LOOP: "loop"
+
+    ompx_attribute: OMPX_ATTRIBUTE "(" PYTHON_NAME "(" number_list ")" ")"
+    OMPX_ATTRIBUTE: "ompx_attribute"
+    loop_directive: LOOP [teams_distribute_parallel_for_clause*]
+    teams_loop_directive: TEAMS LOOP [teams_distribute_parallel_for_clause*]
+    target_loop_directive: TARGET LOOP [target_teams_distribute_parallel_for_clause*]
+    target_teams_loop_directive: TARGET TEAMS LOOP [target_teams_distribute_parallel_for_clause*]
+
+    target_teams_directive: TARGET TEAMS [target_teams_clause*]
+    target_teams_clause: if_clause
+                       | device_clause
+                       | private_clause
+                       | firstprivate_clause
+                //     | in_reduction_clause
+                       | map_clause
+                       | is_device_ptr_clause
+                //     | defaultmap_clause
+                       | NOWAIT
+                       | allocate_clause
+                       | depend_with_modifier_clause
+                //     | uses_allocators_clause
+                       | num_teams_clause
+                       | thread_limit_clause
+                       | data_default_clause
+                       | data_sharing_clause
+                //     | reduction_default_only_clause
+                       | ompx_attribute
+
+    target_teams_distribute_simd_directive: TARGET TEAMS DISTRIBUTE SIMD [target_teams_distribute_simd_clause*]
+    target_teams_distribute_simd_clause: if_clause
+                                       | device_clause
+                                       | private_clause
+                                       | firstprivate_clause
+                                //     | in_reduction_clause
+                                       | map_clause
+                                       | is_device_ptr_clause
+                                //     | defaultmap_clause
+                                       | NOWAIT
+                                       | allocate_clause
+                                       | depend_with_modifier_clause
+                                //     | uses_allocators_clause
+                                       | num_teams_clause
+                                       | thread_limit_clause
+                                       | data_default_clause
+                                       | data_sharing_clause
+                                       | reduction_clause
+                                //     | reduction_default_only_clause
+                                       | lastprivate_clause
+                                       | collapse_clause
+                                       | dist_schedule_clause
+                                //     | safelen_clause
+                                //     | simdlen_clause
+                                       | linear_clause
+                                       | aligned_clause
+                                //     | nontemporal_clause
+                                //     | order_clause
+                                       | ompx_attribute
+
+    target_teams_distribute_directive: TARGET TEAMS DISTRIBUTE [target_teams_distribute_clause*]
+    target_teams_distribute_clause: if_clause
+                                  | device_clause
+                                  | private_clause
+                                  | firstprivate_clause
+                           //     | in_reduction_clause
+                                  | map_clause
+                                  | is_device_ptr_clause
+                           //     | defaultmap_clause
+                                  | NOWAIT
+                                  | allocate_clause
+                                  | depend_with_modifier_clause
+                           //     | uses_allocators_clause
+                                  | num_teams_clause
+                                  | thread_limit_clause
+                                  | data_default_clause
+                                  | data_sharing_clause
+                           //     | reduction_default_only_clause
+                                  | lastprivate_clause
+                                  | collapse_clause
+                                  | dist_schedule_clause
+                                  | ompx_attribute
+
+    IS_DEVICE_PTR: "is_device_ptr"
+    is_device_ptr_clause: IS_DEVICE_PTR "(" var_list ")"
+    allocate_clause: ALLOCATE "(" allocate_parameter ")"
+    ALLOCATE: "allocate"
+    allocate_parameter: [const_num_or_var] var_list
+
+    target_update_construct: target_update_directive
+    target_update_directive: TARGET UPDATE target_update_clause*
+    target_update_clause: motion_clause
+                        | device_clause
+                        | if_clause
+    motion_clause: update_motion_type "(" variable_array_section_list ")"
+    variable_array_section_list: PYTHON_NAME
+                           //    | array_section
+                               | name_slice
+                               | variable_array_section_list "," PYTHON_NAME
+                               | variable_array_section_list "," name_slice
+                           //    | variable_array_section_list "," array_section
+    //array_section: PYTHON_NAME array_section_subscript
+    //array_section_subscript: array_section_subscript "[" [const_num_or_var] ":" [const_num_or_var] "]"
+    //                       | array_section_subscript "[" const_num_or_var "]"
+    //                       | "[" [const_num_or_var] ":" [const_num_or_var] "]"
+    //                       | "[" const_num_or_var "]"
+    TARGET: "target"
+    TEAMS: "teams"
+    DISTRIBUTE: "distribute"
+    single_construct: single_directive
+    single_directive: SINGLE [single_clause*]
+    SINGLE: "single"
+    single_clause: unique_single_clause
+                 | private_clause
+                 | firstprivate_clause
+                 | NOWAIT
+    unique_single_clause: copyprivate_clause
+    NOWAIT: "nowait"
+    master_construct: master_directive
+    master_directive: "master"
+    simd_construct: simd_directive
+    simd_directive: SIMD [simd_clause*]
+    SIMD: "simd"
+    simd_clause: collapse_clause
+               | aligned_clause
+               | linear_clause
+               | uniform_clause
+               | reduction_clause
+               | inbranch_clause
+    aligned_clause: ALIGNED "(" var_list ")"
+                  | ALIGNED "(" var_list ":" const_num_or_var ")"
+    declare_simd_construct: declare_simd_directive_seq
+    declare_simd_directive_seq: declare_simd_directive
+                              | declare_simd_directive_seq declare_simd_directive
+    declare_simd_directive: SIMD [declare_simd_clause*]
+    declare_simd_clause: "simdlen" "(" const_num_or_var ")"
+                       | aligned_clause
+                       | linear_clause
+                       | uniform_clause
+                       | reduction_clause
+                       | inbranch_clause
+    ALIGNED: "aligned"
+    inbranch_clause: INBRANCH | NOTINBRANCH
+    INBRANCH: "inbranch"
+    NOTINBRANCH: "notinbranch"
+    uniform_clause: UNIFORM "(" var_list ")"
+    UNIFORM: "uniform"
+    collapse_clause: COLLAPSE "(" const_num_or_var ")"
+    COLLAPSE: "collapse"
+    task_construct: task_directive
+    TASK: "task"
+    task_directive: TASK [task_clause*]
+    task_clause: unique_task_clause
+               | data_sharing_clause
+               | private_clause
+               | firstprivate_clause
+               | data_default_clause
+    unique_task_clause: if_clause
+                      | UNTIED
+                      | MERGEABLE
+                      | FINAL "(" const_num_or_var ")"
+                      | depend_with_modifier_clause
+    DEPEND: "depend"
+    FINAL: "final"
+    UNTIED: "untied"
+    MERGEABLE: "mergeable"
+    dependence_type: IN
+                   | OUT
+                   | INOUT
+    depend_with_modifier_clause: DEPEND "(" dependence_type ":" variable_array_section_list ")"
+    IN: "in"
+    OUT: "out"
+    INOUT: "inout"
+    data_default_clause: default_shared_clause
+                       | default_none_clause
+    data_sharing_clause: shared_clause
+    data_clause: private_clause
+               | copyprivate_clause
+               | firstprivate_clause
+               | lastprivate_clause
+               | data_sharing_clause
+               | data_default_clause
+               | copyin_clause
+               | reduction_clause
+    private_clause: PRIVATE "(" var_list ")"
+    PRIVATE: "private"
+    copyprivate_clause: COPYPRIVATE "(" var_list ")"
+    COPYPRIVATE: "copyprivate"
+    firstprivate_clause: FIRSTPRIVATE "(" var_list ")"
+    FIRSTPRIVATE: "firstprivate"
+    lastprivate_clause: LASTPRIVATE "(" var_list ")"
+    LASTPRIVATE: "lastprivate"
+    shared_clause: SHARED "(" var_list ")"
+    SHARED: "shared"
+    copyin_clause: COPYIN "(" var_list ")"
+    COPYIN: "copyin"
+    REDUCTION: "reduction"
+    DEFAULT: "default"
+    reduction_clause: REDUCTION "(" reduction_operator ":" var_list ")"
+    default_shared_clause: DEFAULT "(" "shared" ")"
+    default_none_clause: DEFAULT "(" "none" ")"
+    const_num_or_var: NUMBER | PYTHON_NAME
+    parallel_construct: parallel_directive
+    parallel_directive: PARALLEL [parallel_clause*]
+    parallel_clause: unique_parallel_clause
+                   | data_default_clause
+                   | private_clause
+                   | firstprivate_clause
+                   | data_sharing_clause
+                   | reduction_clause
+    unique_parallel_clause: if_clause | num_threads_clause
+    if_clause: IF "(" const_num_or_var ")"
+    IF: "if"
+    num_threads_clause: NUM_THREADS "(" const_num_or_var ")"
+    NUM_THREADS: "num_threads"
+    PARALLEL: "parallel"
+    FOR: "for"
+    parallel_for_construct: parallel_for_directive
+    parallel_for_directive: PARALLEL FOR [parallel_for_clause*]
+    parallel_for_clause: unique_parallel_clause
+                       | unique_for_clause
+                       | data_default_clause
+                       | private_clause
+                       | firstprivate_clause
+                       | lastprivate_clause
+                       | data_sharing_clause
+                       | reduction_clause
+    for_construct: for_directive
+    for_directive: FOR [for_clause*]
+    for_clause: unique_for_clause | data_clause | NOWAIT
+    unique_for_clause: ORDERED
+                     | schedule_clause
+                     | collapse_clause
+    LINEAR: "linear"
+    linear_clause: LINEAR "(" var_list ":" const_num_or_var ")"
+                 | LINEAR "(" var_list ")"
+    sched_no_expr: SCHEDULE "(" schedule_kind ")"
+    sched_expr: SCHEDULE "(" schedule_kind "," const_num_or_var ")"
+    schedule_clause: sched_no_expr
+                   | sched_expr
+    SCHEDULE: "schedule"
+    schedule_kind: STATIC | DYNAMIC | GUIDED | RUNTIME | AUTO
+    STATIC: "static"
+    DYNAMIC: "dynamic"
+    GUIDED: "guided"
+    RUNTIME: "runtime"
+    AUTO: "auto"
+    COLON: ":"
+    oslice: [const_num_or_var] COLON [const_num_or_var]
+    slice_list: oslice | slice_list "," oslice
+    name_slice: PYTHON_NAME [ "[" slice_list "]" ]
+    var_list: name_slice | var_list "," name_slice
+    number_list: NUMBER | number_list "," NUMBER
+    PLUS: "+"
+    MINUS: "-"
+    STAR: "*"
+    reduction_operator: PLUS | "\\" | STAR | MINUS | "&" | "^" | "|" | "&&" | "||"
+    threadprivate_directive: "threadprivate" "(" var_list ")"
+    cancellation_point_directive: "cancellation point" construct_type_clause
+    construct_type_clause: PARALLEL
+                         | SECTIONS
+                         | FOR
+                         | TASKGROUP
+    cancel_directive: "cancel" construct_type_clause [if_clause]
+    ordered_directive: ORDERED
+    ordered_construct: ordered_directive
+    ORDERED: "ordered"
+    flush_directive: "flush" "(" var_list ")"
+
+    region_phrase: "(" PYTHON_NAME ")"
+    PYTHON_NAME: /[a-zA-Z_]\w*/
+
+    %import common.NUMBER
+    %import common.WS
+    %ignore WS
+    """
+
+"""
+    name_slice: PYTHON_NAME [ "[" slice ["," slice]* "]" ]
+"""
+
+openmp_parser = Lark(openmp_grammar, start="openmp_statement")
+var_collector_parser = Lark(openmp_grammar, start="openmp_statement")
+
+
+def remove_ssa_callback(var, unused):
+    assert isinstance(var, ir.Var)
+    new_var = ir.Var(var.scope, var.unversioned_name, var.loc)
+    return new_var
+
+
+def remove_ssa_from_func_ir(func_ir):
+    typed_passes.PreLowerStripPhis()._strip_phi_nodes(func_ir)
+    #    new_func_ir = typed_passes.PreLowerStripPhis()._strip_phi_nodes(func_ir)
+    #    func_ir.blocks = new_func_ir.blocks
+    visit_vars(func_ir.blocks, remove_ssa_callback, None)
+    func_ir._definitions = build_definitions(func_ir.blocks)
+
+
+def _add_openmp_ir_nodes(func_ir, blocks, blk_start, blk_end, body_blocks, extra):
+    """Given the starting and ending block of the with-context,
+    replaces the head block with a new block that has the starting
+    openmp ir nodes in it and adds the ending openmp ir nodes to
+    the end block.
+    """
+    sblk = blocks[blk_start]
+    loc = sblk.loc
+    sblk.body = sblk.body[1:]
+
+    args = extra["args"]
+    arg = args[0]
+    # If OpenMP argument is not a constant or not a string then raise exception
+    if not isinstance(arg, (ir.Const, ir.FreeVar)):
+        raise NonconstantOpenmpSpecification(
+            f"Non-constant OpenMP specification at line {arg.loc}"
+        )
+    if not isinstance(arg.value, str):
+        raise NonStringOpenmpSpecification(
+            f"Non-string OpenMP specification at line {arg.loc}"
+        )
+
+    if DEBUG_OPENMP >= 1:
+        print("args:", args, type(args))
+        print("arg:", arg, type(arg), arg.value, type(arg.value))
+    parse_res = openmp_parser.parse(arg.value)
+    if DEBUG_OPENMP >= 1:
+        print(parse_res.pretty())
+    visitor = OpenmpVisitor(func_ir, blocks, blk_start, blk_end, body_blocks, loc)
+    try:
+        visitor.transform(parse_res)
+    except VisitError as e:
+        raise e.__context__
+        if isinstance(e.__context__, UnspecifiedVarInDefaultNone):
+            print(str(e.__context__))
+            raise e.__context__
+        else:
+            print(
+                "Internal error for OpenMp pragma '{}'".format(arg.value),
+                e.__context__,
+                type(e.__context__),
+            )
+        sys.exit(-1)
+    except Exception as f:
+        print("generic transform exception")
+        exc_type, exc_obj, exc_tb = sys.exc_info()
+        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
+        print(exc_type, fname, exc_tb.tb_lineno)
+        print("Internal error for OpenMp pragma '{}'".format(arg.value))
+        sys.exit(-2)
+    except:
+        print("fallthrough exception")
+        print("Internal error for OpenMP pragma '{}'".format(arg.value))
+        sys.exit(-3)
+    assert blocks is visitor.blocks
+
+
+class OpenmpExternalFunction(types.ExternalFunction):
+    def __call__(self, *args):
+        import inspect
+
+        frm = inspect.stack()[1]
+        mod = inspect.getmodule(frm[0])
+        if mod.__name__.startswith("numba") and not mod.__name__.startswith(
+            "numba.openmp.tests"
+        ):
+            return super(ExternalFunction, self).__call__(*args)
+
+        ffi = FFI()
+        fname = self.symbol
+        ret_typ = str(self.sig.return_type)
+
+        def numba_to_c(ret_typ):
+            if ret_typ == "int32":
+                return "int"
+            elif ret_typ == "none":
+                return "void"
+            elif ret_typ == "float64":
+                return "double"
+            else:
+                assert False
+
+        ret_typ = numba_to_c(ret_typ)
+        arg_str = ",".join([numba_to_c(str(x)) for x in self.sig.args])
+        proto = f"{ret_typ} {fname}({arg_str});"
+        ffi.cdef(proto)
+        # Should be loaded into the process by the load_library_permanently
+        # at the top of this file.
+        C = ffi.dlopen(None)
+        return getattr(C, fname)(*args)
+
+
+model_register(OpenmpExternalFunction)(OpaqueModel)
+
+omp_set_num_threads = OpenmpExternalFunction(
+    "omp_set_num_threads", types.void(types.int32)
+)
+omp_get_thread_num = OpenmpExternalFunction("omp_get_thread_num", types.int32())
+omp_get_num_threads = OpenmpExternalFunction("omp_get_num_threads", types.int32())
+omp_get_wtime = OpenmpExternalFunction("omp_get_wtime", types.float64())
+omp_set_dynamic = OpenmpExternalFunction("omp_set_dynamic", types.void(types.int32))
+omp_set_nested = OpenmpExternalFunction("omp_set_nested", types.void(types.int32))
+omp_set_max_active_levels = OpenmpExternalFunction(
+    "omp_set_max_active_levels", types.void(types.int32)
+)
+omp_get_max_active_levels = OpenmpExternalFunction(
+    "omp_get_max_active_levels", types.int32()
+)
+omp_get_max_threads = OpenmpExternalFunction("omp_get_max_threads", types.int32())
+omp_get_num_procs = OpenmpExternalFunction("omp_get_num_procs", types.int32())
+omp_in_parallel = OpenmpExternalFunction("omp_in_parallel", types.int32())
+omp_get_thread_limit = OpenmpExternalFunction("omp_get_thread_limit", types.int32())
+omp_get_supported_active_levels = OpenmpExternalFunction(
+    "omp_get_supported_active_levels", types.int32()
+)
+omp_get_level = OpenmpExternalFunction("omp_get_level", types.int32())
+omp_get_active_level = OpenmpExternalFunction("omp_get_active_level", types.int32())
+omp_get_ancestor_thread_num = OpenmpExternalFunction(
+    "omp_get_ancestor_thread_num", types.int32(types.int32)
+)
+omp_get_team_size = OpenmpExternalFunction(
+    "omp_get_team_size", types.int32(types.int32)
+)
+omp_in_final = OpenmpExternalFunction("omp_in_finale", types.int32())
+omp_get_proc_bind = OpenmpExternalFunction("omp_get_proc_bind", types.int32())
+omp_get_num_places = OpenmpExternalFunction("omp_get_num_places", types.int32())
+omp_get_place_num_procs = OpenmpExternalFunction(
+    "omp_get_place_num_procs", types.int32(types.int32)
+)
+omp_get_place_num = OpenmpExternalFunction("omp_get_place_num", types.int32())
+omp_set_default_device = OpenmpExternalFunction(
+    "omp_set_default_device", types.int32(types.int32)
+)
+omp_get_default_device = OpenmpExternalFunction("omp_get_default_device", types.int32())
+omp_get_num_devices = OpenmpExternalFunction("omp_get_num_devices", types.int32())
+omp_get_device_num = OpenmpExternalFunction("omp_get_device_num", types.int32())
+omp_get_team_num = OpenmpExternalFunction("omp_get_team_num", types.int32())
+omp_get_num_teams = OpenmpExternalFunction("omp_get_num_teams", types.int32())
+omp_is_initial_device = OpenmpExternalFunction("omp_is_initial_device", types.int32())
+omp_get_initial_device = OpenmpExternalFunction("omp_get_initial_device", types.int32())
+
+
+def copy_np_array(x):
+    return np.copy(x)
+
+
+# {meminfo, parent, ...} copy_np_array({meminfo,  parent, ...})
+
+
+def create_native_np_copy(arg_typ):
+    # The cfunc wrapper of this function is what we need.
+    copy_cres = compiler.compile_isolated(copy_np_array, (arg_typ,), arg_typ)
+    copy_name = getattr(copy_cres.fndesc, "llvm_cfunc_wrapper_name")
+    return (copy_name, copy_cres)
+
+
+def omp_shared_array(size, dtype):
+    return np.empty(size, dtype=dtype)
+
+
+@overload(omp_shared_array, target="cpu", inline="always", prefer_literal=True)
+def omp_shared_array_overload(size, dtype):
+    assert isinstance(size, types.IntegerLiteral)
+
+    def impl(size, dtype):
+        return np.empty(size, dtype=dtype)
+
+    return impl
+
+
+@overload(omp_shared_array, target="cuda", inline="always", prefer_literal=True)
+def omp_shared_array_overload(size, dtype):
+    assert isinstance(size, types.IntegerLiteral)
+
+    def impl(size, dtype):
+        return numba_cuda.shared.array(size, dtype)
+
+    return impl
diff --git a/numba/openmp/nrt/init.c b/numba/openmp/nrt/init.c
new file mode 100644
index 000000000000..8c659aa3a6fe
--- /dev/null
+++ b/numba/openmp/nrt/init.c
@@ -0,0 +1,3 @@
+extern void NRT_MemSys_init();
+
+__attribute__((constructor)) static void PyOMP_NRT_Init() { NRT_MemSys_init(); }
diff --git a/numba/openmp/pass/CGIntrinsicsOpenMP.cpp b/numba/openmp/pass/CGIntrinsicsOpenMP.cpp
new file mode 100644
index 000000000000..861c059c9656
--- /dev/null
+++ b/numba/openmp/pass/CGIntrinsicsOpenMP.cpp
@@ -0,0 +1,3102 @@
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/IR/CFG.h"
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
+#include <stdexcept>
+
+#include "CGIntrinsicsOpenMP.h"
+#include "DebugOpenMP.h"
+
+#define DEBUG_TYPE "intrinsics-openmp"
+
+using namespace llvm;
+using namespace omp;
+using namespace iomp;
+
+namespace {
+
+static CallInst *checkCreateCall(IRBuilderBase &Builder, FunctionCallee &Fn,
+                                 ArrayRef<Value *> Args) {
+  auto PrintDebugOutput = [&]() {
+    dbgs() << "=== CGOpenMP checkCreateCall\n";
+    dbgs() << "FunctionCallee: " << Fn.getCallee()->getName() << "\n";
+    dbgs() << "FunctionCalee Type: " << *Fn.getFunctionType() << "\n";
+    size_t ArgNo = 0;
+    for (Value *Arg : Args) {
+      dbgs() << "Arg " << ArgNo << ": " << *Arg << "\n";
+      ArgNo++;
+    }
+    dbgs() << "=== End of CGOpenMP checkCreateCall\n";
+  };
+  DEBUG_ENABLE(PrintDebugOutput());
+
+  // Check number of parameters only for non-vararg functions.
+  if (!Fn.getFunctionType()->isVarArg())
+    if (Args.size() != Fn.getFunctionType()->getNumParams()) {
+      DEBUG_ENABLE(dbgs() << "Mismatch argument size " << Args.size() << " != "
+                        << Fn.getFunctionType()->getNumParams() << "\n");
+      return nullptr;
+    }
+
+  // Check argument types up to number params in the callee type to avoid
+  // checking varargs unknow types.
+  for (size_t I = 0; I < Fn.getFunctionType()->getNumParams(); ++I)
+    if (Args[I]->getType() != Fn.getFunctionType()->getParamType(I)) {
+      DEBUG_ENABLE(dbgs() << "Mismatch type at " << I << "\n";
+                 dbgs() << "Arg " << *Args[I] << "\n";
+                 dbgs() << "Expected type "
+                        << *Fn.getFunctionType()->getParamType(I) << "\n";);
+      return nullptr;
+    }
+
+  return Builder.CreateCall(Fn, Args);
+}
+
+} // namespace
+
+void CGIntrinsicsOpenMP::setDeviceGlobalizedValues(
+    const ArrayRef<Value *> GlobalizedValues) {
+  DeviceGlobalizedValues.clear();
+  DeviceGlobalizedValues.insert(GlobalizedValues.begin(),
+                                GlobalizedValues.end());
+}
+
+Value *CGIntrinsicsOpenMP::createScalarCast(Value *V, Type *DestTy) {
+  Value *Scalar = nullptr;
+  assert(V && "Expected non-null value");
+  if (V->getType()->isPointerTy()) {
+    Value *Load =
+        OMPBuilder.Builder.CreateLoad(V->getType()->getPointerElementType(), V);
+    Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(Load, DestTy);
+  } else {
+    Scalar = OMPBuilder.Builder.CreateTruncOrBitCast(V, DestTy);
+  }
+
+  return Scalar;
+}
+
+Function *CGIntrinsicsOpenMP::createOutlinedFunction(
+    DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, Function *OuterFn,
+    BasicBlock *StartBB, BasicBlock *EndBB,
+    SmallVectorImpl<Value *> &CapturedVars, StringRef Suffix) {
+  SmallVector<Value *, 16> Privates;
+  SmallVector<Value *, 16> CapturedShared;
+  SmallVector<Value *, 16> CapturedFirstprivate;
+  SmallVector<Value *, 16> Reductions;
+
+  InsertPointTy SavedIP = OMPBuilder.Builder.saveIP();
+
+  OpenMPIRBuilder::OutlineInfo OI;
+  OI.EntryBB = StartBB;
+  OI.ExitBB = EndBB;
+  SmallPtrSet<BasicBlock *, 8> BlockSet;
+  SmallVector<BasicBlock *, 8> BlockVector;
+  OI.collectBlocks(BlockSet, BlockVector);
+
+  CodeExtractorAnalysisCache CEAC(*OuterFn);
+  CodeExtractor Extractor(BlockVector, /* DominatorTree */ nullptr,
+                          /* AggregateArgs */ false,
+                          /* BlockFrequencyInfo */ nullptr,
+                          /* BranchProbabilityInfo */ nullptr,
+                          /* AssumptionCache */ nullptr,
+                          /* AllowVarArgs */ true,
+                          /* AllowAlloca */ true,
+                          /* Suffix */ ".");
+
+  // Find inputs to, outputs from the code region.
+  BasicBlock *CommonExit = nullptr;
+  SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
+  Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+  Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
+
+  assert(Outputs.empty() && "Expected empty outputs from outlined region");
+  assert(SinkingCands.empty() && "Expected empty alloca sinking candidates");
+
+  auto IsTempOrDefaultPrivate = [](Value *V) {
+    if(V->getName().startswith("."))
+      return true;
+
+    if(V->getName().startswith("excinfo"))
+      return true;
+
+    if(V->getName() == "quot")
+      return true;
+
+    if(V->getName() == "rem")
+      return true;
+
+    return false;
+  };
+
+  // Scan Inputs and define any missing values as Privates. Those values must
+  // correspond to Numba-generated temporaries that should be privatized.
+  for (auto *V : Inputs) {
+    if (!DSAValueMap.count(V)) {
+      DEBUG_ENABLE(dbgs() << "Missing V " << *V
+                          << " from DSAValueMap, will privatize\n");
+      if (!IsTempOrDefaultPrivate(V))
+        FATAL_ERROR("Expected Numba temporary value or default private, named starting "
+                    "with . but got " +
+                    V->getName().str());
+      Privates.push_back(V);
+      continue;
+    }
+
+    DSAType DSA = DSAValueMap[V].Type;
+
+    DEBUG_ENABLE(dbgs() << "V " << *V << " from DSAValueMap Type " << DSA
+                      << "\n");
+    switch (DSA) {
+    case DSA_PRIVATE:
+      Privates.push_back(V);
+      break;
+    case DSA_FIRSTPRIVATE:
+      CapturedFirstprivate.push_back(V);
+      break;
+    case DSA_SHARED:
+    // Treat as shared to capture the pointer.
+    case DSA_LASTPRIVATE:
+    case DSA_MAP_TO:
+    case DSA_MAP_FROM:
+    case DSA_MAP_TOFROM:
+    case DSA_MAP_STRUCT:
+      CapturedShared.push_back(V);
+      break;
+    case DSA_REDUCTION_ADD:
+    case DSA_REDUCTION_SUB:
+    case DSA_REDUCTION_MUL:
+      Reductions.push_back(V);
+      break;
+    default:
+      FATAL_ERROR("Unexpected DSA type");
+    }
+  }
+
+  SmallVector<Type *, 16> Params;
+  // tid
+  Params.push_back(OMPBuilder.Int32Ptr);
+  // bound_tid
+  Params.push_back(OMPBuilder.Int32Ptr);
+  for (auto *V : CapturedShared)
+    Params.push_back(V->getType());
+  for (auto *V : CapturedFirstprivate) {
+    Type *VPtrElemTy = V->getType()->getPointerElementType();
+    if (VPtrElemTy->isSingleValueType())
+      // TODO: The OpenMP runtime expects and propagates arguments
+      // typed as Int64, thus we cast byval firstprivates to Int64. Using an
+      // aggregate to store arguments would avoid this peculiarity.
+      // Params.push_back(VPtrElemTy);
+      Params.push_back(OMPBuilder.Int64);
+    else
+      Params.push_back(V->getType());
+  }
+  for (auto *V : Reductions)
+    Params.push_back(V->getType());
+
+  FunctionType *OutlinedFnTy =
+      FunctionType::get(OMPBuilder.Void, Params, /* isVarArgs */ false);
+  Function *OutlinedFn =
+      Function::Create(OutlinedFnTy, GlobalValue::InternalLinkage,
+                       OuterFn->getName() + Suffix, M);
+
+  // Name the parameters and add attributes. Shared are ordered before
+  // firstprivate in the parameter list.
+  OutlinedFn->arg_begin()->setName("global_tid");
+  std::next(OutlinedFn->arg_begin())->setName("bound_tid");
+  Function::arg_iterator AI = std::next(OutlinedFn->arg_begin(), 2);
+  int arg_no = 2;
+  for (auto *V : CapturedShared) {
+    AI->setName(V->getName() + ".shared");
+    // Insert pointers in device globalized if they correspond to a device
+    // globalized pointer.
+    if (DeviceGlobalizedValues.contains(V))
+      DeviceGlobalizedValues.insert(AI);
+
+    OutlinedFn->addParamAttr(arg_no, Attribute::NonNull);
+    OutlinedFn->addParamAttr(
+        arg_no, Attribute::get(M.getContext(), Attribute::Dereferenceable, 8));
+    ++AI;
+    ++arg_no;
+  }
+  for (auto *V : CapturedFirstprivate) {
+    Type *VPtrElemTy = V->getType()->getPointerElementType();
+    if (VPtrElemTy->isSingleValueType()) {
+      AI->setName(V->getName() + ".firstprivate.byval");
+    } else {
+      AI->setName(V->getName() + ".firstprivate");
+      OutlinedFn->addParamAttr(arg_no, Attribute::NonNull);
+      OutlinedFn->addParamAttr(
+          arg_no,
+          Attribute::get(M.getContext(), Attribute::Dereferenceable, 8));
+    }
+    ++AI;
+    ++arg_no;
+  }
+  for (auto *V : Reductions) {
+    AI->setName(V->getName() + ".red");
+    OutlinedFn->addParamAttr(arg_no, Attribute::NonNull);
+    OutlinedFn->addParamAttr(
+        arg_no, Attribute::get(M.getContext(), Attribute::Dereferenceable, 8));
+    ++AI;
+    ++arg_no;
+  }
+
+  BasicBlock *OutlinedEntryBB =
+      BasicBlock::Create(M.getContext(), ".outlined.entry", OutlinedFn);
+  BasicBlock *OutlinedExitBB =
+      BasicBlock::Create(M.getContext(), ".outlined.exit", OutlinedFn);
+
+  auto CreateAllocaAtEntry = [&](Type *Ty, Value *ArraySize = nullptr,
+                                 const Twine &Name = "") {
+    auto CurIP = OMPBuilder.Builder.saveIP();
+    OMPBuilder.Builder.SetInsertPoint(OutlinedEntryBB,
+                                      OutlinedEntryBB->getFirstInsertionPt());
+    Value *Alloca = OMPBuilder.Builder.CreateAlloca(Ty, nullptr, Name);
+    OMPBuilder.Builder.restoreIP(CurIP);
+    return Alloca;
+  };
+
+  OMPBuilder.Builder.SetInsertPoint(OutlinedEntryBB);
+
+  OutlinedFn->addParamAttr(0, Attribute::NoAlias);
+  OutlinedFn->addParamAttr(1, Attribute::NoAlias);
+  OutlinedFn->addFnAttr(Attribute::NoUnwind);
+  OutlinedFn->addFnAttr(Attribute::NoRecurse);
+
+  auto CollectUses = [&BlockSet](Value *V, SetVector<Use *> &Uses) {
+    for (Use &U : V->uses())
+      if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+        if (BlockSet.count(UserI->getParent()))
+          Uses.insert(&U);
+  };
+
+  auto ReplaceUses = [](SetVector<Use *> &Uses, Value *ReplacementValue) {
+    for (Use *UPtr : Uses)
+      UPtr->set(ReplacementValue);
+  };
+
+  for (auto *V : Privates) {
+    SetVector<Use *> Uses;
+    CollectUses(V, Uses);
+
+    Type *VTy = V->getType()->getPointerElementType();
+    Value *ReplacementValue =
+        CreateAllocaAtEntry(VTy, nullptr, V->getName() + ".private");
+    // NOTE: We need to zero initialize privates because Numba reference
+    // counting breaks when those privates correspond to memory-managed
+    // data structures.
+    OMPBuilder.Builder.CreateStore(Constant::getNullValue(VTy),
+                                   ReplacementValue);
+
+    if (VMap)
+      (*VMap)[V] = ReplacementValue;
+
+    ReplaceUses(Uses, ReplacementValue);
+  }
+
+  AI = std::next(OutlinedFn->arg_begin(), 2);
+  for (auto *V : CapturedShared) {
+    SetVector<Use *> Uses;
+    CollectUses(V, Uses);
+
+    Value *ReplacementValue = AI;
+
+    if (VMap)
+      (*VMap)[V] = ReplacementValue;
+
+    ReplaceUses(Uses, ReplacementValue);
+    ++AI;
+  }
+
+  for (auto *V : CapturedFirstprivate) {
+    SetVector<Use *> Uses;
+    CollectUses(V, Uses);
+
+    Type *VPtrElemTy = V->getType()->getPointerElementType();
+    Value *ReplacementValue =
+        CreateAllocaAtEntry(VPtrElemTy, nullptr, V->getName() + ".copy");
+    if (VPtrElemTy->isSingleValueType()) {
+      // TODO: The OpenMP runtime expects and propagates arguments
+      // typed as Int64, thus we cast byval firstprivates to Int64. Using an
+      // aggregate to store arguments would avoid this peculiarity.
+      // OMPBuilder.Builder.CreateStore(AI, ReplacementValue);
+      Value *Alloca = CreateAllocaAtEntry(OMPBuilder.Int64);
+
+      OMPBuilder.Builder.CreateStore(AI, Alloca);
+      Value *BitCast = OMPBuilder.Builder.CreateBitCast(Alloca, V->getType());
+      Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, BitCast);
+      OMPBuilder.Builder.CreateStore(Load, ReplacementValue);
+    } else {
+      Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, AI,
+                                                  V->getName() + ".reload");
+      FunctionCallee CopyConstructor = DSAValueMap[V].CopyConstructor;
+      if (CopyConstructor) {
+        Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {Load});
+        OMPBuilder.Builder.CreateStore(Copy, ReplacementValue);
+      } else
+        OMPBuilder.Builder.CreateStore(Load, ReplacementValue);
+    }
+
+    if (VMap)
+      (*VMap)[V] = ReplacementValue;
+
+    ReplaceUses(Uses, ReplacementValue);
+
+    ++AI;
+  }
+
+  SmallVector<OpenMPIRBuilder::ReductionInfo> ReductionInfos;
+  for (auto *V : Reductions) {
+    SetVector<Use *> Uses;
+    CollectUses(V, Uses);
+
+    if (VMap)
+      (*VMap)[V] = AI;
+
+    InsertPointTy AllocaIP(OutlinedEntryBB,
+                            OutlinedEntryBB->getFirstInsertionPt());
+
+
+    Value *Priv = nullptr;
+    switch (DSAValueMap[V].Type) {
+    case DSA_REDUCTION_ADD:
+      Priv = CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_ADD>(
+          OMPBuilder.Builder, AllocaIP, AI, ReductionInfos);
+      break;
+    case DSA_REDUCTION_SUB:
+      Priv = CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_SUB>(
+          OMPBuilder.Builder, AllocaIP, AI, ReductionInfos);
+      break;
+    case DSA_REDUCTION_MUL:
+      Priv = CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_MUL>(
+          OMPBuilder.Builder, AllocaIP, AI, ReductionInfos);
+      break;
+    default:
+      FATAL_ERROR("Unsupported reduction");
+    }
+
+    assert(Priv && "Expected non-null private reduction variable");
+    ReplaceUses(Uses, Priv);
+
+    ++AI;
+  }
+
+  OMPBuilder.Builder.CreateBr(StartBB);
+
+  EndBB->getTerminator()->setSuccessor(0, OutlinedExitBB);
+  OMPBuilder.Builder.SetInsertPoint(OutlinedExitBB);
+  OMPBuilder.Builder.CreateRetVoid();
+  if (!ReductionInfos.empty())
+    OMPBuilder.createReductions(
+        InsertPointTy(OutlinedExitBB, OutlinedExitBB->begin()),
+        InsertPointTy(OutlinedEntryBB, OutlinedEntryBB->begin()),
+        ReductionInfos);
+
+  // Deterministic insertion of BBs, BlockVector needs ExitBB to move to the
+  // outlined function.
+  BlockVector.push_back(OI.ExitBB);
+  for (auto *BB : BlockVector)
+    BB->moveBefore(OutlinedExitBB);
+
+  DEBUG_ENABLE(dbgs() << "=== Dump OutlinedFn\n"
+                    << *OutlinedFn << "=== End of Dump OutlinedFn\n");
+
+  if (verifyFunction(*OutlinedFn, &errs()))
+    FATAL_ERROR("Verification of OutlinedFn failed!");
+
+  CapturedVars.append(CapturedShared);
+  CapturedVars.append(CapturedFirstprivate);
+  CapturedVars.append(Reductions);
+
+  if (SavedIP.isSet())
+    OMPBuilder.Builder.restoreIP(SavedIP);
+
+  return OutlinedFn;
+}
+
+CGIntrinsicsOpenMP::CGIntrinsicsOpenMP(Module &M) : OMPBuilder(M), M(M) {
+  OMPBuilder.initialize();
+
+  TgtOffloadEntryTy = StructType::create({OMPBuilder.Int8Ptr,
+                                          OMPBuilder.Int8Ptr, OMPBuilder.SizeTy,
+                                          OMPBuilder.Int32, OMPBuilder.Int32},
+                                         "struct.__tgt_offload_entry");
+  // OpenMP device runtime expects this global that controls debugging, default
+  // to 0 (no debugging enabled).
+  if (isOpenMPDeviceRuntime())
+    OMPBuilder.createGlobalFlag(0, "__omp_rtl_debug_kind");
+}
+
+void CGIntrinsicsOpenMP::emitOMPParallel(
+    DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL,
+    Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB,
+    BasicBlock *AfterBB, FinalizeCallbackTy FiniCB,
+    ParRegionInfoStruct &ParRegionInfo) {
+  if (isOpenMPDeviceRuntime())
+    emitOMPParallelDeviceRuntime(DSAValueMap, VMap, DL, Fn, BBEntry, StartBB,
+                                 EndBB, AfterBB, FiniCB, ParRegionInfo);
+  else
+    emitOMPParallelHostRuntime(DSAValueMap, VMap, DL, Fn, BBEntry, StartBB,
+                               EndBB, AfterBB, FiniCB, ParRegionInfo);
+}
+
+void CGIntrinsicsOpenMP::emitOMPParallelHostRuntime(
+    DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL,
+    Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB,
+    BasicBlock *AfterBB, FinalizeCallbackTy FiniCB,
+    ParRegionInfoStruct &ParRegionInfo) {
+
+  // Set the insertion location at the end of the BBEntry.
+  BBEntry->getTerminator()->eraseFromParent();
+  OMPBuilder.Builder.SetInsertPoint(BBEntry);
+  OMPBuilder.Builder.CreateBr(AfterBB);
+
+  OMPBuilder.Builder.SetInsertPoint(BBEntry->getTerminator());
+  OpenMPIRBuilder::LocationDescription Loc(OMPBuilder.Builder.saveIP(), DL);
+  OMPBuilder.Builder.SetCurrentDebugLocation(Loc.DL);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident);
+
+  SmallVector<Value *, 16> CapturedVars;
+  Function *OutlinedFn =
+      createOutlinedFunction(DSAValueMap, VMap, Fn, StartBB, EndBB,
+                             CapturedVars, ".omp_outlined_parallel");
+
+  auto EmitForkCall = [&](InsertPointTy InsertIP) {
+    OMPBuilder.Builder.restoreIP(InsertIP);
+
+    auto *OutlinedFnCast = OMPBuilder.Builder.CreateBitCast(
+        OutlinedFn, OMPBuilder.ParallelTaskPtr);
+    FunctionCallee ForkCall =
+        OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_fork_call);
+    SmallVector<Value *, 16> ForkArgs;
+    ForkArgs.append({Ident, OMPBuilder.Builder.getInt32(CapturedVars.size()),
+                     OutlinedFnCast});
+
+    for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
+      // Pass firstprivate scalar by value.
+      if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
+          CapturedVars[Idx]
+              ->getType()
+              ->getPointerElementType()
+              ->isSingleValueType()) {
+        // TODO: check type conversions.
+        Value *Alloca = OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int64);
+        Type *VPtrElemTy =
+            CapturedVars[Idx]->getType()->getPointerElementType();
+        Value *LoadV =
+            OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
+        Value *BitCast = OMPBuilder.Builder.CreateBitCast(
+            Alloca, CapturedVars[Idx]->getType());
+        OMPBuilder.Builder.CreateStore(LoadV, BitCast);
+        Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, Alloca);
+        ForkArgs.push_back(Load);
+        continue;
+      }
+
+      ForkArgs.push_back(CapturedVars[Idx]);
+    }
+
+    OMPBuilder.Builder.CreateCall(ForkCall, ForkArgs);
+  };
+
+  auto EmitSerializedParallel = [&](InsertPointTy InsertIP) {
+    OMPBuilder.Builder.restoreIP(InsertIP);
+
+    // Build calls __kmpc_serialized_parallel(&Ident, GTid);
+    Value *Args[] = {Ident, ThreadID};
+    OMPBuilder.Builder.CreateCall(OMPBuilder.getOrCreateRuntimeFunctionPtr(
+                                      OMPRTL___kmpc_serialized_parallel),
+                                  Args);
+
+    Value *ZeroAddr = OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int32, nullptr,
+                                                      ".zero.addr");
+    OMPBuilder.Builder.CreateStore(Constant::getNullValue(OMPBuilder.Int32),
+                                   ZeroAddr);
+    // Zero for thread id, bound tid.
+    SmallVector<Value *, 16> OutlinedArgs = {ZeroAddr, ZeroAddr};
+    for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
+      // Pass firstprivate scalar by value.
+      if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
+          CapturedVars[Idx]
+              ->getType()
+              ->getPointerElementType()
+              ->isSingleValueType()) {
+        // TODO: check type conversions.
+        Type *VPtrElemTy =
+            CapturedVars[Idx]->getType()->getPointerElementType();
+        Value *Load =
+            OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
+        OutlinedArgs.push_back(Load);
+        continue;
+      }
+
+      OutlinedArgs.push_back(CapturedVars[Idx]);
+    }
+
+    OMPBuilder.Builder.CreateCall(OutlinedFn, OutlinedArgs);
+
+    // __kmpc_end_serialized_parallel(&Ident, GTid);
+    OMPBuilder.Builder.CreateCall(OMPBuilder.getOrCreateRuntimeFunctionPtr(
+                                      OMPRTL___kmpc_end_serialized_parallel),
+                                  Args);
+  };
+
+  if (ParRegionInfo.NumThreads) {
+    Value *NumThreads =
+        createScalarCast(ParRegionInfo.NumThreads, OMPBuilder.Int32);
+    assert(NumThreads && "Expected non-null num threads");
+    Value *Args[] = {Ident, ThreadID, NumThreads};
+    OMPBuilder.Builder.CreateCall(OMPBuilder.getOrCreateRuntimeFunctionPtr(
+                                      OMPRTL___kmpc_push_num_threads),
+                                  Args);
+  }
+
+  if (ParRegionInfo.IfCondition) {
+    Instruction *ThenTI = nullptr, *ElseTI = nullptr;
+    Value *IfConditionEval = nullptr;
+
+    if (ParRegionInfo.IfCondition->getType()->isFloatingPointTy())
+      IfConditionEval = OMPBuilder.Builder.CreateFCmpUNE(
+          ParRegionInfo.IfCondition,
+          ConstantFP::get(ParRegionInfo.IfCondition->getType(), 0));
+    else
+      IfConditionEval = OMPBuilder.Builder.CreateICmpNE(
+          ParRegionInfo.IfCondition,
+          ConstantInt::get(ParRegionInfo.IfCondition->getType(), 0));
+
+    assert(IfConditionEval && "Expected non-null condition");
+    SplitBlockAndInsertIfThenElse(IfConditionEval, BBEntry->getTerminator(),
+                                  &ThenTI, &ElseTI);
+
+    assert(ThenTI && "Expected non-null ThenTI");
+    assert(ElseTI && "Expected non-null ElseTI");
+    EmitForkCall(InsertPointTy(ThenTI->getParent(), ThenTI->getIterator()));
+    EmitSerializedParallel(
+        InsertPointTy(ElseTI->getParent(), ElseTI->getIterator()));
+  } else {
+    EmitForkCall(
+        InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()));
+  }
+
+  DEBUG_ENABLE(dbgs() << "=== Dump OuterFn\n"
+                    << *Fn << "=== End of Dump OuterFn\n");
+
+  if (verifyFunction(*Fn, &errs()))
+    FATAL_ERROR("Verification of OuterFn failed!");
+}
+
+#if 0
+void CGIntrinsicsOpenMP::emitOMPParallelHostRuntimeOMPIRBuilder(
+    DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap,
+    const DebugLoc &DL, Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB,
+    BasicBlock *EndBB, BasicBlock *AfterBB, FinalizeCallbackTy FiniCB,
+    ParRegionInfoStruct &ParRegionInfo) {
+  InsertPointTy BodyIP, BodyAllocaIP;
+  SmallVector<OpenMPIRBuilder::ReductionInfo> ReductionInfos;
+
+  auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                    Value &Orig, Value &Inner,
+                    Value *&ReplacementValue) -> InsertPointTy {
+    auto It = DSAValueMap.find(&Orig);
+    DEBUG_ENABLE(dbgs() << "DSAValueMap for Orig " << Orig << " Inner " << Inner);
+    if (It != DSAValueMap.end())
+      DEBUG_ENABLE(dbgs() << It->second.Type);
+    else
+      DEBUG_ENABLE(dbgs() << " (null)!");
+    DEBUG_ENABLE(dbgs() << "\n ");
+
+    if (It == DSAValueMap.end()) {
+      DSAValueMap[&Orig] = DSA_PRIVATE;
+      DEBUG_ENABLE(dbgs() << "Missing V " << Orig << " from DSAValueMap, will privatize\n");
+      assert(Orig.getName().startswith(".") &&
+             "Expected Numba temporary value, named starting with .");
+    }
+    assert(It != DSAValueMap.end() && "Expected Value in DSAValueMap");
+
+    DSAType DSA = It->second.Type;
+    FunctionCallee CopyConstructor = It->second.CopyConstructor;
+
+    if (DSA == DSA_PRIVATE) {
+      OMPBuilder.Builder.restoreIP(AllocaIP);
+      Type *VTy = Inner.getType()->getPointerElementType();
+      ReplacementValue = OMPBuilder.Builder.CreateAlloca(
+          VTy, /*ArraySize */ nullptr, Inner.getName());
+      // NOTE: We need to zero-out privates because Numba reference
+      // counting breaks when those privates correspond to memory-managed
+      // data structures.
+      OMPBuilder.Builder.CreateStore(Constant::getNullValue(VTy),
+                                     ReplacementValue);
+      DEBUG_ENABLE(dbgs() << "Privatizing Inner " << Inner << " -> to -> "
+                        << *ReplacementValue << "\n");
+      if (VMap)
+        (*VMap)[&Orig] = ReplacementValue;
+    } else if (DSA == DSA_FIRSTPRIVATE) {
+      OMPBuilder.Builder.restoreIP(AllocaIP);
+      Type *VTy = Inner.getType()->getPointerElementType();
+      ReplacementValue = OMPBuilder.Builder.CreateAlloca(
+          VTy, /*ArraySize */ nullptr, Orig.getName() + ".copy");
+      OMPBuilder.Builder.restoreIP(CodeGenIP);
+      Value *InnerLoad =
+          OMPBuilder.Builder.CreateLoad(VTy, &Inner, Orig.getName() + ".reload");
+      if (CopyConstructor) {
+        Value *Copy =
+            OMPBuilder.Builder.CreateCall(CopyConstructor, {InnerLoad});
+        OMPBuilder.Builder.CreateStore(Copy, ReplacementValue);
+      } else
+        OMPBuilder.Builder.CreateStore(InnerLoad, ReplacementValue);
+
+      DEBUG_ENABLE(dbgs() << "Firstprivatizing Inner " << Inner << " -> to -> "
+                        << *ReplacementValue << "\n");
+      if (VMap)
+        (*VMap)[&Orig] = ReplacementValue;
+    } else if (DSA == DSA_REDUCTION_ADD) {
+      OMPBuilder.Builder.restoreIP(AllocaIP);
+      Type *VTy = Inner.getType()->getPointerElementType();
+      Value *V = OMPBuilder.Builder.CreateAlloca(VTy, /* ArraySize */ nullptr,
+                                                 Orig.getName() + ".red.priv");
+      ReplacementValue = V;
+      if (VMap)
+        (*VMap)[&Orig] = ReplacementValue;
+
+      OMPBuilder.Builder.restoreIP(CodeGenIP);
+      // Store idempotent value based on operation and type.
+      // TODO: use emitInitAndAppendInfo in CGReduction
+      if (VTy->isIntegerTy())
+        OMPBuilder.Builder.CreateStore(ConstantInt::get(VTy, 0), V);
+      else if (VTy->isFloatTy() || VTy->isDoubleTy())
+        OMPBuilder.Builder.CreateStore(ConstantFP::get(VTy, 0.0), V);
+      else
+        assert(false &&
+               "Unsupported type to init with idempotent reduction value");
+
+      ReductionInfos.push_back({VTy, &Orig, V, CGReduction::sumReduction,
+                                CGReduction::sumAtomicReduction});
+
+      return OMPBuilder.Builder.saveIP();
+    } else {
+      ReplacementValue = &Inner;
+      DEBUG_ENABLE(dbgs() << "Shared Inner " << Inner << " -> to -> "
+                        << *ReplacementValue << "\n");
+    }
+
+    return CodeGenIP;
+  };
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                       BasicBlock &ContinuationIP) {
+    BasicBlock *CGStartBB = CodeGenIP.getBlock();
+    BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint());
+    assert(StartBB != nullptr && "StartBB should not be null");
+    CGStartBB->getTerminator()->setSuccessor(0, StartBB);
+    assert(EndBB != nullptr && "EndBB should not be null");
+    EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+
+    BodyIP = InsertPointTy(CGEndBB, CGEndBB->getFirstInsertionPt());
+    BodyAllocaIP = AllocaIP;
+  };
+
+  IRBuilder<>::InsertPoint AllocaIP(&Fn->getEntryBlock(),
+                                    Fn->getEntryBlock().getFirstInsertionPt());
+
+  // Set the insertion location at the end of the BBEntry.
+  BBEntry->getTerminator()->eraseFromParent();
+
+  Value *IfConditionEval = nullptr;
+  if (ParRegionInfo.IfCondition) {
+    OMPBuilder.Builder.SetInsertPoint(BBEntry);
+    if (ParRegionInfo.IfCondition->getType()->isFloatingPointTy())
+      IfConditionEval = OMPBuilder.Builder.CreateFCmpUNE(
+          ParRegionInfo.IfCondition,
+          ConstantFP::get(ParRegionInfo.IfCondition->getType(), 0));
+    else
+      IfConditionEval = OMPBuilder.Builder.CreateICmpNE(
+          ParRegionInfo.IfCondition,
+          ConstantInt::get(ParRegionInfo.IfCondition->getType(), 0));
+  }
+
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->end()), DL);
+
+  Value *NumThreads = nullptr;
+  // It is allowed to have a nullptr NumThreads, createParallel handles that.
+  if (ParRegionInfo.NumThreads)
+    NumThreads = createScalarCast(ParRegionInfo.NumThreads, OMPBuilder.Int32);
+  // TODO: support cancellable, binding.
+  InsertPointTy AfterIP = OMPBuilder.createParallel(
+      Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
+      /* IfCondition */ IfConditionEval,
+      /* NumThreads */ NumThreads, OMP_PROC_BIND_default,
+      /* IsCancellable */ false);
+
+  if (!ReductionInfos.empty())
+    OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos);
+
+  BranchInst::Create(AfterBB, AfterIP.getBlock());
+
+  DEBUG_ENABLE(dbgs() << "=== Before Fn\n" << *Fn << "=== End of Before Fn\n");
+  OMPBuilder.finalize(Fn);
+  DEBUG_ENABLE(dbgs() << "=== Finalize Fn\n"
+                    << *Fn << "=== End of Finalize Fn\n");
+}
+#endif
+
+void CGIntrinsicsOpenMP::emitOMPParallelDeviceRuntime(
+    DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL,
+    Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB,
+    BasicBlock *AfterBB, FinalizeCallbackTy FiniCB,
+    ParRegionInfoStruct &ParRegionInfo) {
+  // Extract parallel region
+  SmallVector<Value *, 16> CapturedVars;
+  Function *OutlinedFn =
+      createOutlinedFunction(DSAValueMap, VMap, Fn, StartBB, EndBB,
+                             CapturedVars, ".omp_outlined_parallel");
+
+  // Create wrapper for worker threads
+  SmallVector<Type *, 2> Params;
+  // parallelism level, unused?
+  Params.push_back(OMPBuilder.Int16);
+  // tid
+  Params.push_back(OMPBuilder.Int32);
+
+  FunctionType *OutlinedWrapperFnTy =
+      FunctionType::get(OMPBuilder.Void, Params, /* isVarArgs */ false);
+  Function *OutlinedWrapperFn =
+      Function::Create(OutlinedWrapperFnTy, GlobalValue::InternalLinkage,
+                       OutlinedFn->getName() + ".wrapper", M);
+  BasicBlock *OutlinedWrapperEntryBB =
+      BasicBlock::Create(M.getContext(), "entry", OutlinedWrapperFn);
+
+  // Code generation for the outlined wrapper function.
+  OMPBuilder.Builder.SetInsertPoint(OutlinedWrapperEntryBB);
+
+  constexpr const int TIDArgNo = 1;
+  AllocaInst *TIDAddr =
+      OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int32, nullptr, ".tid.addr");
+  AllocaInst *ZeroAddr =
+      OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int32, nullptr, ".zero.addr");
+  AllocaInst *GlobalArgs = OMPBuilder.Builder.CreateAlloca(
+      OMPBuilder.Int8PtrPtr, nullptr, "global_args");
+
+  OMPBuilder.Builder.CreateStore(OutlinedWrapperFn->getArg(TIDArgNo), TIDAddr);
+  OMPBuilder.Builder.CreateStore(Constant::getNullValue(OMPBuilder.Int32),
+                                 ZeroAddr);
+  FunctionCallee KmpcGetSharedVariables = OMPBuilder.getOrCreateRuntimeFunction(
+      M, OMPRTL___kmpc_get_shared_variables);
+  OMPBuilder.Builder.CreateCall(KmpcGetSharedVariables, {GlobalArgs});
+
+  SmallVector<Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(TIDAddr);
+  OutlinedFnArgs.push_back(ZeroAddr);
+
+  for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
+    Value *LoadGlobalArgs =
+        OMPBuilder.Builder.CreateLoad(OMPBuilder.Int8PtrPtr, GlobalArgs);
+    Value *GEP = OMPBuilder.Builder.CreateConstInBoundsGEP1_64(
+        OMPBuilder.Int8Ptr, LoadGlobalArgs, Idx);
+
+    // Pass firstprivate scalar by value.
+    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
+        CapturedVars[Idx]
+            ->getType()
+            ->getPointerElementType()
+            ->isSingleValueType()) {
+      Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType();
+      Value *Bitcast =
+          OMPBuilder.Builder.CreateBitCast(GEP, CapturedVars[Idx]->getType());
+      Value *Load = OMPBuilder.Builder.CreateLoad(VPtrElemTy, Bitcast);
+      // TODO: Runtime expects values in Int64 type, fix with arguments in
+      // struct.
+      AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
+          OMPBuilder.Int64, nullptr,
+          CapturedVars[Idx]->getName() + "fpriv.byval");
+      Value *Cast = OMPBuilder.Builder.CreateBitCast(
+          TmpInt64, CapturedVars[Idx]->getType());
+      OMPBuilder.Builder.CreateStore(Load, Cast);
+      Value *ConvLoad =
+          OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
+      OutlinedFnArgs.push_back(ConvLoad);
+
+      continue;
+    }
+
+    Value *Bitcast = OMPBuilder.Builder.CreateBitCast(
+        GEP, CapturedVars[Idx]->getType()->getPointerTo());
+    Value *Load =
+        OMPBuilder.Builder.CreateLoad(CapturedVars[Idx]->getType(), Bitcast);
+    OutlinedFnArgs.push_back(Load);
+  }
+
+  FunctionCallee OutlinedFnCallee(OutlinedFn->getFunctionType(), OutlinedFn);
+
+  auto *OutlinedCI =
+      checkCreateCall(OMPBuilder.Builder, OutlinedFnCallee, OutlinedFnArgs);
+  assert(OutlinedCI && "Expected valid call");
+  OMPBuilder.Builder.CreateRetVoid();
+
+  if (verifyFunction(*OutlinedWrapperFn, &errs()))
+    FATAL_ERROR("Verification of OutlinedWrapperFn failed!");
+
+  DEBUG_ENABLE(dbgs() << "=== Dump OutlinedWrapper\n"
+                    << *OutlinedWrapperFn
+                    << "=== End of Dump OutlinedWrapper\n");
+
+  // Setup the call to kmpc_parallel_51
+  BBEntry->getTerminator()->eraseFromParent();
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->end()), DL);
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  OMPBuilder.Builder.restoreIP(Loc.IP);
+  OMPBuilder.Builder.SetCurrentDebugLocation(Loc.DL);
+
+  // Create the address table of the global data.
+  // The number of outlined arguments without global_tid, bound_tid.
+  Value *NumCapturedArgs =
+      ConstantInt::get(OMPBuilder.SizeTy, CapturedVars.size());
+  Type *CapturedVarsAddrsTy =
+      ArrayType::get(OMPBuilder.Int8Ptr, CapturedVars.size());
+
+  // TODO: Re-think allocas, move to start of caller. If the caller is outlined
+  // in an outer OpenMP region, dot naming ensures captured_var_addrs is a
+  // private value, since it's only used for setting up the call to
+  // kmpc_parallel_51.
+  auto PrevIP = OMPBuilder.Builder.saveIP();
+  InsertPointTy AllocaIP(&Fn->getEntryBlock(),
+                         Fn->getEntryBlock().getFirstInsertionPt());
+  OMPBuilder.Builder.restoreIP(AllocaIP);
+  Value *CapturedVarsAddrs = OMPBuilder.Builder.CreateAlloca(
+      CapturedVarsAddrsTy, nullptr, ".captured_var_addrs");
+  OMPBuilder.Builder.restoreIP(PrevIP);
+
+  SmallVector<Value *> GlobalAllocas;
+  for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
+    DEBUG_ENABLE(dbgs() << "CapturedVar " << Idx << " " << *CapturedVars[Idx]
+                      << "\n");
+    Value *GEP = OMPBuilder.Builder.CreateConstInBoundsGEP2_64(
+        CapturedVarsAddrsTy, CapturedVarsAddrs, 0, Idx);
+
+    // Pass firstprivate scalar by value.
+    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
+        CapturedVars[Idx]
+            ->getType()
+            ->getPointerElementType()
+            ->isSingleValueType()) {
+      // TODO: check type conversions.
+      Value *BitCast = OMPBuilder.Builder.CreateBitCast(CapturedVars[Idx],
+                                                        OMPBuilder.Int64Ptr);
+      Value *Load = OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, BitCast);
+      Value *IntToPtr =
+          OMPBuilder.Builder.CreateIntToPtr(Load, OMPBuilder.Int8Ptr);
+      OMPBuilder.Builder.CreateStore(IntToPtr, GEP);
+
+      continue;
+    }
+
+    // Allocate from global memory if the pointer is not globalized (not in the
+    // global address space).
+    FunctionCallee KmpcAllocShared =
+        OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_alloc_shared);
+    assert(CapturedVars[Idx]->getType()->isPointerTy() &&
+           "Expected pointer type");
+
+    if (DeviceGlobalizedValues.contains(CapturedVars[Idx])) {
+      Value *Bitcast = OMPBuilder.Builder.CreateBitCast(CapturedVars[Idx],
+                                                        OMPBuilder.Int8Ptr);
+      OMPBuilder.Builder.CreateStore(Bitcast, GEP);
+    } else {
+      Type *AllocTy = CapturedVars[Idx]->getType()->getPointerElementType();
+      Value *Size = ConstantInt::get(
+          OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(AllocTy));
+      CallBase *GlobalAlloc =
+          OMPBuilder.Builder.CreateCall(KmpcAllocShared, {Size});
+      GlobalAlloc->addRetAttr(
+          llvm::Attribute::get(M.getContext(), llvm::Attribute::Alignment, 16));
+      GlobalAllocas.push_back(GlobalAlloc);
+      // TODO: this assumes the type is trivally copyable, use the copy
+      // constructor for more complex types.
+      OMPBuilder.Builder.CreateMemCpy(
+          GlobalAlloc, GlobalAlloc->getPointerAlignment(M.getDataLayout()),
+          CapturedVars[Idx],
+          CapturedVars[Idx]->getPointerAlignment(M.getDataLayout()), Size);
+
+      OMPBuilder.Builder.CreateStore(GlobalAlloc, GEP);
+    }
+  }
+
+  Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident);
+
+  Value *IfCondition = ParRegionInfo.IfCondition;
+  Value *NumThreads = ParRegionInfo.NumThreads;
+  if (!IfCondition)
+    // Set condition to 1 (execute in parallel) if not set.
+    IfCondition = ConstantInt::get(OMPBuilder.Int32, 1);
+
+  if (!NumThreads)
+    NumThreads = ConstantInt::get(OMPBuilder.Int32, -1);
+  else
+    NumThreads =
+        OMPBuilder.Builder.CreateTruncOrBitCast(NumThreads, OMPBuilder.Int32);
+
+  assert(NumThreads && "Expected non-null NumThreads");
+
+  FunctionCallee KmpcParallel51 =
+      OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_parallel_51);
+
+  // Set proc_bind to -1 by default as it is unused.
+  assert(Ident && "Expected non-null Ident");
+  assert(ThreadID && "Expected non-null ThreadID");
+  assert(IfCondition && "Expected non-null IfCondition");
+  assert(NumThreads && "Expected non-null NumThreads");
+  assert(OutlinedWrapperFn && "Expected non-null OutlinedWrapperFn");
+  assert(CapturedVarsAddrs && "Expected non-null CapturedVarsAddrs");
+  assert(NumCapturedArgs && "Expected non-null NumCapturedArgs");
+
+  Value *ProcBind = OMPBuilder.Builder.getInt32(-1);
+  Value *OutlinedFnBitcast =
+      OMPBuilder.Builder.CreateBitCast(OutlinedFn, OMPBuilder.VoidPtr);
+  Value *OutlinedWrapperFnBitcast =
+      OMPBuilder.Builder.CreateBitCast(OutlinedWrapperFn, OMPBuilder.VoidPtr);
+  Value *CapturedVarAddrsBitcast = OMPBuilder.Builder.CreateBitCast(
+      CapturedVarsAddrs, OMPBuilder.VoidPtrPtr);
+
+  SmallVector<Value *, 10> Args = {Ident,
+                                   ThreadID,
+                                   IfCondition,
+                                   NumThreads,
+                                   ProcBind,
+                                   OutlinedFnBitcast,
+                                   OutlinedWrapperFnBitcast,
+                                   CapturedVarAddrsBitcast,
+                                   NumCapturedArgs};
+
+  auto *CallKmpcParallel51 =
+      checkCreateCall(OMPBuilder.Builder, KmpcParallel51, Args);
+  assert(CallKmpcParallel51 &&
+         "Expected non-null call instr from code generation");
+
+  FunctionCallee KmpcFreeShared =
+      OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_free_shared);
+  for (Value *GA : GlobalAllocas) {
+    Type *AllocTy = GA->getType()->getPointerElementType();
+    Value *Size = ConstantInt::get(OMPBuilder.SizeTy,
+                                   M.getDataLayout().getTypeAllocSize(AllocTy));
+    auto *CI = checkCreateCall(OMPBuilder.Builder, KmpcFreeShared, {GA, Size});
+    assert(CI && "Expected valid call");
+  }
+
+  OMPBuilder.Builder.CreateBr(AfterBB);
+
+  DEBUG_ENABLE(dbgs() << "=== Dump OuterFn\n"
+                    << *Fn << "=== End of Dump OuterFn\n");
+
+  if (verifyFunction(*Fn, &errs()))
+    FATAL_ERROR("Verification of OuterFn failed!");
+}
+
+FunctionCallee CGIntrinsicsOpenMP::getKmpcForStaticInit(Type *Ty) {
+  DEBUG_ENABLE(dbgs() << "Type " << *Ty << "\n");
+  unsigned Bitwidth = Ty->getIntegerBitWidth();
+  DEBUG_ENABLE(dbgs() << "Bitwidth " << Bitwidth << "\n");
+  if (Bitwidth == 32)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___kmpc_for_static_init_4u);
+  if (Bitwidth == 64)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___kmpc_for_static_init_8u);
+
+  FATAL_ERROR("unknown OpenMP loop iterator bitwidth");
+}
+
+FunctionCallee CGIntrinsicsOpenMP::getKmpcDistributeStaticInit(Type *Ty) {
+  DEBUG_ENABLE(dbgs() << "Type " << *Ty << "\n");
+  unsigned Bitwidth = Ty->getIntegerBitWidth();
+  DEBUG_ENABLE(dbgs() << "Bitwidth " << Bitwidth << "\n");
+  if (Bitwidth == 32)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___kmpc_distribute_static_init_4u);
+  if (Bitwidth == 64)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___kmpc_distribute_static_init_8u);
+
+  FATAL_ERROR("unknown OpenMP loop iterator bitwidth");
+}
+
+void CGIntrinsicsOpenMP::emitLoop(DSAValueMapTy &DSAValueMap,
+                                  OMPLoopInfoStruct &OMPLoopInfo,
+                                  BasicBlock *StartBB, BasicBlock *ExitBB,
+                                  bool IsStandalone, bool IsDistribute,
+                                  bool IsDistributeParallelFor,
+                                  OMPDistributeInfoStruct *OMPDistributeInfo) {
+  DEBUG_ENABLE(dbgs() << "OMPLoopInfo.IV " << *OMPLoopInfo.IV << "\n");
+  DEBUG_ENABLE(dbgs() << "OMPLoopInfo.UB " << *OMPLoopInfo.UB << "\n");
+  assert(OMPLoopInfo.IV && "Expected non-null IV");
+  assert(OMPLoopInfo.UB && "Expected non-null UB");
+
+  assert(static_cast<int>(OMPLoopInfo.Sched) &&
+         "Expected non-zero loop schedule");
+
+  BasicBlock *PreHeader = StartBB;
+  PreHeader->setName("omp.for.preheader");
+  BasicBlock *Header = PreHeader->getUniqueSuccessor();
+  assert(Header && "Expected unique successor header");
+  Header->setName("omp.for.cond");
+  BasicBlock *Exit = ExitBB;
+  Exit->setName("omp.for.exit");
+  assert(Header && "Expected unique successor from PreHeader to Header");
+  DEBUG_ENABLE(dbgs() << "=== PreHeader\n"
+                      << *PreHeader << "=== End of PreHeader\n");
+  DEBUG_ENABLE(dbgs() << "=== Header\n" << *Header << "=== End of Header\n");
+  assert(Header->getTerminator()->getNumSuccessors() == 2 &&
+         "Expected 2 successors (loopbody, exit)");
+  BasicBlock *HeaderSuccBBs[2] = {Header->getTerminator()->getSuccessor(0),
+                                  Header->getTerminator()->getSuccessor(1)};
+  BasicBlock *LoopBody =
+      (HeaderSuccBBs[0] == Exit ? HeaderSuccBBs[1] : HeaderSuccBBs[0]);
+  assert(LoopBody && "Expected non-null loop body basic block\n");
+
+  assert(Header->hasNPredecessors(2) &&
+         "Expected exactly 2 predecessors to loop header (preheader, latch)");
+  BasicBlock *HeaderPredBBs[2] = {*predecessors(Header).begin(),
+                                  *std::next(predecessors(Header).begin(), 1)};
+  BasicBlock *Latch =
+      (HeaderPredBBs[0] == PreHeader ? HeaderPredBBs[1] : HeaderPredBBs[0]);
+  Latch->setName("omp.for.inc");
+  assert(Latch && "Expected latch basicblock");
+
+  auto ClearBlockInstructions = [](BasicBlock *BB) {
+    // Remove all instructions in the BB, iterate backwards to avoid
+    // dangling uses for safe deletion. The BB becomes malformed and
+    // requires a terminator added.
+    while (!BB->empty()) {
+      Instruction &I = BB->back();
+      assert(I.getNumUses() == 0 && "Expected no uses to delete");
+      I.eraseFromParent();
+    }
+  };
+  // Clear Latch, Header.
+  ClearBlockInstructions(Latch);
+  ClearBlockInstructions(Header);
+
+  DEBUG_ENABLE(dbgs() << "=== Exit\n" << *Exit << "=== End of Exit\n");
+
+  Type *IVTy = OMPLoopInfo.IV->getType()->getPointerElementType();
+  SmallVector<OpenMPIRBuilder::ReductionInfo> ReductionInfos;
+
+  FunctionCallee LoopStaticInit = ((IsDistribute && isOpenMPDeviceRuntime())
+                                       ? getKmpcDistributeStaticInit(IVTy)
+                                       : getKmpcForStaticInit(IVTy));
+  FunctionCallee LoopStaticFini =
+      ((IsDistribute && isOpenMPDeviceRuntime())
+           ? OMPBuilder.getOrCreateRuntimeFunction(
+                 M, OMPRTL___kmpc_distribute_static_fini)
+           : OMPBuilder.getOrCreateRuntimeFunction(
+                 M, OMPRTL___kmpc_for_static_fini));
+
+  const DebugLoc DL = PreHeader->getTerminator()->getDebugLoc();
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(PreHeader, PreHeader->getTerminator()->getIterator()), DL);
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadNum = nullptr;
+
+  // Create allocas for static init values.
+  // TODO: Move the AllocaIP to the start of the containing function.
+  InsertPointTy AllocaIP(PreHeader, PreHeader->getFirstInsertionPt());
+  Type *I32Type = Type::getInt32Ty(M.getContext());
+  OMPBuilder.Builder.restoreIP(AllocaIP);
+  Value *PLastIter =
+      OMPBuilder.Builder.CreateAlloca(I32Type, nullptr, "omp.for.is_last");
+  // Value *PStart = OMPBuilder.Builder.CreateAlloca(IVTy, nullptr,
+  // "omp.for.start");
+  Value *PLowerBound =
+      OMPBuilder.Builder.CreateAlloca(IVTy, nullptr, "omp.for.lb");
+  Value *PStride =
+      OMPBuilder.Builder.CreateAlloca(IVTy, nullptr, "omp.for.stride");
+  Value *PUpperBound =
+      OMPBuilder.Builder.CreateAlloca(IVTy, nullptr, "omp.for.ub");
+
+  // Store distribute LB, UB to be used by combined loop constructs.
+  if (IsDistribute)
+    if (OMPDistributeInfo) {
+      OMPDistributeInfo->LB = PLowerBound;
+      OMPDistributeInfo->UB = PUpperBound;
+    }
+
+  // Create BasicBlock structure.
+  BasicBlock *MinUBBlock =
+      PreHeader->splitBasicBlock(PreHeader->getTerminator(), "omp.for.min.ub");
+  BasicBlock *CapUBBlock = MinUBBlock->splitBasicBlock(
+      MinUBBlock->getTerminator(), "omp.for.cap.ub");
+  BasicBlock *SetupLoopBlock =
+      CapUBBlock->splitBasicBlock(CapUBBlock->getTerminator(), "omp.for.setup");
+  BasicBlock *ForEndBB =
+      ExitBB->splitBasicBlockBefore(ExitBB->getFirstInsertionPt());
+  ForEndBB->setName("omp.for.end");
+
+  BasicBlock *DispatchCondBB = nullptr;
+  BasicBlock *DispatchIncBB = nullptr;
+  BasicBlock *DispatchEndBB = nullptr;
+  if (OMPLoopInfo.Sched == OMPScheduleType::StaticChunked ||
+      OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked) {
+    DispatchCondBB = SetupLoopBlock->splitBasicBlock(
+        SetupLoopBlock->getTerminator(), "omp.dispatch.cond");
+    DispatchIncBB = ExitBB->splitBasicBlockBefore(ExitBB->getFirstInsertionPt(),
+                                                  "omp.dispatch.inc");
+    DispatchEndBB = ExitBB->splitBasicBlockBefore(ExitBB->getFirstInsertionPt(),
+                                                  "omp.dispatch.end");
+  }
+
+  Constant *Zero_I32 = ConstantInt::get(I32Type, 0);
+  Constant *One = ConstantInt::get(IVTy, 1);
+
+  // Extend PreHeader
+  {
+    OMPBuilder.Builder.SetInsertPoint(PreHeader->getTerminator());
+    // Store the initial normalized upper bound to PUpperBound.
+    Value *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.UB);
+    OMPBuilder.Builder.CreateStore(LoadUB, PUpperBound);
+
+    Value *LoadLB = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.LB);
+    OMPBuilder.Builder.CreateStore(LoadLB, PLowerBound);
+    OMPBuilder.Builder.CreateStore(One, PStride);
+    OMPBuilder.Builder.CreateStore(Zero_I32, PLastIter);
+
+    // If Chunk is not specified (nullptr), default to one, complying with
+    // the OpenMP specification.
+    if (!OMPLoopInfo.Chunk)
+      OMPLoopInfo.Chunk = One;
+    Value *ChunkCast = OMPBuilder.Builder.CreateIntCast(OMPLoopInfo.Chunk, IVTy,
+                                                        /*isSigned*/ false);
+
+    Constant *SchedulingType =
+        ConstantInt::get(I32Type, static_cast<int>(OMPLoopInfo.Sched));
+
+    ThreadNum = OMPBuilder.getOrCreateThreadID(SrcLoc);
+    DEBUG_ENABLE(dbgs() << "=== SchedulingType " << *SchedulingType << "\n");
+    DEBUG_ENABLE(dbgs() << "=== PLowerBound " << *PLowerBound << "\n");
+    DEBUG_ENABLE(dbgs() << "=== PUpperBound " << *PUpperBound << "\n");
+    DEBUG_ENABLE(dbgs() << "=== PStride " << *PStride << "\n");
+    DEBUG_ENABLE(dbgs() << "=== Incr " << *One << "\n");
+    DEBUG_ENABLE(dbgs() << "=== Schedule "
+                        << static_cast<int>(OMPLoopInfo.Sched) << "\n");
+    DEBUG_ENABLE(dbgs() << "=== Chunk " << *ChunkCast << "\n");
+    OMPBuilder.Builder.CreateCall(
+        LoopStaticInit, {SrcLoc, ThreadNum, SchedulingType, PLastIter,
+                         PLowerBound, PUpperBound, PStride, One, ChunkCast});
+  }
+
+  // Create MinUBBlock.
+  {
+    OMPBuilder.Builder.SetInsertPoint(MinUBBlock,
+                                      MinUBBlock->getFirstInsertionPt());
+    auto *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, PUpperBound);
+    auto *LoadGlobalUB = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.UB);
+    auto *Cond = OMPBuilder.Builder.CreateICmpUGT(LoadUB, LoadGlobalUB);
+    OMPBuilder.Builder.CreateCondBr(Cond, CapUBBlock, SetupLoopBlock);
+    MinUBBlock->getTerminator()->eraseFromParent();
+  }
+
+  // Create CapUBBlock
+  {
+    OMPBuilder.Builder.SetInsertPoint(CapUBBlock,
+                                      CapUBBlock->getFirstInsertionPt());
+    auto *LoadGlobalUB = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.UB);
+    OMPBuilder.Builder.CreateStore(LoadGlobalUB, PUpperBound);
+  }
+
+  // Create SetupLoopBlock
+  {
+    OMPBuilder.Builder.SetInsertPoint(SetupLoopBlock,
+                                      SetupLoopBlock->getFirstInsertionPt());
+    Value *LoadLB = OMPBuilder.Builder.CreateLoad(IVTy, PLowerBound);
+    OMPBuilder.Builder.CreateStore(LoadLB, OMPLoopInfo.IV);
+  }
+
+  // Create Header
+  {
+    auto SaveIP = OMPBuilder.Builder.saveIP();
+    OMPBuilder.Builder.SetInsertPoint(Header);
+    auto *LoadIV = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.IV);
+    auto *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, PUpperBound);
+    auto *Cond = OMPBuilder.Builder.CreateICmpSLE(LoadIV, LoadUB);
+    OMPBuilder.Builder.CreateCondBr(Cond, LoopBody, ForEndBB);
+    OMPBuilder.Builder.restoreIP(SaveIP);
+  }
+
+  // Create Latch.
+  {
+    auto SaveIP = OMPBuilder.Builder.saveIP();
+    OMPBuilder.Builder.SetInsertPoint(Latch);
+    Value *LoadIV = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.IV);
+    if (IsDistribute && IsDistributeParallelFor) {
+      Value *LoadStride = OMPBuilder.Builder.CreateLoad(IVTy, PStride);
+      Value *Inc = OMPBuilder.Builder.CreateAdd(LoadIV, LoadStride);
+      OMPBuilder.Builder.CreateStore(Inc, OMPLoopInfo.IV);
+    } else {
+      Value *Inc = OMPBuilder.Builder.CreateAdd(LoadIV, One);
+      OMPBuilder.Builder.CreateStore(Inc, OMPLoopInfo.IV);
+    }
+
+    // If it's a combined "distribute parallel for" with static/distribute
+    // chunked then fall through to the strided dispatch increment.
+    if (IsDistributeParallelFor &&
+        ((OMPLoopInfo.Sched == OMPScheduleType::StaticChunked) ||
+         (OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked)))
+      OMPBuilder.Builder.CreateBr(DispatchIncBB);
+    else
+      OMPBuilder.Builder.CreateBr(Header);
+
+    OMPBuilder.Builder.restoreIP(SaveIP);
+  }
+
+  assert(ThreadNum && "Expected non-null threadnum");
+  if (OMPLoopInfo.Sched == OMPScheduleType::Static ||
+      OMPLoopInfo.Sched == OMPScheduleType::Distribute) {
+    OMPBuilder.Builder.SetInsertPoint(ForEndBB,
+                                      ForEndBB->getFirstInsertionPt());
+    OMPBuilder.Builder.CreateCall(LoopStaticFini, {SrcLoc, ThreadNum});
+  } else if (OMPLoopInfo.Sched == OMPScheduleType::StaticChunked ||
+             OMPLoopInfo.Sched == OMPScheduleType::DistributeChunked) {
+    assert(DispatchCondBB && "Expected non-null dispatch cond bb");
+    assert(DispatchIncBB && "Expected non-null dispatch inc bb");
+    assert(DispatchEndBB && "Expected non-null dispatch end bb");
+    // Create DispatchCond
+    {
+      auto SaveIP = OMPBuilder.Builder.saveIP();
+      DispatchCondBB->getTerminator()->eraseFromParent();
+      OMPBuilder.Builder.SetInsertPoint(DispatchCondBB);
+      auto *LoadLB = OMPBuilder.Builder.CreateLoad(IVTy, PLowerBound);
+      OMPBuilder.Builder.CreateStore(LoadLB, OMPLoopInfo.IV);
+      auto *LoadIV = OMPBuilder.Builder.CreateLoad(IVTy, OMPLoopInfo.IV);
+      auto *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, PUpperBound);
+      auto *Cond = OMPBuilder.Builder.CreateICmpSLE(LoadIV, LoadUB);
+      OMPBuilder.Builder.CreateCondBr(Cond, Header, DispatchEndBB);
+      OMPBuilder.Builder.restoreIP(SaveIP);
+    }
+    // Create DispatchIncBB.
+    {
+      auto SaveIP = OMPBuilder.Builder.saveIP();
+      DispatchIncBB->getTerminator()->eraseFromParent();
+      OMPBuilder.Builder.SetInsertPoint(DispatchIncBB);
+      auto *LoadLB = OMPBuilder.Builder.CreateLoad(IVTy, PLowerBound);
+      auto *LoadStride = OMPBuilder.Builder.CreateLoad(IVTy, PStride);
+      auto *LBPlusStride = OMPBuilder.Builder.CreateAdd(LoadLB, LoadStride);
+      OMPBuilder.Builder.CreateStore(LBPlusStride, PLowerBound);
+
+      auto *LoadUB = OMPBuilder.Builder.CreateLoad(IVTy, PUpperBound);
+      auto *UBPlusStride = OMPBuilder.Builder.CreateAdd(LoadUB, LoadStride);
+      OMPBuilder.Builder.CreateStore(UBPlusStride, PUpperBound);
+
+      // OMPBuilder.Builder.CreateBr(DispatchCondBB);
+      OMPBuilder.Builder.CreateBr(MinUBBlock);
+      OMPBuilder.Builder.restoreIP(SaveIP);
+    }
+    // Create ForEndBB
+    {
+      ForEndBB->getTerminator()->eraseFromParent();
+      OMPBuilder.Builder.SetInsertPoint(ForEndBB);
+      OMPBuilder.Builder.CreateBr(DispatchIncBB);
+    }
+
+    // Create DispatchEndBB
+    {
+      OMPBuilder.Builder.SetInsertPoint(DispatchEndBB,
+                                        DispatchEndBB->getFirstInsertionPt());
+      OMPBuilder.Builder.CreateCall(LoopStaticFini, {SrcLoc, ThreadNum});
+    }
+  } else {
+    FATAL_ERROR("Unknown loop schedule type");
+  }
+
+  OpenMPIRBuilder::OutlineInfo OI;
+  OI.EntryBB = PreHeader;
+  OI.ExitBB = Exit;
+  SmallPtrSet<BasicBlock *, 8> BlockSet;
+  SmallVector<BasicBlock *, 8> BlockVector;
+  OI.collectBlocks(BlockSet, BlockVector);
+
+  // TODO: De-duplicate privatization code.
+  auto PrivatizeWithReductions = [&]() {
+    auto CurrentIP = OMPBuilder.Builder.saveIP();
+    for (auto &It : DSAValueMap) {
+      Value *Orig = It.first;
+      DSAType DSA = It.second.Type;
+      FunctionCallee CopyConstructor = It.second.CopyConstructor;
+      Value *ReplacementValue = nullptr;
+      Type *VTy = Orig->getType()->getPointerElementType();
+
+      if (DSA == DSA_SHARED)
+        continue;
+
+      // Lastprivates are handled later, need elaborate codegen.
+      if (DSA == DSA_LASTPRIVATE)
+        continue;
+
+      // Store previous uses to set them to the ReplacementValue after
+      // privatization codegen.
+      SetVector<Use *> Uses;
+      for (Use &U : Orig->uses())
+        if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+          if (BlockSet.count(UserI->getParent()))
+            Uses.insert(&U);
+
+      OMPBuilder.Builder.restoreIP(AllocaIP);
+      if (DSA == DSA_PRIVATE) {
+        ReplacementValue = OMPBuilder.Builder.CreateAlloca(
+            VTy, /*ArraySize */ nullptr, Orig->getName() + ".for.priv");
+        OMPBuilder.Builder.CreateStore(Constant::getNullValue(VTy),
+                                       ReplacementValue);
+      } else if (DSA == DSA_FIRSTPRIVATE) {
+        Value *V = OMPBuilder.Builder.CreateLoad(
+            VTy, Orig, Orig->getName() + ".for.firstpriv.reload");
+        ReplacementValue = OMPBuilder.Builder.CreateAlloca(
+            VTy, /*ArraySize */ nullptr,
+            Orig->getName() + ".for.firstpriv.copy");
+        if (CopyConstructor) {
+          Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {V});
+          OMPBuilder.Builder.CreateStore(Copy, ReplacementValue);
+        } else
+          OMPBuilder.Builder.CreateStore(V, ReplacementValue);
+      } else if (DSA == DSA_REDUCTION_ADD) {
+        ReplacementValue =
+            CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_ADD>(
+                OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig,
+                ReductionInfos);
+      } else if (DSA == DSA_REDUCTION_SUB) {
+        ReplacementValue =
+            CGReduction::emitInitAndAppendInfo<DSA_REDUCTION_SUB>(
+                OMPBuilder.Builder, OMPBuilder.Builder.saveIP(), Orig,
+                ReductionInfos);
+      } else
+        FATAL_ERROR("Unsupported privatization");
+
+      assert(ReplacementValue && "Expected non-null ReplacementValue");
+
+      for (Use *UPtr : Uses)
+        UPtr->set(ReplacementValue);
+    }
+
+    OMPBuilder.Builder.restoreIP(CurrentIP);
+  };
+
+  auto EmitLastPrivate = [&](InsertPointTy CodeGenIP) {
+    auto ShouldReplace = [&BlockSet](Use &U) {
+      if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+        if (BlockSet.count(UserI->getParent()))
+          return true;
+
+      return false;
+    };
+
+    for (auto &It : DSAValueMap) {
+      Value *Orig = It.first;
+      DSAType DSA = It.second.Type;
+
+      if (DSA != DSA_LASTPRIVATE)
+        continue;
+
+      FunctionCallee CopyConstructor = It.second.CopyConstructor;
+      Value *ReplacementValue = nullptr;
+      Type *VTy = Orig->getType()->getPointerElementType();
+
+      OMPBuilder.Builder.restoreIP(AllocaIP);
+      ReplacementValue = OMPBuilder.Builder.CreateAlloca(
+          VTy, /*ArraySize */ nullptr, Orig->getName() + ".for.lastpriv");
+      OMPBuilder.Builder.CreateStore(Constant::getNullValue(VTy),
+                                     ReplacementValue);
+      Orig->replaceUsesWithIf(ReplacementValue, ShouldReplace);
+
+      BasicBlock *InsertBB = CodeGenIP.getBlock();
+
+      BasicBlock *LastPrivCond =
+          SplitBlock(InsertBB, InsertBB->getTerminator());
+      LastPrivCond->setName("omp.for.lastpriv.cond");
+      BasicBlock *LastPrivThen =
+          SplitBlock(LastPrivCond, LastPrivCond->getTerminator());
+      LastPrivThen->setName("omp.for.lastpriv.then");
+      BasicBlock *LastPrivEnd =
+          SplitBlock(LastPrivThen, LastPrivThen->getTerminator());
+      LastPrivEnd->setName("omp.for.lastpriv.end");
+      OMPBuilder.Builder.SetInsertPoint(LastPrivThen->getTerminator());
+      Value *Load = OMPBuilder.Builder.CreateLoad(VTy, ReplacementValue);
+      if (CopyConstructor) {
+        Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {Load});
+        OMPBuilder.Builder.CreateStore(Copy, Orig);
+      } else
+        OMPBuilder.Builder.CreateStore(Load, Orig);
+
+      LastPrivCond->getTerminator()->eraseFromParent();
+      OMPBuilder.Builder.SetInsertPoint(LastPrivCond);
+      Value *PLastIterLoad =
+          OMPBuilder.Builder.CreateLoad(OMPBuilder.Int32, PLastIter);
+      Value *Cond = OMPBuilder.Builder.CreateICmpNE(
+          PLastIterLoad, ConstantInt::get(OMPBuilder.Int32, 0));
+      OMPBuilder.Builder.CreateCondBr(Cond, LastPrivThen, LastPrivEnd);
+    }
+  };
+
+  BasicBlock *FiniBB =
+      (OMPLoopInfo.Sched == OMPScheduleType::Static) ? ForEndBB : DispatchEndBB;
+  EmitLastPrivate(InsertPointTy(FiniBB, FiniBB->end()));
+
+  // Emit reductions, barrier, privatize if standalone.
+  if (IsStandalone) {
+    PrivatizeWithReductions();
+    if (!ReductionInfos.empty()) {
+      OMPBuilder.Builder.SetInsertPoint(ForEndBB->getTerminator());
+      OMPBuilder.createReductions(OpenMPIRBuilder::LocationDescription(
+                                      OMPBuilder.Builder.saveIP(), Loc.DL),
+                                  AllocaIP, ReductionInfos);
+    }
+
+    OMPBuilder.Builder.SetInsertPoint(ExitBB->getTerminator());
+    OMPBuilder.createBarrier(OpenMPIRBuilder::LocationDescription(
+                                 OMPBuilder.Builder.saveIP(), Loc.DL),
+                             omp::Directive::OMPD_for,
+                             /* ForceSimpleCall */ false,
+                             /* CheckCancelFlag */ false);
+  }
+
+  if (verifyFunction(*PreHeader->getParent(), &errs()))
+    FATAL_ERROR("Verification of omp for lowering failed!");
+}
+
+void CGIntrinsicsOpenMP::emitOMPFor(DSAValueMapTy &DSAValueMap,
+                                    OMPLoopInfoStruct &OMPLoopInfo,
+                                    BasicBlock *StartBB, BasicBlock *ExitBB,
+                                    bool IsStandalone,
+                                    bool IsDistributeParallelFor) {
+    // Set default loop schedule.
+    if (static_cast<int>(OMPLoopInfo.Sched) == 0)
+        OMPLoopInfo.Sched =
+            (isOpenMPDeviceRuntime() ? OMPScheduleType::StaticChunked
+                                     : OMPScheduleType::Static);
+
+    emitLoop(DSAValueMap, OMPLoopInfo, StartBB, ExitBB, IsStandalone, false,
+             IsDistributeParallelFor);
+}
+
+void CGIntrinsicsOpenMP::emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
+                                     BasicBlock *BBEntry, BasicBlock *StartBB,
+                                     BasicBlock *EndBB, BasicBlock *AfterBB) {
+  // Define types.
+  // ************** START TYPE DEFINITION ************** //
+  enum {
+    TiedFlag = 0x1,
+    FinalFlag = 0x2,
+    DestructorsFlag = 0x8,
+    PriorityFlag = 0x20,
+    DetachableFlag = 0x40,
+  };
+
+  // This is a union for priority/firstprivate destructors, use the
+  // routine entry pointer to allocate space since it is larger than
+  // Int32Ty for priority, see kmp.h. Unused for now.
+  StructType *KmpCmplrdataTy =
+      StructType::create({OMPBuilder.TaskRoutineEntryPtr});
+  StructType *KmpTaskTTy =
+      StructType::create({OMPBuilder.VoidPtr, OMPBuilder.TaskRoutineEntryPtr,
+                          OMPBuilder.Int32, KmpCmplrdataTy, KmpCmplrdataTy},
+                         "struct.kmp_task_t");
+  Type *KmpTaskTPtrTy = KmpTaskTTy->getPointerTo();
+
+  FunctionCallee KmpcOmpTaskAlloc =
+      OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_omp_task_alloc);
+  SmallVector<Type *, 8> SharedsTy;
+  SmallVector<Type *, 8> PrivatesTy;
+  for (auto &It : DSAValueMap) {
+    Value *OriginalValue = It.first;
+    if (It.second.Type == DSA_SHARED)
+      SharedsTy.push_back(OriginalValue->getType());
+    else if (It.second.Type == DSA_PRIVATE ||
+             It.second.Type == DSA_FIRSTPRIVATE) {
+      assert(isa<PointerType>(OriginalValue->getType()) &&
+             "Expected private, firstprivate value with pointer type");
+      // Store a copy of the value, thus get the pointer element type.
+      PrivatesTy.push_back(OriginalValue->getType()->getPointerElementType());
+    } else
+      FATAL_ERROR("Unknown DSA type");
+  }
+
+  StructType *KmpSharedsTTy = nullptr;
+  if (SharedsTy.empty())
+    KmpSharedsTTy = StructType::create(M.getContext(), "struct.kmp_shareds");
+  else
+    KmpSharedsTTy = StructType::create(SharedsTy, "struct.kmp_shareds");
+  assert(KmpSharedsTTy && "Expected non-null KmpSharedsTTy");
+  Type *KmpSharedsTPtrTy = KmpSharedsTTy->getPointerTo();
+  StructType *KmpPrivatesTTy =
+      StructType::create(PrivatesTy, "struct.kmp_privates");
+  Type *KmpPrivatesTPtrTy = KmpPrivatesTTy->getPointerTo();
+  StructType *KmpTaskTWithPrivatesTy = StructType::create(
+      {KmpTaskTTy, KmpPrivatesTTy}, "struct.kmp_task_t_with_privates");
+  Type *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo();
+
+  // Declare the task entry function.
+  Function *TaskEntryFn = Function::Create(
+      OMPBuilder.TaskRoutineEntry, GlobalValue::InternalLinkage,
+      Fn->getAddressSpace(), Fn->getName() + ".omp_task_entry", &M);
+  // Name arguments.
+  TaskEntryFn->getArg(0)->setName(".global_tid");
+  TaskEntryFn->getArg(1)->setName(".task_t_with_privates");
+
+  // Declare the task outlined function.
+  FunctionType *TaskOutlinedFnTy =
+      FunctionType::get(OMPBuilder.Void,
+                        {OMPBuilder.Int32, OMPBuilder.Int32Ptr,
+                         OMPBuilder.VoidPtr, KmpTaskTPtrTy, KmpSharedsTPtrTy},
+                        /*isVarArg=*/false);
+  Function *TaskOutlinedFn = Function::Create(
+      TaskOutlinedFnTy, GlobalValue::InternalLinkage, Fn->getAddressSpace(),
+      Fn->getName() + ".omp_task_outlined", &M);
+  TaskOutlinedFn->getArg(0)->setName(".global_tid");
+  TaskOutlinedFn->getArg(1)->setName(".part_id");
+  TaskOutlinedFn->getArg(2)->setName(".privates");
+  TaskOutlinedFn->getArg(3)->setName(".task.data");
+  TaskOutlinedFn->getArg(4)->setName(".shareds");
+
+  // ************** END TYPE DEFINITION ************** //
+
+  // Emit kmpc_omp_task_alloc, kmpc_omp_task
+  {
+    const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+    OpenMPIRBuilder::LocationDescription Loc(
+        InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL);
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+    Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+    // TODO: parse clauses, for now fix flags to tied
+    unsigned TaskFlags = TiedFlag;
+    Value *SizeofShareds = nullptr;
+    if (KmpSharedsTTy->isEmptyTy())
+      SizeofShareds = OMPBuilder.Builder.getInt64(0);
+    else
+      SizeofShareds = OMPBuilder.Builder.getInt64(
+          M.getDataLayout().getTypeAllocSize(KmpSharedsTTy));
+    Value *SizeofKmpTaskTWithPrivates = OMPBuilder.Builder.getInt64(
+        M.getDataLayout().getTypeAllocSize(KmpTaskTWithPrivatesTy));
+    OMPBuilder.Builder.SetInsertPoint(BBEntry, BBEntry->getFirstInsertionPt());
+    Value *ThreadNum = OMPBuilder.getOrCreateThreadID(SrcLoc);
+    Value *KmpTaskTWithPrivatesVoidPtr = OMPBuilder.Builder.CreateCall(
+        KmpcOmpTaskAlloc,
+        {SrcLoc, ThreadNum, OMPBuilder.Builder.getInt32(TaskFlags),
+         SizeofKmpTaskTWithPrivates, SizeofShareds, TaskEntryFn},
+        ".task.data");
+    Value *KmpTaskTWithPrivates = OMPBuilder.Builder.CreateBitCast(
+        KmpTaskTWithPrivatesVoidPtr, KmpTaskTWithPrivatesPtrTy);
+
+    const unsigned KmpTaskTIdx = 0;
+    const unsigned KmpSharedsIdx = 0;
+    Value *KmpTaskT = OMPBuilder.Builder.CreateStructGEP(
+        KmpTaskTWithPrivatesTy, KmpTaskTWithPrivates, KmpTaskTIdx);
+    Value *KmpSharedsGEP =
+        OMPBuilder.Builder.CreateStructGEP(KmpTaskTTy, KmpTaskT, KmpSharedsIdx);
+    Value *KmpSharedsVoidPtr =
+        OMPBuilder.Builder.CreateLoad(OMPBuilder.VoidPtr, KmpSharedsGEP);
+    Value *KmpShareds =
+        OMPBuilder.Builder.CreateBitCast(KmpSharedsVoidPtr, KmpSharedsTPtrTy);
+    const unsigned KmpPrivatesIdx = 1;
+    Value *KmpPrivates = OMPBuilder.Builder.CreateStructGEP(
+        KmpTaskTWithPrivatesTy, KmpTaskTWithPrivates, KmpPrivatesIdx);
+
+    // Store shareds by reference, firstprivates by value, in task data
+    // storage.
+    unsigned SharedsGEPIdx = 0;
+    unsigned PrivatesGEPIdx = 0;
+    for (auto &It : DSAValueMap) {
+      Value *OriginalValue = It.first;
+      DSAType DSA = It.second.Type;
+      FunctionCallee CopyConstructor = It.second.CopyConstructor;
+      if (DSA == DSA_SHARED) {
+        Value *SharedGEP = OMPBuilder.Builder.CreateStructGEP(
+            KmpSharedsTTy, KmpShareds, SharedsGEPIdx,
+            OriginalValue->getName() + ".task.shared");
+        OMPBuilder.Builder.CreateStore(OriginalValue, SharedGEP);
+        ++SharedsGEPIdx;
+      } else if (DSA == DSA_FIRSTPRIVATE) {
+        Value *FirstprivateGEP = OMPBuilder.Builder.CreateStructGEP(
+            KmpPrivatesTTy, KmpPrivates, PrivatesGEPIdx,
+            OriginalValue->getName() + ".task.firstprivate");
+        Value *Load = OMPBuilder.Builder.CreateLoad(
+            OriginalValue->getType()->getPointerElementType(), OriginalValue);
+        if (CopyConstructor) {
+          Value *Copy = OMPBuilder.Builder.CreateCall(CopyConstructor, {Load});
+          OMPBuilder.Builder.CreateStore(Copy, FirstprivateGEP);
+        } else
+          OMPBuilder.Builder.CreateStore(Load, FirstprivateGEP);
+        ++PrivatesGEPIdx;
+      } else if (DSA == DSA_PRIVATE)
+        ++PrivatesGEPIdx;
+    }
+
+    FunctionCallee KmpcOmpTask =
+        OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_omp_task);
+    OMPBuilder.Builder.CreateCall(
+        KmpcOmpTask, {SrcLoc, ThreadNum, KmpTaskTWithPrivatesVoidPtr});
+  }
+
+  // Emit task entry function.
+  {
+    BasicBlock *TaskEntryBB =
+        BasicBlock::Create(M.getContext(), "entry", TaskEntryFn);
+    OMPBuilder.Builder.SetInsertPoint(TaskEntryBB);
+    const unsigned TaskTIdx = 0;
+    const unsigned PrivatesIdx = 1;
+    const unsigned SharedsIdx = 0;
+    Value *GTId = TaskEntryFn->getArg(0);
+    Value *KmpTaskTWithPrivates = OMPBuilder.Builder.CreateBitCast(
+        TaskEntryFn->getArg(1), KmpTaskTWithPrivatesPtrTy);
+    Value *KmpTaskT = OMPBuilder.Builder.CreateStructGEP(
+        KmpTaskTWithPrivatesTy, KmpTaskTWithPrivates, TaskTIdx, ".task.data");
+    Value *SharedsGEP = OMPBuilder.Builder.CreateStructGEP(
+        KmpTaskTTy, KmpTaskT, SharedsIdx, ".shareds.gep");
+    Value *SharedsVoidPtr = OMPBuilder.Builder.CreateLoad(
+        OMPBuilder.VoidPtr, SharedsGEP, ".shareds.void.ptr");
+    Value *Shareds = OMPBuilder.Builder.CreateBitCast(
+        SharedsVoidPtr, KmpSharedsTPtrTy, ".shareds");
+
+    Value *Privates = nullptr;
+    if (PrivatesTy.empty()) {
+      Privates = Constant::getNullValue(OMPBuilder.VoidPtr);
+    } else {
+      Value *PrivatesTyped = OMPBuilder.Builder.CreateStructGEP(
+          KmpTaskTWithPrivatesTy, KmpTaskTWithPrivates, PrivatesIdx,
+          ".privates");
+      Privates = OMPBuilder.Builder.CreateBitCast(
+          PrivatesTyped, OMPBuilder.VoidPtr, ".privates.void.ptr");
+    }
+    assert(Privates && "Expected non-null privates");
+
+    const unsigned PartIdIdx = 2;
+    Value *PartId = OMPBuilder.Builder.CreateStructGEP(KmpTaskTTy, KmpTaskT,
+                                                       PartIdIdx, ".part_id");
+    OMPBuilder.Builder.CreateCall(TaskOutlinedFnTy, TaskOutlinedFn,
+                                  {GTId, PartId, Privates, KmpTaskT, Shareds});
+    OMPBuilder.Builder.CreateRet(OMPBuilder.Builder.getInt32(0));
+  }
+
+  // Emit TaskOutlinedFn code.
+  {
+    OpenMPIRBuilder::OutlineInfo OI;
+    OI.EntryBB = StartBB;
+    OI.ExitBB = EndBB;
+    SmallPtrSet<BasicBlock *, 8> OutlinedBlockSet;
+    SmallVector<BasicBlock *, 8> OutlinedBlockVector;
+    OI.collectBlocks(OutlinedBlockSet, OutlinedBlockVector);
+    BasicBlock *TaskOutlinedEntryBB =
+        BasicBlock::Create(M.getContext(), "entry", TaskOutlinedFn);
+    BasicBlock *TaskOutlinedExitBB =
+        BasicBlock::Create(M.getContext(), "exit", TaskOutlinedFn);
+    for (BasicBlock *BB : OutlinedBlockVector)
+      BB->moveBefore(TaskOutlinedExitBB);
+    // Explicitly move EndBB to the outlined functions, since OutlineInfo
+    // does not contain it in the OutlinedBlockVector.
+    EndBB->moveBefore(TaskOutlinedExitBB);
+    EndBB->getTerminator()->setSuccessor(0, TaskOutlinedExitBB);
+
+    OMPBuilder.Builder.SetInsertPoint(TaskOutlinedEntryBB);
+    const unsigned KmpPrivatesArgNo = 2;
+    const unsigned KmpSharedsArgNo = 4;
+    Value *KmpPrivatesArgVoidPtr = TaskOutlinedFn->getArg(KmpPrivatesArgNo);
+    Value *KmpPrivatesArg = OMPBuilder.Builder.CreateBitCast(
+        KmpPrivatesArgVoidPtr, KmpPrivatesTPtrTy);
+    Value *KmpSharedsArg = TaskOutlinedFn->getArg(KmpSharedsArgNo);
+
+    // Replace shareds, privates, firstprivates to refer to task data
+    // storage.
+    unsigned SharedsGEPIdx = 0;
+    unsigned PrivatesGEPIdx = 0;
+    for (auto &It : DSAValueMap) {
+      Value *OriginalValue = It.first;
+      Value *ReplacementValue = nullptr;
+      if (It.second.Type == DSA_SHARED) {
+        Value *SharedGEP = OMPBuilder.Builder.CreateStructGEP(
+            KmpSharedsTTy, KmpSharedsArg, SharedsGEPIdx,
+            OriginalValue->getName() + ".task.shared.gep");
+        ReplacementValue = OMPBuilder.Builder.CreateLoad(
+            OriginalValue->getType(), SharedGEP,
+            OriginalValue->getName() + ".task.shared");
+        ++SharedsGEPIdx;
+      } else if (It.second.Type == DSA_PRIVATE) {
+        Value *PrivateGEP = OMPBuilder.Builder.CreateStructGEP(
+            KmpPrivatesTTy, KmpPrivatesArg, PrivatesGEPIdx,
+            OriginalValue->getName() + ".task.private.gep");
+        ReplacementValue = PrivateGEP;
+        // NOTE: Zero initialize private to avoid issue with Numba ref counting.
+        OMPBuilder.Builder.CreateStore(
+            Constant::getNullValue(
+                OriginalValue->getType()->getPointerElementType()),
+            ReplacementValue);
+        ++PrivatesGEPIdx;
+      } else if (It.second.Type == DSA_FIRSTPRIVATE) {
+        Value *FirstprivateGEP = OMPBuilder.Builder.CreateStructGEP(
+            KmpPrivatesTTy, KmpPrivatesArg, PrivatesGEPIdx,
+            OriginalValue->getName() + ".task.firstprivate.gep");
+        ReplacementValue = FirstprivateGEP;
+        ++PrivatesGEPIdx;
+      } else
+        FATAL_ERROR("Unknown DSA type");
+
+      assert(ReplacementValue && "Expected non-null ReplacementValue");
+      SmallVector<User *, 8> Users(OriginalValue->users());
+      for (User *U : Users)
+        if (Instruction *I = dyn_cast<Instruction>(U))
+          if (OutlinedBlockSet.contains(I->getParent()))
+            I->replaceUsesOfWith(OriginalValue, ReplacementValue);
+    }
+
+    OMPBuilder.Builder.CreateBr(StartBB);
+    OMPBuilder.Builder.SetInsertPoint(TaskOutlinedExitBB);
+    OMPBuilder.Builder.CreateRetVoid();
+    BBEntry->getTerminator()->setSuccessor(0, AfterBB);
+  }
+}
+
+void CGIntrinsicsOpenMP::emitOMPOffloadingEntry(const Twine &DevFuncName,
+                                                Value *EntryPtr,
+                                                Constant *&OMPOffloadEntry) {
+
+  Constant *DevFuncNameConstant =
+      ConstantDataArray::getString(M.getContext(), DevFuncName.str());
+  auto *GV = new GlobalVariable(
+      M, DevFuncNameConstant->getType(),
+      /* isConstant */ true, GlobalValue::InternalLinkage, DevFuncNameConstant,
+      ".omp_offloading.entry_name", nullptr, GlobalVariable::NotThreadLocal,
+      /* AddressSpace */ 0);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  Constant *EntryConst = dyn_cast<Constant>(EntryPtr);
+  assert(EntryConst && "Expected constant entry pointer");
+  OMPOffloadEntry = ConstantStruct::get(
+      TgtOffloadEntryTy,
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(EntryConst,
+                                                     OMPBuilder.VoidPtr),
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, OMPBuilder.Int8Ptr),
+      ConstantInt::get(OMPBuilder.SizeTy, 0),
+      ConstantInt::get(OMPBuilder.Int32, 0),
+      ConstantInt::get(OMPBuilder.Int32, 0));
+  auto *OMPOffloadEntryGV = new GlobalVariable(
+      M, TgtOffloadEntryTy,
+      /* isConstant */ true, GlobalValue::WeakAnyLinkage, OMPOffloadEntry,
+      ".omp_offloading.entry." + DevFuncName);
+  OMPOffloadEntryGV->setSection("omp_offloading_entries");
+  OMPOffloadEntryGV->setAlignment(Align(1));
+}
+
+void CGIntrinsicsOpenMP::emitOMPOffloadingMappings(
+    InsertPointTy AllocaIP, DSAValueMapTy &DSAValueMap,
+    StructMapTy &StructMappingInfoMap,
+    OffloadingMappingArgsTy &OffloadingMappingArgs, bool IsTargetRegion) {
+
+  struct MapperInfo {
+    Value *BasePtr;
+    Value *Ptr;
+    Value *Size;
+  };
+
+  SmallVector<MapperInfo, 8> MapperInfos;
+  // SmallVector<Constant *, 8> OffloadSizes;
+  SmallVector<Constant *, 8> OffloadMapTypes;
+  SmallVector<Constant *, 8> OffloadMapNames;
+
+  if (DSAValueMap.empty()) {
+    OffloadingMappingArgs.Size = 0;
+    OffloadingMappingArgs.BasePtrs =
+        Constant::getNullValue(OMPBuilder.VoidPtrPtr);
+    OffloadingMappingArgs.Ptrs = Constant::getNullValue(OMPBuilder.VoidPtrPtr);
+    OffloadingMappingArgs.Sizes = Constant::getNullValue(OMPBuilder.Int64Ptr);
+    OffloadingMappingArgs.MapTypes =
+        Constant::getNullValue(OMPBuilder.Int64Ptr);
+    OffloadingMappingArgs.MapNames =
+        Constant::getNullValue(OMPBuilder.VoidPtrPtr);
+
+    return;
+  }
+
+  auto EmitMappingEntry = [&](Value *Size, uint64_t MapType, Value *BasePtr,
+                              Value *Ptr) {
+    OffloadMapTypes.push_back(ConstantInt::get(OMPBuilder.SizeTy, MapType));
+    // TODO: maybe add debug info.
+    uint32_t SrcLocStrSize;
+    OffloadMapNames.push_back(OMPBuilder.getOrCreateSrcLocStr(
+        BasePtr->getName(), "", 0, 0, SrcLocStrSize));
+    DEBUG_ENABLE(dbgs() << "Emit mapping entry BasePtr " << *BasePtr << " Ptr "
+                      << *Ptr << " Size " << *Size << " MapType " << MapType
+                      << "\n");
+    MapperInfos.push_back({BasePtr, Ptr, Size});
+  };
+
+  auto GetMapType = [IsTargetRegion](DSAType DSA) {
+    uint64_t MapType;
+    // Determine the map type, completely or partly (structs).
+    switch (DSA) {
+    case DSA_FIRSTPRIVATE:
+      MapType = OMP_TGT_MAPTYPE_LITERAL;
+      if (IsTargetRegion)
+        MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM;
+      break;
+    case DSA_MAP_ALLOC:
+      // Allocation is the default in the OpenMP runtime, no extra flags.
+      MapType = OMP_TGT_MAPTYPE_NONE;
+      if (IsTargetRegion)
+        MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM;
+      break;
+    case DSA_MAP_TO:
+      MapType = OMP_TGT_MAPTYPE_TO;
+      if (IsTargetRegion)
+        MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM;
+      break;
+    case DSA_MAP_FROM:
+      MapType = OMP_TGT_MAPTYPE_FROM;
+      if (IsTargetRegion)
+        MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM;
+      break;
+    case DSA_MAP_TOFROM:
+      MapType = OMP_TGT_MAPTYPE_TO | OMP_TGT_MAPTYPE_FROM;
+      if (IsTargetRegion)
+        MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM;
+      break;
+    case DSA_MAP_STRUCT:
+      MapType = OMP_TGT_MAPTYPE_NONE;
+      if (IsTargetRegion)
+        MapType |= OMP_TGT_MAPTYPE_TARGET_PARAM;
+      break;
+    case DSA_MAP_ALLOC_STRUCT:
+      // Allocation is the default in the OpenMP runtime, no extra flags.
+      MapType = OMP_TGT_MAPTYPE_NONE;
+      break;
+    case DSA_MAP_TO_STRUCT:
+      MapType = OMP_TGT_MAPTYPE_TO;
+      break;
+    case DSA_MAP_FROM_STRUCT:
+      MapType = OMP_TGT_MAPTYPE_FROM;
+      break;
+    case DSA_MAP_TOFROM_STRUCT:
+      MapType = OMP_TGT_MAPTYPE_TO | OMP_TGT_MAPTYPE_FROM;
+      break;
+    case DSA_PRIVATE:
+      // do nothing
+      break;
+    default:
+      FATAL_ERROR("Unknown mapping type");
+    }
+
+    return MapType;
+  };
+
+  // Keep track of argument position, needed for struct mappings.
+  for (auto &It : DSAValueMap) {
+    Value *V = It.first;
+    DSAType DSA = It.second.Type;
+
+    // Emit the mapping entry.
+    Value *Size;
+    switch (DSA) {
+    case DSA_MAP_ALLOC:
+    case DSA_MAP_TO:
+    case DSA_MAP_FROM:
+    case DSA_MAP_TOFROM:
+      Size = ConstantInt::get(OMPBuilder.SizeTy,
+                              M.getDataLayout().getTypeAllocSize(V->getType()));
+      EmitMappingEntry(Size, GetMapType(DSA), V, V);
+      break;
+    case DSA_FIRSTPRIVATE: {
+      auto *Load = OMPBuilder.Builder.CreateLoad(
+          V->getType()->getPointerElementType(), V);
+      // TODO: Runtime expects values in Int64 type, fix with arguments in
+      // struct.
+      AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
+          OMPBuilder.Int64, nullptr, V->getName() + ".casted");
+      Value *Cast = OMPBuilder.Builder.CreateBitCast(
+          TmpInt64, V->getType());
+      auto *Store = OMPBuilder.Builder.CreateStore(Load, Cast);
+      Value *ScalarV=
+          OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
+      Size = ConstantInt::get(OMPBuilder.SizeTy,
+                              M.getDataLayout().getTypeAllocSize(
+                                  V->getType()->getPointerElementType()));
+      EmitMappingEntry(Size, GetMapType(DSA), ScalarV, ScalarV);
+      break;
+    }
+    case DSA_MAP_STRUCT: {
+      Size = ConstantInt::get(OMPBuilder.SizeTy,
+                              M.getDataLayout().getTypeAllocSize(
+                                  V->getType()->getPointerElementType()));
+      EmitMappingEntry(Size, GetMapType(DSA), V, V);
+      // Stores the argument position (starting from 1) of the parent
+      // struct, to be used to set MEMBER_OF in the map type.
+      size_t ArgPos = MapperInfos.size();
+
+      for (auto &FieldInfo : StructMappingInfoMap[V]) {
+        // MEMBER_OF(Argument Position)
+        const size_t MemberOfOffset = 48;
+        uint64_t MemberOfBits = ArgPos << MemberOfOffset;
+        uint64_t FieldMapType = GetMapType(FieldInfo.MapType) | MemberOfBits;
+        auto *FieldGEP = OMPBuilder.Builder.CreateInBoundsGEP(
+            V->getType()->getPointerElementType(), V,
+            {OMPBuilder.Builder.getInt32(0), FieldInfo.Index});
+
+        Value *BasePtr = nullptr;
+        Value *Ptr = nullptr;
+
+        if (FieldGEP->getType()->getPointerElementType()->isPointerTy()) {
+          FieldMapType |= OMP_TGT_MAPTYPE_PTR_AND_OBJ;
+          BasePtr = FieldGEP;
+          auto *Load = OMPBuilder.Builder.CreateLoad(
+              BasePtr->getType()->getPointerElementType(), BasePtr);
+          Ptr = OMPBuilder.Builder.CreateInBoundsGEP(
+              Load->getType()->getPointerElementType(), Load, FieldInfo.Offset);
+        } else {
+          BasePtr = V;
+          Ptr = OMPBuilder.Builder.CreateInBoundsGEP(
+              FieldGEP->getType()->getPointerElementType(), FieldGEP,
+              FieldInfo.Offset);
+        }
+
+        assert(BasePtr && "Expected non-null base pointer");
+        assert(Ptr && "Expected non-null pointer");
+
+        auto ElementSize = ConstantInt::get(
+            OMPBuilder.SizeTy, M.getDataLayout().getTypeAllocSize(
+                                   Ptr->getType()->getPointerElementType()));
+        Value *NumElements = nullptr;
+
+        // Load the value of NumElements if it is a pointer.
+        if (FieldInfo.NumElements->getType()->isPointerTy())
+          NumElements = OMPBuilder.Builder.CreateLoad(OMPBuilder.SizeTy,
+                                                      FieldInfo.NumElements);
+        else
+          NumElements = FieldInfo.NumElements;
+
+        auto *Size = OMPBuilder.Builder.CreateMul(ElementSize, NumElements);
+        EmitMappingEntry(Size, FieldMapType, BasePtr, Ptr);
+      }
+      break;
+    }
+    case DSA_PRIVATE: {
+      // do nothing
+      break;
+    }
+    default:
+      FATAL_ERROR("Unknown mapping type");
+    }
+  }
+
+  auto EmitConstantArrayGlobalBitCast = [&](SmallVectorImpl<Constant *> &Vector,
+                                            Type *Ty, Type *DestTy,
+                                            StringRef Name) {
+    auto *Init = ConstantArray::get(ArrayType::get(Ty, Vector.size()), Vector);
+    auto *GV = new GlobalVariable(M, ArrayType::get(Ty, Vector.size()),
+                                  /* isConstant */ true,
+                                  GlobalVariable::PrivateLinkage, Init, Name);
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+    return OMPBuilder.Builder.CreateBitCast(GV, DestTy);
+  };
+
+  // TODO: offload_sizes can be a global of constants for optimization if all
+  // sizes are constants.
+  // OffloadingMappingArgs.Sizes =
+  //    EmitConstantArrayGlobalBitCast(OffloadSizes, OMPBuilder.SizeTy,
+  //                            OMPBuilder.Int64Ptr, ".offload_sizes");
+  OffloadingMappingArgs.MapTypes =
+      EmitConstantArrayGlobalBitCast(OffloadMapTypes, OMPBuilder.SizeTy,
+                                     OMPBuilder.Int64Ptr, ".offload_maptypes");
+  OffloadingMappingArgs.MapNames = EmitConstantArrayGlobalBitCast(
+      OffloadMapNames, OMPBuilder.Int8Ptr, OMPBuilder.VoidPtrPtr,
+      ".offload_mapnames");
+
+  auto EmitArrayAlloca = [&](size_t Size, Type *Ty, StringRef Name) {
+    InsertPointTy CodeGenIP = OMPBuilder.Builder.saveIP();
+
+    OMPBuilder.Builder.restoreIP(AllocaIP);
+    auto *Alloca = OMPBuilder.Builder.CreateAlloca(ArrayType::get(Ty, Size),
+                                                   nullptr, Name);
+
+    OMPBuilder.Builder.restoreIP(CodeGenIP);
+
+    return Alloca;
+  };
+
+  auto *BasePtrsAlloca = EmitArrayAlloca(MapperInfos.size(), OMPBuilder.VoidPtr,
+                                         ".offload_baseptrs");
+  auto *PtrsAlloca =
+      EmitArrayAlloca(MapperInfos.size(), OMPBuilder.VoidPtr, ".offload_ptrs");
+  auto *SizesAlloca =
+      EmitArrayAlloca(MapperInfos.size(), OMPBuilder.SizeTy, ".offload_sizes");
+
+  size_t Idx = 0;
+  for (auto &MI : MapperInfos) {
+    // Store in the base pointers alloca.
+    auto *GEP = OMPBuilder.Builder.CreateInBoundsGEP(
+        BasePtrsAlloca->getType()->getPointerElementType(), BasePtrsAlloca,
+        {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
+    auto *Bitcast = OMPBuilder.Builder.CreateBitCast(
+        GEP, MI.BasePtr->getType()->getPointerTo());
+    OMPBuilder.Builder.CreateStore(MI.BasePtr, Bitcast);
+
+    // Store in the pointers alloca.
+    GEP = OMPBuilder.Builder.CreateInBoundsGEP(
+        PtrsAlloca->getType()->getPointerElementType(), PtrsAlloca,
+        {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
+    Bitcast = OMPBuilder.Builder.CreateBitCast(
+        GEP, MI.Ptr->getType()->getPointerTo());
+    OMPBuilder.Builder.CreateStore(MI.Ptr, Bitcast);
+
+    // Store in the sizes alloca.
+    GEP = OMPBuilder.Builder.CreateInBoundsGEP(
+        SizesAlloca->getType()->getPointerElementType(), SizesAlloca,
+        {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(Idx)});
+    Bitcast = OMPBuilder.Builder.CreateBitCast(
+        GEP, MI.Size->getType()->getPointerTo());
+    OMPBuilder.Builder.CreateStore(MI.Size, Bitcast);
+
+    Idx++;
+  }
+
+  OffloadingMappingArgs.Size = MapperInfos.size();
+  OffloadingMappingArgs.BasePtrs =
+      OMPBuilder.Builder.CreateBitCast(BasePtrsAlloca, OMPBuilder.VoidPtrPtr);
+  OffloadingMappingArgs.Ptrs =
+      OMPBuilder.Builder.CreateBitCast(PtrsAlloca, OMPBuilder.VoidPtrPtr);
+  OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateBitCast(
+      SizesAlloca, OMPBuilder.SizeTy->getPointerTo());
+
+  // OffloadingMappingArgs.BasePtrs = OMPBuilder.Builder.CreateInBoundsGEP(
+  //     BasePtrsAlloca->getType()->getPointerElementType(), BasePtrsAlloca,
+  //     {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)});
+  // OffloadingMappingArgs.Ptrs = OMPBuilder.Builder.CreateInBoundsGEP(
+  //     PtrsAlloca->getType()->getPointerElementType(), PtrsAlloca,
+  //     {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)});
+  // OffloadingMappingArgs.Sizes = OMPBuilder.Builder.CreateInBoundsGEP(
+  //     SizesAlloca->getType()->getPointerElementType(), SizesAlloca,
+  //     {OMPBuilder.Builder.getInt32(0), OMPBuilder.Builder.getInt32(0)});
+}
+
+void CGIntrinsicsOpenMP::emitOMPSingle(Function *Fn, BasicBlock *BBEntry,
+                                       BasicBlock *AfterBB,
+                                       BodyGenCallbackTy BodyGenCB,
+                                       FinalizeCallbackTy FiniCB) {
+  const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+  BBEntry->getTerminator()->eraseFromParent();
+  // Set the insertion location at the end of the BBEntry.
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->end()), DL);
+
+  InsertPointTy AfterIP =
+      OMPBuilder.createSingle(Loc, BodyGenCB, FiniCB, /*DidIt*/ nullptr);
+  BranchInst::Create(AfterBB, AfterIP.getBlock());
+  DEBUG_ENABLE(dbgs() << "=== Single Fn\n" << *Fn << "=== End of Single Fn\n");
+}
+
+void CGIntrinsicsOpenMP::emitOMPCritical(Function *Fn, BasicBlock *BBEntry,
+                                         BasicBlock *AfterBB,
+                                         BodyGenCallbackTy BodyGenCB,
+                                         FinalizeCallbackTy FiniCB) {
+  if (isOpenMPDeviceRuntime())
+    FATAL_ERROR("Critical regions are not (yet) implemented on device");
+
+  const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+  BBEntry->getTerminator()->eraseFromParent();
+  // Set the insertion location at the end of the BBEntry.
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->end()), DL);
+
+  InsertPointTy AfterIP = OMPBuilder.createCritical(Loc, BodyGenCB, FiniCB, "",
+                                                    /*HintInst*/ nullptr);
+  BranchInst::Create(AfterBB, AfterIP.getBlock());
+  DEBUG_ENABLE(dbgs() << "=== Critical Fn\n"
+                    << *Fn << "=== End of Critical Fn\n");
+}
+
+void CGIntrinsicsOpenMP::emitOMPBarrier(Function *Fn, BasicBlock *BBEntry,
+                                        Directive DK) {
+  const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+  // Set the insertion location at the end of the BBEntry.
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL);
+
+  // TODO: check ForceSimpleCall usage.
+  OMPBuilder.createBarrier(Loc, DK,
+                           /*ForceSimpleCall*/ false,
+                           /*CheckCancelFlag*/ true);
+  DEBUG_ENABLE(dbgs() << "=== Barrier Fn\n" << *Fn << "=== End of Barrier Fn\n");
+}
+
+void CGIntrinsicsOpenMP::emitOMPTaskwait(BasicBlock *BBEntry) {
+  const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+  // Set the insertion location at the end of the BBEntry.
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL);
+
+  OMPBuilder.createTaskwait(Loc);
+}
+
+GlobalVariable *
+CGIntrinsicsOpenMP::emitOffloadingGlobals(StringRef DevWrapperFuncName,
+                                          ConstantDataArray *ELF) {
+  GlobalVariable *OMPRegionId = nullptr;
+  GlobalVariable *OMPOffloadEntries = nullptr;
+
+  // TODO: assumes 1 target region, can we call tgt_register_lib
+  // multiple times?
+  OMPRegionId = new GlobalVariable(
+      M, OMPBuilder.Int8, /* isConstant */ true, GlobalValue::WeakAnyLinkage,
+      ConstantInt::get(OMPBuilder.Int8, 0), DevWrapperFuncName + ".region_id",
+      nullptr, GlobalVariable::NotThreadLocal,
+      /* AddressSpace */ 0);
+
+  Constant *OMPOffloadEntry;
+  CGIntrinsicsOpenMP::emitOMPOffloadingEntry(DevWrapperFuncName, OMPRegionId,
+                                             OMPOffloadEntry);
+
+  // TODO: do this at finalization when all entries have been
+  // found.
+  // TODO: assumes 1 device image, can we call tgt_register_lib
+  // multiple times?
+  auto *ArrayTy = ArrayType::get(TgtOffloadEntryTy, 1);
+  OMPOffloadEntries =
+      new GlobalVariable(M, ArrayTy,
+                         /* isConstant */ true, GlobalValue::InternalLinkage,
+                         ConstantArray::get(ArrayTy, {OMPOffloadEntry}),
+                         ".omp_offloading.entries");
+
+  assert(OMPRegionId && "Expected non-null omp region id global");
+  assert(OMPOffloadEntries &&
+         "Expected non-null omp offloading entries constant");
+
+  auto EmitOffloadingBinaryGlobals = [&]() {
+    auto *GV = new GlobalVariable(M, ELF->getType(), /* isConstant */ true,
+                                  GlobalValue::InternalLinkage, ELF,
+                                  ".omp_offloading.device_image");
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+    StructType *TgtDeviceImageTy = StructType::create(
+        {OMPBuilder.Int8Ptr, OMPBuilder.Int8Ptr,
+         TgtOffloadEntryTy->getPointerTo(), TgtOffloadEntryTy->getPointerTo()},
+        "struct.__tgt_device_image");
+
+    StructType *TgtBinDescTy = StructType::create(
+        {OMPBuilder.Int32, TgtDeviceImageTy->getPointerTo(),
+         TgtOffloadEntryTy->getPointerTo(), TgtOffloadEntryTy->getPointerTo()},
+        "struct.__tgt_bin_desc");
+
+    auto *ArrayTy = ArrayType::get(TgtDeviceImageTy, 1);
+    auto *Zero = ConstantInt::get(OMPBuilder.SizeTy, 0);
+    auto *One = ConstantInt::get(OMPBuilder.SizeTy, 1);
+    auto *Size = ConstantInt::get(OMPBuilder.SizeTy, ELF->getNumElements());
+    Constant *ZeroZero[] = {Zero, Zero};
+    Constant *ZeroOne[] = {Zero, One};
+    Constant *ZeroSize[] = {Zero, Size};
+
+    auto *ImageB =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, ZeroZero);
+    auto *ImageE =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, ZeroSize);
+    auto *EntriesB = ConstantExpr::getGetElementPtr(
+        OMPOffloadEntries->getValueType(), OMPOffloadEntries, ZeroZero);
+    auto *EntriesE = ConstantExpr::getGetElementPtr(
+        OMPOffloadEntries->getValueType(), OMPOffloadEntries, ZeroOne);
+
+    auto *DeviceImageEntry = ConstantStruct::get(TgtDeviceImageTy, ImageB,
+                                                 ImageE, EntriesB, EntriesE);
+    auto *DeviceImages =
+        new GlobalVariable(M, ArrayTy,
+                           /* isConstant */ true, GlobalValue::InternalLinkage,
+                           ConstantArray::get(ArrayTy, {DeviceImageEntry}),
+                           ".omp_offloading.device_images");
+
+    auto *ImagesB = ConstantExpr::getGetElementPtr(DeviceImages->getValueType(),
+                                                   DeviceImages, ZeroZero);
+    auto *DescInit =
+        ConstantStruct::get(TgtBinDescTy,
+                            ConstantInt::get(OMPBuilder.Int32,
+                                             /* number of images */ 1),
+                            ImagesB, EntriesB, EntriesE);
+    auto *BinDesc =
+        new GlobalVariable(M, DescInit->getType(),
+                           /* isConstant */ true, GlobalValue::InternalLinkage,
+                           DescInit, ".omp_offloading.descriptor");
+
+    // Add tgt_register_requires, tgt_register_lib,
+    // tgt_unregister_lib.
+    {
+      // tgt_register_requires.
+      auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false);
+      auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+                                    ".omp_offloading.requires_reg", &M);
+      Func->setSection(".text.startup");
+
+      // Get __tgt_register_lib function declaration.
+      auto *RegFuncTy = FunctionType::get(OMPBuilder.Void, OMPBuilder.Int64,
+                                          /*isVarArg*/ false);
+      FunctionCallee RegFuncC =
+          M.getOrInsertFunction("__tgt_register_requires", RegFuncTy);
+
+      // Construct function body
+      IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func));
+      // TODO: fix to pass the requirements enum value.
+      Builder.CreateCall(RegFuncC, ConstantInt::get(OMPBuilder.Int64, 1));
+      Builder.CreateRetVoid();
+
+      // Add this function to constructors.
+      // Set priority to 1 so that __tgt_register_lib is executed
+      // AFTER
+      // __tgt_register_requires (we want to know what requirements
+      // have been asked for before we load a libomptarget plugin so
+      // that by the time the plugin is loaded it can report how
+      // many devices there are which can satisfy these
+      // requirements).
+      appendToGlobalCtors(M, Func, /*Priority*/ 0);
+    }
+    {
+      // ctor
+      auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false);
+      auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+                                    ".omp_offloading.descriptor_reg", &M);
+      Func->setSection(".text.startup");
+
+      // Get __tgt_register_lib function declaration.
+      auto *RegFuncTy =
+          FunctionType::get(OMPBuilder.Void, TgtBinDescTy->getPointerTo(),
+                            /*isVarArg*/ false);
+      FunctionCallee RegFuncC =
+          M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);
+
+      // Construct function body
+      IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func));
+      Builder.CreateCall(RegFuncC, BinDesc);
+      Builder.CreateRetVoid();
+
+      // Add this function to constructors.
+      // Set priority to 1 so that __tgt_register_lib is executed
+      // AFTER
+      // __tgt_register_requires (we want to know what requirements
+      // have been asked for before we load a libomptarget plugin so
+      // that by the time the plugin is loaded it can report how
+      // many devices there are which can satisfy these
+      // requirements).
+      appendToGlobalCtors(M, Func, /*Priority*/ 1);
+    }
+    {
+      auto *FuncTy = FunctionType::get(OMPBuilder.Void, /*isVarArg*/ false);
+      auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+                                    ".omp_offloading.descriptor_unreg", &M);
+      Func->setSection(".text.startup");
+
+      // Get __tgt_unregister_lib function declaration.
+      auto *UnRegFuncTy =
+          FunctionType::get(OMPBuilder.Void, TgtBinDescTy->getPointerTo(),
+                            /*isVarArg*/ false);
+      FunctionCallee UnRegFuncC =
+          M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);
+
+      // Construct function body
+      IRBuilder<> Builder(BasicBlock::Create(M.getContext(), "entry", Func));
+      Builder.CreateCall(UnRegFuncC, BinDesc);
+      Builder.CreateRetVoid();
+
+      // Add this function to global destructors.
+      // Match priority of __tgt_register_lib
+      appendToGlobalDtors(M, Func, /*Priority*/ 1);
+    }
+  };
+
+  EmitOffloadingBinaryGlobals();
+
+  return OMPRegionId;
+}
+
+void CGIntrinsicsOpenMP::emitOMPTarget(Function *Fn, BasicBlock *EntryBB,
+                                       BasicBlock *StartBB, BasicBlock *EndBB,
+                                       DSAValueMapTy &DSAValueMap,
+                                       StructMapTy &StructMappingInfoMap,
+                                       TargetInfoStruct &TargetInfo,
+                                       OMPLoopInfoStruct *OMPLoopInfo,
+                                       bool IsDeviceTargetRegion) {
+  if (IsDeviceTargetRegion)
+    emitOMPTargetDevice(Fn, EntryBB, StartBB, EndBB, DSAValueMap,
+                        StructMappingInfoMap, TargetInfo);
+  else
+    emitOMPTargetHost(Fn, EntryBB, StartBB, EndBB, DSAValueMap,
+                      StructMappingInfoMap, TargetInfo, OMPLoopInfo);
+}
+
+void CGIntrinsicsOpenMP::emitOMPTargetHost(
+    Function *Fn, BasicBlock *EntryBB, BasicBlock *StartBB, BasicBlock *EndBB,
+    DSAValueMapTy &DSAValueMap, StructMapTy &StructMappingInfoMap,
+    TargetInfoStruct &TargetInfo, OMPLoopInfoStruct *OMPLoopInfo) {
+
+  Twine DevWrapperFuncName = getDevWrapperFuncPrefix() + TargetInfo.DevFuncName;
+
+  GlobalVariable *OMPRegionId =
+      emitOffloadingGlobals(DevWrapperFuncName.str(), TargetInfo.ELF);
+
+  const DebugLoc DL = EntryBB->getTerminator()->getDebugLoc();
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(EntryBB, EntryBB->getTerminator()->getIterator()), DL);
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+  // TODO: should we use target_mapper without teams or the more general
+  // target_teams_mapper. Does the former buy us anything (less overhead?)
+  // FunctionCallee TargetMapper =
+  //    OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_mapper);
+  // TODO: For nowait we need to enclose the host code in a task for async
+  // execution.
+  FunctionCallee TargetMapper =
+      (TargetInfo.NoWait ? OMPBuilder.getOrCreateRuntimeFunction(
+                               M, OMPRTL___tgt_target_teams_nowait_mapper)
+                         : OMPBuilder.getOrCreateRuntimeFunction(
+                               M, OMPRTL___tgt_target_teams_mapper));
+  OMPBuilder.Builder.SetInsertPoint(EntryBB->getTerminator());
+
+  // Emit mappings.
+  OffloadingMappingArgsTy OffloadingMappingArgs;
+  InsertPointTy AllocaIP(&Fn->getEntryBlock(),
+                         Fn->getEntryBlock().getFirstInsertionPt());
+  emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap,
+                            OffloadingMappingArgs, /* isTargetRegion */ true);
+
+  // Push the tripcount.
+  if (OMPLoopInfo) {
+    FunctionCallee TripcountMapper = OMPBuilder.getOrCreateRuntimeFunction(
+        M,
+        llvm::omp::RuntimeFunction::OMPRTL___kmpc_push_target_tripcount_mapper);
+    Value *Load =
+        OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, OMPLoopInfo->UB);
+    Value *Tripcount = OMPBuilder.Builder.CreateAdd(
+        Load, ConstantInt::get(OMPBuilder.Int64, 1));
+    auto *CI = checkCreateCall(
+        OMPBuilder.Builder, TripcountMapper,
+        {Ident, ConstantInt::get(OMPBuilder.Int64, -1), Tripcount});
+    assert(CI && "Expected valid call");
+  }
+
+  Value *NumTeams = createScalarCast(TargetInfo.NumTeams, OMPBuilder.Int32);
+  Value *ThreadLimit =
+      createScalarCast(TargetInfo.ThreadLimit, OMPBuilder.Int32);
+
+  assert(NumTeams && "Expected non-null NumTeams");
+  assert(ThreadLimit && "Expected non-null ThreadLimit");
+
+  SmallVector<Value *, 16> Args = {
+      Ident, ConstantInt::get(OMPBuilder.Int64, -1),
+      ConstantExpr::getBitCast(OMPRegionId, OMPBuilder.VoidPtr),
+      ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size),
+      OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs,
+      OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes,
+      OffloadingMappingArgs.MapNames,
+      // TODO: offload_mappers is null for now.
+      Constant::getNullValue(OMPBuilder.VoidPtrPtr), NumTeams, ThreadLimit};
+
+  if (TargetInfo.NoWait) {
+    // Add extra dependency information (unused for now).
+    Args.push_back(Constant::getNullValue(OMPBuilder.Int32));
+    Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr));
+    Args.push_back(Constant::getNullValue(OMPBuilder.Int32));
+    Args.push_back(Constant::getNullValue(OMPBuilder.Int8Ptr));
+  }
+
+  auto *OffloadResult = checkCreateCall(OMPBuilder.Builder, TargetMapper, Args);
+  assert(OffloadResult && "Expected non-null call inst from code generation");
+  auto *Failed = OMPBuilder.Builder.CreateIsNotNull(OffloadResult);
+  OMPBuilder.Builder.CreateCondBr(Failed, StartBB, EndBB);
+  EntryBB->getTerminator()->eraseFromParent();
+}
+
+void CGIntrinsicsOpenMP::emitOMPTargetDevice(Function *Fn, BasicBlock *EntryBB,
+                                             BasicBlock *StartBB,
+                                             BasicBlock *EndBB,
+                                             DSAValueMapTy &DSAValueMap,
+                                             StructMapTy &StructMappingInfoMap,
+                                             TargetInfoStruct &TargetInfo) {
+  // Emit the Numba wrapper offloading function.
+  SmallVector<Type *, 8> WrapperArgsTypes;
+  SmallVector<StringRef, 8> WrapperArgsNames;
+  for (auto &It : DSAValueMap) {
+    Value *V = It.first;
+    DSAType DSA = It.second.Type;
+
+    DEBUG_ENABLE(dbgs() << "V " << *V << " DSA " << DSA << "\n");
+    switch (DSA) {
+    case DSA_FIRSTPRIVATE:
+      // TODO: Runtime expects firstprivate (scalars) typed as Int64.
+      WrapperArgsTypes.push_back(OMPBuilder.Int64);
+      WrapperArgsNames.push_back(V->getName());
+      break;
+    case DSA_PRIVATE:
+      // do nothing
+      break;
+    default:
+      WrapperArgsTypes.push_back(V->getType());
+      WrapperArgsNames.push_back(V->getName());
+    }
+  }
+
+  Twine DevWrapperFuncName = getDevWrapperFuncPrefix() + Fn->getName();
+  FunctionType *NumbaWrapperFnTy =
+      FunctionType::get(OMPBuilder.Void, WrapperArgsTypes,
+                        /* isVarArg */ false);
+  Function *NumbaWrapperFunc = Function::Create(
+      NumbaWrapperFnTy, GlobalValue::ExternalLinkage, DevWrapperFuncName, M);
+
+  // Name the wrapper arguments for readability.
+  for (size_t I = 0; I < NumbaWrapperFunc->arg_size(); ++I)
+    NumbaWrapperFunc->getArg(I)->setName(WrapperArgsNames[I]);
+
+  IRBuilder<> Builder(
+      BasicBlock::Create(M.getContext(), "entry", NumbaWrapperFunc));
+  // Set up default arguments. Depends on the target architecture.
+  FunctionCallee DevFuncCallee(Fn);
+  // Set the callee device function with internal linkage to enable
+  // optimization.
+  Fn->setLinkage(GlobalValue::InternalLinkage);
+  SmallVector<Value *, 8> DevFuncArgs;
+  Triple TargetTriple(M.getTargetTriple());
+
+  // Adapt arguments to the Numba calling convention depending on target. First
+  // two arguments are Numba-generated pointers for return value and exceptions
+  // (if targeting the CPU), which are unused. Init to nullptr.
+  size_t ArgOffset;
+  DevFuncArgs.push_back(Constant::getNullValue(Fn->getArg(0)->getType()));
+  if (!isOpenMPDeviceRuntime()) {
+    DevFuncArgs.push_back(Constant::getNullValue(Fn->getArg(1)->getType()));
+    ArgOffset = 2;
+  } else {
+    ArgOffset = 1;
+  }
+  for (auto &Arg : NumbaWrapperFunc->args()) {
+    // TODO: Runtime expects all scalars typed as Int64.
+    if (!Arg.getType()->isPointerTy()) {
+      auto *ParamType =
+          DevFuncCallee.getFunctionType()->getParamType(ArgOffset + Arg.getArgNo());
+      dbgs() << "ParamType " << *ParamType << "\n";
+      AllocaInst *TmpInt64 = Builder.CreateAlloca(OMPBuilder.Int64, nullptr,
+                                                  Arg.getName() + ".casted");
+      Builder.CreateStore(&Arg, TmpInt64);
+      Value *Cast = Builder.CreateBitCast(TmpInt64, ParamType->getPointerTo());
+      Value *ConvLoad = Builder.CreateLoad(ParamType, Cast);
+      DevFuncArgs.push_back(ConvLoad);
+    } else
+      DevFuncArgs.push_back(&Arg);
+  }
+
+  bool IsSPMD = (TargetInfo.ExecMode == omp::OMP_TGT_EXEC_MODE_SPMD);
+  if (isOpenMPDeviceRuntime()) {
+    OpenMPIRBuilder::LocationDescription Loc(Builder);
+    auto IP = OMPBuilder.createTargetInit(Loc, /* IsSPMD */ IsSPMD,
+                                          /* RequiresFullRuntime */ false);
+    Builder.restoreIP(IP);
+  }
+
+  auto *CI = checkCreateCall(Builder, DevFuncCallee, DevFuncArgs);
+  assert(CI && "Expected valid call");
+
+  if (isOpenMPDeviceRuntime()) {
+    OpenMPIRBuilder::LocationDescription Loc(Builder);
+    OMPBuilder.createTargetDeinit(Loc, /* IsSPMD */ IsSPMD,
+                                  /* RequiresFullRuntime */ false);
+  }
+
+  Builder.CreateRetVoid();
+
+  if (isOpenMPDeviceRuntime()) {
+    assert(TargetInfo.ExecMode && "Expected non-zero ExecMode");
+    // Emit OMP device globals and metadata.
+    // TODO: Make the exec_mode a parameter and use SPMD when possible.
+    auto *ExecModeGV = new GlobalVariable(
+        M, OMPBuilder.Int8, /* isConstant */ false, GlobalValue::WeakAnyLinkage,
+        Builder.getInt8(TargetInfo.ExecMode),
+        DevWrapperFuncName + "_exec_mode");
+    appendToCompilerUsed(M, {ExecModeGV});
+
+    // Get "nvvm.annotations" metadata node.
+    // TODO: may need to adjust for AMD gpus.
+    NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+
+    Metadata *MDVals[] = {
+        ConstantAsMetadata::get(NumbaWrapperFunc),
+        MDString::get(M.getContext(), "kernel"),
+        ConstantAsMetadata::get(ConstantInt::get(OMPBuilder.Int32, 1))};
+    // Append metadata to nvvm.annotations.
+    MD->addOperand(MDNode::get(M.getContext(), MDVals));
+
+    // Add a function attribute for the kernel.
+    NumbaWrapperFunc->addFnAttr(Attribute::get(M.getContext(), "kernel"));
+
+  } else {
+    // Generating an offloading entry is required by the x86_64 plugin.
+    Constant *OMPOffloadEntry;
+    emitOMPOffloadingEntry(DevWrapperFuncName, NumbaWrapperFunc,
+                           OMPOffloadEntry);
+  }
+  // Add llvm.module.flags for "openmp", "openmp-device" to enable
+  // OpenMPOpt.
+  M.addModuleFlag(llvm::Module::Max, "openmp", 50);
+  M.addModuleFlag(llvm::Module::Max, "openmp-device", 50);
+}
+
+void CGIntrinsicsOpenMP::emitOMPTeamsDeviceRuntime(
+    DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL,
+    Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB,
+    BasicBlock *AfterBB, TeamsInfoStruct &TeamsInfo) {
+  SmallVector<Value *, 16> CapturedVars;
+  Function *OutlinedFn =
+      createOutlinedFunction(DSAValueMap, VMap, Fn, StartBB, EndBB,
+                             CapturedVars, ".omp_outlined_teams");
+
+  // Set up the call to the teams outlined function.
+  BBEntry->getTerminator()->eraseFromParent();
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->end()), DL);
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  OMPBuilder.Builder.restoreIP(Loc.IP);
+  OMPBuilder.Builder.SetCurrentDebugLocation(Loc.DL);
+  Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident);
+
+  assert(Ident && "Expected non-null Ident");
+  assert(ThreadID && "Expected non-null ThreadID");
+
+  // Create global_tid, bound_tid (zero) to pass to the teams outlined function.
+  AllocaInst *ThreadIDAddr = OMPBuilder.Builder.CreateAlloca(
+      OMPBuilder.Int32, nullptr, ".threadid.addr");
+  AllocaInst *ZeroAddr =
+      OMPBuilder.Builder.CreateAlloca(OMPBuilder.Int32, nullptr, ".zero.addr");
+  OMPBuilder.Builder.CreateStore(ThreadID, ThreadIDAddr);
+  OMPBuilder.Builder.CreateStore(Constant::getNullValue(OMPBuilder.Int32),
+                                 ZeroAddr);
+
+  FunctionCallee TeamsOutlinedFn(OutlinedFn);
+  SmallVector<Value *, 8> Args;
+  Args.append({ThreadIDAddr, ZeroAddr});
+
+  for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
+    // Pass firstprivate scalar by value.
+    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
+        CapturedVars[Idx]
+            ->getType()
+            ->getPointerElementType()
+            ->isSingleValueType()) {
+      Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType();
+      Value *Load =
+          OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
+      // TODO: Runtime expects values in Int64 type, fix with arguments in
+      // struct.
+      AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
+          OMPBuilder.Int64, nullptr,
+          CapturedVars[Idx]->getName() + "fpriv.byval");
+      Value *Cast = OMPBuilder.Builder.CreateBitCast(
+          TmpInt64, CapturedVars[Idx]->getType());
+      OMPBuilder.Builder.CreateStore(Load, Cast);
+      Value *ConvLoad =
+          OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
+      Args.push_back(ConvLoad);
+
+      continue;
+    }
+    Args.push_back(CapturedVars[Idx]);
+  }
+
+  auto *CI = checkCreateCall(OMPBuilder.Builder, TeamsOutlinedFn, Args);
+  assert(CI && "Expected valid call");
+
+  OMPBuilder.Builder.CreateBr(AfterBB);
+
+  DEBUG_ENABLE(dbgs() << "=== Dump OuterFn\n"
+                    << *Fn << "=== End of Dump OuterFn\n");
+
+  if (verifyFunction(*Fn, &errs()))
+    FATAL_ERROR("Verification of OuterFn failed!");
+}
+
+void CGIntrinsicsOpenMP::emitOMPTeams(DSAValueMapTy &DSAValueMap,
+                                      ValueToValueMapTy *VMap,
+                                      const DebugLoc &DL, Function *Fn,
+                                      BasicBlock *BBEntry, BasicBlock *StartBB,
+                                      BasicBlock *EndBB, BasicBlock *AfterBB,
+                                      TeamsInfoStruct &TeamsInfo) {
+  if (isOpenMPDeviceRuntime())
+    emitOMPTeamsDeviceRuntime(DSAValueMap, VMap, DL, Fn, BBEntry, StartBB,
+                              EndBB, AfterBB, TeamsInfo);
+  else
+    emitOMPTeamsHostRuntime(DSAValueMap, VMap, DL, Fn, BBEntry, StartBB, EndBB,
+                            AfterBB, TeamsInfo);
+}
+
+void CGIntrinsicsOpenMP::emitOMPTeamsHostRuntime(
+    DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL,
+    Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB,
+    BasicBlock *AfterBB, TeamsInfoStruct &TeamsInfo) {
+  SmallVector<Value *, 16> CapturedVars;
+  Function *OutlinedFn = createOutlinedFunction(
+      DSAValueMap, /*ValueToValueMapTy */ VMap, Fn, StartBB, EndBB,
+      CapturedVars, ".omp_outlined_teams");
+
+  // Set up the call to the teams outlined function.
+  BBEntry->getTerminator()->eraseFromParent();
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->end()), DL);
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  OMPBuilder.Builder.restoreIP(Loc.IP);
+  OMPBuilder.Builder.SetCurrentDebugLocation(Loc.DL);
+  Value *Ident = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadID = OMPBuilder.getOrCreateThreadID(Ident);
+
+  assert(Ident && "Expected non-null Ident");
+  // Emit call to set the number of teams and thread limit.
+  if (TeamsInfo.NumTeams || TeamsInfo.ThreadLimit) {
+    Value *NumTeams =
+        (TeamsInfo.NumTeams
+             ? createScalarCast(TeamsInfo.NumTeams, OMPBuilder.Int32)
+             : Constant::getNullValue(OMPBuilder.Int32));
+    Value *ThreadLimit =
+        (TeamsInfo.ThreadLimit
+             ? createScalarCast(TeamsInfo.ThreadLimit, OMPBuilder.Int32)
+             : Constant::getNullValue(OMPBuilder.Int32));
+    FunctionCallee KmpcPushNumTeams =
+        OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_push_num_teams);
+    auto *CI = checkCreateCall(OMPBuilder.Builder, KmpcPushNumTeams,
+                               {Ident, ThreadID, NumTeams, ThreadLimit});
+    assert(CI && "Expected valid call");
+  }
+
+  FunctionCallee ForkTeams =
+      OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_fork_teams);
+
+  SmallVector<Value *, 8> Args;
+  Value *NumCapturedVars = OMPBuilder.Builder.getInt32(CapturedVars.size());
+  Args.append({Ident, NumCapturedVars,
+               OMPBuilder.Builder.CreateBitCast(OutlinedFn,
+                                                OMPBuilder.ParallelTaskPtr)});
+
+  for (size_t Idx = 0; Idx < CapturedVars.size(); ++Idx) {
+    // Pass firstprivate scalar by value.
+    if (DSAValueMap[CapturedVars[Idx]].Type == DSA_FIRSTPRIVATE &&
+        CapturedVars[Idx]
+            ->getType()
+            ->getPointerElementType()
+            ->isSingleValueType()) {
+      Type *VPtrElemTy = CapturedVars[Idx]->getType()->getPointerElementType();
+      Value *Load =
+          OMPBuilder.Builder.CreateLoad(VPtrElemTy, CapturedVars[Idx]);
+      // TODO: Runtime expects values in Int64 type, fix with arguments in
+      // struct.
+      AllocaInst *TmpInt64 = OMPBuilder.Builder.CreateAlloca(
+          OMPBuilder.Int64, nullptr,
+          CapturedVars[Idx]->getName() + ".fpriv.byval");
+      Value *Cast = OMPBuilder.Builder.CreateBitCast(
+          TmpInt64, CapturedVars[Idx]->getType());
+      OMPBuilder.Builder.CreateStore(Load, Cast);
+      Value *ConvLoad =
+          OMPBuilder.Builder.CreateLoad(OMPBuilder.Int64, TmpInt64);
+      Args.push_back(ConvLoad);
+
+      continue;
+    }
+    Args.push_back(CapturedVars[Idx]);
+  }
+
+  auto *CI = checkCreateCall(OMPBuilder.Builder, ForkTeams, Args);
+  assert(CI && "Expected valid call");
+
+  OMPBuilder.Builder.CreateBr(AfterBB);
+
+  DEBUG_ENABLE(dbgs() << "=== Dump OuterFn\n"
+                    << *Fn << "=== End of Dump OuterFn\n");
+
+  if (verifyFunction(*Fn, &errs()))
+    FATAL_ERROR("Verification of OuterFn failed!");
+}
+
+void CGIntrinsicsOpenMP::emitOMPTargetEnterData(
+    Function *Fn, BasicBlock *BBEntry, DSAValueMapTy &DSAValueMap,
+    StructMapTy &StructMappingInfoMap) {
+
+  const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL);
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+  FunctionCallee TargetDataBeginMapper = OMPBuilder.getOrCreateRuntimeFunction(
+      M, OMPRTL___tgt_target_data_begin_mapper);
+  OMPBuilder.Builder.SetInsertPoint(BBEntry->getTerminator());
+
+  // Emit mappings.
+  OffloadingMappingArgsTy OffloadingMappingArgs;
+  InsertPointTy AllocaIP(&Fn->getEntryBlock(),
+                         Fn->getEntryBlock().getFirstInsertionPt());
+  emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap,
+                            OffloadingMappingArgs, /* IsTargetRegion */ false);
+
+  OMPBuilder.Builder.CreateCall(
+      TargetDataBeginMapper,
+      {SrcLoc, ConstantInt::get(OMPBuilder.Int64, -1),
+       ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size),
+       OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs,
+       OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes,
+       OffloadingMappingArgs.MapNames,
+       // TODO: offload_mappers is null for now.
+       Constant::getNullValue(OMPBuilder.VoidPtrPtr)});
+}
+
+void CGIntrinsicsOpenMP::emitOMPTargetExitData(
+    Function *Fn, BasicBlock *BBEntry, DSAValueMapTy &DSAValueMap,
+    StructMapTy &StructMappingInfoMap) {
+
+  const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL);
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+  FunctionCallee TargetDataEndMapper = OMPBuilder.getOrCreateRuntimeFunction(
+      M, OMPRTL___tgt_target_data_end_mapper);
+  OMPBuilder.Builder.SetInsertPoint(BBEntry->getTerminator());
+
+  // Emit mappings.
+  OffloadingMappingArgsTy OffloadingMappingArgs;
+  InsertPointTy AllocaIP(&Fn->getEntryBlock(),
+                         Fn->getEntryBlock().getFirstInsertionPt());
+  emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap,
+                            OffloadingMappingArgs, /* IsTargetRegion */ false);
+
+  OMPBuilder.Builder.CreateCall(
+      TargetDataEndMapper,
+      {SrcLoc, ConstantInt::get(OMPBuilder.Int64, -1),
+       ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size),
+       OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs,
+       OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes,
+       OffloadingMappingArgs.MapNames,
+       // TODO: offload_mappers is null for now.
+       Constant::getNullValue(OMPBuilder.VoidPtrPtr)});
+}
+
+void CGIntrinsicsOpenMP::emitOMPTargetData(Function *Fn, BasicBlock *BBEntry,
+                                           BasicBlock *BBExit,
+                                           DSAValueMapTy &DSAValueMap,
+                                           StructMapTy &StructMappingInfoMap) {
+  // Re-use codegen from TARGET ENTER/EXIT DATA.
+  emitOMPTargetEnterData(Fn, BBEntry, DSAValueMap, StructMappingInfoMap);
+  emitOMPTargetExitData(Fn, BBExit, DSAValueMap, StructMappingInfoMap);
+}
+
+void CGIntrinsicsOpenMP::emitOMPTargetUpdate(
+    Function *Fn, BasicBlock *BBEntry, DSAValueMapTy &DSAValueMap,
+    StructMapTy &StructMappingInfoMap) {
+  const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BBEntry, BBEntry->getTerminator()->getIterator()), DL);
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *SrcLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+  FunctionCallee TargetDataUpdateMapper = OMPBuilder.getOrCreateRuntimeFunction(
+      M, OMPRTL___tgt_target_data_update_mapper);
+  OMPBuilder.Builder.SetInsertPoint(BBEntry->getTerminator());
+
+  // Emit mappings.
+  OffloadingMappingArgsTy OffloadingMappingArgs;
+  InsertPointTy AllocaIP(&Fn->getEntryBlock(),
+                         Fn->getEntryBlock().getFirstInsertionPt());
+  emitOMPOffloadingMappings(AllocaIP, DSAValueMap, StructMappingInfoMap,
+                            OffloadingMappingArgs, /* IsTargetRegion */ false);
+
+  OMPBuilder.Builder.CreateCall(
+      TargetDataUpdateMapper,
+      {SrcLoc, ConstantInt::get(OMPBuilder.Int64, -1),
+       ConstantInt::get(OMPBuilder.Int32, OffloadingMappingArgs.Size),
+       OffloadingMappingArgs.BasePtrs, OffloadingMappingArgs.Ptrs,
+       OffloadingMappingArgs.Sizes, OffloadingMappingArgs.MapTypes,
+       OffloadingMappingArgs.MapNames,
+       // TODO: offload_mappers is null for now.
+       Constant::getNullValue(OMPBuilder.VoidPtrPtr)});
+}
+
+void CGIntrinsicsOpenMP::emitOMPDistribute(
+    DSAValueMapTy &DSAValueMap, OMPLoopInfoStruct &OMPLoopInfo,
+    BasicBlock *StartBB, BasicBlock *ExitBB, bool IsStandalone,
+    bool IsDistributeParallelFor, OMPDistributeInfoStruct *DistributeInfo) {
+    if (static_cast<int>(OMPLoopInfo.Sched) == 0)
+        OMPLoopInfo.Sched = OMPScheduleType::Distribute;
+
+    emitLoop(DSAValueMap, OMPLoopInfo, StartBB, ExitBB, IsStandalone, true,
+             IsDistributeParallelFor, DistributeInfo);
+}
+
+void CGIntrinsicsOpenMP::emitOMPDistributeParallelFor(
+    DSAValueMapTy &DSAValueMap, BasicBlock *StartBB, BasicBlock *ExitBB,
+    OMPLoopInfoStruct &OMPLoopInfo, ParRegionInfoStruct &ParRegionInfo,
+    bool IsStandalone) {
+
+  Function *Fn = StartBB->getParent();
+  const DebugLoc DL = StartBB->getTerminator()->getDebugLoc();
+
+  BasicBlock *DistPreheader =
+      StartBB->splitBasicBlock(StartBB->begin(), "omp.distribute.preheader");
+  BasicBlock *DistHeader = DistPreheader->splitBasicBlock(
+      DistPreheader->begin(), "omp.distribute.header");
+  BasicBlock *ForEntry =
+      DistHeader->splitBasicBlock(DistHeader->begin(), "omp.inner.for.entry");
+  BasicBlock *ForBegin =
+      ForEntry->splitBasicBlock(ForEntry->begin(), "omp.inner.for.begin");
+  BasicBlock *ForEnd = splitBlockBefore(
+      ExitBB, &*ExitBB->getFirstInsertionPt(), /*DomTreeUpdater*/ nullptr,
+      /*LoopInfo*/ nullptr, /*MemorySSAUpdater*/ nullptr);
+  ForEnd->setName("omp.inner.for.end");
+  BasicBlock *ForExit = SplitBlock(ForEnd, ForEnd->getTerminator());
+  ForExit->setName("omp.inner.for.exit");
+  BasicBlock *ForExitAfter = SplitBlock(ForExit, ForExit->getTerminator());
+  ForExitAfter->setName("omp.inner.for.exit.after");
+  BasicBlock *DistInc = ForExitAfter->splitBasicBlock(
+      ForExitAfter->getTerminator(), "omp.distribute.inc");
+  BasicBlock *DistExit =
+      DistInc->splitBasicBlock(DistInc->getTerminator(), "omp.distribute.exit");
+
+  // Create skeleton DistHeader
+  {
+    // Dummy condition to create the expected structure.
+    DistHeader->getTerminator()->eraseFromParent();
+    OMPBuilder.Builder.SetInsertPoint(DistHeader);
+    auto *Cond =
+        OMPBuilder.Builder.CreateICmpSLE(OMPLoopInfo.IV, OMPLoopInfo.UB);
+    OMPBuilder.Builder.CreateCondBr(Cond, ForEntry, DistExit);
+  }
+  // Create skeleton DistInc
+  {
+    DistInc->getTerminator()->eraseFromParent();
+    OMPBuilder.Builder.SetInsertPoint(DistInc);
+    OMPBuilder.Builder.CreateBr(DistHeader);
+  }
+
+  OMPLoopInfo.Sched = (isOpenMPDeviceRuntime() ? OMPScheduleType::StaticChunked
+                                               : OMPScheduleType::Static);
+  emitOMPFor(DSAValueMap, OMPLoopInfo, ForBegin, ForEnd, IsStandalone, true);
+  BasicBlock *ParEntryBB = ForEntry;
+  DEBUG_ENABLE(dbgs() << "ParEntryBB " << ParEntryBB->getName() << "\n");
+  BasicBlock *ParStartBB = ForBegin;
+  DEBUG_ENABLE(dbgs() << "ParStartBB " << ParStartBB->getName() << "\n");
+  BasicBlock *ParEndBB = ForExit;
+  DEBUG_ENABLE(dbgs() << "ParEndBB " << ParEndBB->getName() << "\n");
+  BasicBlock *ParAfterBB = ForExitAfter;
+  DEBUG_ENABLE(dbgs() << "ParAfterBB " << ParAfterBB->getName() << "\n");
+
+  emitOMPParallel(
+      DSAValueMap, nullptr, DL, Fn, ParEntryBB, ParStartBB, ParEndBB,
+      ParAfterBB, [](auto) {}, ParRegionInfo);
+
+  // By default, to maximize performance on GPUs, we do static chunked with a
+  // chunk size equal to the block size when targeting the device runtime.
+  if (isOpenMPDeviceRuntime()) {
+    OMPLoopInfo.Sched = OMPScheduleType::DistributeChunked;
+    // Extend DistPreheader
+    {
+      OMPBuilder.Builder.SetInsertPoint(DistPreheader,
+                                        DistPreheader->getFirstInsertionPt());
+
+      FunctionCallee NumTeamThreadsFn = OMPBuilder.getOrCreateRuntimeFunction(
+          M, llvm::omp::RuntimeFunction::
+                 OMPRTL___kmpc_get_hardware_num_threads_in_block);
+      Value *NumTeamThreads =
+          OMPBuilder.Builder.CreateCall(NumTeamThreadsFn, {});
+      OMPLoopInfo.Chunk = NumTeamThreads;
+    }
+  } else {
+    OMPLoopInfo.Sched = OMPScheduleType::Distribute;
+  }
+
+  OMPDistributeInfoStruct DistributeInfo;
+  emitOMPDistribute(DSAValueMap, OMPLoopInfo, DistPreheader, DistExit,
+                    IsStandalone, true, &DistributeInfo);
+
+  // Replace upper bound, lower bound to the "parallel for" with distribute
+  // bounds.
+  {
+    assert(DistributeInfo.LB && "Expected non-null distribute lower bound");
+    assert(DistributeInfo.UB && "Expected non-null distribute upper bound");
+    auto ShouldReplace = [&](Use &U) {
+      if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
+        if (UserI->getParent() == ForEntry)
+          return true;
+
+      return false;
+    };
+
+    // Replace the inner, parallel for loop LB, UB.
+    OMPLoopInfo.LB->replaceUsesWithIf(DistributeInfo.LB, ShouldReplace);
+    OMPLoopInfo.UB->replaceUsesWithIf(DistributeInfo.UB, ShouldReplace);
+  }
+}
+
+void CGIntrinsicsOpenMP::emitOMPTargetTeamsDistributeParallelFor(
+    DSAValueMapTy &DSAValueMap, const DebugLoc &DL, Function *Fn,
+    BasicBlock *EntryBB, BasicBlock *StartBB, BasicBlock *EndBB,
+    BasicBlock *ExitBB, BasicBlock *AfterBB, OMPLoopInfoStruct &OMPLoopInfo,
+    ParRegionInfoStruct &ParRegionInfo, TargetInfoStruct &TargetInfo,
+    StructMapTy &StructMappingInfoMap, bool IsDeviceTargetRegion) {
+
+    emitOMPDistributeParallelFor(DSAValueMap, StartBB, ExitBB, OMPLoopInfo,
+                                 ParRegionInfo,
+                                 /* isStandalone */ false);
+
+    emitOMPTargetTeams(DSAValueMap, nullptr, DL, Fn, EntryBB,
+                       StartBB, EndBB, AfterBB,
+                       TargetInfo, &OMPLoopInfo, StructMappingInfoMap,
+                       IsDeviceTargetRegion);
+
+    // Alternative codegen, starting from top-down and renaming values using the
+    // ValueToValueMap.
+#if 0
+  ValueToValueMapTy VMap;
+  // Lower target_teams.
+  emitOMPTargetTeams(DSAValueMap, &VMap, DL, Fn, EntryBB, StartBB, EndBB, AfterBB,
+                     TargetInfo, &OMPLoopInfo, StructMappingInfoMap,
+                     IsDeviceTargetRegion);
+
+  dbgs() << "=== VMap\n";
+  for(auto VV : VMap) {
+    dbgs() << "V " << *VV.first << " -> " << *VV.second << "\n";
+  }
+  dbgs() << "=== End of VMap\n";
+  getchar();
+
+  // Update DSAValueMap
+  SmallVector<Value *, 8> ToDelete;
+  for(auto &It : DSAValueMap) {
+    Value *V = It.first;
+    if(!VMap.count(V))
+      continue;
+
+    DSAValueMap[VMap[V]] = It.second;
+    dbgs() << "Update DSAValueMap " << *VMap[V] << " ~> " << It.second.Type << "\n";
+    ToDelete.push_back(V);
+  }
+  for(auto *V : ToDelete) {
+    dbgs() << "Update DSAValueMAp delete " << *V << "\n";
+    DSAValueMap.erase(V);
+  }
+
+  // Update OMPLoopInfo
+  OMPLoopInfo.IV = VMap[OMPLoopInfo.IV];
+  OMPLoopInfo.Start = VMap[OMPLoopInfo.Start];
+  OMPLoopInfo.LB = VMap[OMPLoopInfo.LB];
+  OMPLoopInfo.UB = VMap[OMPLoopInfo.UB];
+
+  emitOMPDistributeParallelFor(DSAValueMap, StartBB, ExitBB, OMPLoopInfo,
+                               ParRegionInfo,
+                               /* isStandalone */ false);
+#endif
+}
+
+void CGIntrinsicsOpenMP::emitOMPTargetTeams(
+    DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL,
+    Function *Fn, BasicBlock *EntryBB, BasicBlock *StartBB, BasicBlock *EndBB,
+    BasicBlock *AfterBB, TargetInfoStruct &TargetInfo,
+    OMPLoopInfoStruct *OMPLoopInfo, StructMapTy &StructMappingInfoMap,
+    bool IsDeviceTargetRegion) {
+
+  BasicBlock *TeamsEntryBB = SplitBlock(EntryBB, EntryBB->getTerminator());
+  TeamsEntryBB->setName("omp.teams.entry");
+  BasicBlock *TeamsStartBB =
+      splitBlockBefore(StartBB, &*StartBB->getFirstInsertionPt(), nullptr,
+                       nullptr, nullptr, "omp.teams.start");
+  BasicBlock *TeamsEndBB =
+      splitBlockBefore(EndBB, &*EndBB->getFirstInsertionPt(), nullptr, nullptr,
+                       nullptr, "omp.teams.end");
+  // TargetInfo contains teams info.
+  TeamsInfoStruct TeamsInfo;
+  TeamsInfo.NumTeams = TargetInfo.NumTeams;
+  TeamsInfo.ThreadLimit = TargetInfo.ThreadLimit;
+  emitOMPTeams(DSAValueMap, VMap, DL, Fn, TeamsEntryBB, TeamsStartBB,
+               TeamsEndBB, EndBB, TeamsInfo);
+
+  emitOMPTarget(Fn, EntryBB, TeamsEntryBB, EndBB, DSAValueMap,
+                StructMappingInfoMap, TargetInfo, OMPLoopInfo,
+                IsDeviceTargetRegion);
+}
+
+bool CGIntrinsicsOpenMP::isOpenMPDeviceRuntime() {
+  Triple TargetTriple(M.getTargetTriple());
+
+  if (TargetTriple.isNVPTX())
+    return true;
+
+  return false;
+}
+
+template <>
+Value *CGReduction::emitOperation<DSA_REDUCTION_ADD>(IRBuilderBase &IRB,
+                                                     Value *LHS, Value *RHS) {
+    Type *VTy = RHS->getType();
+    if (VTy->isIntegerTy())
+        return IRB.CreateAdd(LHS, RHS, "red.add");
+    else if (VTy->isFloatTy() || VTy->isDoubleTy())
+        return IRB.CreateFAdd(LHS, RHS, "red.add");
+    else
+        FATAL_ERROR("Unsupported type for reduction operation");
+}
+
+// OpenMP 5.1, 2.21.5, sub is the same as add.
+template <>
+Value *CGReduction::emitOperation<DSA_REDUCTION_SUB>(IRBuilderBase &IRB,
+                                                     Value *LHS, Value *RHS) {
+    return emitOperation<DSA_REDUCTION_ADD>(IRB, LHS, RHS);
+}
+
+template <>
+Value *CGReduction::emitOperation<DSA_REDUCTION_MUL>(IRBuilderBase &IRB,
+                                                     Value *LHS, Value *RHS) {
+    Type *VTy = RHS->getType();
+    if (VTy->isIntegerTy())
+        return IRB.CreateMul(LHS, RHS, "red.mul");
+    else if (VTy->isFloatTy() || VTy->isDoubleTy())
+        return IRB.CreateFMul(LHS, RHS, "red.mul");
+    else
+        FATAL_ERROR("Unsupported type for reduction operation");
+}
+
+template <>
+InsertPointTy CGReduction::emitAtomicOperationRMW<DSA_REDUCTION_ADD>(
+    IRBuilderBase &IRB, Value *LHS, Value *Partial) {
+    IRB.CreateAtomicRMW(AtomicRMWInst::Add, LHS, Partial, None,
+                        AtomicOrdering::Monotonic);
+    return IRB.saveIP();
+}
+
+// OpenMP 5.1, 2.21.5, sub is the same as add.
+template <>
+InsertPointTy CGReduction::emitAtomicOperationRMW<DSA_REDUCTION_SUB>(
+    IRBuilderBase &IRB, Value *LHS, Value *Partial) {
+    return emitAtomicOperationRMW<DSA_REDUCTION_ADD>(IRB, LHS, Partial);
+}
diff --git a/numba/openmp/pass/CGIntrinsicsOpenMP.h b/numba/openmp/pass/CGIntrinsicsOpenMP.h
new file mode 100644
index 000000000000..f9b5d29291df
--- /dev/null
+++ b/numba/openmp/pass/CGIntrinsicsOpenMP.h
@@ -0,0 +1,527 @@
+#ifndef LLVM_TRANSFORMS_INTRINSICS_OPENMP_CODEGEN_H
+#define LLVM_TRANSFORMS_INTRINSICS_OPENMP_CODEGEN_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Frontend/OpenMP/OMP.h.inc"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/Support/AtomicOrdering.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include "DebugOpenMP.h"
+
+using namespace llvm;
+using namespace omp;
+
+using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+using BodyGenCallbackTy = OpenMPIRBuilder::BodyGenCallbackTy;
+using FinalizeCallbackTy = OpenMPIRBuilder::FinalizeCallbackTy;
+
+namespace iomp {
+// TODO: expose clauses through namespace omp?
+enum DSAType {
+  DSA_NONE,
+  DSA_PRIVATE,
+  DSA_FIRSTPRIVATE,
+  DSA_LASTPRIVATE,
+  DSA_SHARED,
+  DSA_REDUCTION_ADD,
+  DSA_REDUCTION_SUB,
+  DSA_REDUCTION_MUL,
+  DSA_MAP_ALLOC,
+  DSA_MAP_TO,
+  DSA_MAP_FROM,
+  DSA_MAP_TOFROM,
+  DSA_MAP_ALLOC_STRUCT,
+  DSA_MAP_TO_STRUCT,
+  DSA_MAP_FROM_STRUCT,
+  DSA_MAP_TOFROM_STRUCT,
+  DSA_MAP_STRUCT
+};
+
+struct DSATypeInfo {
+  DSAType Type;
+  FunctionCallee CopyConstructor;
+
+  DSATypeInfo() : Type(DSA_NONE), CopyConstructor(nullptr) {}
+  DSATypeInfo(DSAType InType) : Type(InType), CopyConstructor(nullptr) {}
+  DSATypeInfo(DSAType InType, FunctionCallee InCopyConstructor)
+      : Type(InType), CopyConstructor(InCopyConstructor) {}
+  DSATypeInfo(const DSATypeInfo &DTI) {
+    Type = DTI.Type;
+    CopyConstructor = DTI.CopyConstructor;
+  }
+  DSATypeInfo &operator=(const DSATypeInfo &DTI) = default;
+};
+
+using DSAValueMapTy = MapVector<Value *, DSATypeInfo>;
+
+// using DSAValueMapTy = MapVector<Value *, DSAType>;
+
+static const DenseMap<StringRef, Directive> StringToDir = {
+    {"DIR.OMP.PARALLEL", OMPD_parallel},
+    {"DIR.OMP.SINGLE", OMPD_single},
+    {"DIR.OMP.CRITICAL", OMPD_critical},
+    {"DIR.OMP.BARRIER", OMPD_barrier},
+    {"DIR.OMP.LOOP", OMPD_for},
+    {"DIR.OMP.PARALLEL.LOOP", OMPD_parallel_for},
+    {"DIR.OMP.TASK", OMPD_task},
+    {"DIR.OMP.TASKWAIT", OMPD_taskwait},
+    {"DIR.OMP.TARGET", OMPD_target},
+    {"DIR.OMP.TEAMS", OMPD_teams},
+    {"DIR.OMP.DISTRIBUTE", OMPD_distribute},
+    {"DIR.OMP.TEAMS.DISTRIBUTE", OMPD_teams_distribute},
+    {"DIR.OMP.TEAMS.DISTRIBUTE.PARALLEL.LOOP",
+     OMPD_teams_distribute_parallel_for},
+    {"DIR.OMP.TARGET.TEAMS", OMPD_target_teams},
+    {"DIR.OMP.TARGET.DATA", OMPD_target_data},
+    {"DIR.OMP.TARGET.ENTER.DATA", OMPD_target_enter_data},
+    {"DIR.OMP.TARGET.EXIT.DATA", OMPD_target_exit_data},
+    {"DIR.OMP.TARGET.UPDATE", OMPD_target_update},
+    {"DIR.OMP.TARGET.TEAMS.DISTRIBUTE", OMPD_target_teams_distribute},
+    {"DIR.OMP.DISTRIBUTE.PARALLEL.LOOP", OMPD_distribute_parallel_for},
+    {"DIR.OMP.TARGET.TEAMS.DISTRIBUTE.PARALLEL.LOOP",
+     OMPD_target_teams_distribute_parallel_for}};
+
+// TODO: add more reduction operators.
+static const DenseMap<StringRef, DSAType> StringToDSA = {
+    {"QUAL.OMP.PRIVATE", DSA_PRIVATE},
+    {"QUAL.OMP.FIRSTPRIVATE", DSA_FIRSTPRIVATE},
+    {"QUAL.OMP.LASTPRIVATE", DSA_LASTPRIVATE},
+    {"QUAL.OMP.SHARED", DSA_SHARED},
+    {"QUAL.OMP.REDUCTION.ADD", DSA_REDUCTION_ADD},
+    {"QUAL.OMP.REDUCTION.SUB", DSA_REDUCTION_SUB},
+    {"QUAL.OMP.REDUCTION.MUL", DSA_REDUCTION_MUL},
+    {"QUAL.OMP.MAP.ALLOC", DSA_MAP_ALLOC},
+    {"QUAL.OMP.MAP.TO", DSA_MAP_TO},
+    {"QUAL.OMP.MAP.FROM", DSA_MAP_FROM},
+    {"QUAL.OMP.MAP.TOFROM", DSA_MAP_TOFROM},
+    {"QUAL.OMP.MAP.ALLOC.STRUCT", DSA_MAP_ALLOC_STRUCT},
+    {"QUAL.OMP.MAP.TO.STRUCT", DSA_MAP_TO_STRUCT},
+    {"QUAL.OMP.MAP.FROM.STRUCT", DSA_MAP_FROM_STRUCT},
+    {"QUAL.OMP.MAP.TOFROM.STRUCT", DSA_MAP_TOFROM_STRUCT}};
+
+/// Data attributes for each data reference used in an OpenMP target region.
+enum tgt_map_type {
+  // No flags
+  OMP_TGT_MAPTYPE_NONE = 0x000,
+  // copy data from host to device
+  OMP_TGT_MAPTYPE_TO = 0x001,
+  // copy data from device to host
+  OMP_TGT_MAPTYPE_FROM = 0x002,
+  // copy regardless of the reference count
+  OMP_TGT_MAPTYPE_ALWAYS = 0x004,
+  // force unmapping of data
+  OMP_TGT_MAPTYPE_DELETE = 0x008,
+  // map the pointer as well as the pointee
+  OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010,
+  // pass device base address to kernel
+  OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020,
+  // return base device address of mapped data
+  OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040,
+  // private variable - not mapped
+  OMP_TGT_MAPTYPE_PRIVATE = 0x080,
+  // copy by value - not mapped
+  OMP_TGT_MAPTYPE_LITERAL = 0x100,
+  // mapping is implicit
+  OMP_TGT_MAPTYPE_IMPLICIT = 0x200,
+  // copy data to device
+  OMP_TGT_MAPTYPE_CLOSE = 0x400,
+  // runtime error if not already allocated
+  OMP_TGT_MAPTYPE_PRESENT = 0x1000,
+  // descriptor for non-contiguous target-update
+  OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000,
+  // member of struct, member given by [16 MSBs] - 1
+  OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000
+};
+
+struct OffloadingMappingArgsTy {
+  Value *Sizes;
+  Value *MapTypes;
+  Value *MapNames;
+  Value *BasePtrs;
+  Value *Ptrs;
+  size_t Size;
+};
+
+struct FieldMappingInfo {
+  Value *Index;
+  Value *Offset;
+  Value *NumElements;
+  DSAType MapType;
+};
+
+using StructMapTy = MapVector<Value *, SmallVector<FieldMappingInfo, 4>>;
+
+struct OMPLoopInfoStruct {
+  Value *IV = nullptr;
+  Value *Start = nullptr;
+  Value *LB = nullptr;
+  Value *UB = nullptr;
+  // 0 is invalid, schedule will be set by the user or to reasonable defaults
+  // by the pass.
+  OMPScheduleType DistSched = static_cast<OMPScheduleType>(0);
+  OMPScheduleType Sched = static_cast<OMPScheduleType>(0);
+  Value *Chunk = nullptr;
+};
+
+struct OMPDistributeInfoStruct {
+  Value *UB = nullptr;
+  Value *LB = nullptr;
+};
+
+struct TargetInfoStruct {
+  StringRef DevFuncName;
+  ConstantDataArray *ELF = nullptr;
+  Value *NumTeams = nullptr;
+  Value *ThreadLimit = nullptr;
+  OMPTgtExecModeFlags ExecMode = (OMPTgtExecModeFlags)0;
+  bool NoWait = false;
+};
+
+struct ParRegionInfoStruct {
+  Value *NumThreads = nullptr;
+  Value *IfCondition = nullptr;
+};
+
+struct TeamsInfoStruct {
+  Value *NumTeams = nullptr;
+  Value *ThreadLimit = nullptr;
+};
+
+struct CGReduction {
+  template <DSAType ReductionOperator>
+  static Value *emitOperation(IRBuilderBase &IRB, Value *LHS, Value *RHS);
+
+  template<DSAType ReductionOperator>
+  static OpenMPIRBuilder::InsertPointTy
+  reductionNonAtomic(OpenMPIRBuilder::InsertPointTy IP, Value *LHS, Value *RHS,
+               Value *&Result) {
+    IRBuilder<> Builder(IP.getBlock(), IP.getPoint());
+    Result = emitOperation<ReductionOperator>(Builder, LHS, RHS);
+    return Builder.saveIP();
+  }
+
+  template <DSAType ReductionOperator>
+  static InsertPointTy emitAtomicOperationRMW(IRBuilderBase &IRB, Value *LHS,
+                                  Value *Partial);
+
+  template <DSAType ReductionOperator>
+  static InsertPointTy emitAtomicOperationCmpxchg(IRBuilderBase &IRB,
+                                                  InsertPointTy IP, Type *VTy,
+                                                  Value *LHS, Value *Partial) {
+    LLVMContext &Ctx = IRB.getContext();
+    unsigned int Bitwidth = VTy->getScalarSizeInBits();
+    auto *IntTy =
+        (Bitwidth == 64 ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx));
+    auto *IntPtrTy =
+        (Bitwidth == 64 ? Type::getInt64PtrTy(Ctx) : Type::getInt32PtrTy(Ctx));
+
+    auto SaveIP = IRB.saveIP();
+    // TODO: move alloca to function entry point, may be outlined later, e.g.,
+    // for nested under parallel.
+    Value *AllocaTemp =
+        IRB.CreateAlloca(IntTy, nullptr, "atomic.alloca.tmp");
+    IRB.restoreIP(SaveIP);
+
+    Value *CastLHS =
+        IRB.CreateBitCast(LHS, IntPtrTy, LHS->getName() + ".cast.int");
+    auto *LoadAtomic =
+        IRB.CreateLoad(IntTy, CastLHS, LHS->getName() + ".load.atomic");
+    LoadAtomic->setAtomic(AtomicOrdering::Monotonic);
+
+    Value *CastFP = IRB.CreateBitCast(LoadAtomic, VTy, "cast.fp");
+    Value *RedOp = emitOperation<ReductionOperator>(IRB, CastFP, Partial);
+    Value *CastFAdd =
+        IRB.CreateBitCast(RedOp, IntTy, RedOp->getName() + ".cast.int");
+
+    auto *CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadAtomic, CastFAdd,
+                                                None, AtomicOrdering::Monotonic,
+                                                AtomicOrdering::Monotonic);
+
+    auto *Returned = IRB.CreateExtractValue(CmpXchg, 0);
+    auto *StoreTemp = IRB.CreateStore(Returned, AllocaTemp);
+    auto *Cond = IRB.CreateExtractValue(CmpXchg, 1);
+    // Add unreachable as placholder for splitting.
+    auto *Unreachable = IRB.CreateUnreachable();
+    auto *IfTrueTerm = SplitBlockAndInsertIfThen(Cond, Unreachable, false);
+    auto *ExitBlock = IfTrueTerm->getParent();
+    auto *Retry = ExitBlock->getSingleSuccessor();
+    assert(Retry && "Expected single successor tail block");
+    // Erase the fall-through branch.
+    IfTrueTerm->eraseFromParent();
+
+    SaveIP = IRB.saveIP();
+    IRB.SetInsertPoint(Retry, Retry->getFirstInsertionPt());
+    auto *LoadReturned = IRB.CreateLoad(IntTy, AllocaTemp);
+    auto *CastLoad = IRB.CreateBitCast(LoadReturned, VTy);
+    // FAdd = IRB.CreateFAdd(CastLoad, Partial, "retry.add");
+    RedOp = emitOperation<ReductionOperator>(IRB, CastLoad, Partial);
+    CastFAdd =
+        IRB.CreateBitCast(RedOp, IntTy, RedOp->getName() + ".cast.int");
+    CmpXchg = IRB.CreateAtomicCmpXchg(CastLHS, LoadReturned, CastFAdd, None,
+                                          AtomicOrdering::Monotonic,
+                                          AtomicOrdering::Monotonic);
+    Returned = IRB.CreateExtractValue(CmpXchg, 0);
+    StoreTemp = IRB.CreateStore(Returned, AllocaTemp);
+    Cond = IRB.CreateExtractValue(CmpXchg, 1);
+    IRB.CreateCondBr(Cond, ExitBlock, Retry);
+    // Remove unreachable placeholder.
+    Unreachable->eraseFromParent();
+    IRB.restoreIP(SaveIP);
+
+    return InsertPointTy(ExitBlock, ExitBlock->getFirstInsertionPt());
+  }
+
+  template <DSAType ReductionOperator>
+  static OpenMPIRBuilder::InsertPointTy
+  reductionAtomic(OpenMPIRBuilder::InsertPointTy IP, Type *VTy, Value *LHS,
+                  Value *RHS) {
+    IRBuilder<> Builder(IP.getBlock(), IP.getPoint());
+    Value *Partial = Builder.CreateLoad(VTy, RHS, "red.partial");
+    if (VTy->isIntegerTy())
+        switch (ReductionOperator) {
+        case DSA_REDUCTION_ADD:
+        case DSA_REDUCTION_SUB:
+            return emitAtomicOperationRMW<ReductionOperator>(Builder, LHS, Partial);
+            break;
+        case DSA_REDUCTION_MUL:
+            // RMW does not support mul.
+            return emitAtomicOperationCmpxchg<ReductionOperator>(Builder, IP, VTy, LHS,
+                                                          Partial);
+        default:
+            FATAL_ERROR("Unsupported reduction operation");
+        }
+    else if (VTy->isFloatTy() || VTy->isDoubleTy()) {
+        // NOTE: Using atomicrmw for floats is buggy for aarch64, fallback to
+        // cmpxchg codegen for now similarly to Clang. Revisit with newer LLVM
+        // versions.
+        // Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, LHS, Partial, None,
+        //                        AtomicOrdering::Monotonic);
+        return emitAtomicOperationCmpxchg<ReductionOperator>(Builder, IP, VTy, LHS,
+                                                      Partial);
+    } else
+        FATAL_ERROR("Unsupported type for reductionAtomic");
+  }
+
+  template <DSAType ReductionOperator>
+  static Value *emitInitAndAppendInfo(
+      IRBuilderBase &IRB, InsertPointTy AllocaIP, Value *Orig,
+      SmallVectorImpl<OpenMPIRBuilder::ReductionInfo> &ReductionInfos) {
+    auto GetIdentityValue = []() {
+        switch (ReductionOperator) {
+        case DSA_REDUCTION_ADD:
+        case DSA_REDUCTION_SUB:
+            return 0;
+        case DSA_REDUCTION_MUL:
+            return 1;
+        default:
+            FATAL_ERROR("Unknown reduction type");
+        }
+    };
+
+    Type *VTy = Orig->getType()->getPointerElementType();
+    auto SaveIP = IRB.saveIP();
+    IRB.restoreIP(AllocaIP);
+    Value *Priv = IRB.CreateAlloca(VTy, /* ArraySize */ nullptr,
+                                               Orig->getName() + ".red.priv");
+    IRB.restoreIP(SaveIP);
+
+    // Store identity value based on operation and type.
+    if (VTy->isIntegerTy()) {
+      IRB.CreateStore(ConstantInt::get(VTy, GetIdentityValue()),
+                      Priv);
+    } else if (VTy->isFloatTy() || VTy->isDoubleTy()) {
+      IRB.CreateStore(ConstantFP::get(VTy, GetIdentityValue()),
+                      Priv);
+    }
+    else
+      FATAL_ERROR(
+          "Unsupported type to init with identity reduction value");
+
+    ReductionInfos.push_back(
+        {VTy, Orig, Priv,
+         CGReduction::reductionNonAtomic<ReductionOperator>,
+         CGReduction::reductionAtomic<ReductionOperator>});
+
+    return Priv;
+  }
+};
+
+class CGIntrinsicsOpenMP {
+public:
+  CGIntrinsicsOpenMP(Module &M);
+
+  OpenMPIRBuilder OMPBuilder;
+  Module &M;
+  StructType *TgtOffloadEntryTy;
+
+  StructType *getTgtOffloadEntryTy() { return TgtOffloadEntryTy; }
+
+  void emitOMPParallel(DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap,
+                       const DebugLoc &DL, Function *Fn, BasicBlock *BBEntry,
+                       BasicBlock *StartBB, BasicBlock *EndBB,
+                       BasicBlock *AfterBB, FinalizeCallbackTy FiniCB,
+                       ParRegionInfoStruct &ParRegionInfo);
+
+  void emitOMPFor(DSAValueMapTy &DSAValueMap, OMPLoopInfoStruct &OMPLoopInfo,
+                  BasicBlock *StartBB, BasicBlock *ExitBB, bool IsStandalone,
+                  bool IsDistributeParallelFor);
+
+  void emitOMPTask(DSAValueMapTy &DSAValueMap, Function *Fn,
+                   BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB,
+                   BasicBlock *AfterBB);
+
+  void emitOMPOffloadingEntry(const Twine &DevFuncName, Value *EntryPtr,
+                              Constant *&OMPOffloadEntry);
+
+  void emitOMPOffloadingMappings(InsertPointTy AllocaIP,
+                                 DSAValueMapTy &DSAValueMap,
+                                 StructMapTy &StructMappingInfoMap,
+                                 OffloadingMappingArgsTy &OffloadingMappingArgs,
+                                 bool IsTargetRegion);
+
+  void emitOMPSingle(Function *Fn, BasicBlock *BBEntry, BasicBlock *AfterBB,
+                     BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB);
+
+  void emitOMPCritical(Function *Fn, BasicBlock *BBEntry, BasicBlock *AfterBB,
+                       BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB);
+
+  void emitOMPBarrier(Function *Fn, BasicBlock *BBEntry, Directive DK);
+
+  void emitOMPTaskwait(BasicBlock *BBEntry);
+
+  void emitOMPTarget(Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB,
+                     BasicBlock *EndBB, DSAValueMapTy &DSAValueMap,
+                     StructMapTy &StructMappingInfoMap,
+                     TargetInfoStruct &TargetInfo,
+                     OMPLoopInfoStruct *OMPLoopInfo, bool IsDeviceTargetRegion);
+
+  void emitOMPTeams(DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap,
+                    const DebugLoc &DL, Function *Fn, BasicBlock *BBEntry,
+                    BasicBlock *StartBB, BasicBlock *EndBB, BasicBlock *AfterBB,
+                    TeamsInfoStruct &TeamsInfo);
+
+  void emitOMPTargetData(Function *Fn, BasicBlock *BBEntry, BasicBlock *BBExit,
+                         DSAValueMapTy &DSAValueMap,
+                         StructMapTy &StructMappingInfoMap);
+
+  void emitOMPTargetEnterData(Function *Fn, BasicBlock *BBEntry,
+                              DSAValueMapTy &DSAValueMap,
+                              StructMapTy &StructMappingInfoMap);
+
+  void emitOMPTargetExitData(Function *Fn, BasicBlock *BBEntry,
+                             DSAValueMapTy &DSAValueMap,
+                             StructMapTy &StructMappingInfoMap);
+
+  void emitOMPTargetUpdate(Function *Fn, BasicBlock *BBEntry,
+                           DSAValueMapTy &DSAValueMap,
+                           StructMapTy &StructMappingInfoMap);
+
+  void emitOMPDistribute(DSAValueMapTy &DSAValueMap,
+                         OMPLoopInfoStruct &OMPLoopInfo, BasicBlock *StartBB,
+                         BasicBlock *ExitBB, bool IsStandalone,
+                         bool IsDistributeParallelFor,
+                         OMPDistributeInfoStruct *DistributeInfo = nullptr);
+
+  void emitOMPDistributeParallelFor(DSAValueMapTy &DSAValueMap,
+                                    BasicBlock *StartBB, BasicBlock *ExitBB,
+                                    OMPLoopInfoStruct &OMPLoopInfo,
+                                    ParRegionInfoStruct &ParRegionInfo,
+                                    bool IsStandalone);
+
+  void emitOMPTargetTeamsDistributeParallelFor(
+      DSAValueMapTy &DSAValueMap, const DebugLoc &DL, Function *Fn,
+      BasicBlock *EntryBB, BasicBlock *StartBB, BasicBlock *EndBB,
+      BasicBlock *ExitBB, BasicBlock *AfterBB, OMPLoopInfoStruct &OMPLoopInfo,
+      ParRegionInfoStruct &ParRegionInfo, TargetInfoStruct &TargetInfo,
+      StructMapTy &StructMappingInfoMap, bool IsDeviceTargetRegion);
+
+  void emitOMPTargetTeams(DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap,
+                          const DebugLoc &DL, Function *Fn, BasicBlock *EntryBB,
+                          BasicBlock *StartBB, BasicBlock *EndBB,
+                          BasicBlock *AfterBB, TargetInfoStruct &TargetInfo,
+                          OMPLoopInfoStruct *OMPLoopInfo,
+                          StructMapTy &StructMappingInfoMap,
+                          bool IsDeviceTargetRegion);
+
+  GlobalVariable *emitOffloadingGlobals(StringRef DevWrapperFuncName,
+                                        ConstantDataArray *ELF);
+
+  Twine getDevWrapperFuncPrefix() { return "__omp_offload_numba_"; }
+
+  Function *createOutlinedFunction(DSAValueMapTy &DSAValueMap,
+                                   ValueToValueMapTy *VMap, Function *OuterFn,
+                                   BasicBlock *StartBB, BasicBlock *EndBB,
+                                   SmallVectorImpl<llvm::Value *> &CapturedVars,
+                                   StringRef Suffix);
+
+  void setDeviceGlobalizedValues(const ArrayRef<Value *> GlobalizedValues);
+
+private:
+  void emitOMPParallelDeviceRuntime(DSAValueMapTy &DSAValueMap,
+                                    ValueToValueMapTy *VMap, const DebugLoc &DL,
+                                    Function *Fn, BasicBlock *BBEntry,
+                                    BasicBlock *StartBB, BasicBlock *EndBB,
+                                    BasicBlock *AfterBB,
+                                    FinalizeCallbackTy FiniCB,
+                                    ParRegionInfoStruct &ParRegionInfo);
+  void emitOMPParallelHostRuntime(DSAValueMapTy &DSAValueMap,
+                                  ValueToValueMapTy *VMap, const DebugLoc &DL,
+                                  Function *Fn, BasicBlock *BBEntry,
+                                  BasicBlock *StartBB, BasicBlock *EndBB,
+                                  BasicBlock *AfterBB,
+                                  FinalizeCallbackTy FiniCB,
+                                  ParRegionInfoStruct &ParRegionInfo);
+  void emitOMPParallelHostRuntimeOMPIRBuilder(
+      DSAValueMapTy &DSAValueMap, ValueToValueMapTy *VMap, const DebugLoc &DL,
+      Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB, BasicBlock *EndBB,
+      BasicBlock *AfterBB, FinalizeCallbackTy FiniCB,
+      ParRegionInfoStruct &ParRegionInfo);
+
+  void emitOMPTeamsDeviceRuntime(DSAValueMapTy &DSAValueMap,
+                                 ValueToValueMapTy *VMap, const DebugLoc &DL,
+                                 Function *Fn, BasicBlock *BBEntry,
+                                 BasicBlock *StartBB, BasicBlock *EndBB,
+                                 BasicBlock *AfterBB,
+                                 TeamsInfoStruct &TeamsInfo);
+  void emitOMPTeamsHostRuntime(DSAValueMapTy &DSAValueMap,
+                               ValueToValueMapTy *VMap, const DebugLoc &DL,
+                               Function *Fn, BasicBlock *BBEntry,
+                               BasicBlock *StartBB, BasicBlock *EndBB,
+                               BasicBlock *AfterBB, TeamsInfoStruct &TeamsInfo);
+
+  void emitOMPTargetHost(Function *Fn, BasicBlock *BBEntry, BasicBlock *StartBB,
+                         BasicBlock *EndBB, DSAValueMapTy &DSAValueMap,
+                         StructMapTy &StructMappingInfoMap,
+                         TargetInfoStruct &TargetInfo,
+                         OMPLoopInfoStruct *OMPLoopInfo);
+
+  void emitOMPTargetDevice(Function *Fn, BasicBlock *BBEntry,
+                           BasicBlock *StartBB, BasicBlock *EndBB,
+                           DSAValueMapTy &DSAValueMap,
+                           StructMapTy &StructMappingInfoMap,
+                           TargetInfoStruct &TargetInfo);
+
+  void emitLoop(DSAValueMapTy &DSAValueMap, OMPLoopInfoStruct &OMPLoopInfo,
+                BasicBlock *StartBB, BasicBlock *ExitBB, bool IsStandalone,
+                bool IsDistribute, bool IsDistributeParallelFor,
+                OMPDistributeInfoStruct *OMPDistributeInfo = nullptr);
+
+  FunctionCallee getKmpcForStaticInit(Type *Ty);
+  FunctionCallee getKmpcDistributeStaticInit(Type *Ty);
+  Value *createScalarCast(Value *V, Type *DestTy);
+  bool isOpenMPDeviceRuntime();
+
+  SmallPtrSet<Value *, 32> DeviceGlobalizedValues;
+};
+
+} // namespace iomp
+
+#endif
\ No newline at end of file
diff --git a/numba/openmp/pass/CMakeLists.txt b/numba/openmp/pass/CMakeLists.txt
new file mode 100644
index 000000000000..a01fa0a46c71
--- /dev/null
+++ b/numba/openmp/pass/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.20)
+project(pyomp-pass)
+
+# Set this to a valid LLVM installation dir
+set(LT_LLVM_INSTALL_DIR "" CACHE PATH "LLVM installation directory")
+
+# Add the location of LLVMConfig.cmake to CMake search paths (so that
+# find_package can locate it)
+list(APPEND CMAKE_PREFIX_PATH "${LT_LLVM_INSTALL_DIR}/lib/cmake/llvm/")
+
+find_package(LLVM CONFIG)
+if("${LLVM_VERSION_MAJOR}" VERSION_LESS 14)
+  message(FATAL_ERROR "Found LLVM ${LLVM_VERSION_MAJOR}, but need LLVM 14 or above")
+endif()
+
+# HelloWorld includes headers from LLVM - update the include paths accordingly
+include_directories(SYSTEM ${LLVM_INCLUDE_DIRS})
+
+# Use the same C++ standard as LLVM does
+set(CMAKE_CXX_STANDARD 17 CACHE STRING "")
+
+# LLVM is normally built without RTTI. Be consistent with that.
+if(NOT LLVM_ENABLE_RTTI)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
+endif()
+
+add_library(IntrinsicsOpenMP SHARED 
+  CGIntrinsicsOpenMP.cpp 
+  DebugOpenMP.cpp 
+  IntrinsicsOpenMP.cpp)
+
+# Allow undefined symbols in shared objects on Darwin (this is the default
+# behaviour on Linux)
+target_link_libraries(IntrinsicsOpenMP
+  "$<$<PLATFORM_ID:Darwin>:-undefined dynamic_lookup>")
+
+install(TARGETS IntrinsicsOpenMP
+  EXPORT IntrinsicsOpenMP
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}
+)
diff --git a/numba/openmp/pass/DebugOpenMP.cpp b/numba/openmp/pass/DebugOpenMP.cpp
new file mode 100644
index 000000000000..d0d01f4f7d7b
--- /dev/null
+++ b/numba/openmp/pass/DebugOpenMP.cpp
@@ -0,0 +1,16 @@
+#include <cstdlib>
+#include <iostream>
+#include <string>
+
+bool DebugOpenMPFlag;
+void DebugOpenMPInit() {
+    char *DebugStr = getenv("NUMBA_DEBUG_OPENMP_LLVM_PASS");
+    DebugOpenMPFlag = false;
+    if(DebugStr)
+        DebugOpenMPFlag = (std::stoi(DebugStr) >= 1);
+}
+
+[[noreturn]] void fatalError(const std::string &msg, const char *file, int line) {
+    std::cerr << "Fatal error @ " << file << ":" << line << " :: " << msg << "\n";
+    std::abort();
+}
diff --git a/numba/openmp/pass/DebugOpenMP.h b/numba/openmp/pass/DebugOpenMP.h
new file mode 100644
index 000000000000..c9814dc90553
--- /dev/null
+++ b/numba/openmp/pass/DebugOpenMP.h
@@ -0,0 +1,28 @@
+#ifndef DEBUG_OPENMP_H
+#define DEBUG_OPENMP_H
+
+#include <string>
+
+#ifdef NDEBUG
+
+#define DEBUG_ENABLE(X)
+#define DebugOpenMPInit()
+
+#else
+
+extern bool DebugOpenMPFlag;
+void DebugOpenMPInit();
+
+#define DEBUG_ENABLE(X)                                                        \
+    do {                                                                       \
+        if (DebugOpenMPFlag) {                                                 \
+            X;                                                                 \
+        }                                                                      \
+    } while (false)
+
+#endif
+
+[[noreturn]] void fatalError(const std::string &msg, const char *file, int line);
+#define FATAL_ERROR(msg) fatalError(msg, __FILE__, __LINE__)
+
+#endif
diff --git a/numba/openmp/pass/IntrinsicsOpenMP.cpp b/numba/openmp/pass/IntrinsicsOpenMP.cpp
new file mode 100644
index 000000000000..81b4e334cdd7
--- /dev/null
+++ b/numba/openmp/pass/IntrinsicsOpenMP.cpp
@@ -0,0 +1,732 @@
+//===- IntrinsicsOpenMP.cpp - Codegen OpenMP from IR intrinsics
+//--------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements code generation for OpenMP from intrinsics embedded in
+// the IR, using the OpenMPIRBuilder
+//
+//===-------------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Frontend/OpenMP/OMP.h.inc"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <llvm/Passes/PassPlugin.h>
+
+#include "IntrinsicsOpenMP.h"
+#include "IntrinsicsOpenMP_CAPI.h"
+#include "CGIntrinsicsOpenMP.h"
+#include "DebugOpenMP.h"
+
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+using namespace omp;
+using namespace iomp;
+
+#define DEBUG_TYPE "intrinsics-openmp"
+
+// TODO: Increment.
+STATISTIC(NumOpenMPRegions, "Counts number of OpenMP regions created");
+
+namespace {
+
+class DirectiveRegionAnalysis;
+
+class DirectiveRegion;
+SmallVector<std::unique_ptr<DirectiveRegion>, 8> DirectiveRegionStorage;
+
+class DirectiveRegion {
+public:
+  DirectiveRegion() = delete;
+
+  void addNested(DirectiveRegionAnalysis &DRA, DirectiveRegion *DR);
+
+  const SmallVector<DirectiveRegion *, 4> &getNested() const { return Nested; }
+
+  CallBase *getEntry() const { return CBEntry; }
+
+  CallBase *getExit() const { return CBExit; }
+
+  void setParent(DirectiveRegion *P) { Parent = P; }
+
+  DirectiveRegion *getParent() const { return Parent; }
+
+  StringRef getTag() const {
+    return getEntry()->getOperandBundleAt(0).getTagName();
+  }
+
+  static DirectiveRegion *create(CallBase *CBEntry, CallBase *CBExit) {
+    // Use global storage of unique_ptr for auto-cleanup.
+    DirectiveRegionStorage.push_back(
+        std::unique_ptr<DirectiveRegion>(new DirectiveRegion{CBEntry, CBExit}));
+    return DirectiveRegionStorage.back().get();
+  }
+
+private:
+  CallBase *CBEntry;
+  CallBase *CBExit;
+  DirectiveRegion *Parent;
+  SmallVector<DirectiveRegion *, 4> Nested;
+
+  DirectiveRegion(CallBase *CBEntry, CallBase *CBExit)
+      : CBEntry(CBEntry), CBExit(CBExit), Parent(nullptr) {}
+};
+
+class DirectiveRegionAnalysis {
+public:
+  explicit DirectiveRegionAnalysis(Function &F) : DT(F), PDT(F) {}
+
+  bool directiveEncloses(DirectiveRegion *DR, DirectiveRegion *OtherDR) {
+    // Use DominatorTree for Entry and PostDominatorTree for Exit.
+    // PostDominator is effective for checking Exit when there are loops in
+    // the CFG, since dominance does not hold for graphs with cycles, but
+    // post-dominance does.
+    if (DT.dominates(DR->getEntry(), OtherDR->getEntry()) &&
+        PDT.dominates(DR->getExit(), OtherDR->getExit()))
+      return true;
+
+    return false;
+  };
+
+  bool directiveEntryDominates(DirectiveRegion *DR, DirectiveRegion *OtherDR) {
+    if (DT.dominates(DR->getEntry(), OtherDR->getEntry()))
+      return true;
+
+    return false;
+  }
+
+private:
+  DominatorTree DT;
+  PostDominatorTree PDT;
+};
+
+void DirectiveRegion::addNested(DirectiveRegionAnalysis &DRA,
+                                DirectiveRegion *DR) {
+  // Insert in topological order.
+  auto Compare = [&DRA](DirectiveRegion *DR, DirectiveRegion *OtherDR) {
+    return DRA.directiveEntryDominates(DR, OtherDR);
+  };
+
+  Nested.insert(std::upper_bound(Nested.begin(), Nested.end(), DR, Compare),
+                DR);
+}
+
+static SmallVector<Value *>
+collectGlobalizedValues(DirectiveRegion &Directive) {
+
+  SmallVector<Value *> GlobalizedValues;
+
+  SmallVector<OperandBundleDef, 16> OpBundles;
+  Directive.getEntry()->getOperandBundlesAsDefs(OpBundles);
+  for (OperandBundleDef &O : OpBundles) {
+    StringRef Tag = O.getTag();
+    auto It = StringToDSA.find(Tag);
+    if (It == StringToDSA.end())
+      continue;
+
+    const ArrayRef<Value *> &TagInputs = O.inputs();
+
+    DSAType DSATy = It->second;
+
+    switch (DSATy) {
+    case iomp::DSA_FIRSTPRIVATE:
+    case iomp::DSA_PRIVATE:
+      continue;
+    default:
+      GlobalizedValues.push_back(TagInputs[0]);
+    }
+  }
+
+  return GlobalizedValues;
+}
+
+struct IntrinsicsOpenMP {
+
+  IntrinsicsOpenMP() {
+    DebugOpenMPInit();
+  }
+
+  bool runOnModule(Module &M) {
+    // Codegen for nested or combined constructs assumes code is generated
+    // bottom-up, that is from the innermost directive to the outermost. This
+    // simplifies handling of DSA attributes by avoiding renaming values (tags
+    // contain pre-lowered values when defining the data sharing environment)
+    // when an outlined function privatizes them in the DSAValueMap.
+    DEBUG_ENABLE(dbgs() << "=== Start IntrinsicsOpenMPPass v4\n");
+
+    Function *RegionEntryF = M.getFunction("llvm.directive.region.entry");
+
+    // Return early for lack of directive intrinsics.
+    if (!RegionEntryF) {
+      DEBUG_ENABLE(dbgs() << "No intrinsics directives, exiting...\n");
+      return false;
+    }
+
+    DEBUG_ENABLE(dbgs() << "=== Dump Module\n"
+                      << M << "=== End of Dump Module\n");
+
+    CGIntrinsicsOpenMP CGIOMP(M);
+    // Find all calls to directive intrinsics.
+    SmallMapVector<Function *, SmallVector<DirectiveRegion *, 4>, 8>
+        FunctionToDirectives;
+
+    for (User *Usr : RegionEntryF->users()) {
+      CallBase *CBEntry = dyn_cast<CallBase>(Usr);
+      assert(CBEntry && "Expected call to directive entry");
+      assert(CBEntry->getNumUses() == 1 &&
+             "Expected single use of the directive entry");
+      Use &U = *CBEntry->use_begin();
+      CallBase *CBExit = dyn_cast<CallBase>(U.getUser());
+      assert(CBExit && "Expected call to region exit intrinsic");
+      Function *F = CBEntry->getFunction();
+      assert(F == CBExit->getFunction() &&
+             "Expected directive entry/exit in the same function");
+
+      DirectiveRegion *DM = DirectiveRegion::create(CBEntry, CBExit);
+      FunctionToDirectives[F].push_back(DM);
+    }
+
+    SmallVector<SmallVector<DirectiveRegion *, 4>, 4> DirectiveListVector;
+    // Create directive lists per function, building trees of directive nests.
+    // Each list stores directives outermost to innermost (pre-order).
+    for (auto &FTD : FunctionToDirectives) {
+      // Find the dominator tree for the function to find directive lists.
+      Function &F = *FTD.first;
+      auto &DirectiveRegions = FTD.second;
+      DirectiveRegionAnalysis DRA{F};
+
+      // Construct directive tree nests. First, find immediate parents, then add
+      // nested children to parents.
+
+      // Find immediate parents.
+      for (auto *DR : DirectiveRegions) {
+        for (auto *OtherDR : DirectiveRegions) {
+          if (DR == OtherDR)
+            continue;
+
+          if (!DRA.directiveEncloses(OtherDR, DR))
+            continue;
+
+          DirectiveRegion *Parent = DR->getParent();
+          if (!Parent) {
+            DR->setParent(OtherDR);
+            continue;
+          }
+
+          // If OtherDR is nested under Parent and encloses DR, then OtherDR is
+          // the immediate parent of DR.
+          if (DRA.directiveEncloses(Parent, OtherDR)) {
+            DR->setParent(OtherDR);
+            continue;
+          }
+
+          // Else, OtherDR must be enclosing Parent. It is not OtherDR's
+          // immediate parent, hence no change to OtherDR.
+          assert(DRA.directiveEncloses(OtherDR, Parent));
+        }
+      }
+      // Gather all root directives, add nested children.
+      SmallVector<DirectiveRegion *, 4> Roots;
+      for (auto *DR : DirectiveRegions) {
+        DirectiveRegion *Parent = DR->getParent();
+        if (!Parent) {
+          Roots.push_back(DR);
+          continue;
+        }
+
+        Parent->addNested(DRA, DR);
+      }
+
+      // Travese the tree and add directives (outermost to innermost)
+      // in a list.
+      for (auto *Root : Roots) {
+        SmallVector<DirectiveRegion *, 4> DirectiveList;
+
+        auto VisitNode = [&DirectiveList](DirectiveRegion *Node, int Depth,
+                                          auto &&VisitNode) -> void {
+          DirectiveList.push_back(Node);
+          for (auto *Nested : Node->getNested())
+            VisitNode(Nested, Depth + 1, VisitNode);
+        };
+
+        VisitNode(Root, 0, VisitNode);
+
+        DirectiveListVector.push_back(DirectiveList);
+
+        auto PrintTree = [&]() {
+          dbgs() << " === TREE\n";
+          auto PrintNode = [](DirectiveRegion *Node, int Depth,
+                              auto &&PrintNode) -> void {
+            if (Depth) {
+              for (int I = 0; I < Depth; ++I)
+                dbgs() << "  ";
+              dbgs() << "|_ ";
+            }
+            dbgs() << Node->getTag() << "\n";
+
+            for (auto *Nested : Node->getNested())
+              PrintNode(Nested, Depth + 1, PrintNode);
+          };
+          PrintNode(Root, 0, PrintNode);
+          dbgs() << " === END OF TREE\n";
+        };
+        DEBUG_ENABLE(PrintTree());
+
+        auto PrintList = [&]() {
+          dbgs() << " === List\n";
+          for (auto *DR : DirectiveList)
+            dbgs() << DR->getTag() << " -> ";
+          dbgs() << "EOL\n";
+          dbgs() << " === End of List\n";
+        };
+        DEBUG_ENABLE(PrintList());
+      }
+    }
+
+    // Iterate all directive lists and codegen.
+    for (auto &DirectiveList : DirectiveListVector) {
+      // If the outermost directive is a TARGET directive, collect globalized
+      // values to set for codegen.
+      // TODO: implement Directives as a class, parse each directive before
+      // codegen, optimize privatization.
+      auto *Outer = DirectiveList.front();
+      if (Outer->getEntry()->getOperandBundleAt(0).getTagName().contains(
+              "TARGET")) {
+        auto GlobalizedValues = collectGlobalizedValues(*Outer);
+        CGIOMP.setDeviceGlobalizedValues(GlobalizedValues);
+      }
+      // Iterate post-order, from innermost to outermost to avoid renaming
+      // values in codegen.
+      for (auto It = DirectiveList.rbegin(), E = DirectiveList.rend(); It != E;
+           ++It) {
+        DirectiveRegion *DR = *It;
+        DEBUG_ENABLE(dbgs() << "Found Directive " << *DR->getEntry() << "\n");
+        // Extract the directive kind and data sharing attributes of values
+        // from the operand bundles of the intrinsic call.
+        Directive Dir = OMPD_unknown;
+        SmallVector<OperandBundleDef, 16> OpBundles;
+        DSAValueMapTy DSAValueMap;
+
+        // RAII for directive metainfo structs.
+        OMPLoopInfoStruct OMPLoopInfo;
+        ParRegionInfoStruct ParRegionInfo;
+        TargetInfoStruct TargetInfo;
+        TeamsInfoStruct TeamsInfo;
+
+        MapVector<Value *, SmallVector<FieldMappingInfo, 4>>
+            StructMappingInfoMap;
+
+        bool IsDeviceTargetRegion = false;
+
+        DR->getEntry()->getOperandBundlesAsDefs(OpBundles);
+        // TODO: parse clauses.
+        for (OperandBundleDef &O : OpBundles) {
+          StringRef Tag = O.getTag();
+          DEBUG_ENABLE(dbgs() << "OPB " << Tag << "\n");
+
+          // TODO: check for conflicting DSA, for example reduction variables
+          // cannot be set private. Should be done in Numba.
+          if (Tag.startswith("DIR")) {
+            auto It = StringToDir.find(Tag);
+            assert(It != StringToDir.end() && "Directive is not supported!");
+            Dir = It->second;
+          } else if (Tag.startswith("QUAL")) {
+            const ArrayRef<Value *> &TagInputs = O.inputs();
+            if (Tag.startswith("QUAL.OMP.NORMALIZED.IV")) {
+              assert(O.input_size() == 1 && "Expected single IV value");
+              OMPLoopInfo.IV = TagInputs[0];
+            } else if (Tag.startswith("QUAL.OMP.NORMALIZED.START")) {
+              assert(O.input_size() == 1 && "Expected single START value");
+              OMPLoopInfo.Start = TagInputs[0];
+            } else if (Tag.startswith("QUAL.OMP.NORMALIZED.LB")) {
+              assert(O.input_size() == 1 && "Expected single LB value");
+              OMPLoopInfo.LB = TagInputs[0];
+            } else if (Tag.startswith("QUAL.OMP.NORMALIZED.UB")) {
+              assert(O.input_size() == 1 && "Expected single UB value");
+              OMPLoopInfo.UB = TagInputs[0];
+            } else if (Tag.startswith("QUAL.OMP.NUM_THREADS")) {
+              assert(O.input_size() == 1 && "Expected single NumThreads value");
+              ParRegionInfo.NumThreads = TagInputs[0];
+            } else if (Tag.startswith("QUAL.OMP.SCHEDULE")) {
+              // TODO: Add DIST_SCHEDULE for distribute loops.
+              assert(O.input_size() == 1 &&
+                     "Expected single chunking scheduling value");
+              Constant *Zero = ConstantInt::get(TagInputs[0]->getType(), 0);
+              OMPLoopInfo.Chunk = TagInputs[0];
+
+              if (Tag == "QUAL.OMP.SCHEDULE.STATIC") {
+                if (TagInputs[0] == Zero)
+                  OMPLoopInfo.Sched = OMPScheduleType::Static;
+                else {
+                  OMPLoopInfo.Sched = OMPScheduleType::StaticChunked;
+                  OMPLoopInfo.Chunk = TagInputs[0];
+                }
+              } else
+                FATAL_ERROR("Unsupported scheduling type");
+            } else if (Tag.startswith("QUAL.OMP.IF")) {
+              assert(O.input_size() == 1 &&
+                     "Expected single if condition value");
+              ParRegionInfo.IfCondition = TagInputs[0];
+            } else if (Tag.startswith("QUAL.OMP.TARGET.DEV_FUNC")) {
+              assert(O.input_size() == 1 &&
+                     "Expected a single device function name");
+              ConstantDataArray *DevFuncArray =
+                  dyn_cast<ConstantDataArray>(TagInputs[0]);
+              assert(DevFuncArray &&
+                     "Expected constant string for the device function");
+              TargetInfo.DevFuncName = DevFuncArray->getAsString();
+            } else if (Tag.startswith("QUAL.OMP.TARGET.ELF")) {
+              assert(O.input_size() == 1 &&
+                     "Expected a single elf image string");
+              ConstantDataArray *ELF =
+                  dyn_cast<ConstantDataArray>(TagInputs[0]);
+              assert(ELF && "Expected constant string for ELF");
+              TargetInfo.ELF = ELF;
+            } else if (Tag.startswith("QUAL.OMP.DEVICE")) {
+              // TODO: Handle device selection for target regions.
+            } else if (Tag.startswith("QUAL.OMP.NUM_TEAMS")) {
+              assert(O.input_size() == 1 && "Expected single NumTeams value");
+              switch (Dir) {
+              case OMPD_target:
+                TargetInfo.NumTeams = TagInputs[0];
+                break;
+              case OMPD_teams:
+              case OMPD_teams_distribute:
+              case OMPD_teams_distribute_parallel_for:
+                TeamsInfo.NumTeams = TagInputs[0];
+                break;
+              case OMPD_target_teams:
+              case OMPD_target_teams_distribute:
+                TargetInfo.NumTeams = TagInputs[0];
+                TeamsInfo.NumTeams = TagInputs[0];
+                break;
+              case OMPD_target_teams_distribute_parallel_for:
+                TargetInfo.NumTeams = TagInputs[0];
+                TeamsInfo.NumTeams = TagInputs[0];
+                break;
+              default:
+                FATAL_ERROR("Unsupported qualifier in directive");
+              }
+            } else if (Tag.startswith("QUAL.OMP.THREAD_LIMIT")) {
+              assert(O.input_size() == 1 &&
+                     "Expected single ThreadLimit value");
+              switch (Dir) {
+              case OMPD_target:
+                TargetInfo.ThreadLimit = TagInputs[0];
+                break;
+              case OMPD_teams:
+              case OMPD_teams_distribute:
+              case OMPD_teams_distribute_parallel_for:
+                TeamsInfo.ThreadLimit = TagInputs[0];
+                break;
+              case OMPD_target_teams:
+              case OMPD_target_teams_distribute:
+              case OMPD_target_teams_distribute_parallel_for:
+                TargetInfo.ThreadLimit = TagInputs[0];
+                TeamsInfo.ThreadLimit = TagInputs[0];
+                break;
+              default:
+                FATAL_ERROR("Unsupported qualifier in directive");
+              }
+            } else if (Tag.startswith("QUAL.OMP.NOWAIT")) {
+              switch (Dir) {
+              case OMPD_target:
+              case OMPD_target_teams:
+              case OMPD_target_teams_distribute:
+              case OMPD_target_teams_distribute_parallel_for:
+                TargetInfo.NoWait = true;
+                break;
+              default:
+                FATAL_ERROR("Unsupported nowait qualifier in directive");
+              }
+            } else /* DSA Qualifiers */ {
+              auto It = StringToDSA.find(Tag);
+              assert(It != StringToDSA.end() && "DSA type not found in map");
+              if (It->second == DSA_MAP_ALLOC_STRUCT ||
+                  It->second == DSA_MAP_TO_STRUCT ||
+                  It->second == DSA_MAP_FROM_STRUCT ||
+                  It->second == DSA_MAP_TOFROM_STRUCT) {
+                assert((TagInputs.size() - 1) == 3 &&
+                       "Expected input triple for struct mapping");
+                Value *Index = TagInputs[1];
+                Value *Offset = TagInputs[2];
+                Value *NumElements = TagInputs[3];
+                StructMappingInfoMap[TagInputs[0]].push_back(
+                    {Index, Offset, NumElements, It->second});
+
+                DSAValueMap[TagInputs[0]] = DSATypeInfo(DSA_MAP_STRUCT);
+              } else {
+                // This firstprivate includes a copy-constructor operand.
+                if ((It->second == DSA_FIRSTPRIVATE ||
+                     It->second == DSA_LASTPRIVATE) &&
+                    TagInputs.size() == 2) {
+                  Value *V = TagInputs[0];
+                  ConstantDataArray *CopyFnNameArray =
+                      dyn_cast<ConstantDataArray>(TagInputs[1]);
+                  assert(CopyFnNameArray && "Expected constant string for the "
+                                            "copy-constructor function");
+                  StringRef CopyFnName = CopyFnNameArray->getAsString();
+                  FunctionCallee CopyConstructor = M.getOrInsertFunction(
+                      CopyFnName, V->getType()->getPointerElementType(),
+                      V->getType()->getPointerElementType());
+                  DSAValueMap[TagInputs[0]] =
+                      DSATypeInfo(It->second, CopyConstructor);
+                } else
+                  // Sink for DSA qualifiers that do not require special
+                  // handling.
+                  DSAValueMap[TagInputs[0]] = DSATypeInfo(It->second);
+              }
+            }
+          } else if (Tag == "OMP.DEVICE")
+            IsDeviceTargetRegion = true;
+          else
+            FATAL_ERROR(("Unknown tag " + Tag).str().c_str());
+        }
+
+        assert(Dir != OMPD_unknown && "Expected valid OMP directive");
+
+        // Gather info.
+        BasicBlock *BBEntry = DR->getEntry()->getParent();
+        Function *Fn = BBEntry->getParent();
+        const DebugLoc DL = BBEntry->getTerminator()->getDebugLoc();
+
+        // Create the basic block structure to isolate the outlined region.
+        // Structure: BBEntry -> StartBB -> BBExit -> EndBB -> AfterBB
+        // TODO: Reverse naming on BBExit and EndBB?
+        BasicBlock *StartBB = SplitBlock(BBEntry, DR->getEntry());
+        assert(BBEntry->getUniqueSuccessor() == StartBB &&
+               "Expected unique successor at region start BB");
+
+        BasicBlock *BBExit = DR->getExit()->getParent();
+        BasicBlock *EndBB = SplitBlock(BBExit, DR->getExit()->getNextNode());
+        assert(BBExit->getUniqueSuccessor() == EndBB &&
+               "Expected unique successor at region end BB");
+        BasicBlock *AfterBB = SplitBlock(EndBB, &*EndBB->getFirstInsertionPt());
+
+        DEBUG_ENABLE(dbgs() << "BBEntry " << BBEntry->getName() << "\n");
+        DEBUG_ENABLE(dbgs() << "StartBB " << StartBB->getName() << "\n");
+        DEBUG_ENABLE(dbgs() << "BBExit " << BBExit->getName() << "\n");
+        DEBUG_ENABLE(dbgs() << "EndBB " << EndBB->getName() << "\n");
+        DEBUG_ENABLE(dbgs() << "AfterBB " << AfterBB->getName() << "\n");
+
+        // Define the default BodyGenCB lambda.
+        auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                             BasicBlock &ContinuationIP) {
+          BasicBlock *CGStartBB = CodeGenIP.getBlock();
+          BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint());
+          assert(StartBB != nullptr && "StartBB should not be null");
+          CGStartBB->getTerminator()->setSuccessor(0, StartBB);
+          assert(EndBB != nullptr && "EndBB should not be null");
+          EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+        };
+
+        // Define the default FiniCB lambda.
+        auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+        // Remove intrinsics of OpenMP tags, first CBExit to also remove use
+        // of CBEntry, then CBEntry.
+        DR->getExit()->eraseFromParent();
+        DR->getEntry()->eraseFromParent();
+
+        if (Dir == OMPD_parallel) {
+          CGIOMP.emitOMPParallel(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB,
+                                 EndBB, AfterBB, FiniCB, ParRegionInfo);
+        } else if (Dir == OMPD_single) {
+          CGIOMP.emitOMPSingle(Fn, BBEntry, AfterBB, BodyGenCB, FiniCB);
+        } else if (Dir == OMPD_critical) {
+          CGIOMP.emitOMPCritical(Fn, BBEntry, AfterBB, BodyGenCB, FiniCB);
+        } else if (Dir == OMPD_barrier) {
+          CGIOMP.emitOMPBarrier(Fn, BBEntry, OMPD_barrier);
+        } else if (Dir == OMPD_for) {
+          CGIOMP.emitOMPFor(DSAValueMap, OMPLoopInfo, StartBB, BBExit,
+                            /* IsStandalone */ true, false);
+        } else if (Dir == OMPD_parallel_for) {
+          CGIOMP.emitOMPFor(DSAValueMap, OMPLoopInfo, StartBB, BBExit,
+                            /* IsStandalone */ false, false);
+          CGIOMP.emitOMPParallel(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB,
+                                 EndBB, AfterBB, FiniCB, ParRegionInfo);
+        } else if (Dir == OMPD_task) {
+          CGIOMP.emitOMPTask(DSAValueMap, Fn, BBEntry, StartBB, EndBB, AfterBB);
+        } else if (Dir == OMPD_taskwait) {
+          CGIOMP.emitOMPTaskwait(BBEntry);
+        } else if (Dir == OMPD_target) {
+          TargetInfo.ExecMode = OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
+          CGIOMP.emitOMPTarget(Fn, BBEntry, StartBB, EndBB, DSAValueMap,
+                               StructMappingInfoMap, TargetInfo,
+                               /* OMPLoopInfo */ nullptr, IsDeviceTargetRegion);
+        } else if (Dir == OMPD_teams) {
+          CGIOMP.emitOMPTeams(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB,
+                              EndBB, AfterBB, TeamsInfo);
+        } else if (Dir == OMPD_distribute) {
+          CGIOMP.emitOMPDistribute(DSAValueMap, OMPLoopInfo, StartBB, BBExit,
+                                   /* IsStandalone */ true, false);
+        } else if (Dir == OMPD_teams_distribute) {
+          CGIOMP.emitOMPDistribute(DSAValueMap, OMPLoopInfo, StartBB, BBExit,
+                                   /* IsStandalone */ false, false);
+          CGIOMP.emitOMPTeams(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB,
+                              EndBB, AfterBB, TeamsInfo);
+        } else if (Dir == OMPD_teams_distribute_parallel_for) {
+          CGIOMP.emitOMPDistributeParallelFor(DSAValueMap, StartBB, BBExit,
+                                              OMPLoopInfo, ParRegionInfo,
+                                              /* IsStandalone */ false);
+          CGIOMP.emitOMPTeams(DSAValueMap, nullptr, DL, Fn, BBEntry, StartBB,
+                              EndBB, AfterBB, TeamsInfo);
+        } else if (Dir == OMPD_target_teams) {
+          TargetInfo.ExecMode = OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
+          CGIOMP.emitOMPTargetTeams(DSAValueMap, nullptr, DL, Fn, BBEntry,
+                                    StartBB, EndBB, AfterBB, TargetInfo,
+                                    /* OMPLoopInfo */ nullptr,
+                                    StructMappingInfoMap, IsDeviceTargetRegion);
+        } else if (Dir == OMPD_target_data) {
+          if (IsDeviceTargetRegion)
+            FATAL_ERROR("Target enter data should never appear inside a "
+                        "device target region");
+          CGIOMP.emitOMPTargetData(Fn, BBEntry, BBExit, DSAValueMap,
+                                   StructMappingInfoMap);
+        } else if (Dir == OMPD_target_enter_data) {
+          if (IsDeviceTargetRegion)
+            FATAL_ERROR("Target enter data should never appear inside a "
+                        "device target region");
+
+          CGIOMP.emitOMPTargetEnterData(Fn, BBEntry, DSAValueMap,
+                                        StructMappingInfoMap);
+        } else if (Dir == OMPD_target_exit_data) {
+          if (IsDeviceTargetRegion)
+            FATAL_ERROR("Target exit data should never appear inside a "
+                        "device target region");
+
+          CGIOMP.emitOMPTargetExitData(Fn, BBEntry, DSAValueMap,
+                                       StructMappingInfoMap);
+        } else if (Dir == OMPD_target_update) {
+          if (IsDeviceTargetRegion)
+            FATAL_ERROR("Target exit data should never appear inside a "
+                        "device target region");
+
+          CGIOMP.emitOMPTargetUpdate(Fn, BBEntry, DSAValueMap,
+                                     StructMappingInfoMap);
+        } else if (Dir == OMPD_target_teams_distribute) {
+          TargetInfo.ExecMode = OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
+          CGIOMP.emitOMPDistribute(DSAValueMap, OMPLoopInfo, StartBB, BBExit,
+                                   /* IsStandalone */ false, false);
+          CGIOMP.emitOMPTargetTeams(DSAValueMap, nullptr, DL, Fn, BBEntry,
+                                    StartBB, EndBB, AfterBB, TargetInfo,
+                                    &OMPLoopInfo, StructMappingInfoMap,
+                                    IsDeviceTargetRegion);
+        } else if (Dir == OMPD_distribute_parallel_for) {
+          CGIOMP.emitOMPDistributeParallelFor(DSAValueMap, StartBB, BBExit,
+                                              OMPLoopInfo, ParRegionInfo,
+                                              /* isStandalone */ false);
+        } else if (Dir == OMPD_target_teams_distribute_parallel_for) {
+          TargetInfo.ExecMode = OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
+          CGIOMP.emitOMPTargetTeamsDistributeParallelFor(
+              DSAValueMap, DL, Fn, BBEntry, StartBB, EndBB, BBExit, AfterBB,
+              OMPLoopInfo, ParRegionInfo, TargetInfo, StructMappingInfoMap,
+              IsDeviceTargetRegion);
+        } else {
+          FATAL_ERROR("Unknown directive");
+        }
+
+        if (verifyFunction(*Fn, &errs()))
+          FATAL_ERROR(
+              "Verification of IntrinsicsOpenMP lowering failed!");
+      }
+    }
+
+    DEBUG_ENABLE(dbgs() << "=== Dump Lowered Module\n"
+                      << M << "=== End of Dump Lowered Module\n");
+
+    DEBUG_ENABLE(dbgs() << "=== End of IntrinsicsOpenMP pass\n");
+
+    return true;
+  }
+
+};
+} // namespace
+
+// Legacy PM registration.
+struct LegacyIntrinsicsOpenmMPPass : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  LegacyIntrinsicsOpenmMPPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    IntrinsicsOpenMP IOMP;
+    return IOMP.runOnModule(M);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ModulePass::getAnalysisUsage(AU);
+  }
+};
+
+char LegacyIntrinsicsOpenmMPPass::ID = 0;
+static RegisterPass<LegacyIntrinsicsOpenmMPPass> X("intrinsics-openmp",
+                                        "Legacy IntrinsicsOpenMP Pass");
+
+ModulePass *llvm::createIntrinsicsOpenMPPass() {
+  return new LegacyIntrinsicsOpenmMPPass();
+}
+
+void LLVMAddIntrinsicsOpenMPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIntrinsicsOpenMPPass());
+}
+
+// New PM registration.
+
+class IntrinsicsOpenMPPass : public PassInfoMixin<IntrinsicsOpenMPPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
+    IntrinsicsOpenMP IOMP;
+    bool Changed = IOMP.runOnModule(M);
+  
+    if (Changed)
+      return PreservedAnalyses::none();
+  
+    return PreservedAnalyses::all();
+
+  }
+
+  // Run always to lower OpenMP intrinsics.
+  static bool isRequired() { return true; }
+};
+
+llvm::PassPluginLibraryInfo getIntrinsicsOpenMPPluginInfo() {
+  return {LLVM_PLUGIN_API_VERSION, "IntrinsicsOpenMP", LLVM_VERSION_STRING,
+          [](PassBuilder &PB) {
+            PB.registerPipelineParsingCallback(
+                [](StringRef Name, ModulePassManager &MPM,
+                   ArrayRef<PassBuilder::PipelineElement>) {
+                  if (Name == "intrinsics-openmp") {
+                    MPM.addPass(IntrinsicsOpenMPPass());
+                    return true;
+                  }
+                  return false;
+                });
+          }};
+}
+
+extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo
+llvmGetPassPluginInfo() {
+  return getIntrinsicsOpenMPPluginInfo();
+}
diff --git a/numba/openmp/pass/IntrinsicsOpenMP.h b/numba/openmp/pass/IntrinsicsOpenMP.h
new file mode 100644
index 000000000000..3d44f3f92312
--- /dev/null
+++ b/numba/openmp/pass/IntrinsicsOpenMP.h
@@ -0,0 +1,14 @@
+#ifndef LLVM_TRANSFORMS_INTRINSICS_OPENMP_H
+#define LLVM_TRANSFORMS_INTRINSICS_OPENMP_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+
+ModulePass *createIntrinsicsOpenMPPass();
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_INTRINSICS_OPENMP_H
\ No newline at end of file
diff --git a/numba/openmp/pass/IntrinsicsOpenMP_CAPI.h b/numba/openmp/pass/IntrinsicsOpenMP_CAPI.h
new file mode 100644
index 000000000000..b0d0b67bca81
--- /dev/null
+++ b/numba/openmp/pass/IntrinsicsOpenMP_CAPI.h
@@ -0,0 +1,23 @@
+#ifndef LLVM_C_TRANSFORMS_INTRINSICS_OPENMP_H
+#define LLVM_C_TRANSFORMS_INTRINSICS_OPENMP_H
+
+#include "llvm-c/ExternC.h"
+#include "llvm-c/Types.h"
+
+LLVM_C_EXTERN_C_BEGIN
+
+/**
+ * @defgroup LLVMCTransformsIntrinsicsOpenMP IntrinsicsOpenMP transformations
+ * @ingroup LLVMCTransforms
+ *
+ * @{
+ */
+
+/** See llvm::createIntrinsicsOpenMPPass function. */
+void LLVMAddIntrinsicsOpenMPPass(LLVMPassManagerRef PM);
+
+/**
+ * @}
+ */
+LLVM_C_EXTERN_C_END
+#endif
\ No newline at end of file
diff --git a/numba/openmp/tests/test_openmp.py b/numba/openmp/tests/test_openmp.py
new file mode 100644
index 000000000000..17aba2cb8b02
--- /dev/null
+++ b/numba/openmp/tests/test_openmp.py
@@ -0,0 +1,4959 @@
+import contextlib
+import math
+import time
+import dis
+import numbers
+import os
+import platform
+import sys
+import subprocess
+import warnings
+from functools import reduce
+import numpy as np
+from numpy.random import randn
+import operator
+from collections import defaultdict, namedtuple
+import copy
+from itertools import cycle, chain
+import subprocess as subp
+
+from numba import typeof
+from numba.core import (
+    types,
+    utils,
+    typing,
+    errors,
+    ir,
+    rewrites,
+    typed_passes,
+    inline_closurecall,
+    config,
+    compiler,
+    cpu,
+)
+from numba.extending import (
+    overload_method,
+    register_model,
+    typeof_impl,
+    unbox,
+    NativeValue,
+    models,
+)
+from numba.core.registry import cpu_target
+from numba.core.annotations import type_annotations
+from numba.core.ir_utils import (
+    find_callname,
+    guard,
+    build_definitions,
+    get_definition,
+    is_getitem,
+    is_setitem,
+    index_var_of_get_setitem,
+)
+from numba.np.unsafe.ndarray import empty_inferred as unsafe_empty
+from numba.core.bytecode import ByteCodeIter
+from numba.core.compiler import (
+    compile_isolated,
+    Flags,
+    CompilerBase,
+    DefaultPassBuilder,
+)
+from numba.core.compiler_machinery import register_pass, AnalysisPass
+from numba.core.typed_passes import IRLegalization
+from numba.tests.support import (
+    TestCase,
+    captured_stdout,
+    MemoryLeakMixin,
+    override_env_config,
+    linux_only,
+    tag,
+    _32bit,
+    needs_blas,
+    needs_lapack,
+    disabled_test,
+    skip_unless_scipy,
+    needs_subprocess,
+)
+import numba.openmp as openmp
+from numba.openmp import njit
+from numba.openmp import openmp_context as omp
+from numba.openmp import (
+    omp_set_num_threads,
+    omp_get_thread_num,
+    omp_get_num_threads,
+    omp_get_wtime,
+    omp_set_nested,
+    omp_set_max_active_levels,
+    omp_set_dynamic,
+    omp_get_max_active_levels,
+    omp_get_max_threads,
+    omp_get_num_procs,
+    UnspecifiedVarInDefaultNone,
+    NonconstantOpenmpSpecification,
+    NonStringOpenmpSpecification,
+    omp_get_thread_limit,
+    ParallelForExtraCode,
+    ParallelForWrongLoopCount,
+    omp_in_parallel,
+    omp_get_level,
+    omp_get_active_level,
+    omp_get_team_size,
+    omp_get_ancestor_thread_num,
+    omp_get_team_num,
+    omp_get_num_teams,
+    omp_in_final,
+    omp_shared_array,
+)
+import cmath
+import unittest
+
+# NOTE: Each OpenMP test class is run in separate subprocess, this is to reduce
+# memory pressure in CI settings. The environment variable "SUBPROC_TEST" is
+# used to determine whether a test is skipped or not, such that if you want to
+# run any OpenMP test directly this environment variable can be set. The
+# subprocesses running the test classes set this environment variable as the new
+# process starts which enables the tests within the process. The decorator
+# @needs_subprocess is used to ensure the appropriate test skips are made.
+
+#
+# class TestOpenmpRunner(TestCase):
+#    _numba_parallel_test_ = False
+#
+#    # Each test class can run for 30 minutes before time out.
+#    _TIMEOUT = 1800
+#
+#    """This is the test runner for all the OpenMP tests, it runs them in
+#    subprocesses as described above. The convention for the test method naming
+#    is: `test_<TestClass>` where <TestClass> is the name of the test class in
+#    this module.
+#    """
+#    def runner(self):
+#        themod = self.__module__
+#        test_clazz_name = self.id().split('.')[-1].split('_')[-1]
+#        # don't specify a given test, it's an entire class that needs running
+#        self.subprocess_test_runner(test_module=themod,
+#                                    test_class=test_clazz_name,
+#                                    timeout=self._TIMEOUT)
+#
+#    """
+#    def test_TestOpenmpBasic(self):
+#        self.runner()
+#    """
+#
+#    def test_TestOpenmpRoutinesEnvVariables(self):
+#        self.runner()
+#
+#    def test_TestOpenmpParallelForResults(self):
+#        self.runner()
+#
+#    def test_TestOpenmpWorksharingSchedule(self):
+#        self.runner()
+#
+#    def test_TestOpenmpParallelClauses(self):
+#        self.runner()
+#
+#    def test_TestOpenmpDataClauses(self):
+#        self.runner()
+#
+#    def test_TestOpenmpConstraints(self):
+#        self.runner()
+#
+#    def test_TestOpenmpConcurrency(self):
+#        self.runner()
+#
+#    def test_TestOpenmpTask(self):
+#        self.runner()
+#
+#    def test_TestOpenmpTaskloop(self):
+#        self.runner()
+#
+#    def test_TestOpenmpTarget(self):
+#        self.runner()
+#
+#    def test_TestOpenmpPi(self):
+#        self.runner()
+
+
+x86_only = unittest.skipIf(
+    platform.machine() not in ("i386", "x86_64"), "x86 only test"
+)
+
+
+def null_comparer(a, b):
+    """
+    Used with check_arq_equality to indicate that we do not care
+    whether the value of the parameter at the end of the function
+    has a particular value.
+    """
+    pass
+
+
+@contextlib.contextmanager
+def override_config(name, value):
+    """
+    Return a context manager that temporarily sets an openmp config variable
+    *name* to *value*.  *name* must be the name of an existing variable
+    in openmp.
+    """
+    old_value = getattr(openmp, name)
+    setattr(openmp, name, value)
+    try:
+        yield
+    finally:
+        setattr(openmp, name, old_value)
+
+
+# @needs_subprocess
+class TestOpenmpBase(TestCase):
+    """
+    Base class for testing OpenMP.
+    Provides functions for compilation and three way comparison between
+    python functions, njit'd functions and njit'd functions with
+    OpenMP disabled.
+
+    To set a default value or state for all the tests in a class, set
+    a variable *var* inside the class where *var* is:
+
+    - MAX_THREADS - Thread team size for parallel regions.
+    - MAX_ACTIVE_LEVELS - Number of nested parallel regions capable of
+                          running in parallel.
+    """
+
+    _numba_parallel_test_ = False
+
+    skip_disabled = int(os.environ.get("OVERRIDE_TEST_SKIP", 0)) != 0
+    run_target = int(os.environ.get("RUN_TARGET", 0)) != 0
+    test_devices = os.environ.get("TEST_DEVICES", "")
+
+    env_vars = {
+        "OMP_NUM_THREADS": omp_get_num_procs(),
+        "OMP_MAX_ACTIVE_LEVELS": 1,
+        "OMP_DYNAMIC": True,
+    }
+
+    def __init__(self, *args):
+        # flags for njit()
+        self.cflags = Flags()
+        self.cflags.enable_ssa = False
+        self.cflags.nrt = True
+
+        super(TestOpenmpBase, self).__init__(*args)
+
+    def setUp(self):
+        omp_set_num_threads(
+            getattr(self, "MAX_THREADS", TestOpenmpBase.env_vars.get("OMP_NUM_THREADS"))
+        )
+        omp_set_max_active_levels(
+            getattr(
+                self,
+                "MAX_ACTIVE_LEVELS",
+                TestOpenmpBase.env_vars.get("OMP_MAX_ACTIVE_LEVELS"),
+            )
+        )
+        self.beforeThreads = omp_get_max_threads()
+        self.beforeLevels = omp_get_max_active_levels()
+
+    def tearDown(self):
+        omp_set_num_threads(self.beforeThreads)
+        omp_set_max_active_levels(self.beforeLevels)
+
+    def _compile_this(self, func, sig, flags):
+        return compile_isolated(func, sig, flags=flags)
+
+    def compile_njit_openmp_disabled(self, func, sig):
+        with override_config("OPENMP_DISABLED", True):
+            return self._compile_this(func, sig, flags=self.cflags)
+
+    def compile_njit(self, func, sig):
+        return self._compile_this(func, sig, flags=self.cflags)
+
+    def compile_all(self, pyfunc, *args, **kwargs):
+        sig = tuple([typeof(x) for x in args])
+
+        # compile the OpenMP-disabled njit function
+        cdfunc = self.compile_njit_openmp_disabled(pyfunc, sig)
+
+        # compile a standard njit of the original function
+        cfunc = self.compile_njit(pyfunc, sig)
+
+        return cfunc, cdfunc
+
+    def assert_outputs_equal(self, *outputs):
+        assert len(outputs) > 1
+
+        for op_num in range(len(outputs) - 1):
+            op1, op2 = outputs[op_num], outputs[op_num + 1]
+            if isinstance(op1, (bool, np.bool_)):
+                assert isinstance(op2, (bool, np.bool_))
+            elif not isinstance(op1, numbers.Number) or not isinstance(
+                op2, numbers.Number
+            ):
+                self.assertEqual(type(op1), type(op2))
+
+            if isinstance(op1, np.ndarray):
+                np.testing.assert_almost_equal(op1, op2)
+            elif isinstance(op1, (tuple, list)):
+                assert len(op1) == len(op2)
+                for i in range(len(op1)):
+                    self.assert_outputs_equal(op1[i], op2[i])
+            elif isinstance(op1, (bool, np.bool_, str, type(None))):
+                assert op1 == op2
+            elif isinstance(op1, numbers.Number):
+                np.testing.assert_approx_equal(op1, op2)
+            else:
+                raise ValueError("Unsupported output type encountered")
+
+    def check_openmp_vs_others(self, pyfunc, cfunc, cdfunc, *args, **kwargs):
+        """
+        Checks python, njit and njit without OpenMP impls produce the same result.
+
+        Arguments:
+            pyfunc - the python function to test
+            cfunc - CompilerResult from njit of pyfunc
+            cdfunc - CompilerResult from OpenMP-disabled njit of pyfunc
+            args - arguments for the function being tested
+        Keyword Arguments:
+            scheduler_type - 'signed', 'unsigned' or None, default is None.
+                           Supply in cases where the presence of a specific
+                           scheduler is to be asserted.
+            fastmath_pcres - a fastmath parallel compile result, if supplied
+                             will be run to make sure the result is correct
+            check_arg_equality - some functions need to check that a
+                                 parameter is modified rather than a certain
+                                 value returned.  If this keyword argument
+                                 is supplied, it should be a list of
+                                 comparison functions such that the i'th
+                                 function in the list is used to compare the
+                                 i'th parameter of the njit and OpenMP-disabled
+                                 functions against the i'th parameter of the
+                                 standard Python function, asserting if they
+                                 differ.  The length of this list must be equal
+                                 to the number of parameters to the function.
+                                 The null comparator is available for use
+                                 when you do not desire to test if some
+                                 particular parameter is changed.
+            Remaining kwargs are passed to np.testing.assert_almost_equal
+        """
+        check_args_for_equality = kwargs.pop("check_arg_equality", None)
+
+        def copy_args(*args):
+            if not args:
+                return tuple()
+            new_args = []
+            for x in args:
+                if isinstance(x, np.ndarray):
+                    new_args.append(x.copy("k"))
+                elif isinstance(x, np.number):
+                    new_args.append(x.copy())
+                elif isinstance(x, numbers.Number):
+                    new_args.append(x)
+                elif isinstance(x, tuple):
+                    new_args.append(copy.deepcopy(x))
+                elif isinstance(x, list):
+                    new_args.append(x[:])
+                elif isinstance(x, str):
+                    new_args.append(x)
+                else:
+                    raise ValueError("Unsupported argument type encountered")
+            return tuple(new_args)
+
+        # python result
+        py_args = copy_args(*args)
+        py_expected = pyfunc(*py_args)
+
+        # njit result
+        njit_args = copy_args(*args)
+        njit_output = cfunc.entry_point(*njit_args)
+
+        # OpenMP-disabled result
+        openmp_disabled_args = copy_args(*args)
+        openmp_disabled_output = cdfunc.entry_point(*openmp_disabled_args)
+
+        if check_args_for_equality is None:
+            self.assert_outputs_equal(py_expected, njit_output, openmp_disabled_output)
+        else:
+            assert len(py_args) == len(check_args_for_equality)
+            for pyarg, njitarg, noomparg, argcomp in zip(
+                py_args, njit_args, openmp_disabled_args, check_args_for_equality
+            ):
+                argcomp(njitarg, pyarg, **kwargs)
+                argcomp(noomparg, pyarg, **kwargs)
+
+    # TODO: remove this check function and check_openmp_vs_others and check
+    # directly expected results.
+    def check(self, pyfunc, *args, **kwargs):
+        """Checks that pyfunc compiles for *args under njit OpenMP-disabled and
+        njit and asserts that all version execute and produce the same result
+        """
+        cfunc, cdfunc = self.compile_all(pyfunc, *args)
+        self.check_openmp_vs_others(pyfunc, cfunc, cdfunc, *args, **kwargs)
+
+    def check_variants(self, impl, arg_gen, **kwargs):
+        """Run self.check(impl, ...) on array data generated from arg_gen."""
+        for args in arg_gen():
+            with self.subTest(list(map(typeof, args))):
+                self.check(impl, *args, **kwargs)
+
+
+class TestPipeline(object):
+    def __init__(self, typingctx, targetctx, args, test_ir):
+        self.state = compiler.StateDict()
+        self.state.typingctx = typingctx
+        self.state.targetctx = targetctx
+        self.state.args = args
+        self.state.func_ir = test_ir
+        self.state.typemap = None
+        self.state.return_type = None
+        self.state.calltypes = None
+        self.state.metadata = {}
+
+
+#
+# class TestOpenmpBasic(TestOpenmpBase):
+#    """OpenMP smoke tests. These tests check the most basic
+#    functionality"""
+#
+#    def __init__(self, *args):
+#        TestOpenmpBase.__init__(self, *args)
+
+
+class TestOpenmpRoutinesEnvVariables(TestOpenmpBase):
+    MAX_THREADS = 5
+
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    """
+    def test_func_get_wtime(self):
+        @njit
+        def test_impl(t):
+            start = omp_get_wtime()
+            time.sleep(t)
+            return omp_get_wtime() - start
+        t = 0.5
+        np.testing.assert_approx_equal(test_impl(t), t, signifcant=2)
+    """
+
+    def test_func_get_max_threads(self):
+        @njit
+        def test_impl():
+            omp_set_dynamic(0)
+            o_nt = omp_get_max_threads()
+            count = 0
+            with omp("parallel"):
+                i_nt = omp_get_max_threads()
+                with omp("critical"):
+                    count += 1
+            return count, i_nt, o_nt
+
+        nt = self.MAX_THREADS
+        with override_env_config("OMP_NUM_THREADS", str(nt)):
+            r = test_impl()
+        assert r[0] == r[1] == r[2] == nt
+
+    def test_func_get_num_threads(self):
+        @njit
+        def test_impl():
+            omp_set_dynamic(0)
+            o_nt = omp_get_num_threads()
+            count = 0
+            with omp("parallel"):
+                i_nt = omp_get_num_threads()
+                with omp("critical"):
+                    count += 1
+            return (count, i_nt), o_nt
+
+        nt = self.MAX_THREADS
+        with override_env_config("OMP_NUM_THREADS", str(nt)):
+            r = test_impl()
+        assert r[0][0] == r[0][1] == nt
+        assert r[1] == 1
+
+    def test_func_set_num_threads(self):
+        @njit
+        def test_impl(n1, n2):
+            omp_set_dynamic(0)
+            omp_set_num_threads(n1)
+            count1 = 0
+            count2 = 0
+            with omp("parallel"):
+                with omp("critical"):
+                    count1 += 1
+                omp_set_num_threads(n2)
+            with omp("parallel"):
+                with omp("critical"):
+                    count2 += 1
+            return count1, count2
+
+        nt = 32
+        with override_env_config("OMP_NUM_THREADS", str(4)):
+            r = test_impl(nt, 20)
+        assert r[0] == r[1] == nt
+
+    def test_func_set_max_active_levels(self):
+        @njit
+        def test_impl(n1, n2, n3):
+            omp_set_dynamic(0)
+            omp_set_max_active_levels(2)
+            omp_set_num_threads(n2)
+            count1, count2, count3 = 0, 0, 0
+            with omp("parallel num_threads(n1)"):
+                with omp("single"):
+                    with omp("parallel"):
+                        with omp("single"):
+                            omp_set_num_threads(n3)
+                            with omp("parallel"):
+                                with omp("critical"):
+                                    count3 += 1
+                        with omp("critical"):
+                            count2 += 1
+                with omp("critical"):
+                    count1 += 1
+            return count1, count2, count3
+
+        n1, n2 = 3, 4
+        r = test_impl(n1, n2, 5)
+        assert r[0] == n1
+        assert r[1] == n2
+        assert r[2] == 1
+
+    def test_func_get_ancestor_thread_num(self):
+        @njit
+        def test_impl():
+            oa = omp_get_ancestor_thread_num(0)
+            with omp("parallel"):
+                with omp("single"):
+                    m1 = omp_get_ancestor_thread_num(0)
+                    f1 = omp_get_ancestor_thread_num(1)
+                    s1 = omp_get_ancestor_thread_num(2)
+                    tn1 = omp_get_thread_num()
+                    with omp("parallel"):
+                        m2 = omp_get_ancestor_thread_num(0)
+                        f2 = omp_get_ancestor_thread_num(1)
+                        s2 = omp_get_ancestor_thread_num(2)
+                        tn2 = omp_get_thread_num()
+            return oa, (m1, f1, s1, tn1), (m2, f2, s2, tn2)
+
+        oa, r1, r2 = test_impl()
+        assert oa == r1[0] == r2[0] == 0
+        assert r1[1] == r1[3] == r2[1]
+        assert r1[2] == -1
+        assert r2[2] == r2[3]
+
+    def test_func_get_team_size(self):
+        @njit
+        def test_impl(n1, n2):
+            omp_set_max_active_levels(2)
+            oa = omp_get_team_size(0)
+            with omp("parallel num_threads(n1)"):
+                with omp("single"):
+                    m1 = omp_get_team_size(0)
+                    f1 = omp_get_team_size(1)
+                    s1 = omp_get_team_size(2)
+                    nt1 = omp_get_num_threads()
+                    with omp("parallel num_threads(n2)"):
+                        with omp("single"):
+                            m2 = omp_get_team_size(0)
+                            f2 = omp_get_team_size(1)
+                            s2 = omp_get_team_size(2)
+                            nt2 = omp_get_num_threads()
+            return oa, (m1, f1, s1, nt1), (m2, f2, s2, nt2)
+
+        n1, n2 = 6, 8
+        oa, r1, r2 = test_impl(n1, n2)
+        assert oa == r1[0] == r2[0] == 1
+        assert r1[1] == r1[3] == r2[1] == n1
+        assert r1[2] == -1
+        assert r2[2] == r2[3] == n2
+
+    def test_func_get_level(self):
+        @njit
+        def test_impl():
+            oa = omp_get_level()
+            with omp("parallel if(0)"):
+                f = omp_get_level()
+                with omp("parallel num_threads(1)"):
+                    s = omp_get_level()
+                    with omp("parallel"):
+                        t = omp_get_level()
+            return oa, f, s, t
+
+        for i, l in enumerate(test_impl()):
+            assert i == l
+
+    def test_func_get_active_level(self):
+        @njit
+        def test_impl():
+            oa = omp_get_active_level()
+            with omp("parallel if(0)"):
+                f = omp_get_active_level()
+                with omp("parallel num_threads(1)"):
+                    s = omp_get_active_level()
+                    with omp("parallel"):
+                        t = omp_get_active_level()
+            return oa, f, s, t
+
+        r = test_impl()
+        for i in range(3):
+            assert r[i] == 0
+        assert r[3] == 1
+
+    def test_func_in_parallel(self):
+        @njit
+        def test_impl():
+            omp_set_dynamic(0)
+            omp_set_max_active_levels(1)  # 1 because first region is inactive
+            oa = omp_in_parallel()
+            with omp("parallel num_threads(1)"):
+                ia = omp_in_parallel()
+                with omp("parallel"):
+                    n1a = omp_in_parallel()
+                    with omp("single"):
+                        with omp("parallel"):
+                            n2a = omp_in_parallel()
+            with omp("parallel if(0)"):
+                ua = omp_in_parallel()
+            return oa, ia, n1a, n2a, ua
+
+        r = test_impl()
+        assert r[0] == False
+        assert r[1] == False
+        assert r[2] == True
+        assert r[3] == True
+        assert r[4] == False
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_func_in_final(self):
+        @njit
+        def test_impl(N, c):
+            a = np.arange(N)[::-1]
+            fa = np.zeros(N)
+            fia = np.zeros(N)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(len(a)):
+                        e = a[i]
+                        with omp("task final(e >= c)"):
+                            fa[i] = omp_in_final()
+                            with omp("task"):
+                                fia[i] = omp_in_final()
+            return fa, fia
+
+        N, c = 25, 10
+        r = test_impl(N, c)
+        np.testing.assert_array_equal(r[0], np.concatenate(np.ones(N - c), np.zeros(c)))
+        np.testing.assert_array_equal(r[0], r[1])
+
+
+class TestOpenmpParallelForResults(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_parallel_for_set_elements(self):
+        def test_impl(v):
+            with omp("parallel for"):
+                for i in range(len(v)):
+                    v[i] = 1.0
+            return v
+
+        self.check(test_impl, np.zeros(100))
+
+    def test_separate_parallel_for_set_elements(self):
+        def test_impl(v):
+            with omp("parallel"):
+                with omp("for"):
+                    for i in range(len(v)):
+                        v[i] = 1.0
+            return v
+
+        self.check(test_impl, np.zeros(100))
+
+    def test_parallel_for_const_var_omp_statement(self):
+        def test_impl(v):
+            ovar = "parallel for"
+            with omp(ovar):
+                for i in range(len(v)):
+                    v[i] = 1.0
+            return v
+
+        self.check(test_impl, np.zeros(100))
+
+    def test_parallel_for_string_conditional(self):
+        def test_impl(S):
+            capitalLetters = 0
+            with omp("parallel for reduction(+:capitalLetters)"):
+                for i in range(len(S)):
+                    if S[i].isupper():
+                        capitalLetters += 1
+            return capitalLetters
+
+        self.check(test_impl, "OpenMPstrTEST")
+
+    def test_parallel_for_tuple(self):
+        def test_impl(t):
+            len_total = 0
+            with omp("parallel for reduction(+:len_total)"):
+                for i in range(len(t)):
+                    len_total += len(t[i])
+            return len_total
+
+        self.check(test_impl, ("32", "4", "test", "567", "re", ""))
+
+    def test_parallel_for_range_step_2(self):
+        def test_impl(N):
+            a = np.zeros(N, dtype=np.int32)
+            with omp("parallel for"):
+                for i in range(0, len(a), 2):
+                    a[i] = i + 1
+
+            return a
+
+        self.check(test_impl, 12)
+
+    def test_parallel_for_range_step_arg(self):
+        def test_impl(N, step):
+            a = np.zeros(N, dtype=np.int32)
+            with omp("parallel for"):
+                for i in range(0, len(a), step):
+                    a[i] = i + 1
+
+            return a
+
+        self.check(test_impl, 12, 2)
+
+    def test_parallel_for_incremented_step(self):
+        @njit
+        def test_impl(v, n):
+            for i in range(n):
+                with omp("parallel for"):
+                    for j in range(0, len(v), i + 1):
+                        v[j] = i + 1
+            return v
+
+        self.check(test_impl, np.zeros(100), 3)
+
+    def test_parallel_for_range_backward_step(self):
+        def test_impl(N):
+            a = np.zeros(N, dtype=np.int32)
+            with omp("parallel for"):
+                for i in range(N - 1, -1, -1):
+                    a[i] = i + 1
+
+            return a
+
+        self.check(test_impl, 12)
+
+    """
+    def test_parallel_for_dictionary(self):
+        def test_impl(N, c):
+            l = {}
+            with omp("parallel for"):
+                for i in range(N):
+                    l[i] = i % c
+            return l
+        self.check(test_impl, 32, 5)
+    """
+
+    def test_parallel_for_num_threads(self):
+        def test_impl(nt):
+            a = np.zeros(nt)
+            with omp("parallel num_threads(nt)"):
+                with omp("for"):
+                    for i in range(nt):
+                        a[i] = i
+            return a
+
+        self.check(test_impl, 15)
+
+    def test_parallel_for_only_inside_var(self):
+        @njit
+        def test_impl(nt):
+            a = np.zeros(nt)
+            with omp("parallel num_threads(nt) private(x)"):
+                with omp("for private(x)"):
+                    for i in range(nt):
+                        x = 0
+                        # print("out:", i, x, i + x, nt)
+                        a[i] = i + x
+            return a
+
+        nt = 12
+        np.testing.assert_array_equal(test_impl(nt), np.arange(nt))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_parallel_for_ordered(self):
+        @njit
+        def test_impl(N, c):
+            a = np.zeros(N)
+            b = np.zeros(N)
+            with omp("parallel for ordered"):
+                for i in range(1, N):
+                    b[i] = b[i - 1] + c
+                    with omp("ordered"):
+                        a[i] = a[i - 1] + c
+            return a
+
+        N, c = 30, 4
+        r = test_impl(N, c)
+        rc = np.arange(0, N * c, c)
+        np.testing.assert_array_equal(r[0], rc)
+        assert not np.array_equal(r[1], rc)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_parallel_for_collapse(self):
+        @njit
+        def test_impl(n1, n2, n3):
+            ia = np.zeros(n1)
+            ja = np.zeros((n1, n2))
+            ka = np.zeros((n1, n2, n3))
+            with omp("parallel for collapse(2)"):
+                for i in range(n1):
+                    ia[i] = omp_get_thread_num()
+                    for j in range(n2):
+                        ja[i][j] = omp_get_thread_num()
+                        for k in range(n3):
+                            ka[i][j][k] = omp_get_thread_num()
+            return ia, ja, ka
+
+        ia, ja, ka = test_impl(5, 3, 2)
+        print(ia)
+        print(ja)
+        for a1i in range(len(ja)):
+            with self.assertRaises(AssertionError) as raises:
+                np.testing.assert_equal(ia[a1i], ja[a1i])  # Scalar to array
+        for a1i in range(len(ka)):
+            for a2i in range(a1i):
+                # Scalar to array
+                np.testing.assert_equal(ja[a1i][a2i], ka[a1i][a2i])
+
+
+class TestOpenmpWorksharingSchedule(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    """
+    def test_static_work_calculation(self):
+        def test_impl(N, nt):
+            v = np.zeros(N)
+            step = -2
+            omp_set_num_threads(nt)
+            with omp("parallel private(thread_num)"):
+                running_omp = omp_in_parallel()
+                thread_num = omp_get_thread_num()
+                if not running_omp:
+                    iters = N // abs(step)
+                    itersPerThread = iters // nt
+                    finishToThread = {}
+                    for t in range(N):
+                        f = itersPerThread*(t+1)-1 + min(iters%itersPerThread, t+1)
+                        finishToThread[f] = t
+                with omp("for schedule(static)"):
+                    for index, i in enumerate(range(N-1, N%2 - 1, -2)):
+                        if not running_omp:
+                            for finish in finishToThread.keys():
+                                if index <= finish:
+                                    thread_num = finishToThread[finish]
+                        if i % (thread_num+1) == 0:
+                            v[i] = i/(thread_num+1)
+            print(v)
+            return v
+        self.check(test_impl, 100, 8)
+    """
+
+    # Giorgis pass doesn't support static with chunksize yet?
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented")
+    def test_avg_sched_const(self):
+        def test_impl(n, a):
+            b = np.zeros(n)
+            nt = 5
+            with omp("parallel for num_threads(nt) schedule(static, 4)"):
+                for i in range(1, n):
+                    b[i] = (a[i] + a[i - 1]) / 2.0
+
+            return b
+
+        self.check(test_impl, 10, np.ones(10))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented")
+    def test_avg_sched_var(self):
+        def test_impl(n, a):
+            b = np.zeros(n)
+            nt = 5
+            ss = 4
+            with omp("parallel for num_threads(nt) schedule(static, ss)"):
+                for i in range(1, n):
+                    b[i] = (a[i] + a[i - 1]) / 2.0
+
+            return b
+
+        self.check(test_impl, 10, np.ones(10))
+
+    def test_static_distribution(self):
+        @njit
+        def test_impl(nt, c):
+            a = np.empty(nt * c)
+            with omp("parallel for num_threads(nt) schedule(static)"):
+                for i in range(nt * c):
+                    a[i] = omp_get_thread_num()
+            return a
+
+        nt, c = 8, 3
+        r = test_impl(nt, c)
+        for tn in range(nt):
+            indices = np.sort(np.where(r == tn)[0])
+            si = indices[0]
+            np.testing.assert_array_equal(indices, np.arange(si, si + c))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_static_chunk_distribution(self):
+        @njit
+        def test_impl(nt, c, cs):
+            a = np.empty(nt * c)
+            with omp("parallel for num_threads(nt) schedule(static, cs)"):
+                for i in range(nt * c):
+                    a[i] = omp_get_thread_num()
+            return a
+
+        nt, c, cs = 8, 6, 3
+        r = test_impl(nt, c, cs)
+        for tn in range(nt):
+            indices = np.sort(np.where(r == tn)[0])
+            for i in range(c // cs):
+                si = indices[i * cs]
+                np.testing.assert_array_equal(
+                    indices, np.arange(si, min(len(r), si + cs))
+                )
+
+    def test_static_consistency(self):
+        @njit
+        def test_impl(nt, c, cs):
+            a = np.empty(nt * c)
+            b = np.empty(nt * c)
+            with omp("parallel num_threads(8)"):
+                with omp("for schedule(static)"):
+                    for i in range(nt * c):
+                        a[i] = omp_get_thread_num()
+                with omp("for schedule(static)"):
+                    for i in range(nt * c):
+                        b[i] = omp_get_thread_num()
+            return a, b
+
+        r = test_impl(8, 7, 5)
+        np.testing.assert_array_equal(r[0], r[1])
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_dynamic_distribution(self):
+        @njit
+        def test_impl(nt, c, cs):
+            a = np.empty(nt * c)
+            with omp("parallel for num_threads(nt) schedule(dynamic)"):
+                for i in range(nt * c):
+                    a[i] = omp_get_thread_num()
+            return a
+
+        nt, c, cs = 10, 2, 1
+        r = test_impl(nt, c, cs)
+        a = np.zeros(nt)
+        for tn in range(nt):
+            indices = np.sort(np.where(r == tn)[0])
+            if len(indices > 0):
+                for i in range(c // cs):
+                    si = indices[i * cs]
+                    np.testing.assert_array_equal(
+                        indices, np.arange(si, min(len(r), si + cs))
+                    )
+            else:
+                a[tn] = 1
+        assert np.any(a)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_guided_distribution(self):
+        @njit
+        def test_impl(nt, c, cs):
+            a = np.empty(nt * c)
+            with omp("parallel for num_threads(nt) schedule(guided, cs)"):
+                for i in range(nt * c):
+                    a[i] = omp_get_thread_num()
+            return a
+
+        nt, c, cs = 8, 6, 3
+        r = test_impl(nt, c, cs)
+        chunksizes = []
+        cur_tn = r[0]
+        cur_chunk = 0
+        for e in r:
+            if e == cur_tn:
+                cur_chunk += 1
+            else:
+                chunksizes.append(cur_chunk)
+                cur_chunk = 1
+        chunksizes.append(cur_chunk)
+        ca = np.array(chunksizes)
+        np.testing.assert_array_equal(ca, np.sort(ca)[::-1])
+        assert ca[-2] >= cs
+
+
+class TestOpenmpParallelClauses(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_num_threads_clause(self):
+        @njit
+        def test_impl(N, c1, c2):
+            omp_set_dynamic(0)
+            omp_set_max_active_levels(2)
+            omp_set_num_threads(N + c1)
+            d_count = 0
+            n_count = 0
+            nc_count = 0
+            a_count = 0
+            with omp("parallel num_threads(N) shared(c2)"):
+                with omp("critical"):
+                    d_count += 1
+                with omp("parallel"):
+                    with omp("critical"):
+                        n_count += 1
+                with omp("single"):
+                    with omp("parallel num_threads(6)"):
+                        with omp("critical"):
+                            nc_count += 1
+            with omp("parallel"):
+                with omp("critical"):
+                    a_count += 1
+            return d_count, a_count, n_count, nc_count
+
+        a, b, c = 13, 3, 6
+        r = test_impl(a, b, c)
+        assert r[0] == a
+        assert r[1] == a + b
+        assert r[2] == a * (a + b)
+        assert r[3] == c
+
+    def test_if_clause(self):
+        @njit
+        def test_impl(s):
+            rp = 2  # Should also work with anything non-zero
+            drp = 0
+            ar = np.zeros(s, dtype=np.int32)
+            adr = np.zeros(s, dtype=np.int32)
+            par = np.full(s, 2, dtype=np.int32)
+            padr = np.full(s, 2, dtype=np.int32)
+
+            omp_set_num_threads(s)
+            omp_set_dynamic(0)
+            with omp("parallel for if(rp)"):
+                for i in range(s):
+                    ar[omp_get_thread_num()] = 1
+                    par[i] = omp_in_parallel()
+            with omp("parallel for if(drp)"):
+                for i in range(s):
+                    adr[omp_get_thread_num()] = 1
+                    padr[i] = omp_in_parallel()
+            return ar, adr, par, padr
+
+        size = 20
+        r = test_impl(size)
+        np.testing.assert_array_equal(r[0], np.ones(size))
+        rc = np.zeros(size)
+        rc[0] = 1
+        np.testing.assert_array_equal(r[1], rc)
+        np.testing.assert_array_equal(r[2], np.ones(size))
+        np.testing.assert_array_equal(r[3], np.zeros(size))
+
+    def test_avg_arr_prev_two_elements_base(self):
+        def test_impl(n, a):
+            b = np.zeros(n)
+            omp_set_num_threads(5)
+
+            with omp("parallel for"):
+                for i in range(1, n):
+                    b[i] = (a[i] + a[i - 1]) / 2.0
+            return b
+
+        self.check(test_impl, 10, np.ones(10))
+
+    def test_avg_num_threads_clause(self):
+        def test_impl(n, a):
+            b = np.zeros(n)
+            with omp("parallel for num_threads(5)"):
+                for i in range(1, n):
+                    b[i] = (a[i] + a[i - 1]) / 2.0
+
+            return b
+
+        self.check(test_impl, 10, np.ones(10))
+
+    def test_avg_num_threads_clause_var(self):
+        def test_impl(n, a):
+            b = np.zeros(n)
+            nt = 5
+            with omp("parallel for num_threads(nt)"):
+                for i in range(1, n):
+                    b[i] = (a[i] + a[i - 1]) / 2.0
+
+            return b
+
+        self.check(test_impl, 10, np.ones(10))
+
+    # Uses apparently unsupported chunking.
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented")
+    def test_avg_if_const(self):
+        def test_impl(n, a):
+            b = np.zeros(n)
+            nt = 5
+            with omp("parallel for if(1) num_threads(nt) schedule(static, 4)"):
+                for i in range(1, n):
+                    b[i] = (a[i] + a[i - 1]) / 2.0
+
+            return b
+
+        self.check(test_impl, 10, np.ones(10))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented")
+    def test_avg_if_var(self):
+        def test_impl(n, a):
+            b = np.zeros(n)
+            nt = 5
+            ss = 4
+            do_if = 1
+            with omp("parallel for if(do_if) num_threads(nt) schedule(static, ss)"):
+                for i in range(1, n):
+                    b[i] = (a[i] + a[i - 1]) / 2.0
+
+            return b
+
+        self.check(test_impl, 10, np.ones(10))
+
+    def test_teams1(self):
+        def test_impl():
+            a = 1
+            with omp("teams"):
+                with omp("parallel"):
+                    a = 123
+            return a
+
+        self.check(test_impl)
+
+
+class TestReductions(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_parallel_reduction_add_int(self):
+        @njit
+        def test_impl():
+            redux = 0
+            nthreads = 0
+            with omp("parallel reduction(+:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = 1
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, nthreads)
+
+    def test_parallel_reduction_sub_int(self):
+        @njit
+        def test_impl():
+            redux = 0
+            nthreads = 0
+            with omp("parallel reduction(-:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = 1
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, nthreads)
+
+    def test_parallel_reduction_mul_int(self):
+        @njit
+        def test_impl():
+            redux = 1
+            nthreads = 0
+            with omp("parallel reduction(*:redux) num_threads(8)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = 2
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 2**nthreads)
+
+    def test_parallel_reduction_add_fp64(self):
+        @njit
+        def test_impl():
+            redux = np.float64(0.0)
+            nthreads = np.float64(0.0)
+            with omp("parallel reduction(+:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = np.float64(1.0)
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 1.0 * nthreads)
+
+    def test_parallel_reduction_sub_fp64(self):
+        @njit
+        def test_impl():
+            redux = np.float64(0.0)
+            nthreads = np.float64(0.0)
+            with omp("parallel reduction(-:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = np.float64(1.0)
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 1.0 * nthreads)
+
+    def test_parallel_reduction_mul_fp64(self):
+        @njit
+        def test_impl():
+            redux = np.float64(1.0)
+            nthreads = np.float64(0.0)
+            with omp("parallel reduction(*:redux) num_threads(8)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = np.float64(2.0)
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 2.0**nthreads)
+
+    def test_parallel_reduction_add_fp32(self):
+        @njit
+        def test_impl():
+            redux = np.float32(0.0)
+            nthreads = np.float32(0.0)
+            with omp("parallel reduction(+:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = np.float32(1.0)
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 1.0 * nthreads)
+
+    def test_parallel_reduction_sub_fp32(self):
+        @njit
+        def test_impl():
+            redux = np.float32(0.0)
+            nthreads = np.float32(0.0)
+            with omp("parallel reduction(-:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = np.float32(1.0)
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 1.0 * nthreads)
+
+    def test_parallel_reduction_mul_fp32(self):
+        @njit
+        def test_impl():
+            redux = np.float32(1.0)
+            nthreads = np.float32(0.0)
+            with omp("parallel reduction(*:redux) num_threads(8)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = np.float32(2.0)
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 2.0**nthreads)
+
+    def test_parallel_for_reduction_add_int(self):
+        @njit
+        def test_impl():
+            redux = 0
+            with omp("parallel for reduction(+:redux)"):
+                for i in range(10):
+                    redux += 1
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10)
+
+    def test_parallel_for_reduction_sub_int(self):
+        @njit
+        def test_impl():
+            redux = 0
+            with omp("parallel for reduction(-:redux)"):
+                for i in range(10):
+                    redux += 1
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10)
+
+    def test_parallel_for_reduction_mul_int(self):
+        @njit
+        def test_impl():
+            redux = 1
+            with omp("parallel for reduction(*:redux)"):
+                for i in range(10):
+                    redux *= 2
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 2**10)
+
+    def test_parallel_for_reduction_add_fp64(self):
+        @njit
+        def test_impl():
+            redux = np.float64(0.0)
+            with omp("parallel for reduction(+:redux)"):
+                for i in range(10):
+                    redux += np.float64(1.0)
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10.0)
+
+    def test_parallel_for_reduction_sub_fp64(self):
+        @njit
+        def test_impl():
+            redux = np.float64(0.0)
+            with omp("parallel for reduction(-:redux)"):
+                for i in range(10):
+                    redux += np.float64(1.0)
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10.0)
+
+    def test_parallel_for_reduction_mul_fp64(self):
+        @njit
+        def test_impl():
+            redux = np.float64(1.0)
+            with omp("parallel for reduction(*:redux)"):
+                for i in range(10):
+                    redux *= np.float64(2.0)
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 2.0**10)
+
+    def test_parallel_for_reduction_add_fp32(self):
+        @njit
+        def test_impl():
+            redux = np.float32(0.0)
+            with omp("parallel for reduction(+:redux)"):
+                for i in range(10):
+                    redux += np.float32(1.0)
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10.0)
+
+    def test_parallel_for_reduction_sub_fp32(self):
+        @njit
+        def test_impl():
+            redux = np.float32(0.0)
+            with omp("parallel for reduction(-:redux)"):
+                for i in range(10):
+                    redux += np.float32(1.0)
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10.0)
+
+    def test_parallel_for_reduction_mul_fp32(self):
+        @njit
+        def test_impl():
+            redux = np.float32(1.0)
+            with omp("parallel for reduction(*:redux)"):
+                for i in range(10):
+                    redux *= np.float32(2.0)
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 2.0**10)
+
+    def test_parallel_reduction_add_int_10(self):
+        @njit
+        def test_impl():
+            redux = 10
+            nthreads = 0
+            with omp("parallel reduction(+:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = 1
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, nthreads + 10)
+
+    def test_parallel_reduction_add_fp32_10(self):
+        @njit
+        def test_impl():
+            redux = np.float32(10.0)
+            nthreads = np.float32(0.0)
+            with omp("parallel reduction(+:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = np.float32(1.0)
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 1.0 * nthreads + 10.0)
+
+    def test_parallel_reduction_add_fp64_10(self):
+        @njit
+        def test_impl():
+            redux = np.float64(10.0)
+            nthreads = np.float64(0.0)
+            with omp("parallel reduction(+:redux)"):
+                thread_id = omp_get_thread_num()
+                if thread_id == 0:
+                    nthreads = omp_get_num_threads()
+                redux = np.float64(1.0)
+            return redux, nthreads
+
+        redux, nthreads = test_impl()
+        self.assertGreater(nthreads, 1)
+        self.assertEqual(redux, 1.0 * nthreads + 10.0)
+
+    def test_parallel_for_reduction_add_int_10(self):
+        @njit
+        def test_impl():
+            redux = 10
+            with omp("parallel for reduction(+:redux)"):
+                for i in range(10):
+                    redux += 1
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10 + 10)
+
+    def test_parallel_for_reduction_add_fp32(self):
+        @njit
+        def test_impl():
+            redux = np.float32(0.0)
+            with omp("parallel for reduction(+:redux)"):
+                for i in range(10):
+                    redux += np.float32(1.0)
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10.0)
+
+    def test_parallel_for_reduction_add_fp64_10(self):
+        @njit
+        def test_impl():
+            redux = np.float64(10.0)
+            with omp("parallel for reduction(+:redux)"):
+                for i in range(10):
+                    redux += np.float64(1.0)
+            return redux
+
+        redux = test_impl()
+        self.assertEqual(redux, 10.0 + 10.0)
+
+
+class TestOpenmpDataClauses(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_default_none(self):
+        @njit
+        def test_impl(N):
+            a = np.zeros(N, dtype=np.int32)
+            x = 7
+            with omp("parallel for default(none)"):
+                for i in range(N):
+                    y = i + x
+                    a[i] = y
+                    z = i
+
+            return a, z
+
+        with self.assertRaises(UnspecifiedVarInDefaultNone) as raises:
+            test_impl(100)
+        self.assertIn("Variables with no data env clause", str(raises.exception))
+
+    def test_data_sharing_default(self):
+        @njit
+        def test_impl(N, M):
+            x = np.zeros(N)
+            y = np.zeros(N)
+            z = 3.14
+            i = 7
+            with omp("parallel private(i)"):
+                yn = M + 1
+                zs = z
+                with omp("for"):
+                    for i in range(N):
+                        y[i] = yn + 2 * (i + 1)
+                with omp("for"):
+                    for i in range(N):
+                        x[i] = y[i] - i
+                        with omp("critical"):
+                            z += 3
+            return x, y, zs, z, i
+
+        N, M = 10, 5
+        r = test_impl(N, M)
+        np.testing.assert_array_equal(r[0], np.arange(M + 3, M + N + 3))
+        np.testing.assert_array_equal(r[1], np.arange(M + 3, M + 2 * N + 2, 2))
+        assert r[2] == 3.14
+        assert r[3] == 3.14 + 3 * N
+        assert r[4] == 7
+
+    def test_variables(self):
+        @njit
+        def test_impl():
+            x = 5
+            y = 3
+            zfp = 2
+            zsh = 7
+            nerr = 0
+            nsing = 0
+            NTHREADS = 4
+            numthrds = 0
+            omp_set_num_threads(NTHREADS)
+            vals = np.zeros(NTHREADS)
+            valsfp = np.zeros(NTHREADS)
+
+            with omp("""parallel private(x) shared(zsh)
+                        firstprivate(zfp) private(ID)"""):
+                ID = omp_get_thread_num()
+                with omp("single"):
+                    nsing = nsing + 1
+                    numthrds = omp_get_num_threads()
+                    if y != 3:
+                        nerr = nerr + 1
+                        print(
+                            "Shared Default status failure y = ",
+                            y,
+                            " It should equal 3",
+                        )
+
+                # verify each thread sees the same variable vsh
+                with omp("critical"):
+                    zsh = zsh + ID
+
+                # test first private
+                zfp = zfp + ID
+                valsfp[ID] = zfp
+
+                # setup test to see if each thread got its own x value
+                x = ID
+                vals[ID] = x
+
+            # Shared clause test: assumes zsh starts at 7 and we add up IDs from 4 threads
+            if zsh != 13:
+                print("Shared clause or critical failed", zsh)
+                nerr = nerr + 1
+
+            # Single Test: How many threads updated nsing?
+            if nsing != 1:
+                print(" Single test failed", nsing)
+                nerr = nerr + 1
+
+            # Private clause test: did each thread get its own x variable?
+            for i in range(numthrds):
+                if int(vals[i]) != i:
+                    print("Private clause failed", numthrds, i, vals[i])
+                    nerr = nerr + 1
+
+            # First private clause test: each thread should get 2 + ID for up to 4 threads
+            for i in range(numthrds):
+                if int(valsfp[i]) != 2 + i:
+                    print("Firstprivate clause failed", numthrds, i, valsfp[i])
+                    nerr = nerr + 1
+
+            # Test number of threads
+            if numthrds > NTHREADS:
+                print("Number of threads error: too many threads", numthrds, NTHREADS)
+                nerr = nerr + 1
+
+            if nerr > 0:
+                print(
+                    nerr,
+                    """ errors when testing parallel, private, shared,
+                            firstprivate, critical  and single""",
+                )
+
+            return nerr
+
+        assert test_impl() == 0
+
+    def test_privates(self):
+        def test_impl(N):
+            a = np.zeros(N, dtype=np.int32)
+            x = 7
+            with omp("""parallel for firstprivate(x) private(y)
+                         lastprivate(zzzz) private(private_index) shared(a)
+                          firstprivate(N) default(none)"""):
+                for private_index in range(N):
+                    y = private_index + x
+                    a[private_index] = y
+                    zzzz = private_index
+
+            return a, zzzz
+
+        self.check(test_impl, 100)
+
+    def test_private_retain_value(self):
+        @njit
+        def test_impl():
+            x = 5
+            with omp("parallel private(x)"):
+                x = 13
+            return x
+
+        assert test_impl() == 5
+
+    def test_private_retain_value_param(self):
+        @njit
+        def test_impl(x):
+            with omp("parallel private(x)"):
+                x = 13
+            return x
+
+        assert test_impl(5) == 5
+
+    def test_private_retain_value_for(self):
+        @njit
+        def test_impl():
+            x = 5
+            with omp("parallel private(x)"):
+                with omp("for"):
+                    for i in range(10):
+                        x = i
+            return x
+
+        assert test_impl() == 5
+
+    def test_private_retain_value_for_param(self):
+        @njit
+        def test_impl(x):
+            with omp("parallel private(x)"):
+                with omp("for"):
+                    for i in range(10):
+                        x = i
+            return x
+
+        assert test_impl(5) == 5
+
+    def test_private_retain_value_combined_for(self):
+        @njit
+        def test_impl():
+            x = 5
+            with omp("parallel for private(x)"):
+                for i in range(10):
+                    x = i
+            return x
+
+        assert test_impl() == 5
+
+    def test_private_retain_value_combined_for_param(self):
+        @njit
+        def test_impl(x):
+            with omp("parallel for private(x)"):
+                for i in range(10):
+                    x = i
+            return x
+
+        assert test_impl(5) == 5
+
+    def test_private_retain_two_values(self):
+        @njit
+        def test_impl():
+            x = 5
+            y = 7
+            with omp("parallel private(x,y)"):
+                x = 13
+                y = 40
+            return x, y
+
+        assert test_impl() == (5, 7)
+
+    def test_private_retain_array(self):
+        @njit
+        def test_impl(N, x):
+            a = np.ones(N)
+            with omp("parallel private(a)"):
+                with omp("single"):
+                    sa = a
+                a = np.zeros(N)
+                with omp("for"):
+                    for i in range(N):
+                        a[i] = x
+            return a, sa
+
+        r = test_impl(10, 3)
+        np.testing.assert_array_equal(r[0], np.ones(r[0].shape))
+        with self.assertRaises(AssertionError) as raises:
+            np.testing.assert_array_equal(r[1], np.ones(r[0].shape))
+
+    def test_private_divide_work(self):
+        def test_impl(v, npoints):
+            omp_set_num_threads(3)
+
+            with omp("""parallel default(shared)
+                        private(iam,nt,ipoints,istart)"""):
+                iam = omp_get_thread_num()
+                nt = omp_get_num_threads()
+                ipoints = npoints // nt
+                istart = iam * ipoints
+                if iam == nt - 1:
+                    ipoints = npoints - istart
+                for i in range(ipoints):
+                    v[istart + i] = 123.456
+            return v
+
+        self.check(test_impl, np.zeros(12), 12)
+
+    def test_firstprivate(self):
+        @njit
+        def test_impl(x, y):
+            with omp("parallel firstprivate(x)"):
+                xs = x
+                x = y
+            return xs, x
+
+        x, y = 5, 3
+        self.assert_outputs_equal(test_impl(x, y), (x, x))
+
+    def test_lastprivate_for(self):
+        @njit
+        def test_impl(N):
+            a = np.zeros(N)
+            si = 0
+            with omp("parallel for lastprivate(si)"):
+                for i in range(N):
+                    si = i + 1
+                    a[i] = si
+            return si, a
+
+        N = 10
+        r = test_impl(N)
+        assert r[0] == N
+        np.testing.assert_array_equal(r[1], np.arange(1, N + 1))
+
+    def test_lastprivate_non_one_step(self):
+        @njit
+        def test_impl(n1, n2, s):
+            a = np.zeros(math.ceil((n2 - n1) / s))
+            rl = np.arange(n1, n2, s)
+            with omp("parallel for lastprivate(si)"):
+                for i in range(len(rl)):
+                    si = rl[i] + 1
+                    a[i] = si
+            return si, a
+
+        n1, n2, s = 4, 26, 3
+        r = test_impl(n1, n2, s)
+        ra = np.arange(n1, n2, s) + 1
+        assert r[0] == ra[-1]
+        np.testing.assert_array_equal(r[1], ra)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_lastprivate_sections(self):
+        @njit
+        def test_impl(N2, si):
+            a = np.zeros(N2)
+            with omp("parallel shared(sis1)"):
+                with omp("sections lastprivate(si)"):
+                    sis1 = si
+                    # N1 = number of sections
+                    with omp("section"):
+                        si = 0
+                    with omp("section"):
+                        si = 1
+                    with omp("section"):
+                        si = 2
+                sis2 = si
+                with omp("sections lastprivate(si)"):
+                    # N2 = number of sections
+                    with omp("section"):
+                        i = 0
+                        si = N2 - i
+                        a[i] = si
+                    with omp("section"):
+                        i = 1
+                        si = N2 - i
+                        a[i] = si
+                    with omp("section"):
+                        i = 2
+                        si = N2 - i
+                        a[i] = si
+                    with omp("section"):
+                        i = 3
+                        si = N2 - i
+                        a[i] = si
+            return si, sis1, sis2, a
+
+        N1, N2, d = 3, 4, 5
+        r = test_impl(N2, d)
+        assert r[0] == 1
+        assert r[1] != d
+        assert r[2] == N1 - 1
+        np.testing.assert_array_equal(r[3], np.arange(N2, 0, -1))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_lastprivate_conditional(self):
+        @njit
+        def test_impl(N, c1, c2):
+            a = np.arange(0, N * 2, c2)
+            num = 0
+            with omp("parallel"):
+                with omp("for lastprivate(conditional: num)"):
+                    for i in range(N):
+                        if i < c1:
+                            num = a[i] + c2
+            return num
+
+        c1, c2 = 11, 3
+        assert test_impl(15, c1, c2) == c1 * c2
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_threadprivate(self):
+        @njit
+        def test_impl(N, c):
+            omp_set_num_threads(N)
+            a = np.zeros(N)
+            ra = np.zeros(N)
+            val = 0
+            with omp("threadprivate(val)"):
+                pass
+            with omp("parallel private(tn, sn)"):
+                tn = omp_get_thread_num()
+                sn = c + tn
+                val = sn
+                a[tn] = sn
+            with omp("parallel private(tn)"):
+                tn = omp_get_thread_num()
+                ra[tn] = 1 if val == a[tn] else 0
+            return ra
+
+        nt = 8
+        np.testing.assert_array_equal(test_impl(nt, 5), np.ones(nt))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_copyin(self):
+        @njit
+        def test_impl(nt, n1, n2, n3):
+            xsa1 = np.zeros(nt)
+            xsa2 = np.zeros(nt)
+            x = n1
+            with omp("threadprivate(x)"):
+                pass
+            x = n2
+            with omp("parallel num_threads(nt) copyin(x) private(tn)"):
+                tn = omp_get_thread_num()
+                xsa1[tn] = x
+                if tn == 0:
+                    x = n3
+            with omp("parallel copyin(x)"):
+                xsa2[omp_get_thread_num()] = x
+            return xsa1, xsa2
+
+        nt, n2, n3 = 10, 12.5, 7.1
+        r = test_impl(nt, 4.3, n2, n3)
+        np.testing.assert_array_equal(r[0], np.full(nt, n2))
+        np.testing.assert_array_equal(r[1], np.full(nt, n3))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_copyin_nested(self):
+        def test_impl(nt1, nt2, mt, n1, n2, n3):
+            omp_set_nested(1)
+            omp_set_dynamic(0)
+            xsa1 = np.zeros(nt1)
+            xsa2 = np.zeros(nt2)
+            x = n1
+            with omp("threadprivate(x)"):
+                pass
+            x = n2
+            with omp("parallel num_threads(nt1) copyin(x) private(tn)"):
+                tn = omp_get_thread_num()
+                xsa1[tn] = x
+                if tn == mt:
+                    x = n3
+                    with omp("parallel num_threads(nt2) copyin(x)"):
+                        xsa2[omp_get_thread_num()] = x
+            return xsa1, xsa2
+
+        nt1, nt2, n2, n3 = 10, 4, 12.5, 7.1
+        r = test_impl(nt1, nt2, 2, 4.3, n2, n3)
+        np.testing.assert_array_equal(r[0], np.full(nt1, n2))
+        np.testing.assert_array_equal(r[1], np.full(nt2, n3))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_copyprivate(self):
+        @njit
+        def test_impl(nt, n1, n2, n3):
+            x = n1
+            a = np.zeros(nt)
+            xsa = np.zeros(nt)
+            ar = np.zeros(nt)
+            omp_set_num_threads(nt)
+            with omp("parallel firstprivate(x, a) private(tn)"):
+                with omp("single copyprivate(x, a)"):
+                    x = n2
+                    a = np.full(nt, n3)
+                tn = omp_get_thread_num()
+                xsa[tn] = x
+                ar[tn] = a[tn]
+            return xsa, a, ar
+
+        nt, n2, n3 = 16, 12, 3
+        r = test_impl(nt, 5, n2, n3)
+        np.testing.assert_array_equal(r[0], np.full(nt, n2))
+        self.assert_outputs_equal(r[1], r[2], np.full(nt, n3))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_linear_clause(self):
+        @njit
+        def test_impl(N):
+            a = np.arange(N) + 1
+            b = np.zeros(N // 2)
+
+            linearj = 0
+            with omp("parallel for linear(linearj:1)"):
+                for i in range(0, N, 2):
+                    b[linearj] = a[i] * 2
+
+            return b, linearj
+
+        N = 50
+        r = test_impl(N)
+        np.testing.assert_array_equal(r[0], np.arange(2, N * 2 - 1, 4))
+        assert r[1] == N // 2 - 1
+
+
+class TestOpenmpConstraints(TestOpenmpBase):
+    """Tests designed to confirm that errors occur when expected, or
+    to see how OpenMP behaves in various circumstances"""
+
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_parallel_for_no_for_loop(self):
+        @njit
+        def test_impl():
+            with omp("parallel for"):
+                pass
+
+        with self.assertRaises(ParallelForWrongLoopCount) as raises:
+            test_impl()
+        self.assertIn(
+            "OpenMP parallel for regions must contain exactly one",
+            str(raises.exception),
+        )
+
+    def test_parallel_for_multiple_for_loops(self):
+        @njit
+        def test_impl():
+            a = np.zeros(4)
+            with omp("parallel for"):
+                for i in range(2):
+                    a[i] = 1
+                for i in range(2, 4):
+                    a[i] = 1
+
+        with self.assertRaises(ParallelForWrongLoopCount) as raises:
+            test_impl()
+        self.assertIn(
+            "OpenMP parallel for regions must contain exactly one",
+            str(raises.exception),
+        )
+
+    def test_statement_before_parallel_for(self):
+        @njit
+        def test_impl():
+            a = np.zeros(4)
+            with omp("parallel for"):
+                print("Fail")
+                for i in range(4):
+                    a[i] = i
+            return a
+
+        with self.assertRaises(ParallelForExtraCode) as raises:
+            test_impl()
+        self.assertIn("Extra code near line", str(raises.exception))
+
+    def test_statement_after_parallel_for(self):
+        @njit
+        def test_impl():
+            a = np.zeros(4)
+            with omp("parallel for"):
+                for i in range(4):
+                    a[i] = i
+                print("Fail")
+            return a
+
+        with self.assertRaises(ParallelForExtraCode) as raises:
+            a = test_impl()
+            print("a", a)
+        self.assertIn("Extra code near line", str(raises.exception))
+
+    def test_nonstring_var_omp_statement(self):
+        @njit
+        def test_impl(v):
+            ovar = 7
+            with omp(ovar):
+                for i in range(len(v)):
+                    v[i] = 1.0
+            return v
+
+        with self.assertRaises(NonStringOpenmpSpecification) as raises:
+            test_impl(np.zeros(100))
+        self.assertIn("Non-string OpenMP specification at line", str(raises.exception))
+
+    def test_parallel_for_nonconst_var_omp_statement(self):
+        @njit
+        def test_impl(v):
+            ovar = "parallel "
+            ovar += "for"
+            with omp(ovar):
+                for i in range(len(v)):
+                    v[i] = 1.0
+            return v
+
+        with self.assertRaises(NonconstantOpenmpSpecification) as raises:
+            test_impl(np.zeros(100))
+        self.assertIn(
+            "Non-constant OpenMP specification at line", str(raises.exception)
+        )
+
+    # def test_parallel_for_blocking_if(self):
+    #    @njit
+    #    def test_impl():
+    #        n = 0
+    #        with omp("parallel"):
+    #            half_threads = omp_get_num_threads()//2
+    #            if omp_get_thread_num() < half_threads:
+    #                with omp("for reduction(+:n)"):
+    #                    for _ in range(half_threads):
+    #                        n += 1
+    #        return n
+
+    #    #with self.assertRaises(AssertionError) as raises:
+    #     #   njit(test_impl)
+    #    test_impl()
+    #    #print(str(raises.exception))
+
+    def test_parallel_for_delaying_condition(self):
+        @njit
+        def test_impl():
+            n = 0
+            with omp("parallel private(lc)"):
+                lc = 0
+                while lc < omp_get_thread_num():
+                    lc += 1
+                with omp("for reduction(+:n)"):
+                    for _ in range(omp_get_num_threads()):
+                        n += 1
+            return n
+
+        test_impl()
+
+    def test_parallel_for_nowait(self):
+        @njit
+        def test_impl(nt):
+            a = np.zeros(nt)
+            with omp("parallel for num_threads(nt) nowait"):
+                for i in range(nt):
+                    a[omp_get_thread_num] = i
+            return a
+
+        with self.assertRaises(Exception) as raises:
+            test_impl(12)
+        self.assertIn("No terminal matches", str(raises.exception))
+
+    def test_parallel_double_num_threads(self):
+        @njit
+        def test_impl(nt1, nt2):
+            count = 0
+            with omp("parallel num_threads(nt1) num_threads(nt2)"):
+                with omp("critical"):
+                    count += 1
+            print(count)
+            return count
+
+        with self.assertRaises(Exception) as raises:
+            test_impl(5, 7)
+
+    def test_conditional_barrier(self):
+        @njit
+        def test_impl(nt):
+            hp = nt // 2
+            a = np.zeros(hp)
+            b = np.zeros(nt - hp)
+            with omp("parallel num_threads(nt) private(tn)"):
+                tn = omp_get_thread_num()
+                if tn < hp:
+                    with omp("barrier"):
+                        pass
+                    a[tn] = 1
+                else:
+                    with omp("barrier"):
+                        pass
+                    b[tn - hp] = 1
+            return a, b
+
+        # The spec seems to say this should be an error but in practice maybe not?
+        # with self.assertRaises(Exception) as raises:
+        test_impl(12)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Hangs")
+    def test_closely_nested_for_loops(self):
+        @njit
+        def test_impl(N):
+            a = np.zeros((N, N))
+            with omp("parallel"):
+                with omp("for"):
+                    for i in range(N):
+                        with omp("for"):
+                            for j in range(N):
+                                a[i][j] = 1
+            return a
+
+        with self.assertRaises(Exception) as raises:
+            test_impl(4)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Hangs")
+    def test_nested_critical(self):
+        @njit
+        def test_impl():
+            num = 0
+            with omp("parallel"):
+                with omp("critical"):
+                    num += 1
+                    with omp("critical"):
+                        num -= 1
+            return num
+
+        with self.assertRaises(Exception) as raises:
+            test_impl()
+
+
+class TestOpenmpConcurrency(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_parallel_region(self):
+        @njit
+        def test_impl():
+            a = 1
+            with omp("parallel"):
+                a += 1
+
+        test_impl()
+
+    def test_single(self):
+        @njit
+        def test_impl(nt):
+            omp_set_num_threads(nt)
+            a = np.zeros(4, dtype=np.int64)
+            with omp("parallel"):
+                with omp("single"):
+                    a[0] += 1
+            return a
+
+        np.testing.assert_array_equal(test_impl(4), np.array([1, 0, 0, 0]))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_master(self):
+        @njit
+        def test_impl(nt):
+            omp_set_num_threads(nt)
+            a = np.ones(4, dtype=np.int64)
+            with omp("parallel"):
+                with omp("master"):
+                    a[0] += omp_get_thread_num()
+            return a
+
+        np.testing.assert_array_equal(test_impl(4), np.array([0, 1, 1, 1]))
+
+    def test_critical_threads1(self):
+        @njit
+        def test_impl(N, iters):
+            omp_set_num_threads(N)
+            count = 0
+            p = 0
+            sum = 0
+            with omp("parallel"):
+                with omp("barrier"):
+                    pass
+                with omp("for private(p, sum)"):
+                    for _ in range(iters):
+                        with omp("critical"):
+                            p = count
+                            sum = 0
+                            for i in range(10000):
+                                if i % 2 == 0:
+                                    sum += 1
+                                else:
+                                    sum -= 1
+                            p += 1 + sum
+                            count = p
+            return count
+
+        iters = 1000
+        self.check(test_impl, 2, iters)
+
+    def test_critical_threads2(self):
+        @njit
+        def test_impl(N):
+            omp_set_num_threads(N)
+            ca = np.zeros(N)
+            sum = 0
+            with omp("parallel private(sum) shared(c)"):
+                c = N
+                with omp("barrier"):
+                    pass
+                with omp("critical"):
+                    ca[omp_get_thread_num()] = c - 1
+                    # Sleep
+                    sum = 0
+                    for i in range(10000):
+                        if i % 2 == 0:
+                            sum += 1
+                        else:
+                            sum -= 1
+                    c -= 1 + sum
+            return np.sort(ca)
+
+        nt = 16
+        np.testing.assert_array_equal(test_impl(nt), np.arange(nt))
+
+    def test_critical_result(self):
+        @njit
+        def test_impl(N):
+            omp_set_num_threads(N)
+            count = 0
+            with omp("parallel"):
+                if omp_get_thread_num() < N // 2:
+                    with omp("critical"):
+                        count += 1
+                else:
+                    with omp("critical"):
+                        count += 1
+            return count
+
+        nt = 16
+        assert test_impl(nt) == nt
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_named_critical(self):
+        @njit
+        def test_impl(N):
+            omp_set_num_threads(N)
+            a = np.zeros((2, N))
+            sa = np.zeros(N)
+            with omp("parallel private(a0c, sum, tn)"):
+                tn = omp_get_thread_num()
+                with omp("barrier"):
+                    pass
+                with omp("critical (a)"):
+                    # Sleep
+                    sum = 0
+                    for j in range(1000):
+                        if j % 2 == 0:
+                            sum += 1
+                        else:
+                            sum -= 1
+                    a[0][tn] = 1 + sum
+                with omp("critical (b)"):
+                    a0c = np.copy(a[0])
+                    # Sleep
+                    sum = 0
+                    for j in range(10000):
+                        if j % 2 == 0:
+                            sum += 1
+                        else:
+                            sum -= 1
+                    a[1][tn] = 1 + sum
+                    sa[tn] = 1 if a[0] != a0c else 0
+            return a, sa
+
+        nt = 16
+        r = test_impl(nt)
+        np.testing.assert_array_equal(r[0], np.ones((2, nt)))
+        assert np.any(r[1])
+
+    # Revisit - how to prove atomic works without a race condition?
+    # def test_atomic_threads(self):
+    #    def test_impl(N, iters):
+    #        omp_set_num_threads(N)
+    #        count = 0
+    #        p = 0
+    #        sum = 0
+    #        with omp("parallel"):
+    #            with omp("barrier"):
+    #                pass
+    #            with omp("for private(p, sum)"):
+    #                for _ in range(iters):
+    #                    with omp("atomic"):
+    #                        p = count
+    #                        sum = 0
+    #                        for i in range(10000):
+    #                            if i % 2 == 0:
+    #                                sum += 1
+    #                            else:
+    #                                sum -= 1
+    #                        p += 1 + sum
+    #                        count = p
+    #        return count
+    #    iters = 1000
+    #    self.check(test_impl, 2, iters)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_atomic(self):
+        @njit
+        def test_impl(nt, N, c):
+            omp_set_num_threads(nt)
+            a = np.zeros(N)
+            with omp("parallel for private(b, index)"):
+                for i in range(nt):
+                    b = 0
+                    index = i % N
+                    with omp("atomic write"):
+                        a[index] = nt % c
+                    with omp("barrier"):
+                        pass
+                    with omp("atomic read"):
+                        b = a[index - 1] + index
+                    with omp("barrier"):
+                        pass
+                    with omp("atomic update"):
+                        a[index] += b
+            return a
+
+        nt, N, c = 27, 8, 6
+        rc = np.zeros(N)
+        # ba = np.zeros(nt)
+        # for i in range(nt):
+        #    index = i % N
+        #    rc[index] = nt % c
+        # print("rc1:", rc)
+
+        # for i in range(nt):
+        #    index = i % N
+        #    ba[i] = rc[index-1] + index
+
+        # for i in range(nt):
+        #    index = i % N
+        #    rc[index] += ba[i]
+        # print("rc2:", rc)
+
+        for i in range(nt):
+            index = i % N
+            ts = nt // N
+            ts += 1 if index < nt % N else 0
+            rc[index] = nt % c + (nt % c + index) * ts
+
+        np.testing.assert_array_equal(test_impl(nt, N, c), rc)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_atomic_capture(self):
+        @njit
+        def test_impl(nt, N, c):
+            s = math.ceil(N // 2)
+            a = np.zeros(s)
+            sva = np.zeros(N)
+            tns = np.zeros(N)
+            with omp("parallel for num_threads(nt) private(sv, index)"):
+                for i in range(N):
+                    index = i % s
+                    tns[i] = omp_get_thread_num()
+                    with omp("atomic write"):
+                        a[index] = index * c + 1
+                    with omp("barrier"):
+                        pass
+                    with omp("atomic capture"):
+                        sv = a[index - 1]
+                        a[index - 1] += sv + (tns[i] % c + 1)
+                    # sva[index] = sv
+            return a, sva, tns
+
+        nt, N, c = 16, 30, 7
+        r1, r2, tns = test_impl(nt, N, c)
+        size = math.ceil(N // 2)
+        rc = np.arange(1, (size - 1) * c + 2, c)
+        # np.testing.assert_array_equal(r2, np.roll(rc, 1))
+        for i in range(N):
+            index = i % size
+            rc[index - 1] += rc[index - 1] + (tns[i] % c + 1)
+        np.testing.assert_array_equal(r1, rc)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_parallel_sections(self):
+        @njit
+        def test_impl(nt):
+            ta0 = np.zeros(nt)
+            ta1 = np.zeros(nt)
+            secpa = np.zeros(nt)
+
+            with omp("parallel sections num_threads(nt)"):
+                with omp("section"):
+                    ta0[omp_get_thread_num()] += 1
+                    secpa[0] = omp_in_parallel()
+                with omp("section"):
+                    ta1[omp_get_thread_num()] += 1
+                    secpa[1] = omp_in_parallel()
+            print(ta0, ta1)
+            return ta0, ta0, secpa
+
+        NT = 2  # Must equal the number of section directives in the test
+        r = test_impl(NT)
+        assert np.sum(r[0]) == 1
+        assert np.sum(r[1]) == 1
+        assert np.sum(r[2]) == NT
+        np.testing.assert_array_equal(r[0] + r[1], np.ones(NT))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - needs fix")
+    def test_barrier(self):
+        @njit
+        def test_impl(nt, iters, c):
+            a = np.zeros(nt)
+            ac = np.zeros((nt, nt))
+            x = iters // c
+            iters = x * c
+            sum = 0
+            with omp("parallel num_threads(nt) private(tn, sum)"):
+                tn = omp_get_thread_num()
+                with omp("critical"):
+                    sum = 0
+                    for i in range(iters):
+                        if i % x == 0:
+                            sum += 1
+                    a[tn] = sum
+                with omp("barrier"):
+                    pass
+                for j in range(nt):
+                    ac[tn][j] = a[j]
+            return ac
+
+        nt, c = 15, 12
+        r = test_impl(nt, 10000, c)
+        a = np.full(nt, c)
+        for i in range(nt):
+            np.testing.assert_array_equal(r[i], a)
+
+    #    def test_for_nowait(self):
+    #        @njit
+    #        def test_impl(nt, n, c1, c2):
+    #            a = np.zeros(n)
+    #            b = np.zeros(n)
+    #            ac = np.zeros((nt, n))
+    #            sum = 0
+    #            with omp("parallel num_threads(nt) private(tn)"):
+    #                tn = omp_get_thread_num()
+    #                with omp("for nowait schedule(static) private(sum)"):
+    #                    for i in range(n):
+    #                        # Sleep
+    #                        sum = 0
+    #                        for j in range(i * 1000):
+    #                            if j % 2 == 0:
+    #                                sum += 1
+    #                            else:
+    #                                sum -= 1
+    #                        a[i] = i * c1 + sum
+    #                for j in range(nt):
+    #                    ac[tn][j] = a[j]
+    #                with omp("for schedule(static)"):
+    #                    for i in range(n):
+    #                        b[i] = a[i] + c2
+    #            return b, ac
+    #        nt, n, c1, c2 = 8, 30, 5, -7
+    #        r = test_impl(nt, n, c1, c2)
+    #        a = np.arange(n) * c1
+    #        np.testing.assert_array_equal(r[0], a + c2)
+    #        arc = [np.array_equal(r[1][i], a) for i in range(nt)]
+    #        assert(not np.all(arc))
+    #
+    #    def test_nowait_result(self):
+    #        def test_impl(n, m, a, b, y, z):
+    #            omp_set_num_threads(5)
+    #
+    #            with omp("parallel"):
+    #                with omp("for nowait"):
+    #                    for i in range(1, n):
+    #                        b[i] = (a[i] + a[i-1]) / 2.0
+    #                with omp("for nowait"):
+    #                    for i in range(m):
+    #                        y[i] = math.sqrt(z[i])
+    #
+    #            return b, y
+    #        n, m = 10, 20
+    #        self.check(test_impl, n, m, np.ones(n), np.zeros(n),
+    #                    np.zeros(m), np.full(m, 13))
+
+    def test_nested_parallel_for(self):
+        @njit
+        def test_impl(nt):
+            omp_set_num_threads(nt)
+            omp_set_nested(1)
+            omp_set_dynamic(0)
+            a = np.zeros((nt, nt), dtype=np.int32)
+            with omp("parallel for"):
+                for i in range(nt):
+                    with omp("parallel for"):
+                        for j in range(nt):
+                            a[i][j] = omp_get_thread_num()
+            return a
+
+        nt = 8
+        r = test_impl(nt)
+        for i in range(len(r)):
+            np.testing.assert_array_equal(np.sort(r[i]), np.arange(nt))
+
+    def test_nested_parallel_regions_1(self):
+        @njit
+        def test_impl(nt1, nt2):
+            omp_set_dynamic(0)
+            omp_set_max_active_levels(2)
+            ca = np.zeros(nt1)
+            omp_set_num_threads(nt1)
+            with omp("parallel private(tn)"):
+                tn = omp_get_thread_num()
+                with omp("parallel num_threads(3)"):
+                    with omp("critical"):
+                        ca[tn] += 1
+                    with omp("single"):
+                        ats = omp_get_ancestor_thread_num(1) == tn
+                        ts = omp_get_team_size(1)
+            return ca, ats, ts
+
+        nt1, nt2 = 6, 3
+        r = test_impl(nt1, nt2)
+        np.testing.assert_array_equal(r[0], np.full(nt1, nt2))
+        assert r[1] == True
+        assert r[2] == nt1
+
+    def test_nested_parallel_regions_2(self):
+        @njit
+        def set_array(a):
+            tn = omp_get_thread_num()
+            a[tn][0] = omp_get_max_active_levels()
+            a[tn][1] = omp_get_num_threads()
+            a[tn][2] = omp_get_max_threads()
+            a[tn][3] = omp_get_level()
+            a[tn][4] = omp_get_team_size(1)
+            a[tn][5] = omp_in_parallel()
+
+        @njit
+        def test_impl(mal, n1, n2, n3):
+            omp_set_max_active_levels(mal)
+            omp_set_dynamic(0)
+            omp_set_num_threads(n1)
+            a = np.zeros((n2, 6), dtype=np.int32)
+            b = np.zeros((n1, 6), dtype=np.int32)
+            with omp("parallel"):
+                omp_set_num_threads(n2)
+                with omp("single"):
+                    with omp("parallel"):
+                        omp_set_num_threads(n3)
+                        set_array(a)
+                set_array(b)
+
+            return a, b
+
+        mal, n1, n2, n3 = 8, 2, 4, 5
+        a, b = test_impl(mal, n1, n2, n3)
+        for i in range(n2):
+            np.testing.assert_array_equal(a[i], np.array([8, n2, n3, 2, n1, 1]))
+        for i in range(n1):
+            np.testing.assert_array_equal(b[i], np.array([8, n1, n2, 1, n1, 1]))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort / Segmentation Fault")
+    def test_parallel_two_dimensional_array(self):
+        @njit
+        def test_impl(N):
+            omp_set_dynamic(0)
+            omp_set_num_threads(N)
+            a = np.zeros((N, 2), dtype=np.int32)
+            with omp("parallel private(tn)"):
+                tn = omp_get_thread_num()
+                a[tn][0] = 1
+                a[tn][1] = 2
+            return a
+
+        N = 5
+        r = test_impl(N)
+        for i in range(N):
+            np.testing.assert_array_equal(r[i], np.array([1, 2]))
+
+
+class TestOpenmpTask(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_task_basic(self):
+        def test_impl(ntsks):
+            a = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task"):
+                            a[i] = 1
+            return a
+
+        self.check(test_impl, 15)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Sometimes segmentation fault")
+    def test_task_thread_assignment(self):
+        @njit
+        def test_impl(ntsks):
+            a = np.empty(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task"):
+                            a[i] = omp_get_thread_num()
+            return a
+
+        with self.assertRaises(AssertionError) as raises:
+            v = test_impl(15)
+            np.testing.assert_equal(v[0], v)
+
+    def test_task_data_sharing_default(self):
+        @njit
+        def test_impl(n1, n2):
+            x = n1
+            with omp("parallel private(y)"):
+                y = n1
+                with omp("single"):
+                    with omp("task"):
+                        xa = x == n1
+                        ya = y == n1
+                        x, y = n2, n2
+                    with omp("taskwait"):
+                        ysave = y
+            return (x, ysave), (xa, ya)
+
+        n1, n2 = 1, 2
+        r = test_impl(n1, n2)
+        self.assert_outputs_equal(r[1], (True, True))
+        self.assert_outputs_equal(r[0], (n2, n1))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Segmentation fault")
+    def test_task_single_implicit_barrier(self):
+        @njit
+        def test_impl(ntsks):
+            a = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task private(sum)"):
+                            # Sleep
+                            sum = 0
+                            for j in range(10000):
+                                if j % 2 == 0:
+                                    sum += 1
+                                else:
+                                    sum -= 1
+                            a[i] = 1 + sum
+                # with omp("barrier"):
+                #    pass
+                sa = np.copy(a)
+            return sa
+
+        ntsks = 15
+        r = test_impl(ntsks)
+        np.testing.assert_array_equal(r, np.ones(ntsks))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Segmentation fault")
+    def test_task_single_nowait(self):
+        @njit
+        def test_impl(ntsks):
+            a = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single nowait"):
+                    for i in range(ntsks):
+                        with omp("task private(sum)"):
+                            sum = 0
+                            for j in range(10000):
+                                if j % 2 == 0:
+                                    sum += 1
+                                else:
+                                    sum -= 1
+                            a[i] = 1 + sum
+                sa = np.copy(a)
+            return sa
+
+        with self.assertRaises(AssertionError) as raises:
+            ntsks = 15
+            r = test_impl(ntsks)
+            np.testing.assert_array_equal(r, np.ones(ntsks))
+
+    # Error with commented out code, other version never finished running
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Error")
+    def test_task_barrier(self):
+        @njit
+        def test_impl(nt):
+            omp_set_num_threads(nt)
+            a = np.zeros((nt + 1) * nt / 2)
+            # a = np.zeros(10)
+            with omp("parallel"):
+                with omp("single"):
+                    for tn in range(nt):
+                        with omp("task"):
+                            for i in range(tn + 1):
+                                with omp("task"):
+                                    a[i] = omp_get_thread_num() + 1
+                    with omp("barrier"):
+                        ret = np.all(a)
+            return ret
+
+        assert test_impl(4)
+
+    def test_taskwait(self):
+        def test_impl(ntsks):
+            a = np.zeros(ntsks)
+            with omp("parallel private(i)"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task private(sum) private(j)"):
+                            sum = 0
+                            for j in range(10000):
+                                if j % 2 == 0:
+                                    sum += 1
+                                else:
+                                    sum -= 1
+                            a[i] = 1 + sum
+                    with omp("taskwait"):
+                        ret = np.all(a)
+            return ret
+
+        self.check(test_impl, 15)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Sometimes segmentation fault")
+    def test_taskwait_descendants(self):
+        @njit
+        def test_impl(ntsks, dtsks):
+            a = np.zeros(ntsks)
+            da = np.zeros((ntsks, dtsks))
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task"):
+                            a[i] = 1
+                            for j in range(dtsks):
+                                with omp("task private(sum)"):
+                                    sum = 0
+                                    for k in range(10000):
+                                        if k % 2 == 0:
+                                            sum += 1
+                                        else:
+                                            sum -= 1
+                                    da[i][j] = 1 + sum
+                    with omp("taskwait"):
+                        ac = np.copy(a)
+                        dac = np.copy(da)
+                with omp("barrier"):
+                    pass
+            return ac, dac
+
+        r = test_impl(15, 10)
+        np.testing.assert_array_equal(r[0], np.ones(r[0].shape))
+        with self.assertRaises(AssertionError) as raises:
+            np.testing.assert_array_equal(r[1], np.ones(r[1].shape))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_undeferred_task(self):
+        @njit
+        def test_impl():
+            with omp("parallel"):
+                flag = 1
+                with omp("single"):
+                    with omp("task if(1) private(sum)"):
+                        sum = 0
+                        for i in range(10000):
+                            if i % 2 == 0:
+                                sum += 1
+                            else:
+                                sum -= 1
+                        r = flag + sum
+                    flag = 0
+            return r
+
+        assert test_impl()
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_untied_task_thread_assignment(self):
+        @njit
+        def test_impl(ntsks):
+            start_nums = np.zeros(ntsks)
+            current_nums = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task untied private(sum)"):
+                            start_nums[i] = omp_get_thread_num()
+                            with omp("task if(0) shared(sum)"):
+                                # Sleep
+                                sum = 0
+                                for j in range(10000):
+                                    if j % 2 == 0:
+                                        sum += 1
+                                    else:
+                                        sum -= 1
+                            current_nums[i] = omp_get_thread_num() + sum
+                with omp("barrier"):
+                    pass
+            return start_nums, current_nums
+
+        with self.assertRaises(AssertionError) as raises:
+            sids, cids = test_impl(15)
+            np.testing.assert_array_equal(sids, cids)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_taskyield_thread_assignment(self):
+        @njit
+        def test_impl(ntsks):
+            start_nums = np.zeros(ntsks)
+            finish_nums = np.zeros(ntsks)
+            yielded_tasks = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task private(stn, start_i, finish_i, diff)"):
+                            stn = omp_get_thread_num()
+                            start_i = np.where(start_nums == stn)[0]
+                            finish_i = np.where(finish_nums == stn)[0]
+                            diff = np.zeros(len(start_i), dtype=np.int64)
+                            for sindex in range(len(start_i)):
+                                for findex in range(len(finish_i)):
+                                    if start_i[sindex] == finish_i[findex]:
+                                        break
+                                else:
+                                    diff[sindex] = start_i[sindex]
+                            for dindex in diff[diff != 0]:
+                                yielded_tasks[dindex] = 1
+                            start_nums[i] = stn
+                            with omp("taskyield"):
+                                pass
+                            finish_nums[i] = omp_get_thread_num()
+                with omp("barrier"):
+                    pass
+            return yielded_tasks
+
+        yt = test_impl(50)
+        assert np.any(yt)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_final_task_thread_assignment(self):
+        @njit
+        def test_impl(ntsks, c):
+            final_nums = np.zeros(ntsks)
+            included_nums = np.zeros(ntsks)
+            da = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task final(i>c) private(sum, d)"):
+                            ftask_num = i
+                            final_nums[ftask_num] = omp_get_thread_num()
+                            # If it is a final task, generate an included task
+                            if ftask_num > c:
+                                d = 1
+                                with omp("task private(sum)"):
+                                    itask_num = ftask_num
+                                    # Sleep
+                                    sum = 0
+                                    for j in range(10000):
+                                        if j % 2 == 0:
+                                            sum += 1
+                                        else:
+                                            sum -= 1
+                                    included_nums[itask_num] = omp_get_thread_num()
+                                    da[itask_num] = d + sum
+                                d = 0
+
+            return final_nums, included_nums, da
+
+        ntsks, c = 15, 5
+        fns, ins, da = test_impl(ntsks, c)
+        np.testing.assert_array_equal(fns[c:], ins[c:])
+        np.testing.assert_array_equal(da, np.ones(ntsks))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_taskgroup(self):
+        @njit
+        def test_impl(ntsks, dtsks):
+            a = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    with omp("taskgroup"):
+                        for i in range(ntsks):
+                            with omp("task"):
+                                for _ in range(dtsks):
+                                    with omp("task"):
+                                        # Sleep
+                                        sum = 0
+                                        for j in range(10000):
+                                            if j % 2 == 0:
+                                                sum += 1
+                                            else:
+                                                sum -= 1
+                                        a[i] = 1 + sum
+                    sa = np.copy(a)
+            return a, sa
+
+        ntsks = 15
+        r = test_impl(ntsks, 10)
+        np.testing.assert_array_equal(r[0], np.ones(ntsks))
+        np.testing.assert_array_equal(r[1], np.ones(ntsks))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_task_priority(self):
+        @njit
+        def test_impl(ntsks):
+            a = np.zeros(ntsks)
+            count = 0
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task priority(i)"):
+                            count += i + 1
+                            a[i] = count
+            return a
+
+        ntsks = 15
+        r = test_impl(ntsks)
+        rc = np.zeros(ntsks)
+        for i in range(ntsks):
+            rc[i] = sum(range(i + 1, ntsks + 1))
+        np.testing.assert_array_equal(r, rc)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_task_mergeable(self):
+        @njit
+        def test_impl(ntsks, c1, c2):
+            a = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task private(x)"):
+                            x = c1
+                            with omp("task mergeable if(0)"):
+                                x = c2
+                            a[i] = x
+            return a
+
+        ntsks, c1, c2 = 75, 2, 3
+        assert c2 in test_impl(ntsks, c1, c2)
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_task_depend(self):
+        def test_impl(ntsks):
+            a = np.zeros(ntsks)
+            da = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task private(x, done)"):
+                            x = 1
+                            done = False
+                            with omp("task shared(x) depend(out: x)"):
+                                x = 5
+                            with omp("""task shared(done, x)
+                                        depend(out: done) depend(inout: x)"""):
+                                x += i
+                                done = True
+                            with omp("""task shared(done, x)
+                                         depend(in: done) depend(inout: x)"""):
+                                x *= i
+                                da[i] = 1 if done else 0
+                            with omp("task shared(x) depend(in: x)"):
+                                a[i] = x
+            return a, da
+
+        self.check(test_impl, 15)
+
+    # Affinity clause should not affect result
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+    def test_task_affinity(self):
+        def test_impl(ntsks, const):
+            a = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    for i in range(ntsks):
+                        with omp("task firstprivate(i)"):
+                            with omp("""task shared(b) depend(out: b)
+                                         affinity(a)"""):
+                                b = np.full(i, const)
+                            with omp("""task shared(b) depend(in: b)
+                                         affinity(a)"""):
+                                a[i] = np.sum(b)
+            return a
+
+        self.check(test_impl, 15, 4)
+
+    def test_shared_array(self):
+        def test_impl(mode):
+            if mode == 0:
+                return
+
+            b = np.zeros(100)
+            with omp("parallel"):
+                with omp("single"):
+                    a = np.ones(100)
+                    c = 0
+                    d = 0
+                    if mode > 1:
+                        with omp("task shared(a, c)"):
+                            c = a.sum()
+                        with omp("task shared(a, d)"):
+                            d = a.sum()
+                        with omp("taskwait"):
+                            b[:] = c + d
+
+            return b
+
+        self.check(test_impl, 0)
+        self.check(test_impl, 1)
+        self.check(test_impl, 2)
+
+
+@unittest.skipUnless(TestOpenmpBase.skip_disabled, "Unimplemented")
+class TestOpenmpTaskloop(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_taskloop_basic(self):
+        def test_impl(ntsks):
+            a = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    with omp("taskloop"):
+                        for i in range(ntsks):
+                            a[i] = 1
+            return a
+
+        self.check(test_impl, 15)
+
+    def test_taskloop_num_tasks(self):
+        @njit
+        def test_impl(nt, iters, ntsks):
+            a = np.zeros(ntsks)
+            with omp("parallel num_threads(nt)"):
+                with omp("single"):
+                    with omp("taskloop num_tasks(ntsks)"):
+                        for i in range(iters):
+                            a[i] = omp_get_thread_num()
+            return a
+
+        nt, iters, ntsks = 8, 10, 4
+        assert len(np.unique(test_impl(nt, iters, ntsks))) <= ntsks
+
+    def test_taskloop_grainsize(self):
+        @njit
+        def test_impl(nt, iters, ntsks):
+            a = np.zeros(ntsks)
+            with omp("parallel num_threads(nt)"):
+                with omp("single"):
+                    iters_per_task = iters // ntsks
+                    with omp("taskloop grainsize(iters_per_task)"):
+                        for i in range(iters):
+                            a[i] = omp_get_thread_num()
+            return a
+
+        nt, iters, ntsks = 8, 10, 4
+        assert len(np.unique(test_impl(nt, iters, ntsks))) <= ntsks
+
+    def test_taskloop_nogroup(self):
+        @njit
+        def test_impl(ntsks):
+            a = np.zeros(ntsks)
+            sa = np.zeros(ntsks)
+            with omp("parallel"):
+                with omp("single"):
+                    s = 0
+                    with omp("taskloop nogroup num_tasks(ntsks)"):
+                        for i in range(ntsks):
+                            a[i] = 1
+                            sa[i] = s
+                    with omp("task priority(1)"):
+                        s = 1
+            return a, sa
+
+        ntsks = 15
+        r = test_impl(ntsks)
+        np.testing.assert_array_equal(r[0], np.ones(ntsks))
+        np.testing.assert_array_equal(r[1], np.ones(ntsks))
+
+    def test_taskloop_collapse(self):
+        @njit
+        def test_impl(ntsks, nt):
+            fl = np.zeros(ntsks)
+            sl = np.zeros(ntsks)
+            tl = np.zeros(ntsks)
+            omp_set_num_threads(nt)
+            with omp("parallel"):
+                with omp("single"):
+                    with omp("taskloop collapse(2) num_tasks(ntsks)"):
+                        for i in range(ntsks):
+                            fl[i] = omp_get_thread_num()
+                            for j in range(1):
+                                sl[i] = omp_get_thread_num()
+                                for k in range(1):
+                                    tl[i] = omp_get_thread_num()
+
+            return fl, sl, tl
+
+        r = test_impl(25, 4)
+        with self.assertRaises(AssertionError) as raises:
+            np.testing.assert_array_equal(r[0], r[1])
+        np.testing.assert_array_equal(r[1], r[2])
+
+
+@linux_only
+@unittest.skipUnless(
+    TestOpenmpBase.skip_disabled or TestOpenmpBase.run_target, "Unimplemented"
+)
+class TestOpenmpTarget(TestOpenmpBase):
+    """
+    OpenMP target offloading tests. TEST_DEVICES is a required env var to
+    specify the device numbers to run the tests on: 0 for host backend, 1 for
+    CUDA backend. It is expected to be a comma-separated list of integer values.
+    """
+
+    devices = []
+    assert TestOpenmpBase.test_devices, (
+        "Expected env var TEST_DEVICES (comma-separated list of device numbers)"
+    )
+    devices = [int(devno) for devno in TestOpenmpBase.test_devices.split(",")]
+    assert devices, "Expected non-empty test devices list"
+
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    @classmethod
+    def is_testing_cpu(cls):
+        return 1 in cls.devices
+
+    # How to check for nowait?
+    # Currently checks only compilation.
+    # Numba optimizes the whole target away? This runs too fast.
+    def target_nowait(self, device):
+        target_pragma = f"target nowait device({device})"
+
+        @njit
+        def test_impl():
+            with omp(target_pragma):
+                a = 0
+                for i in range(1000000):
+                    for j in range(1000000):
+                        for k in range(1000000):
+                            a += math.sqrt(i) + math.sqrt(j) + math.sqrt(k)
+
+        test_impl()
+
+    def target_nest_parallel_default_threadlimit(self, device):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("parallel"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams = omp_get_num_teams()
+                        threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        np.testing.assert_equal(teams, 1)
+        self.assertGreater(threads, 1)
+
+    def target_nest_parallel_set_numthreads(self, device):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("parallel num_threads(32)"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams = omp_get_num_teams()
+                        threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        np.testing.assert_equal(teams, 1)
+        np.testing.assert_equal(threads, 32)
+
+    def target_nest_teams_default_numteams(self, device):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("teams"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams = omp_get_num_teams()
+                        threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        # GPU device(0) starts >1 teams each with 1 thread.
+        if device == 0:
+            self.assertGreater(teams, 1)
+            self.assertEqual(threads, 1)
+        # CPU device(1) starts 1 team with >1 threads.
+        elif device == 1:
+            self.assertEqual(teams, 1)
+            self.assertGreater(threads, 1)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_nest_teams_set_numteams(self, device):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(32)"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams = omp_get_num_teams()
+                        threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        if device == 0:
+            self.assertEqual(teams, 32)
+        elif device == 1:
+            self.assertLessEqual(teams, 32)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+        self.assertGreaterEqual(threads, 1)
+
+    def target_nest_teams_nest_parallel_default_numteams_threadlimit(self, device):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("teams"):
+                    with omp("parallel"):
+                        teamno = omp_get_team_num()
+                        threadno = omp_get_thread_num()
+                        if teamno == 0 and threadno == 0:
+                            teams = omp_get_num_teams()
+                            threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        # For GPU, impl. creates multiple threads and teams.
+        if device == 0:
+            self.assertGreater(teams, 1)
+            self.assertGreater(threads, 1)
+        # For CPU, impl. creates 1 teams with multiple threads.
+        elif device == 1:
+            self.assertEqual(teams, 1)
+            self.assertGreater(threads, 1)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_nest_teams_nest_parallel_set_numteams(self, device):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(32)"):
+                    with omp("parallel"):
+                        teamno = omp_get_team_num()
+                        threadno = omp_get_thread_num()
+                        if teamno == 0 and threadno == 0:
+                            teams = omp_get_num_teams()
+                            threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        if device == 0:
+            self.assertEqual(teams, 32)
+        elif device == 1:
+            self.assertGreaterEqual(teams, 1)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+        self.assertGreaterEqual(threads, 1)
+
+    def target_nest_teams_nest_parallel_set_threadlimit(self, device):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("teams thread_limit(32)"):
+                    with omp("parallel"):
+                        teamno = omp_get_team_num()
+                        threadno = omp_get_thread_num()
+                        if teamno == 0 and threadno == 0:
+                            teams = omp_get_num_teams()
+                            threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        # For GPU, impl. creates > 1 teams.
+        if device == 0:
+            self.assertGreater(teams, 1)
+            self.assertEqual(threads, 32)
+        # For CPU, impl. creates exactly 1 team.
+        elif device == 1:
+            self.assertEqual(teams, 1)
+            self.assertLessEqual(threads, 32)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_nest_teams_nest_parallel_set_numteams_threadlimit(self, device):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(32) thread_limit(32)"):
+                    with omp("parallel"):
+                        teamno = omp_get_team_num()
+                        threadno = omp_get_thread_num()
+                        if teamno == 0 and threadno == 0:
+                            teams = omp_get_num_teams()
+                            threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        self.assertGreaterEqual(teams, 1)
+        if device == 0:
+            self.assertEqual(teams, 32)
+            self.assertEqual(threads, 32)
+        elif device == 1:
+            self.assertLessEqual(teams, 32)
+            self.assertLessEqual(threads, 32)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_nest_teams_nest_parallel_set_numteams_threadlimit_gt_numthreads(
+        self, device
+    ):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(32) thread_limit(64)"):
+                    with omp("parallel num_threads(32)"):
+                        teamno = omp_get_team_num()
+                        threadno = omp_get_thread_num()
+                        if teamno == 0 and threadno == 0:
+                            teams = omp_get_num_teams()
+                            threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        self.assertGreaterEqual(teams, 1)
+        if device == 0:
+            self.assertEqual(teams, 32)
+            self.assertEqual(threads, 32)
+        elif device == 1:
+            self.assertLessEqual(teams, 32)
+            self.assertLessEqual(threads, 32)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_nest_teams_nest_parallel_set_numteams_threadlimit_lt_numthreads(
+        self, device
+    ):
+        target_pragma = f"target device({device}) map(from: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                # THREAD_LIMIT takes precedence over NUM_THREADS.
+                with omp("teams num_teams(32) thread_limit(64)"):
+                    with omp("parallel num_threads(128)"):
+                        teamno = omp_get_team_num()
+                        threadno = omp_get_thread_num()
+                        if teamno == 0 and threadno == 0:
+                            teams = omp_get_num_teams()
+                            threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        self.assertGreaterEqual(teams, 1)
+        if device == 0:
+            self.assertEqual(teams, 32)
+            self.assertEqual(threads, 64)
+        elif device == 1:
+            self.assertLessEqual(teams, 32)
+            self.assertLessEqual(threads, 64)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_nest_parallel_multiple_set_numthreads(self, device):
+        target_pragma = (
+            f"target device({device}) map(from: teams1, threads1, teams2, threads2)"
+        )
+
+        @njit
+        def test_impl():
+            teams1 = 0
+            threads1 = 0
+            teams2 = 0
+            threads2 = 0
+            with omp(target_pragma):
+                with omp("parallel num_threads(32)"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams1 = omp_get_num_teams()
+                        threads1 = omp_get_num_threads()
+                with omp("parallel num_threads(256)"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams2 = omp_get_num_teams()
+                        threads2 = omp_get_num_threads()
+            return teams1, threads1, teams2, threads2
+
+        teams1, threads1, teams2, threads2 = test_impl()
+        np.testing.assert_equal(teams1, 1)
+        np.testing.assert_equal(threads1, 32)
+        np.testing.assert_equal(teams2, 1)
+        np.testing.assert_equal(threads2, 256)
+
+    def target_nest_parallel_multiple_default_numthreads(self, device):
+        target_pragma = (
+            f"target device({device}) map(from: teams1, threads1, teams2, threads2)"
+        )
+
+        @njit
+        def test_impl():
+            teams1 = 0
+            threads1 = 0
+            teams2 = 0
+            threads2 = 0
+            with omp(target_pragma):
+                with omp("parallel"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams1 = omp_get_num_teams()
+                        threads1 = omp_get_num_threads()
+                with omp("parallel"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams2 = omp_get_num_teams()
+                        threads2 = omp_get_num_threads()
+            return teams1, threads1, teams2, threads2
+
+        teams1, threads1, teams2, threads2 = test_impl()
+        np.testing.assert_equal(teams1, 1)
+        self.assertGreater(threads1, 1)
+        np.testing.assert_equal(teams2, 1)
+        self.assertGreater(threads2, 1)
+
+    def target_nest_parallel_multiple_set_numthreads_byone(self, device):
+        target_pragma = f"target device({device}) map(from: max_threads, teams1, threads1, teams2, threads2)"
+
+        @njit
+        def test_impl():
+            max_threads = 0
+            teams1 = 0
+            threads1 = 0
+            teams2 = 0
+            threads2 = 0
+            with omp(target_pragma):
+                max_threads = omp_get_max_threads()
+                with omp("parallel"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams1 = omp_get_num_teams()
+                        threads1 = omp_get_num_threads()
+                with omp("parallel num_threads(256)"):
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        teams2 = omp_get_num_teams()
+                        threads2 = omp_get_num_threads()
+            return max_threads, teams1, threads1, teams2, threads2
+
+        # NOTE: max_threads for device(0) is the number of threads set by the
+        # sibling parallel legion with the highest num_threads clause.
+        # For device(1), is the number of max threads as determined by the host
+        # runtime.
+        max_threads, teams1, threads1, teams2, threads2 = test_impl()
+        np.testing.assert_equal(teams1, 1)
+        np.testing.assert_equal(threads1, max_threads)
+        np.testing.assert_equal(teams2, 1)
+        np.testing.assert_equal(threads2, 256)
+
+    def target_nest_parallel(self, device):
+        # TODO: map should be "from" instead of "tofrom" once this is fixed.
+        target_pragma = f"target device({device}) map(from: a)"
+        # NOTE: num_threads should be a multiple of warp size, e.g. for NVIDIA
+        # V100 it is 32, the OpenMP runtime floors non-multiple of warp size.
+        # TODO: Newer LLVM versions should not have this restriction.
+        parallel_pragma = (
+            "parallel num_threads(32)"  # + (" shared(a)" if explicit else "")
+        )
+
+        @njit
+        def test_impl():
+            a = np.zeros(32, dtype=np.int64)
+            with omp(target_pragma):
+                with omp(parallel_pragma):
+                    thread_id = omp_get_thread_num()
+                    a[thread_id] = 1
+            return a
+
+        r = test_impl()
+        np.testing.assert_equal(r, np.full(32, 1))
+
+    def target_parallel_for_range_step_arg(self, device):
+        target_pragma = f"target device({device}) map(tofrom: a)"
+        parallel_pragma = "parallel for"
+        N = 10
+        step = 2
+
+        @njit
+        def test_impl():
+            a = np.zeros(N, dtype=np.int32)
+            with omp(target_pragma):
+                with omp(parallel_pragma):
+                    for i in range(0, len(a), step):
+                        a[i] = i + 1
+
+            return a
+
+        r = test_impl()
+        np.testing.assert_equal(r, np.array([1, 0, 3, 0, 5, 0, 7, 0, 9, 0]))
+
+    def target_parallel_for_incremented_step(self, device):
+        target_pragma = f"target device({device}) map(tofrom: a)"
+        parallel_pragma = "parallel for"
+        N = 10
+        step_range = 3
+
+        @njit
+        def test_impl():
+            a = np.zeros(N, dtype=np.int32)
+            for i in range(step_range):
+                with omp(target_pragma):
+                    with omp(parallel_pragma):
+                        for j in range(0, len(a), i + 1):
+                            a[j] = i + 1
+            return a
+
+        r = test_impl()
+        np.testing.assert_equal(r, np.array([3, 1, 2, 3, 2, 1, 3, 1, 2, 3]))
+
+    def target_teams(self, device):
+        target_pragma = (
+            f"target teams num_teams(100) device({device}) map(from: a, nteams)"
+        )
+
+        @njit
+        def test_impl():
+            a = np.zeros(100, dtype=np.int64)
+            nteams = 0
+            with omp(target_pragma):
+                team_id = omp_get_team_num()
+                if team_id == 0:
+                    nteams = omp_get_num_teams()
+                a[team_id] = 1
+            return a, nteams
+
+        r, nteams = test_impl()
+        if device == 0:
+            np.testing.assert_equal(r, np.full(100, 1))
+        elif device == 1:
+            np.testing.assert_equal(r[:nteams], np.full(nteams, 1))
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_nest_teams(self, device):
+        target_pragma = f"target device({device}) map(from: a, nteams)"
+
+        @njit
+        def test_impl():
+            a = np.zeros(100, dtype=np.int64)
+            nteams = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(100)"):
+                    team_id = omp_get_team_num()
+                    if team_id == 0:
+                        nteams = omp_get_num_teams()
+                    a[team_id] = 1
+            return a, nteams
+
+        r, nteams = test_impl()
+        if device == 0:
+            np.testing.assert_equal(r, np.full(100, 1))
+        elif device == 1:
+            np.testing.assert_equal(r[:nteams], np.full(nteams, 1))
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_nest_teams_from_shared_expl_scalar(self, device):
+        target_pragma = f"target device({device}) map(from: s)"
+
+        @njit
+        def test_impl():
+            s = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(100) shared(s)"):
+                    team_id = omp_get_team_num()
+                    if team_id == 0:
+                        s = 1
+            return s
+
+        s = test_impl()
+        np.testing.assert_equal(s, 1)
+
+    def target_nest_teams_from_shared_impl_scalar(self, device):
+        target_pragma = f"target device({device}) map(from: s)"
+
+        @njit
+        def test_impl():
+            s = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(100)"):
+                    team_id = omp_get_team_num()
+                    if team_id == 0:
+                        s = 1
+            return s
+
+        s = test_impl()
+        np.testing.assert_equal(s, 1)
+
+    def target_nest_teams_tofrom_shared_expl_scalar(self, device):
+        target_pragma = f"target device({device}) map(tofrom: s)"
+
+        @njit
+        def test_impl():
+            s = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(100) shared(s)"):
+                    team_id = omp_get_team_num()
+                    if team_id == 0:
+                        s = 1
+            return s
+
+        s = test_impl()
+        np.testing.assert_equal(s, 1)
+
+    def target_nest_teams_tofrom_shared_impl_scalar(self, device):
+        target_pragma = f"target device({device}) map(tofrom: s)"
+
+        @njit
+        def test_impl():
+            s = 0
+            ss = np.zeros(1)
+            with omp(target_pragma):
+                with omp("teams num_teams(100)"):
+                    team_id = omp_get_team_num()
+                    if team_id == 0:
+                        s = 1
+                        ss[0] = 1
+            return s, ss
+
+        s, ss = test_impl()
+        np.testing.assert_equal(s, 1)
+        np.testing.assert_equal(ss, 1)
+
+    def target_teams_nest_parallel(self, device):
+        target_pragma = f"target teams device({device}) num_teams(10) thread_limit(32) map(tofrom: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("parallel"):
+                    team_id = omp_get_team_num()
+                    thread_id = omp_get_thread_num()
+                    if team_id == 0 and thread_id == 0:
+                        teams = omp_get_num_teams()
+                        threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        if device == 0:
+            self.assertEqual(teams, 10)
+            self.assertEqual(threads, 32)
+        elif device == 1:
+            self.assertLessEqual(teams, 10)
+            self.assertLessEqual(threads, 32)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_teams_nest_parallel_set_thread_limit(self, device):
+        target_pragma = f"target device({device}) map(tofrom: teams, threads)"
+
+        @njit
+        def test_impl():
+            teams = 0
+            threads = 0
+            with omp(target_pragma):
+                with omp("teams num_teams(10) thread_limit(32)"):
+                    with omp("parallel"):
+                        team_id = omp_get_team_num()
+                        thread_id = omp_get_thread_num()
+                        if team_id == 0 and thread_id == 0:
+                            teams = omp_get_num_teams()
+                            threads = omp_get_num_threads()
+            return teams, threads
+
+        teams, threads = test_impl()
+        if device == 0:
+            self.assertEqual(teams, 10)
+            self.assertEqual(threads, 32)
+        elif device == 1:
+            self.assertLessEqual(teams, 10)
+            self.assertLessEqual(threads, 32)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_map_to_scalar(self, device):
+        target_pragma = f"target device({device}) map(to: x) map(from: r)"
+
+        @njit
+        def test_impl(x):
+            with omp(target_pragma):
+                x += 1
+                r = x
+            return r
+
+        x = 42
+        r = test_impl(x)
+        np.testing.assert_equal(r, 43)
+
+    def target_map_to_array(self, device):
+        target_pragma = f"target device({device}) map(to: a) map(from: r)"
+
+        @njit
+        def test_impl(a):
+            with omp(target_pragma):
+                r = 0
+                for i in range(len(a)):
+                    r += a[i]
+            return r
+
+        n = 10
+        a = np.ones(n)
+        r = test_impl(a)
+        # r is the sum of array elements (ones-array), thus must equal s.
+        np.testing.assert_equal(r, n)
+
+    def target_map_from_scalar(self, device):
+        target_pragma = f"target device({device}) map(from: x)"
+
+        @njit
+        def test_impl(x):
+            with omp(target_pragma):
+                x = 43
+            return x
+
+        x = 42
+        r = test_impl(x)
+        np.testing.assert_equal(r, 43)
+
+    def target_map_tofrom_scalar(self, device):
+        target_pragma = f"target device({device}) map(tofrom: x)"
+
+        @njit
+        def test_impl(x):
+            with omp(target_pragma):
+                x += 1
+            return x
+
+        x = 42
+        r = test_impl(x)
+        np.testing.assert_equal(r, 43)
+
+    def target_multiple_map_tofrom_scalar(self, device):
+        target_pragma = f"target device({device}) map(tofrom: x)"
+
+        @njit
+        def test_impl(x):
+            with omp(target_pragma):
+                x += 1
+            with omp(target_pragma):
+                x += 1
+            return x
+
+        x = 42
+        r = test_impl(x)
+        np.testing.assert_equal(r, 44)
+
+    def target_map_from_array(self, device):
+        target_pragma = f"target device({device}) map(from: a)"
+
+        @njit
+        def test_impl(n):
+            a = np.zeros(n, dtype=np.int64)
+            with omp(target_pragma):
+                for i in range(len(a)):
+                    a[i] = 42
+            return a
+
+        n = 10
+        r = test_impl(n)
+        np.testing.assert_array_equal(r, np.full(n, 42))
+
+    def target_map_slice_in_mapping(self, device):
+        target_pragma = f"target device({device}) map(a[50:100]) map(to: b[100:150])"
+
+        @njit
+        def test_impl(n):
+            a = np.zeros(n)
+            b = np.arange(n)
+            with omp(target_pragma):
+                for i in range(50):
+                    # These b accesses are within the transferred region.
+                    a[i + 50] = b[i + 100]
+            return a
+
+        n = 200
+        r = test_impl(n)
+        np.testing.assert_array_equal(r[0:50], np.zeros(50))
+        np.testing.assert_array_equal(r[50:100], np.arange(n)[100:150])
+        np.testing.assert_array_equal(r[100:200], np.zeros(100))
+
+    def target_map_slice_read_out_mapping(self, device):
+        target_pragma = f"target device({device}) map(a[50:100]) map(to: b[100:150])"
+
+        @njit
+        def test_impl(n):
+            a = np.zeros(n)
+            b = np.arange(n)
+            with omp(target_pragma):
+                for i in range(50):
+                    # These b accesses are outside the transferred region.
+                    # Should get whatever happens to be in memory at that point.
+                    # We assume that isn't arange(50:100).
+                    a[i + 50] = b[i + 50]
+            return a
+
+        n = 200
+        r = test_impl(n)
+        np.testing.assert_array_equal(r[0:50], np.zeros(50))
+        # Make sure that the range 50-100 was not transferred.
+        assert not np.array_equal(r[50:100], np.arange(n)[50:100])
+        np.testing.assert_array_equal(r[100:200], np.zeros(100))
+
+    def target_map_tofrom_array(self, device):
+        target_pragma = f"target device({device}) map(tofrom: a)"
+
+        @njit
+        def test_impl(a):
+            with omp(target_pragma):
+                for i in range(len(a)):
+                    a[i] += 1
+            return a
+
+        n = 10
+        a = np.full(n, 42)
+        r = test_impl(a)
+        np.testing.assert_array_equal(r, np.full(n, 43))
+
+    def target_nest_parallel_for(self, device):
+        target_pragma = f"target device({device}) map(tofrom: a, sched)"
+
+        @njit
+        def test_impl(a, sched):
+            with omp(target_pragma):
+                with omp("parallel for num_threads(256)"):
+                    for i in range(len(a)):
+                        a[i] = 1
+                        thread_id = omp_get_thread_num()
+                        sched[i] = thread_id
+            return a, sched
+
+        n = 1000
+        a = np.zeros(n)
+        sched = np.zeros(n)
+        r, sched = test_impl(a, sched)
+        np.testing.assert_array_equal(r, np.ones(n))
+        # u = unique thread ids that processed the array, c = number of iters
+        # each unique thread id has processed.
+        u, c = np.unique(sched, return_counts=True)
+        # test that 256 threads executed.
+        np.testing.assert_equal(len(u), 256)
+        # test that each thread executed more than 1 iteration.
+        for ci in c:
+            self.assertGreater(ci, 0)
+
+    def target_nest_teams_distribute(self, device):
+        target_pragma = f"target device({device}) map(tofrom: a, sched)"
+
+        @njit
+        def test_impl(a, sched):
+            with omp(target_pragma):
+                with omp("teams distribute"):
+                    for i in range(len(a)):
+                        a[i] = 1
+                        team_id = omp_get_team_num()
+                        sched[i] = team_id
+            return a, sched
+
+        n = 100
+        a = np.zeros(n)
+        sched = np.zeros(n)
+        r, sched = test_impl(a, sched)
+        np.testing.assert_array_equal(r, np.ones(n))
+        # u = unique teams ids that processed the array, c = number of iters
+        # each unique team id has processed.
+        u, c = np.unique(sched, return_counts=True)
+        if device == 0:
+            # For GPU, OpenMP creates as many teams as the number of iterations,
+            # where each team leader executes one iteration.
+            np.testing.assert_equal(len(u), n)
+            np.testing.assert_array_equal(c, np.ones(n))
+        elif device == 1:
+            # For CPU, OpenMP creates 1 teams with 1 thread processing all n
+            # iterations.
+            np.testing.assert_equal(len(u), 1)
+            np.testing.assert_array_equal(c, [100])
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_teams_distribute(self, device):
+        target_pragma = (
+            f"target teams distribute device({device}) map(tofrom: a, sched)"
+        )
+
+        @njit
+        def test_impl(a, sched):
+            with omp(target_pragma):
+                for i in range(len(a)):
+                    a[i] = 1
+                    team_id = omp_get_team_num()
+                    sched[i] = team_id
+            return a, sched
+
+        n = 1000
+        a = np.zeros(n)
+        sched = np.zeros(n)
+        r, sched = test_impl(a, sched)
+        np.testing.assert_array_equal(r, np.ones(n))
+        # u = unique teams ids that processed the array, c = number of iters
+        # each unique team id has processed.
+        u, c = np.unique(sched, return_counts=True)
+        if device == 0:
+            # For GPU, impl. creates as many teams as the number of iterations,
+            # where each team leader executes one iteration.
+            np.testing.assert_equal(len(u), n)
+            np.testing.assert_array_equal(c, np.ones(n))
+        elif device == 1:
+            # For CPU, impl. creates 1 team which processes all iterations.
+            np.testing.assert_equal(len(u), 1)
+            np.testing.assert_array_equal(c, [1000])
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_teams_distribute_set_num_teams(self, device):
+        target_pragma = (
+            f"target teams distribute device({device}) map(tofrom: a) num_teams(4)"
+        )
+
+        @njit
+        def test_impl(a, sched):
+            with omp(target_pragma):
+                for i in range(len(a)):
+                    a[i] = 1
+                    team_id = omp_get_team_num()
+                    sched[i] = team_id
+            return a, sched
+
+        n = 1000
+        a = np.zeros(n)
+        sched = np.zeros(n)
+        r, sched = test_impl(a, sched)
+        np.testing.assert_array_equal(r, np.ones(n))
+        # u = unique teams ids that processed the array, c = number of iters
+        # each unique team id has processed.
+        u, c = np.unique(sched, return_counts=True)
+        np.testing.assert_equal(len(u), 4)
+        np.testing.assert_array_equal(c, np.full(4, 250))
+
+    def target_firstprivate_scalar_explicit(self, device):
+        target_pragma = f"target device({device}) firstprivate(s)"
+
+        @njit
+        def test_impl(s):
+            with omp(target_pragma):
+                s = 43
+            return s
+
+        s = 42
+        r = test_impl(s)
+        np.testing.assert_equal(r, 42)
+
+    def target_firstprivate_scalar_implicit(self, device):
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl(s):
+            with omp(target_pragma):
+                s = 43
+            return s
+
+        s = 42
+        r = test_impl(s)
+        np.testing.assert_equal(r, 42)
+
+    def target_data_from(self, device):
+        target_data_pragma = f"""target data device({device})
+                                map(from: a)"""
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            with omp(target_data_pragma):
+                with omp(target_pragma):
+                    for i in range(len(a)):
+                        a[i] = 42
+            return a
+
+        a = test_impl()
+        np.testing.assert_array_equal(a, np.full(10, 42))
+
+    def target_data_to(self, device):
+        target_data_pragma = f"""target data device({device})
+                                map(to: a) map(from: b)"""
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            b = np.zeros(10)
+            with omp(target_data_pragma):
+                with omp(target_pragma):
+                    for i in range(len(a)):
+                        a[i] = 42
+                        b[i] = a[i]
+            return a, b
+
+        a, b = test_impl()
+        np.testing.assert_array_equal(a, np.ones(10))
+        np.testing.assert_array_equal(b, np.full(10, 42))
+
+    def target_data_tofrom(self, device):
+        target_data_pragma = f"""target data device({device})
+                                map(tofrom: s, a)"""
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl():
+            s = 0
+            a = np.ones(10)
+            with omp(target_data_pragma):
+                with omp(target_pragma):
+                    for i in range(len(a)):
+                        a[i] += 41
+                    s = 42
+            return s, a
+
+        s, a = test_impl()
+        # s is a FIRSTPRIVATE in the target region, so changes do not affect
+        # host s despite FROM mapping.
+        np.testing.assert_equal(s, 0)
+        np.testing.assert_array_equal(a, np.full(10, 42))
+
+    def target_data_alloc_from(self, device):
+        target_data_pragma = f"""target data device({device})
+                                map(alloc: a) map(from: b)"""
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            b = np.zeros(10)
+            with omp(target_data_pragma):
+                with omp(target_pragma):
+                    for i in range(len(a)):
+                        a[i] = 42
+                        b[i] = a[i]
+            return a, b
+
+        a, b = test_impl()
+        np.testing.assert_array_equal(a, np.ones(10))
+        np.testing.assert_array_equal(b, np.full(10, 42))
+
+    def target_data_mix_to_from(self, device):
+        target_data_pragma = f"""target data device({device})
+                                map(to: a) map(from: b)"""
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            b = np.ones(10)
+            with omp(target_data_pragma):
+                with omp(target_pragma):
+                    for i in range(len(a)):
+                        a[i] = 42
+                        b[i] = 42
+            return a, b
+
+        a, b = test_impl()
+        np.testing.assert_array_equal(a, np.ones(10))
+        np.testing.assert_array_equal(b, np.full(10, 42))
+
+    def target_update_from(self, device):
+        target_data_pragma = f"""target data device({device})
+                                map(to: a)"""
+        target_pragma = f"target device({device})"
+        target_update_pragma = f"target update from(a) device({device})"
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            with omp(target_data_pragma):
+                with omp(target_pragma):
+                    for i in range(len(a)):
+                        a[i] = 42
+                with omp(target_update_pragma):
+                    pass
+            return a
+
+        a = test_impl()
+        np.testing.assert_array_equal(a, np.full(10, 42))
+
+    def target_update_to(self, device):
+        target_data_pragma = f"""target data device({device})
+                                map(from: a)"""
+        target_pragma = f"target device({device})"
+        target_update_pragma = f"target update to(a) device({device})"
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            with omp(target_data_pragma):
+                a += 1
+
+                with omp(target_update_pragma):
+                    pass
+
+                with omp(target_pragma):
+                    for i in range(len(a)):
+                        a[i] += 1
+            return a
+
+        a = test_impl()
+        np.testing.assert_array_equal(a, np.full(10, 3))
+
+    def target_update_to_from(self, device):
+        target_data_pragma = f"""target data device({device})
+                                map(to: a)"""
+        target_pragma = f"target device({device})"
+        target_update_to_pragma = f"target update to(a) device({device})"
+        target_update_from_pragma = f"target update from(a) device({device})"
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            with omp(target_data_pragma):
+                a += 1
+
+                with omp(target_update_to_pragma):
+                    pass
+
+                with omp(target_pragma):
+                    for i in range(len(a)):
+                        a[i] += 1
+
+                with omp(target_update_from_pragma):
+                    pass
+
+                a += 1
+            return a
+
+        a = test_impl()
+        np.testing.assert_array_equal(a, np.full(10, 4))
+
+    # WEIRD: breaks when runs alone, passes if runs with all tests.
+    def target_enter_exit_data_to_from_hostonly(self, device):
+        target_enter = f"""target enter data device({device})
+                                map(to: a)"""
+
+        target_exit = f"""target exit data device({device})
+                                map(from: a)"""
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            with omp(target_enter):
+                pass
+
+            a += 1
+
+            # XXX: Test passes if uncommented!
+            # with omp("target device(1)"):
+            #    pass
+
+            with omp(target_exit):
+                pass
+
+            return a
+
+        a = test_impl()
+        np.testing.assert_array_equal(a, np.full(10, 1))
+
+    # WEIRD: breaks when runs alone, passes if runs with all tests.
+    def target_data_tofrom_hostonly(self, device):
+        target_data = f"""target data device({device})
+                                map(tofrom: a)"""
+
+        @njit
+        def test_impl():
+            a = np.ones(10)
+            with omp(target_data):
+                a += 1
+
+            # XXX: Test passes if uncommented!
+            # with omp("target device(1)"):
+            #    pass
+
+            return a
+
+        a = test_impl()
+        np.testing.assert_array_equal(a, np.full(10, 1))
+
+    def target_data_update(self, device):
+        target_pragma = f"target teams distribute parallel for device({device})"
+        target_data = f"target data map(from:a) device({device})"
+        target_update = f"target update to(a) device({device})"
+
+        @njit
+        def test_impl(a):
+            with omp(target_data):
+                for rep in range(10):
+                    # Target update resets a to ones.
+                    with omp(target_update):
+                        pass
+                    with omp(target_pragma):
+                        for i in range(len(a)):
+                            a[i] += 1
+
+        a = np.ones(4)
+        test_impl(a)
+        np.testing.assert_array_equal(a, np.full(4, 2))
+
+    @unittest.skipUnless(TestOpenmpBase.skip_disabled, "Abort - unimplemented")
+    def target_data_nest_multiple_target(self, device):
+        target_data_pragma = f"""target data device({device}) map(to: a)
+                        map(tofrom: b) map(from: as1, as2, bs1, bs2)"""
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl(s, n1, n2):
+            a = np.full(s, n1)
+            as1 = np.empty(s, dtype=a.dtype)
+            as2 = np.empty(s, dtype=a.dtype)
+            b = n1
+            with omp(target_data_pragma):
+                with omp(target_pragma):
+                    as1[:] = a
+                    bs1 = b
+                with omp(target_pragma):
+                    for i in range(s):
+                        a[i] = n2
+                    b = n2
+                with omp(target_pragma):
+                    as2[:] = a
+                    bs2 = b
+            return a, as1, as2, b, bs1, bs2
+
+        s, n1, n2 = 50, 1, 2
+        ao, a1, a2, bo, b1, b2 = test_impl(s, n1, n2)
+        np.testing.assert_array_equal(ao, np.full(s, n1))
+        np.testing.assert_array_equal(a1, np.full(s, n1))
+        np.testing.assert_array_equal(a2, np.full(s, n2))
+        assert bo == n2
+        assert b1 == n1
+        assert b2 == n2
+
+    @unittest.skip("Creates map entries that aren't cleared.")
+    def target_enter_exit_data_array_sections(self, device):
+        target_enter_pragma = (
+            f"target enter data map(to: a[0:3], b[bstart:bstop]) device({device})"
+        )
+        target_exit_pragma = f"target exit data map(from: a[0:3]) device({device})"
+        target_pragma = f"target teams distribute parallel for device({device})"
+
+        @njit
+        def test_impl():
+            bstart = 0
+            bstop = 3
+            a = np.array([1, 2, 3])
+            b = np.array([3, 2, 1])
+            with omp(target_enter_pragma):
+                with omp(target_pragma):
+                    for i in range(1):
+                        a[0] = 42
+                        b[0] = 42
+
+            with omp(target_exit_pragma):
+                pass
+
+            return a, b
+
+        a, b = test_impl()
+        np.testing.assert_array_equal(a, [42, 2, 3])
+        np.testing.assert_array_equal(b, [3, 2, 1])
+
+    def target_enter_exit_data(self, device):
+        target_enter_pragma = f"""target enter data device({device})
+                            map(to: scalar) map(to: array)"""
+        target_exit_pragma = f"""target exit data device({device})
+                            map(from: scalar, array)"""
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl(scalar, array):
+            with omp(target_enter_pragma):
+                pass
+
+            with omp(target_pragma):
+                scalar += 1
+                for i in range(len(array)):
+                    array[i] += 1
+
+            with omp(target_exit_pragma):
+                pass
+
+            return scalar, array
+
+        n = 10
+        s = 42
+        a = np.full(n, 42)
+        r_s, r_a = test_impl(s, a)
+        # NOTE: This is confusing but spec compliant and matches OpenMP target
+        # offloading of the C/C++ version: scalar is implicitly a firstprivate
+        # thus it does not copy back to the host although it is in a "from" map
+        # of the target exit data directive.
+
+        # TODO: we may want to revise Python behavior and copy back scalar too.
+        np.testing.assert_equal(r_s, 42)
+        np.testing.assert_array_equal(r_a, np.full(n, 43))
+
+    def target_enter_exit_data_alloc(self, device):
+        target_enter_pragma = f"""target enter data device({device})
+                                map(alloc: a)"""
+        target_exit_pragma = f"target exit data device({device}) map(from: a)"
+        target_pragma = f"target device({device})"
+
+        @njit
+        def test_impl(a):
+            with omp(target_enter_pragma):
+                pass
+            with omp(target_pragma):
+                for i in range(len(a)):
+                    a[i] = 1
+            with omp(target_exit_pragma):
+                pass
+
+            return a
+
+        n = 100
+        a = np.zeros(n)
+        r = test_impl(a)
+        np.testing.assert_array_equal(r, np.ones(n))
+
+    def target_teams_distribute_parallel_for(self, device):
+        target_pragma = f"""target teams distribute parallel for
+                        device({device}) num_teams(4)
+                        map(tofrom: s, a, sched_team, sched_thread)"""
+
+        @njit
+        def test_impl(a, sched_team, sched_thread):
+            s = 42
+            with omp(target_pragma):
+                for i in range(len(a)):
+                    a[i] = 1
+                    team_id = omp_get_team_num()
+                    sched_team[i] = team_id
+                    thread_id = omp_get_thread_num()
+                    sched_thread[i] = thread_id
+                    if i == 0 and team_id == 0 and thread_id == 0:
+                        s += 1
+            return s, a, sched_team, sched_thread
+
+        n = 1024
+        a = np.zeros(n)
+        sched_team = np.zeros(n)
+        sched_thread = np.zeros(n)
+        s, r, sched_team, sched_thread = test_impl(a, sched_team, sched_thread)
+        self.assertEqual(s, 43)
+        np.testing.assert_array_equal(r, np.ones(n))
+        # u_team stores unique ids of teams, c_team stores how many iterations
+        # each time executed.
+        u_team, c_team = np.unique(sched_team, return_counts=True)
+        # u_thread stores unique ids of threads (regardless of team), c_thread
+        # stores how many iterations threads of the same unique id executed.
+        u_thread, c_thread = np.unique(sched_thread, return_counts=True)
+        if device == 0:
+            # there are 4 teams each with a unique id starting from 0.
+            self.assertEqual(len(u_team), 4)
+            np.testing.assert_array_equal(u_team, np.arange(0, len(u_team)))
+            # each team should execute 1024/4 = 256 iterations.
+            np.testing.assert_array_equal(c_team, np.full(len(c_team), n / len(u_team)))
+            # Expect equal number of iterations per thread id across teams.
+            np.testing.assert_array_equal(
+                c_thread, np.full(len(u_thread), n / len(u_thread))
+            )
+        elif device == 1:
+            self.assertLessEqual(len(u_team), 4)
+            np.testing.assert_array_equal(u_team, np.arange(0, len(u_team)))
+            # Divide (integer) n iterations by number of teams and add the
+            # remainder.
+            chunk = n // len(u_team)
+            rem = n % len(u_team)
+            chunks = np.full(len(u_team), chunk)
+            chunks[:rem] += 1
+            np.testing.assert_array_equal(c_team, chunks)
+
+            # Divide (integer) per team iterations by number of threads and add the
+            # remainder.
+            chunks_thread = np.zeros(len(u_thread))
+            for i in range(len(u_team)):
+                chunk = chunks[i] // len(u_thread)
+                rem = chunks[i] % len(u_thread)
+                chunk_thread = np.full(len(u_thread), chunk)
+                chunk_thread[:rem] += 1
+                chunks_thread += chunk_thread
+
+            np.testing.assert_array_equal(c_thread, chunks_thread)
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    @unittest.skip("Fix unexpected QUAL.OMP.THREAD_LIMIT")
+    def target_teams_nest_distribute_parallel_for(self, device):
+        target_pragma = f"""target teams device({device}) num_teams(4)
+                        map(tofrom: s, a, sched_team, sched_thread)"""
+        dist_parfor_pragma = "distribute parallel for num_threads(256)"
+
+        @njit
+        def test_impl(a, sched_team, sched_thread):
+            s = 42
+            with omp(target_pragma):
+                with omp(dist_parfor_pragma):
+                    for i in range(len(a)):
+                        a[i] = 1
+                        team_id = omp_get_team_num()
+                        sched_team[i] = team_id
+                        thread_id = omp_get_thread_num()
+                        sched_thread[i] = thread_id
+                        if i == 0 and team_id == 0 and thread_id == 0:
+                            s += 1
+            return s, a, sched_team, sched_thread
+
+        n = 1024
+        a = np.zeros(n)
+        sched_team = np.zeros(n)
+        sched_thread = np.zeros(n)
+        s, r, sched_team, sched_thread = test_impl(a, sched_team, sched_thread)
+        np.testing.assert_equal(s, 43)
+        np.testing.assert_array_equal(r, np.ones(n))
+        u_team, c_team = np.unique(sched_team, return_counts=True)
+        # there are 4 teams each with a unique id starting from 0.
+        np.testing.assert_equal(len(u_team), 4)
+        np.testing.assert_array_equal(u_team, np.arange(0, len(u_team)))
+        # each team should execute 1024/4 = 256 iterations.
+        np.testing.assert_array_equal(c_team, np.full(len(c_team), n / len(u_team)))
+        u_thread, c_thread = np.unique(sched_thread, return_counts=True)
+        # testing thread scheduling is tricky: OpenMP runtime sets aside a warp
+        # for the "sequential" target region execution.
+        # TODO: update tests as newer LLVM version lift the above limitations.
+        self.assertGreaterEqual(len(u_thread), n / len(u_team) - 32)
+        for c_thread_i in c_thread:
+            # threads from team 0 will execute more iterations (see above
+            # comment on removed warp).
+            self.assertGreaterEqual(c_thread_i, 4)
+
+    def target_teams_nest_parallel_fpriv_shared_scalar(self, device):
+        target_pragma = f"target teams num_teams(1) thread_limit(32) device({device}) map(from: threads)"
+
+        @njit
+        def test_impl():
+            s = 42
+            r = np.zeros(32)
+            threads = 0
+            with omp(target_pragma):
+                with omp("parallel firstprivate(s)"):
+                    threadno = omp_get_thread_num()
+                    if threadno == 0:
+                        threads = omp_get_num_threads()
+                    s += 1
+                    r[threadno] = s
+            return s, r, threads
+
+        s, r, threads = test_impl()
+        self.assertEqual(s, 42)
+        self.assertLessEqual(threads, 32)
+        np.testing.assert_array_equal(r[:threads], np.full(threads, 43))
+
+    def target_nest_parallel_float_fpriv(self, device):
+        target_pragma = f"target device({device}) map(from: r)"
+
+        @njit
+        def test_impl():
+            s = np.float32(42.0)
+            r = np.float32(0.0)
+            with omp(target_pragma):
+                with omp("parallel firstprivate(s)"):
+                    threadno = omp_get_thread_num()
+                    if threadno == 0:
+                        r = s + 1
+            return r
+
+        r = test_impl()
+        np.testing.assert_equal(r, 43.0)
+
+    def target_nest_teams_float_fpriv(self, device):
+        target_pragma = f"target device({device}) map(from: r)"
+
+        @njit
+        def test_impl():
+            s = np.float32(42.0)
+            r = np.float32(0.0)
+            with omp(target_pragma):
+                with omp("teams firstprivate(s)"):
+                    teamno = omp_get_thread_num()
+                    if teamno == 0:
+                        r = s + 1
+            return r
+
+        r = test_impl()
+        np.testing.assert_equal(r, 43.0)
+
+    @unittest.skip("Frontend codegen error")
+    def target_teams_nest_parallel_fpriv_shared_array(self, device):
+        target_pragma = f"target teams num_teams(1) thread_limit(32) device({device})"
+
+        # FIX: frontend fails to emit copy constructor, error:
+        # add_llvm_module is not supported on the CUDACodelibrary
+        # QUESTION: in which address space does the copy constructor create the copy on the GPU?
+        @njit
+        def test_impl():
+            s = np.zeros(32)
+            with omp(target_pragma):
+                with omp("parallel firstprivate(s)"):
+                    print("parallel s", s[0])
+                    teams = omp_get_num_teams()
+                    threads = omp_get_num_threads()
+                    teamno = omp_get_team_num()
+                    threadno = omp_get_thread_num()
+                    if teamno == 0 and threadno == 0:
+                        print("teams", teams, "threads", threads)
+
+        test_impl()
+        input("ok?")
+
+    def target_teams_shared_array(self, device):
+        target_pragma = f"target teams num_teams(10) map(tofrom: a) map(from: nteams) device({device})"
+
+        @njit
+        def test_impl():
+            a = np.zeros(10, dtype=np.int32)
+            nteams = 0
+
+            with omp(target_pragma):
+                team_shared_array = np.empty(10, dtype=np.int32)
+                team_id = omp_get_team_num()
+
+                if team_id == 0:
+                    nteams = omp_get_num_teams()
+
+                for i in range(10):
+                    team_shared_array[i] = team_id
+
+                lasum = 0
+                for i in range(10):
+                    lasum += team_shared_array[i]
+                a[team_id] = lasum
+
+            return a, nteams
+
+        r, nteams = test_impl()
+        expected = np.arange(10) * 10
+        if device == 0:
+            np.testing.assert_array_equal(r, expected)
+        elif device == 1:
+            np.testing.assert_array_equal(r[:nteams], expected[:nteams])
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_teams_shared_array_2d(self, device):
+        target_pragma = f"target teams num_teams(10) map(tofrom: a) map(from: nteams) device({device})"
+
+        @njit
+        def test_impl():
+            a = np.zeros((10, 2, 2), dtype=np.int32)
+            nteams = 0
+
+            with omp(target_pragma):
+                team_shared_array = np.empty((2, 2), dtype=np.int32)
+                team_id = omp_get_team_num()
+
+                if team_id == 0:
+                    nteams = omp_get_num_teams()
+
+                for i in range(2):
+                    for j in range(2):
+                        team_shared_array[i, j] = team_id
+
+                for i in range(2):
+                    for j in range(2):
+                        a[team_id, i, j] = team_shared_array[i, j]
+            return a, nteams
+
+        a, nteams = test_impl()
+        expected = np.empty((10, 2, 2))
+        for i in range(10):
+            expected[i] = np.full((2, 2), i)
+        if device == 0:
+            np.testing.assert_array_equal(a, expected)
+        elif device == 1:
+            np.testing.assert_array_equal(a[:nteams], expected[:nteams])
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_local_array(self, device):
+        target_pragma = f"target teams num_teams(1) map(tofrom: a) map(from: nthreads) device({device})"
+
+        @njit
+        def test_impl():
+            a = np.zeros((32, 10), dtype=np.int32)
+            nthreads = 0
+            with omp(target_pragma):
+                with omp("parallel num_threads(32)"):
+                    local_array = np.empty(10, dtype=np.int32)
+                    tid = omp_get_thread_num()
+                    if tid == 0:
+                        nthreads = omp_get_num_threads()
+                    for i in range(10):
+                        local_array[i] = tid
+                    for i in range(10):
+                        a[tid, i] = local_array[i]
+            return a, nthreads
+
+        a, nthreads = test_impl()
+        expected = np.empty((32, 10), dtype=np.int32)
+        for i in range(32):
+            expected[i] = [i] * 10
+        if device == 0:
+            self.assertEqual(nthreads, 32)
+            np.testing.assert_array_equal(a, expected)
+        elif device == 1:
+            # CPU num_threads are capped by number of cores, which can be less
+            # than the provided value.
+            self.assertLessEqual(nthreads, 32)
+            np.testing.assert_array_equal(a[:nthreads], expected[:nthreads])
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_teams_parallel_shared_array(self, device):
+        target_pragma = f"target teams num_teams(10) map(tofrom: a) map(from: nteams, nthreads) device({device})"
+
+        @njit
+        def test_impl():
+            # save data from 10 teams each of 32 threads (maximally).
+            a = np.zeros((10, 32), dtype=np.int32)
+            nteams = 0
+            nthreads = 0
+
+            with omp(target_pragma):
+                team_shared_array = np.empty(32, dtype=np.int32)
+                team_id = omp_get_team_num()
+                if team_id == 0:
+                    nteams = omp_get_num_teams()
+                    nthreads = omp_get_num_threads()
+
+                with omp("parallel num_threads(32)"):
+                    thread_local_array = np.empty(10, dtype=np.int32)
+                    for i in range(10):
+                        thread_local_array[i] = omp_get_thread_num()
+
+                    lasum = 0
+                    for i in range(10):
+                        lasum += thread_local_array[i]
+                    team_shared_array[omp_get_thread_num()] = lasum / 10
+
+                for i in range(32):
+                    a[team_id, i] = team_shared_array[i]
+
+            return a, nteams, nthreads
+
+        r, nteams, nthreads = test_impl()
+        expected = np.tile(np.arange(32), (10, 1))
+        if device == 0:
+            np.testing.assert_array_equal(r, expected)
+        elif device == 1:
+            np.testing.assert_array_equal(
+                r[:nteams, :nthreads], expected[:nteams, :nthreads]
+            )
+        else:
+            raise ValueError(f"Device {device} must be 0 or 1")
+
+    def target_teams_loop_collapse(self, device):
+        target_pragma = f"""target teams loop collapse(2)
+                        device({device})
+                        map(tofrom: a, b, c)"""
+
+        @njit
+        def test_impl(n):
+            a = np.ones((n, n))
+            b = np.ones((n, n))
+            c = np.zeros((n, n))
+            with omp(target_pragma):
+                for i in range(n):
+                    for j in range(n):
+                        c[i, j] = a[i, j] + b[i, j]
+            return c
+
+        n = 10
+        c = test_impl(n)
+        np.testing.assert_array_equal(c, np.full((n, n), 2))
+
+    def target_nest_teams_nest_loop_collapse(self, device):
+        target_pragma = f"""target device({device}) map(tofrom: a, b, c)"""
+
+        @njit
+        def test_impl(n):
+            a = np.ones((n, n))
+            b = np.ones((n, n))
+            c = np.zeros((n, n))
+            with omp(target_pragma):
+                with omp("teams"):
+                    with omp("loop collapse(2)"):
+                        for i in range(n):
+                            for j in range(n):
+                                c[i, j] = a[i, j] + b[i, j]
+            return c
+
+        n = 10
+        c = test_impl(n)
+        np.testing.assert_array_equal(c, np.full((n, n), 2))
+
+
+for memberName in dir(TestOpenmpTarget):
+    if memberName.startswith("target"):
+        test_func = getattr(TestOpenmpTarget, memberName)
+
+        def make_func_with_subtest(func):
+            def func_with_subtest(self):
+                for device in TestOpenmpTarget.devices:
+                    with self.subTest(device=device):
+                        func(self, device)
+
+            return func_with_subtest
+
+        setattr(
+            TestOpenmpTarget,
+            "test_" + test_func.__name__,
+            make_func_with_subtest(test_func),
+        )
+
+
+class TestOpenmpPi(TestOpenmpBase):
+    def __init__(self, *args):
+        TestOpenmpBase.__init__(self, *args)
+
+    def test_pi_loop(self):
+        def test_impl(num_steps):
+            step = 1.0 / num_steps
+
+            the_sum = 0.0
+            omp_set_num_threads(4)
+
+            with omp("parallel"):
+                with omp("for reduction(+:the_sum) schedule(static)"):
+                    for j in range(num_steps):
+                        x = ((j - 1) - 0.5) * step
+                        the_sum += 4.0 / (1.0 + x * x)
+
+            pi = step * the_sum
+            return pi
+
+        self.check(test_impl, 100000)
+
+    def test_pi_loop_combined(self):
+        def test_impl(num_steps):
+            step = 1.0 / num_steps
+
+            the_sum = 0.0
+            omp_set_num_threads(4)
+
+            with omp("parallel for reduction(+:the_sum) schedule(static)"):
+                for j in range(num_steps):
+                    x = ((j - 1) - 0.5) * step
+                    the_sum += 4.0 / (1.0 + x * x)
+
+            pi = step * the_sum
+            return pi
+
+        self.check(test_impl, 100000)
+
+    def test_pi_loop_directive(self):
+        def test_impl(num_steps):
+            step = 1.0 / num_steps
+
+            the_sum = 0.0
+            omp_set_num_threads(4)
+
+            with omp("loop reduction(+:the_sum) schedule(static)"):
+                for j in range(num_steps):
+                    x = ((j - 1) - 0.5) * step
+                    the_sum += 4.0 / (1.0 + x * x)
+
+            pi = step * the_sum
+            return pi
+
+        self.check(test_impl, 100000)
+
+    def test_pi_spmd(self):
+        def test_impl(num_steps):
+            step = 1.0 / num_steps
+            MAX_THREADS = 8
+            tsum = np.zeros(MAX_THREADS)
+
+            j = 4
+            omp_set_num_threads(j)
+            full_sum = 0.0
+
+            with omp("parallel private(tid, numthreads, local_sum, x)"):
+                tid = omp_get_thread_num()
+                numthreads = omp_get_num_threads()
+                local_sum = 0.0
+
+                for i in range(tid, num_steps, numthreads):
+                    x = (i + 0.5) * step
+                    local_sum += 4.0 / (1.0 + x * x)
+
+                tsum[tid] = local_sum
+
+            for k in range(j):
+                full_sum += tsum[k]
+
+            pi = step * full_sum
+            return pi
+
+        self.check(test_impl, 10000000)
+
+    def test_pi_task(self):
+        def test_pi_comp(Nstart, Nfinish, step):
+            MIN_BLK = 256
+            pi_sum = 0.0
+            if Nfinish - Nstart < MIN_BLK:
+                for i in range(Nstart, Nfinish):
+                    x = (i + 0.5) * step
+                    pi_sum += 4.0 / (1.0 + x * x)
+            else:
+                iblk = Nfinish - Nstart
+                pi_sum1 = 0.0
+                pi_sum2 = 0.0
+                cut = Nfinish - (iblk // 2)
+                with omp("task shared(pi_sum1)"):
+                    pi_sum1 = test_pi_comp(Nstart, cut, step)
+                with omp("task shared(pi_sum2)"):
+                    pi_sum2 = test_pi_comp(cut, Nfinish, step)
+                with omp("taskwait"):
+                    pi_sum = pi_sum1 + pi_sum2
+            return pi_sum
+
+        @njit
+        def test_pi_comp_njit(Nstart, Nfinish, step):
+            MIN_BLK = 256
+            pi_sum = 0.0
+            if Nfinish - Nstart < MIN_BLK:
+                for i in range(Nstart, Nfinish):
+                    x = (i + 0.5) * step
+                    pi_sum += 4.0 / (1.0 + x * x)
+            else:
+                iblk = Nfinish - Nstart
+                pi_sum1 = 0.0
+                pi_sum2 = 0.0
+                cut = Nfinish - (iblk // 2)
+                with omp("task shared(pi_sum1)"):
+                    pi_sum1 = test_pi_comp_njit(Nstart, cut, step)
+                with omp("task shared(pi_sum2)"):
+                    pi_sum2 = test_pi_comp_njit(cut, Nfinish, step)
+                with omp("taskwait"):
+                    pi_sum = pi_sum1 + pi_sum2
+            return pi_sum
+
+        def test_impl(lb, num_steps, pi_comp_func):
+            step = 1.0 / num_steps
+
+            j = 4
+            omp_set_num_threads(j)
+            full_sum = 0.0
+
+            with omp("parallel"):
+                with omp("single"):
+                    full_sum = pi_comp_func(lb, num_steps, step)
+
+            pi = step * full_sum
+            return pi
+
+        py_output = test_impl(0, 1024, test_pi_comp)
+        njit_output = njit(test_impl)(0, 1024, test_pi_comp_njit)
+        self.assert_outputs_equal(py_output, njit_output)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 7f3ba0a3f9dc921674d55566fd4f4c7e3644fd3a Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Tue, 22 Apr 2025 03:51:10 -0700
Subject: [PATCH 03/28] Cleanup openmp testing

- Fix override_config
- Rename openmp_context alias to openmp instead of omp to align with examples
  and documentation
---
 numba/openmp/tests/test_openmp.py | 971 +++++++++++++++---------------
 1 file changed, 486 insertions(+), 485 deletions(-)

diff --git a/numba/openmp/tests/test_openmp.py b/numba/openmp/tests/test_openmp.py
index 17aba2cb8b02..37ff05930ea3 100644
--- a/numba/openmp/tests/test_openmp.py
+++ b/numba/openmp/tests/test_openmp.py
@@ -74,9 +74,10 @@
     skip_unless_scipy,
     needs_subprocess,
 )
-import numba.openmp as openmp
+
+import numba.openmp
 from numba.openmp import njit
-from numba.openmp import openmp_context as omp
+from numba.openmp import openmp_context as openmp
 from numba.openmp import (
     omp_set_num_threads,
     omp_get_thread_num,
@@ -195,12 +196,12 @@ def override_config(name, value):
     *name* to *value*.  *name* must be the name of an existing variable
     in openmp.
     """
-    old_value = getattr(openmp, name)
-    setattr(openmp, name, value)
+    old_value = getattr(numba.openmp, name)
+    setattr(numba.openmp, name, value)
     try:
         yield
     finally:
-        setattr(openmp, name, old_value)
+        setattr(numba.openmp, name, old_value)
 
 
 # @needs_subprocess
@@ -440,9 +441,9 @@ def test_impl():
             omp_set_dynamic(0)
             o_nt = omp_get_max_threads()
             count = 0
-            with omp("parallel"):
+            with openmp("parallel"):
                 i_nt = omp_get_max_threads()
-                with omp("critical"):
+                with openmp("critical"):
                     count += 1
             return count, i_nt, o_nt
 
@@ -457,9 +458,9 @@ def test_impl():
             omp_set_dynamic(0)
             o_nt = omp_get_num_threads()
             count = 0
-            with omp("parallel"):
+            with openmp("parallel"):
                 i_nt = omp_get_num_threads()
-                with omp("critical"):
+                with openmp("critical"):
                     count += 1
             return (count, i_nt), o_nt
 
@@ -476,12 +477,12 @@ def test_impl(n1, n2):
             omp_set_num_threads(n1)
             count1 = 0
             count2 = 0
-            with omp("parallel"):
-                with omp("critical"):
+            with openmp("parallel"):
+                with openmp("critical"):
                     count1 += 1
                 omp_set_num_threads(n2)
-            with omp("parallel"):
-                with omp("critical"):
+            with openmp("parallel"):
+                with openmp("critical"):
                     count2 += 1
             return count1, count2
 
@@ -497,17 +498,17 @@ def test_impl(n1, n2, n3):
             omp_set_max_active_levels(2)
             omp_set_num_threads(n2)
             count1, count2, count3 = 0, 0, 0
-            with omp("parallel num_threads(n1)"):
-                with omp("single"):
-                    with omp("parallel"):
-                        with omp("single"):
+            with openmp("parallel num_threads(n1)"):
+                with openmp("single"):
+                    with openmp("parallel"):
+                        with openmp("single"):
                             omp_set_num_threads(n3)
-                            with omp("parallel"):
-                                with omp("critical"):
+                            with openmp("parallel"):
+                                with openmp("critical"):
                                     count3 += 1
-                        with omp("critical"):
+                        with openmp("critical"):
                             count2 += 1
-                with omp("critical"):
+                with openmp("critical"):
                     count1 += 1
             return count1, count2, count3
 
@@ -521,13 +522,13 @@ def test_func_get_ancestor_thread_num(self):
         @njit
         def test_impl():
             oa = omp_get_ancestor_thread_num(0)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     m1 = omp_get_ancestor_thread_num(0)
                     f1 = omp_get_ancestor_thread_num(1)
                     s1 = omp_get_ancestor_thread_num(2)
                     tn1 = omp_get_thread_num()
-                    with omp("parallel"):
+                    with openmp("parallel"):
                         m2 = omp_get_ancestor_thread_num(0)
                         f2 = omp_get_ancestor_thread_num(1)
                         s2 = omp_get_ancestor_thread_num(2)
@@ -545,14 +546,14 @@ def test_func_get_team_size(self):
         def test_impl(n1, n2):
             omp_set_max_active_levels(2)
             oa = omp_get_team_size(0)
-            with omp("parallel num_threads(n1)"):
-                with omp("single"):
+            with openmp("parallel num_threads(n1)"):
+                with openmp("single"):
                     m1 = omp_get_team_size(0)
                     f1 = omp_get_team_size(1)
                     s1 = omp_get_team_size(2)
                     nt1 = omp_get_num_threads()
-                    with omp("parallel num_threads(n2)"):
-                        with omp("single"):
+                    with openmp("parallel num_threads(n2)"):
+                        with openmp("single"):
                             m2 = omp_get_team_size(0)
                             f2 = omp_get_team_size(1)
                             s2 = omp_get_team_size(2)
@@ -570,11 +571,11 @@ def test_func_get_level(self):
         @njit
         def test_impl():
             oa = omp_get_level()
-            with omp("parallel if(0)"):
+            with openmp("parallel if(0)"):
                 f = omp_get_level()
-                with omp("parallel num_threads(1)"):
+                with openmp("parallel num_threads(1)"):
                     s = omp_get_level()
-                    with omp("parallel"):
+                    with openmp("parallel"):
                         t = omp_get_level()
             return oa, f, s, t
 
@@ -585,11 +586,11 @@ def test_func_get_active_level(self):
         @njit
         def test_impl():
             oa = omp_get_active_level()
-            with omp("parallel if(0)"):
+            with openmp("parallel if(0)"):
                 f = omp_get_active_level()
-                with omp("parallel num_threads(1)"):
+                with openmp("parallel num_threads(1)"):
                     s = omp_get_active_level()
-                    with omp("parallel"):
+                    with openmp("parallel"):
                         t = omp_get_active_level()
             return oa, f, s, t
 
@@ -604,14 +605,14 @@ def test_impl():
             omp_set_dynamic(0)
             omp_set_max_active_levels(1)  # 1 because first region is inactive
             oa = omp_in_parallel()
-            with omp("parallel num_threads(1)"):
+            with openmp("parallel num_threads(1)"):
                 ia = omp_in_parallel()
-                with omp("parallel"):
+                with openmp("parallel"):
                     n1a = omp_in_parallel()
-                    with omp("single"):
-                        with omp("parallel"):
+                    with openmp("single"):
+                        with openmp("parallel"):
                             n2a = omp_in_parallel()
-            with omp("parallel if(0)"):
+            with openmp("parallel if(0)"):
                 ua = omp_in_parallel()
             return oa, ia, n1a, n2a, ua
 
@@ -629,13 +630,13 @@ def test_impl(N, c):
             a = np.arange(N)[::-1]
             fa = np.zeros(N)
             fia = np.zeros(N)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(len(a)):
                         e = a[i]
-                        with omp("task final(e >= c)"):
+                        with openmp("task final(e >= c)"):
                             fa[i] = omp_in_final()
-                            with omp("task"):
+                            with openmp("task"):
                                 fia[i] = omp_in_final()
             return fa, fia
 
@@ -651,7 +652,7 @@ def __init__(self, *args):
 
     def test_parallel_for_set_elements(self):
         def test_impl(v):
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(len(v)):
                     v[i] = 1.0
             return v
@@ -660,8 +661,8 @@ def test_impl(v):
 
     def test_separate_parallel_for_set_elements(self):
         def test_impl(v):
-            with omp("parallel"):
-                with omp("for"):
+            with openmp("parallel"):
+                with openmp("for"):
                     for i in range(len(v)):
                         v[i] = 1.0
             return v
@@ -671,7 +672,7 @@ def test_impl(v):
     def test_parallel_for_const_var_omp_statement(self):
         def test_impl(v):
             ovar = "parallel for"
-            with omp(ovar):
+            with openmp(ovar):
                 for i in range(len(v)):
                     v[i] = 1.0
             return v
@@ -681,7 +682,7 @@ def test_impl(v):
     def test_parallel_for_string_conditional(self):
         def test_impl(S):
             capitalLetters = 0
-            with omp("parallel for reduction(+:capitalLetters)"):
+            with openmp("parallel for reduction(+:capitalLetters)"):
                 for i in range(len(S)):
                     if S[i].isupper():
                         capitalLetters += 1
@@ -692,7 +693,7 @@ def test_impl(S):
     def test_parallel_for_tuple(self):
         def test_impl(t):
             len_total = 0
-            with omp("parallel for reduction(+:len_total)"):
+            with openmp("parallel for reduction(+:len_total)"):
                 for i in range(len(t)):
                     len_total += len(t[i])
             return len_total
@@ -702,7 +703,7 @@ def test_impl(t):
     def test_parallel_for_range_step_2(self):
         def test_impl(N):
             a = np.zeros(N, dtype=np.int32)
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(0, len(a), 2):
                     a[i] = i + 1
 
@@ -713,7 +714,7 @@ def test_impl(N):
     def test_parallel_for_range_step_arg(self):
         def test_impl(N, step):
             a = np.zeros(N, dtype=np.int32)
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(0, len(a), step):
                     a[i] = i + 1
 
@@ -725,7 +726,7 @@ def test_parallel_for_incremented_step(self):
         @njit
         def test_impl(v, n):
             for i in range(n):
-                with omp("parallel for"):
+                with openmp("parallel for"):
                     for j in range(0, len(v), i + 1):
                         v[j] = i + 1
             return v
@@ -735,7 +736,7 @@ def test_impl(v, n):
     def test_parallel_for_range_backward_step(self):
         def test_impl(N):
             a = np.zeros(N, dtype=np.int32)
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(N - 1, -1, -1):
                     a[i] = i + 1
 
@@ -747,7 +748,7 @@ def test_impl(N):
     def test_parallel_for_dictionary(self):
         def test_impl(N, c):
             l = {}
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(N):
                     l[i] = i % c
             return l
@@ -757,8 +758,8 @@ def test_impl(N, c):
     def test_parallel_for_num_threads(self):
         def test_impl(nt):
             a = np.zeros(nt)
-            with omp("parallel num_threads(nt)"):
-                with omp("for"):
+            with openmp("parallel num_threads(nt)"):
+                with openmp("for"):
                     for i in range(nt):
                         a[i] = i
             return a
@@ -769,8 +770,8 @@ def test_parallel_for_only_inside_var(self):
         @njit
         def test_impl(nt):
             a = np.zeros(nt)
-            with omp("parallel num_threads(nt) private(x)"):
-                with omp("for private(x)"):
+            with openmp("parallel num_threads(nt) private(x)"):
+                with openmp("for private(x)"):
                     for i in range(nt):
                         x = 0
                         # print("out:", i, x, i + x, nt)
@@ -786,10 +787,10 @@ def test_parallel_for_ordered(self):
         def test_impl(N, c):
             a = np.zeros(N)
             b = np.zeros(N)
-            with omp("parallel for ordered"):
+            with openmp("parallel for ordered"):
                 for i in range(1, N):
                     b[i] = b[i - 1] + c
-                    with omp("ordered"):
+                    with openmp("ordered"):
                         a[i] = a[i - 1] + c
             return a
 
@@ -806,7 +807,7 @@ def test_impl(n1, n2, n3):
             ia = np.zeros(n1)
             ja = np.zeros((n1, n2))
             ka = np.zeros((n1, n2, n3))
-            with omp("parallel for collapse(2)"):
+            with openmp("parallel for collapse(2)"):
                 for i in range(n1):
                     ia[i] = omp_get_thread_num()
                     for j in range(n2):
@@ -837,7 +838,7 @@ def test_impl(N, nt):
             v = np.zeros(N)
             step = -2
             omp_set_num_threads(nt)
-            with omp("parallel private(thread_num)"):
+            with openmp("parallel private(thread_num)"):
                 running_omp = omp_in_parallel()
                 thread_num = omp_get_thread_num()
                 if not running_omp:
@@ -847,7 +848,7 @@ def test_impl(N, nt):
                     for t in range(N):
                         f = itersPerThread*(t+1)-1 + min(iters%itersPerThread, t+1)
                         finishToThread[f] = t
-                with omp("for schedule(static)"):
+                with openmp("for schedule(static)"):
                     for index, i in enumerate(range(N-1, N%2 - 1, -2)):
                         if not running_omp:
                             for finish in finishToThread.keys():
@@ -866,7 +867,7 @@ def test_avg_sched_const(self):
         def test_impl(n, a):
             b = np.zeros(n)
             nt = 5
-            with omp("parallel for num_threads(nt) schedule(static, 4)"):
+            with openmp("parallel for num_threads(nt) schedule(static, 4)"):
                 for i in range(1, n):
                     b[i] = (a[i] + a[i - 1]) / 2.0
 
@@ -880,7 +881,7 @@ def test_impl(n, a):
             b = np.zeros(n)
             nt = 5
             ss = 4
-            with omp("parallel for num_threads(nt) schedule(static, ss)"):
+            with openmp("parallel for num_threads(nt) schedule(static, ss)"):
                 for i in range(1, n):
                     b[i] = (a[i] + a[i - 1]) / 2.0
 
@@ -892,7 +893,7 @@ def test_static_distribution(self):
         @njit
         def test_impl(nt, c):
             a = np.empty(nt * c)
-            with omp("parallel for num_threads(nt) schedule(static)"):
+            with openmp("parallel for num_threads(nt) schedule(static)"):
                 for i in range(nt * c):
                     a[i] = omp_get_thread_num()
             return a
@@ -909,7 +910,7 @@ def test_static_chunk_distribution(self):
         @njit
         def test_impl(nt, c, cs):
             a = np.empty(nt * c)
-            with omp("parallel for num_threads(nt) schedule(static, cs)"):
+            with openmp("parallel for num_threads(nt) schedule(static, cs)"):
                 for i in range(nt * c):
                     a[i] = omp_get_thread_num()
             return a
@@ -929,11 +930,11 @@ def test_static_consistency(self):
         def test_impl(nt, c, cs):
             a = np.empty(nt * c)
             b = np.empty(nt * c)
-            with omp("parallel num_threads(8)"):
-                with omp("for schedule(static)"):
+            with openmp("parallel num_threads(8)"):
+                with openmp("for schedule(static)"):
                     for i in range(nt * c):
                         a[i] = omp_get_thread_num()
-                with omp("for schedule(static)"):
+                with openmp("for schedule(static)"):
                     for i in range(nt * c):
                         b[i] = omp_get_thread_num()
             return a, b
@@ -946,7 +947,7 @@ def test_dynamic_distribution(self):
         @njit
         def test_impl(nt, c, cs):
             a = np.empty(nt * c)
-            with omp("parallel for num_threads(nt) schedule(dynamic)"):
+            with openmp("parallel for num_threads(nt) schedule(dynamic)"):
                 for i in range(nt * c):
                     a[i] = omp_get_thread_num()
             return a
@@ -971,7 +972,7 @@ def test_guided_distribution(self):
         @njit
         def test_impl(nt, c, cs):
             a = np.empty(nt * c)
-            with omp("parallel for num_threads(nt) schedule(guided, cs)"):
+            with openmp("parallel for num_threads(nt) schedule(guided, cs)"):
                 for i in range(nt * c):
                     a[i] = omp_get_thread_num()
             return a
@@ -1007,18 +1008,18 @@ def test_impl(N, c1, c2):
             n_count = 0
             nc_count = 0
             a_count = 0
-            with omp("parallel num_threads(N) shared(c2)"):
-                with omp("critical"):
+            with openmp("parallel num_threads(N) shared(c2)"):
+                with openmp("critical"):
                     d_count += 1
-                with omp("parallel"):
-                    with omp("critical"):
+                with openmp("parallel"):
+                    with openmp("critical"):
                         n_count += 1
-                with omp("single"):
-                    with omp("parallel num_threads(6)"):
-                        with omp("critical"):
+                with openmp("single"):
+                    with openmp("parallel num_threads(6)"):
+                        with openmp("critical"):
                             nc_count += 1
-            with omp("parallel"):
-                with omp("critical"):
+            with openmp("parallel"):
+                with openmp("critical"):
                     a_count += 1
             return d_count, a_count, n_count, nc_count
 
@@ -1041,11 +1042,11 @@ def test_impl(s):
 
             omp_set_num_threads(s)
             omp_set_dynamic(0)
-            with omp("parallel for if(rp)"):
+            with openmp("parallel for if(rp)"):
                 for i in range(s):
                     ar[omp_get_thread_num()] = 1
                     par[i] = omp_in_parallel()
-            with omp("parallel for if(drp)"):
+            with openmp("parallel for if(drp)"):
                 for i in range(s):
                     adr[omp_get_thread_num()] = 1
                     padr[i] = omp_in_parallel()
@@ -1065,7 +1066,7 @@ def test_impl(n, a):
             b = np.zeros(n)
             omp_set_num_threads(5)
 
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(1, n):
                     b[i] = (a[i] + a[i - 1]) / 2.0
             return b
@@ -1075,7 +1076,7 @@ def test_impl(n, a):
     def test_avg_num_threads_clause(self):
         def test_impl(n, a):
             b = np.zeros(n)
-            with omp("parallel for num_threads(5)"):
+            with openmp("parallel for num_threads(5)"):
                 for i in range(1, n):
                     b[i] = (a[i] + a[i - 1]) / 2.0
 
@@ -1087,7 +1088,7 @@ def test_avg_num_threads_clause_var(self):
         def test_impl(n, a):
             b = np.zeros(n)
             nt = 5
-            with omp("parallel for num_threads(nt)"):
+            with openmp("parallel for num_threads(nt)"):
                 for i in range(1, n):
                     b[i] = (a[i] + a[i - 1]) / 2.0
 
@@ -1101,7 +1102,7 @@ def test_avg_if_const(self):
         def test_impl(n, a):
             b = np.zeros(n)
             nt = 5
-            with omp("parallel for if(1) num_threads(nt) schedule(static, 4)"):
+            with openmp("parallel for if(1) num_threads(nt) schedule(static, 4)"):
                 for i in range(1, n):
                     b[i] = (a[i] + a[i - 1]) / 2.0
 
@@ -1116,7 +1117,7 @@ def test_impl(n, a):
             nt = 5
             ss = 4
             do_if = 1
-            with omp("parallel for if(do_if) num_threads(nt) schedule(static, ss)"):
+            with openmp("parallel for if(do_if) num_threads(nt) schedule(static, ss)"):
                 for i in range(1, n):
                     b[i] = (a[i] + a[i - 1]) / 2.0
 
@@ -1127,8 +1128,8 @@ def test_impl(n, a):
     def test_teams1(self):
         def test_impl():
             a = 1
-            with omp("teams"):
-                with omp("parallel"):
+            with openmp("teams"):
+                with openmp("parallel"):
                     a = 123
             return a
 
@@ -1144,7 +1145,7 @@ def test_parallel_reduction_add_int(self):
         def test_impl():
             redux = 0
             nthreads = 0
-            with omp("parallel reduction(+:redux)"):
+            with openmp("parallel reduction(+:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1160,7 +1161,7 @@ def test_parallel_reduction_sub_int(self):
         def test_impl():
             redux = 0
             nthreads = 0
-            with omp("parallel reduction(-:redux)"):
+            with openmp("parallel reduction(-:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1176,7 +1177,7 @@ def test_parallel_reduction_mul_int(self):
         def test_impl():
             redux = 1
             nthreads = 0
-            with omp("parallel reduction(*:redux) num_threads(8)"):
+            with openmp("parallel reduction(*:redux) num_threads(8)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1192,7 +1193,7 @@ def test_parallel_reduction_add_fp64(self):
         def test_impl():
             redux = np.float64(0.0)
             nthreads = np.float64(0.0)
-            with omp("parallel reduction(+:redux)"):
+            with openmp("parallel reduction(+:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1208,7 +1209,7 @@ def test_parallel_reduction_sub_fp64(self):
         def test_impl():
             redux = np.float64(0.0)
             nthreads = np.float64(0.0)
-            with omp("parallel reduction(-:redux)"):
+            with openmp("parallel reduction(-:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1224,7 +1225,7 @@ def test_parallel_reduction_mul_fp64(self):
         def test_impl():
             redux = np.float64(1.0)
             nthreads = np.float64(0.0)
-            with omp("parallel reduction(*:redux) num_threads(8)"):
+            with openmp("parallel reduction(*:redux) num_threads(8)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1240,7 +1241,7 @@ def test_parallel_reduction_add_fp32(self):
         def test_impl():
             redux = np.float32(0.0)
             nthreads = np.float32(0.0)
-            with omp("parallel reduction(+:redux)"):
+            with openmp("parallel reduction(+:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1256,7 +1257,7 @@ def test_parallel_reduction_sub_fp32(self):
         def test_impl():
             redux = np.float32(0.0)
             nthreads = np.float32(0.0)
-            with omp("parallel reduction(-:redux)"):
+            with openmp("parallel reduction(-:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1272,7 +1273,7 @@ def test_parallel_reduction_mul_fp32(self):
         def test_impl():
             redux = np.float32(1.0)
             nthreads = np.float32(0.0)
-            with omp("parallel reduction(*:redux) num_threads(8)"):
+            with openmp("parallel reduction(*:redux) num_threads(8)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1287,7 +1288,7 @@ def test_parallel_for_reduction_add_int(self):
         @njit
         def test_impl():
             redux = 0
-            with omp("parallel for reduction(+:redux)"):
+            with openmp("parallel for reduction(+:redux)"):
                 for i in range(10):
                     redux += 1
             return redux
@@ -1299,7 +1300,7 @@ def test_parallel_for_reduction_sub_int(self):
         @njit
         def test_impl():
             redux = 0
-            with omp("parallel for reduction(-:redux)"):
+            with openmp("parallel for reduction(-:redux)"):
                 for i in range(10):
                     redux += 1
             return redux
@@ -1311,7 +1312,7 @@ def test_parallel_for_reduction_mul_int(self):
         @njit
         def test_impl():
             redux = 1
-            with omp("parallel for reduction(*:redux)"):
+            with openmp("parallel for reduction(*:redux)"):
                 for i in range(10):
                     redux *= 2
             return redux
@@ -1323,7 +1324,7 @@ def test_parallel_for_reduction_add_fp64(self):
         @njit
         def test_impl():
             redux = np.float64(0.0)
-            with omp("parallel for reduction(+:redux)"):
+            with openmp("parallel for reduction(+:redux)"):
                 for i in range(10):
                     redux += np.float64(1.0)
             return redux
@@ -1335,7 +1336,7 @@ def test_parallel_for_reduction_sub_fp64(self):
         @njit
         def test_impl():
             redux = np.float64(0.0)
-            with omp("parallel for reduction(-:redux)"):
+            with openmp("parallel for reduction(-:redux)"):
                 for i in range(10):
                     redux += np.float64(1.0)
             return redux
@@ -1347,7 +1348,7 @@ def test_parallel_for_reduction_mul_fp64(self):
         @njit
         def test_impl():
             redux = np.float64(1.0)
-            with omp("parallel for reduction(*:redux)"):
+            with openmp("parallel for reduction(*:redux)"):
                 for i in range(10):
                     redux *= np.float64(2.0)
             return redux
@@ -1359,7 +1360,7 @@ def test_parallel_for_reduction_add_fp32(self):
         @njit
         def test_impl():
             redux = np.float32(0.0)
-            with omp("parallel for reduction(+:redux)"):
+            with openmp("parallel for reduction(+:redux)"):
                 for i in range(10):
                     redux += np.float32(1.0)
             return redux
@@ -1371,7 +1372,7 @@ def test_parallel_for_reduction_sub_fp32(self):
         @njit
         def test_impl():
             redux = np.float32(0.0)
-            with omp("parallel for reduction(-:redux)"):
+            with openmp("parallel for reduction(-:redux)"):
                 for i in range(10):
                     redux += np.float32(1.0)
             return redux
@@ -1383,7 +1384,7 @@ def test_parallel_for_reduction_mul_fp32(self):
         @njit
         def test_impl():
             redux = np.float32(1.0)
-            with omp("parallel for reduction(*:redux)"):
+            with openmp("parallel for reduction(*:redux)"):
                 for i in range(10):
                     redux *= np.float32(2.0)
             return redux
@@ -1396,7 +1397,7 @@ def test_parallel_reduction_add_int_10(self):
         def test_impl():
             redux = 10
             nthreads = 0
-            with omp("parallel reduction(+:redux)"):
+            with openmp("parallel reduction(+:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1412,7 +1413,7 @@ def test_parallel_reduction_add_fp32_10(self):
         def test_impl():
             redux = np.float32(10.0)
             nthreads = np.float32(0.0)
-            with omp("parallel reduction(+:redux)"):
+            with openmp("parallel reduction(+:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1428,7 +1429,7 @@ def test_parallel_reduction_add_fp64_10(self):
         def test_impl():
             redux = np.float64(10.0)
             nthreads = np.float64(0.0)
-            with omp("parallel reduction(+:redux)"):
+            with openmp("parallel reduction(+:redux)"):
                 thread_id = omp_get_thread_num()
                 if thread_id == 0:
                     nthreads = omp_get_num_threads()
@@ -1443,7 +1444,7 @@ def test_parallel_for_reduction_add_int_10(self):
         @njit
         def test_impl():
             redux = 10
-            with omp("parallel for reduction(+:redux)"):
+            with openmp("parallel for reduction(+:redux)"):
                 for i in range(10):
                     redux += 1
             return redux
@@ -1455,7 +1456,7 @@ def test_parallel_for_reduction_add_fp32(self):
         @njit
         def test_impl():
             redux = np.float32(0.0)
-            with omp("parallel for reduction(+:redux)"):
+            with openmp("parallel for reduction(+:redux)"):
                 for i in range(10):
                     redux += np.float32(1.0)
             return redux
@@ -1467,7 +1468,7 @@ def test_parallel_for_reduction_add_fp64_10(self):
         @njit
         def test_impl():
             redux = np.float64(10.0)
-            with omp("parallel for reduction(+:redux)"):
+            with openmp("parallel for reduction(+:redux)"):
                 for i in range(10):
                     redux += np.float64(1.0)
             return redux
@@ -1485,7 +1486,7 @@ def test_default_none(self):
         def test_impl(N):
             a = np.zeros(N, dtype=np.int32)
             x = 7
-            with omp("parallel for default(none)"):
+            with openmp("parallel for default(none)"):
                 for i in range(N):
                     y = i + x
                     a[i] = y
@@ -1504,16 +1505,16 @@ def test_impl(N, M):
             y = np.zeros(N)
             z = 3.14
             i = 7
-            with omp("parallel private(i)"):
+            with openmp("parallel private(i)"):
                 yn = M + 1
                 zs = z
-                with omp("for"):
+                with openmp("for"):
                     for i in range(N):
                         y[i] = yn + 2 * (i + 1)
-                with omp("for"):
+                with openmp("for"):
                     for i in range(N):
                         x[i] = y[i] - i
-                        with omp("critical"):
+                        with openmp("critical"):
                             z += 3
             return x, y, zs, z, i
 
@@ -1540,10 +1541,10 @@ def test_impl():
             vals = np.zeros(NTHREADS)
             valsfp = np.zeros(NTHREADS)
 
-            with omp("""parallel private(x) shared(zsh)
+            with openmp("""parallel private(x) shared(zsh)
                         firstprivate(zfp) private(ID)"""):
                 ID = omp_get_thread_num()
-                with omp("single"):
+                with openmp("single"):
                     nsing = nsing + 1
                     numthrds = omp_get_num_threads()
                     if y != 3:
@@ -1555,7 +1556,7 @@ def test_impl():
                         )
 
                 # verify each thread sees the same variable vsh
-                with omp("critical"):
+                with openmp("critical"):
                     zsh = zsh + ID
 
                 # test first private
@@ -1608,7 +1609,7 @@ def test_privates(self):
         def test_impl(N):
             a = np.zeros(N, dtype=np.int32)
             x = 7
-            with omp("""parallel for firstprivate(x) private(y)
+            with openmp("""parallel for firstprivate(x) private(y)
                          lastprivate(zzzz) private(private_index) shared(a)
                           firstprivate(N) default(none)"""):
                 for private_index in range(N):
@@ -1624,7 +1625,7 @@ def test_private_retain_value(self):
         @njit
         def test_impl():
             x = 5
-            with omp("parallel private(x)"):
+            with openmp("parallel private(x)"):
                 x = 13
             return x
 
@@ -1633,7 +1634,7 @@ def test_impl():
     def test_private_retain_value_param(self):
         @njit
         def test_impl(x):
-            with omp("parallel private(x)"):
+            with openmp("parallel private(x)"):
                 x = 13
             return x
 
@@ -1643,8 +1644,8 @@ def test_private_retain_value_for(self):
         @njit
         def test_impl():
             x = 5
-            with omp("parallel private(x)"):
-                with omp("for"):
+            with openmp("parallel private(x)"):
+                with openmp("for"):
                     for i in range(10):
                         x = i
             return x
@@ -1654,8 +1655,8 @@ def test_impl():
     def test_private_retain_value_for_param(self):
         @njit
         def test_impl(x):
-            with omp("parallel private(x)"):
-                with omp("for"):
+            with openmp("parallel private(x)"):
+                with openmp("for"):
                     for i in range(10):
                         x = i
             return x
@@ -1666,7 +1667,7 @@ def test_private_retain_value_combined_for(self):
         @njit
         def test_impl():
             x = 5
-            with omp("parallel for private(x)"):
+            with openmp("parallel for private(x)"):
                 for i in range(10):
                     x = i
             return x
@@ -1676,7 +1677,7 @@ def test_impl():
     def test_private_retain_value_combined_for_param(self):
         @njit
         def test_impl(x):
-            with omp("parallel for private(x)"):
+            with openmp("parallel for private(x)"):
                 for i in range(10):
                     x = i
             return x
@@ -1688,7 +1689,7 @@ def test_private_retain_two_values(self):
         def test_impl():
             x = 5
             y = 7
-            with omp("parallel private(x,y)"):
+            with openmp("parallel private(x,y)"):
                 x = 13
                 y = 40
             return x, y
@@ -1699,11 +1700,11 @@ def test_private_retain_array(self):
         @njit
         def test_impl(N, x):
             a = np.ones(N)
-            with omp("parallel private(a)"):
-                with omp("single"):
+            with openmp("parallel private(a)"):
+                with openmp("single"):
                     sa = a
                 a = np.zeros(N)
-                with omp("for"):
+                with openmp("for"):
                     for i in range(N):
                         a[i] = x
             return a, sa
@@ -1717,7 +1718,7 @@ def test_private_divide_work(self):
         def test_impl(v, npoints):
             omp_set_num_threads(3)
 
-            with omp("""parallel default(shared)
+            with openmp("""parallel default(shared)
                         private(iam,nt,ipoints,istart)"""):
                 iam = omp_get_thread_num()
                 nt = omp_get_num_threads()
@@ -1734,7 +1735,7 @@ def test_impl(v, npoints):
     def test_firstprivate(self):
         @njit
         def test_impl(x, y):
-            with omp("parallel firstprivate(x)"):
+            with openmp("parallel firstprivate(x)"):
                 xs = x
                 x = y
             return xs, x
@@ -1747,7 +1748,7 @@ def test_lastprivate_for(self):
         def test_impl(N):
             a = np.zeros(N)
             si = 0
-            with omp("parallel for lastprivate(si)"):
+            with openmp("parallel for lastprivate(si)"):
                 for i in range(N):
                     si = i + 1
                     a[i] = si
@@ -1763,7 +1764,7 @@ def test_lastprivate_non_one_step(self):
         def test_impl(n1, n2, s):
             a = np.zeros(math.ceil((n2 - n1) / s))
             rl = np.arange(n1, n2, s)
-            with omp("parallel for lastprivate(si)"):
+            with openmp("parallel for lastprivate(si)"):
                 for i in range(len(rl)):
                     si = rl[i] + 1
                     a[i] = si
@@ -1780,32 +1781,32 @@ def test_lastprivate_sections(self):
         @njit
         def test_impl(N2, si):
             a = np.zeros(N2)
-            with omp("parallel shared(sis1)"):
-                with omp("sections lastprivate(si)"):
+            with openmp("parallel shared(sis1)"):
+                with openmp("sections lastprivate(si)"):
                     sis1 = si
                     # N1 = number of sections
-                    with omp("section"):
+                    with openmp("section"):
                         si = 0
-                    with omp("section"):
+                    with openmp("section"):
                         si = 1
-                    with omp("section"):
+                    with openmp("section"):
                         si = 2
                 sis2 = si
-                with omp("sections lastprivate(si)"):
+                with openmp("sections lastprivate(si)"):
                     # N2 = number of sections
-                    with omp("section"):
+                    with openmp("section"):
                         i = 0
                         si = N2 - i
                         a[i] = si
-                    with omp("section"):
+                    with openmp("section"):
                         i = 1
                         si = N2 - i
                         a[i] = si
-                    with omp("section"):
+                    with openmp("section"):
                         i = 2
                         si = N2 - i
                         a[i] = si
-                    with omp("section"):
+                    with openmp("section"):
                         i = 3
                         si = N2 - i
                         a[i] = si
@@ -1824,8 +1825,8 @@ def test_lastprivate_conditional(self):
         def test_impl(N, c1, c2):
             a = np.arange(0, N * 2, c2)
             num = 0
-            with omp("parallel"):
-                with omp("for lastprivate(conditional: num)"):
+            with openmp("parallel"):
+                with openmp("for lastprivate(conditional: num)"):
                     for i in range(N):
                         if i < c1:
                             num = a[i] + c2
@@ -1842,14 +1843,14 @@ def test_impl(N, c):
             a = np.zeros(N)
             ra = np.zeros(N)
             val = 0
-            with omp("threadprivate(val)"):
+            with openmp("threadprivate(val)"):
                 pass
-            with omp("parallel private(tn, sn)"):
+            with openmp("parallel private(tn, sn)"):
                 tn = omp_get_thread_num()
                 sn = c + tn
                 val = sn
                 a[tn] = sn
-            with omp("parallel private(tn)"):
+            with openmp("parallel private(tn)"):
                 tn = omp_get_thread_num()
                 ra[tn] = 1 if val == a[tn] else 0
             return ra
@@ -1864,15 +1865,15 @@ def test_impl(nt, n1, n2, n3):
             xsa1 = np.zeros(nt)
             xsa2 = np.zeros(nt)
             x = n1
-            with omp("threadprivate(x)"):
+            with openmp("threadprivate(x)"):
                 pass
             x = n2
-            with omp("parallel num_threads(nt) copyin(x) private(tn)"):
+            with openmp("parallel num_threads(nt) copyin(x) private(tn)"):
                 tn = omp_get_thread_num()
                 xsa1[tn] = x
                 if tn == 0:
                     x = n3
-            with omp("parallel copyin(x)"):
+            with openmp("parallel copyin(x)"):
                 xsa2[omp_get_thread_num()] = x
             return xsa1, xsa2
 
@@ -1889,15 +1890,15 @@ def test_impl(nt1, nt2, mt, n1, n2, n3):
             xsa1 = np.zeros(nt1)
             xsa2 = np.zeros(nt2)
             x = n1
-            with omp("threadprivate(x)"):
+            with openmp("threadprivate(x)"):
                 pass
             x = n2
-            with omp("parallel num_threads(nt1) copyin(x) private(tn)"):
+            with openmp("parallel num_threads(nt1) copyin(x) private(tn)"):
                 tn = omp_get_thread_num()
                 xsa1[tn] = x
                 if tn == mt:
                     x = n3
-                    with omp("parallel num_threads(nt2) copyin(x)"):
+                    with openmp("parallel num_threads(nt2) copyin(x)"):
                         xsa2[omp_get_thread_num()] = x
             return xsa1, xsa2
 
@@ -1915,8 +1916,8 @@ def test_impl(nt, n1, n2, n3):
             xsa = np.zeros(nt)
             ar = np.zeros(nt)
             omp_set_num_threads(nt)
-            with omp("parallel firstprivate(x, a) private(tn)"):
-                with omp("single copyprivate(x, a)"):
+            with openmp("parallel firstprivate(x, a) private(tn)"):
+                with openmp("single copyprivate(x, a)"):
                     x = n2
                     a = np.full(nt, n3)
                 tn = omp_get_thread_num()
@@ -1937,7 +1938,7 @@ def test_impl(N):
             b = np.zeros(N // 2)
 
             linearj = 0
-            with omp("parallel for linear(linearj:1)"):
+            with openmp("parallel for linear(linearj:1)"):
                 for i in range(0, N, 2):
                     b[linearj] = a[i] * 2
 
@@ -1959,7 +1960,7 @@ def __init__(self, *args):
     def test_parallel_for_no_for_loop(self):
         @njit
         def test_impl():
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 pass
 
         with self.assertRaises(ParallelForWrongLoopCount) as raises:
@@ -1973,7 +1974,7 @@ def test_parallel_for_multiple_for_loops(self):
         @njit
         def test_impl():
             a = np.zeros(4)
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(2):
                     a[i] = 1
                 for i in range(2, 4):
@@ -1990,7 +1991,7 @@ def test_statement_before_parallel_for(self):
         @njit
         def test_impl():
             a = np.zeros(4)
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 print("Fail")
                 for i in range(4):
                     a[i] = i
@@ -2004,7 +2005,7 @@ def test_statement_after_parallel_for(self):
         @njit
         def test_impl():
             a = np.zeros(4)
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(4):
                     a[i] = i
                 print("Fail")
@@ -2019,7 +2020,7 @@ def test_nonstring_var_omp_statement(self):
         @njit
         def test_impl(v):
             ovar = 7
-            with omp(ovar):
+            with openmp(ovar):
                 for i in range(len(v)):
                     v[i] = 1.0
             return v
@@ -2033,7 +2034,7 @@ def test_parallel_for_nonconst_var_omp_statement(self):
         def test_impl(v):
             ovar = "parallel "
             ovar += "for"
-            with omp(ovar):
+            with openmp(ovar):
                 for i in range(len(v)):
                     v[i] = 1.0
             return v
@@ -2048,10 +2049,10 @@ def test_impl(v):
     #    @njit
     #    def test_impl():
     #        n = 0
-    #        with omp("parallel"):
+    #        with openmp("parallel"):
     #            half_threads = omp_get_num_threads()//2
     #            if omp_get_thread_num() < half_threads:
-    #                with omp("for reduction(+:n)"):
+    #                with openmp("for reduction(+:n)"):
     #                    for _ in range(half_threads):
     #                        n += 1
     #        return n
@@ -2065,11 +2066,11 @@ def test_parallel_for_delaying_condition(self):
         @njit
         def test_impl():
             n = 0
-            with omp("parallel private(lc)"):
+            with openmp("parallel private(lc)"):
                 lc = 0
                 while lc < omp_get_thread_num():
                     lc += 1
-                with omp("for reduction(+:n)"):
+                with openmp("for reduction(+:n)"):
                     for _ in range(omp_get_num_threads()):
                         n += 1
             return n
@@ -2080,7 +2081,7 @@ def test_parallel_for_nowait(self):
         @njit
         def test_impl(nt):
             a = np.zeros(nt)
-            with omp("parallel for num_threads(nt) nowait"):
+            with openmp("parallel for num_threads(nt) nowait"):
                 for i in range(nt):
                     a[omp_get_thread_num] = i
             return a
@@ -2093,8 +2094,8 @@ def test_parallel_double_num_threads(self):
         @njit
         def test_impl(nt1, nt2):
             count = 0
-            with omp("parallel num_threads(nt1) num_threads(nt2)"):
-                with omp("critical"):
+            with openmp("parallel num_threads(nt1) num_threads(nt2)"):
+                with openmp("critical"):
                     count += 1
             print(count)
             return count
@@ -2108,14 +2109,14 @@ def test_impl(nt):
             hp = nt // 2
             a = np.zeros(hp)
             b = np.zeros(nt - hp)
-            with omp("parallel num_threads(nt) private(tn)"):
+            with openmp("parallel num_threads(nt) private(tn)"):
                 tn = omp_get_thread_num()
                 if tn < hp:
-                    with omp("barrier"):
+                    with openmp("barrier"):
                         pass
                     a[tn] = 1
                 else:
-                    with omp("barrier"):
+                    with openmp("barrier"):
                         pass
                     b[tn - hp] = 1
             return a, b
@@ -2129,10 +2130,10 @@ def test_closely_nested_for_loops(self):
         @njit
         def test_impl(N):
             a = np.zeros((N, N))
-            with omp("parallel"):
-                with omp("for"):
+            with openmp("parallel"):
+                with openmp("for"):
                     for i in range(N):
-                        with omp("for"):
+                        with openmp("for"):
                             for j in range(N):
                                 a[i][j] = 1
             return a
@@ -2145,10 +2146,10 @@ def test_nested_critical(self):
         @njit
         def test_impl():
             num = 0
-            with omp("parallel"):
-                with omp("critical"):
+            with openmp("parallel"):
+                with openmp("critical"):
                     num += 1
-                    with omp("critical"):
+                    with openmp("critical"):
                         num -= 1
             return num
 
@@ -2164,7 +2165,7 @@ def test_parallel_region(self):
         @njit
         def test_impl():
             a = 1
-            with omp("parallel"):
+            with openmp("parallel"):
                 a += 1
 
         test_impl()
@@ -2174,8 +2175,8 @@ def test_single(self):
         def test_impl(nt):
             omp_set_num_threads(nt)
             a = np.zeros(4, dtype=np.int64)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     a[0] += 1
             return a
 
@@ -2187,8 +2188,8 @@ def test_master(self):
         def test_impl(nt):
             omp_set_num_threads(nt)
             a = np.ones(4, dtype=np.int64)
-            with omp("parallel"):
-                with omp("master"):
+            with openmp("parallel"):
+                with openmp("master"):
                     a[0] += omp_get_thread_num()
             return a
 
@@ -2201,12 +2202,12 @@ def test_impl(N, iters):
             count = 0
             p = 0
             sum = 0
-            with omp("parallel"):
-                with omp("barrier"):
+            with openmp("parallel"):
+                with openmp("barrier"):
                     pass
-                with omp("for private(p, sum)"):
+                with openmp("for private(p, sum)"):
                     for _ in range(iters):
-                        with omp("critical"):
+                        with openmp("critical"):
                             p = count
                             sum = 0
                             for i in range(10000):
@@ -2227,11 +2228,11 @@ def test_impl(N):
             omp_set_num_threads(N)
             ca = np.zeros(N)
             sum = 0
-            with omp("parallel private(sum) shared(c)"):
+            with openmp("parallel private(sum) shared(c)"):
                 c = N
-                with omp("barrier"):
+                with openmp("barrier"):
                     pass
-                with omp("critical"):
+                with openmp("critical"):
                     ca[omp_get_thread_num()] = c - 1
                     # Sleep
                     sum = 0
@@ -2251,12 +2252,12 @@ def test_critical_result(self):
         def test_impl(N):
             omp_set_num_threads(N)
             count = 0
-            with omp("parallel"):
+            with openmp("parallel"):
                 if omp_get_thread_num() < N // 2:
-                    with omp("critical"):
+                    with openmp("critical"):
                         count += 1
                 else:
-                    with omp("critical"):
+                    with openmp("critical"):
                         count += 1
             return count
 
@@ -2270,11 +2271,11 @@ def test_impl(N):
             omp_set_num_threads(N)
             a = np.zeros((2, N))
             sa = np.zeros(N)
-            with omp("parallel private(a0c, sum, tn)"):
+            with openmp("parallel private(a0c, sum, tn)"):
                 tn = omp_get_thread_num()
-                with omp("barrier"):
+                with openmp("barrier"):
                     pass
-                with omp("critical (a)"):
+                with openmp("critical (a)"):
                     # Sleep
                     sum = 0
                     for j in range(1000):
@@ -2283,7 +2284,7 @@ def test_impl(N):
                         else:
                             sum -= 1
                     a[0][tn] = 1 + sum
-                with omp("critical (b)"):
+                with openmp("critical (b)"):
                     a0c = np.copy(a[0])
                     # Sleep
                     sum = 0
@@ -2308,12 +2309,12 @@ def test_impl(N):
     #        count = 0
     #        p = 0
     #        sum = 0
-    #        with omp("parallel"):
-    #            with omp("barrier"):
+    #        with openmp("parallel"):
+    #            with openmp("barrier"):
     #                pass
-    #            with omp("for private(p, sum)"):
+    #            with openmp("for private(p, sum)"):
     #                for _ in range(iters):
-    #                    with omp("atomic"):
+    #                    with openmp("atomic"):
     #                        p = count
     #                        sum = 0
     #                        for i in range(10000):
@@ -2333,19 +2334,19 @@ def test_atomic(self):
         def test_impl(nt, N, c):
             omp_set_num_threads(nt)
             a = np.zeros(N)
-            with omp("parallel for private(b, index)"):
+            with openmp("parallel for private(b, index)"):
                 for i in range(nt):
                     b = 0
                     index = i % N
-                    with omp("atomic write"):
+                    with openmp("atomic write"):
                         a[index] = nt % c
-                    with omp("barrier"):
+                    with openmp("barrier"):
                         pass
-                    with omp("atomic read"):
+                    with openmp("atomic read"):
                         b = a[index - 1] + index
-                    with omp("barrier"):
+                    with openmp("barrier"):
                         pass
-                    with omp("atomic update"):
+                    with openmp("atomic update"):
                         a[index] += b
             return a
 
@@ -2382,15 +2383,15 @@ def test_impl(nt, N, c):
             a = np.zeros(s)
             sva = np.zeros(N)
             tns = np.zeros(N)
-            with omp("parallel for num_threads(nt) private(sv, index)"):
+            with openmp("parallel for num_threads(nt) private(sv, index)"):
                 for i in range(N):
                     index = i % s
                     tns[i] = omp_get_thread_num()
-                    with omp("atomic write"):
+                    with openmp("atomic write"):
                         a[index] = index * c + 1
-                    with omp("barrier"):
+                    with openmp("barrier"):
                         pass
-                    with omp("atomic capture"):
+                    with openmp("atomic capture"):
                         sv = a[index - 1]
                         a[index - 1] += sv + (tns[i] % c + 1)
                     # sva[index] = sv
@@ -2414,11 +2415,11 @@ def test_impl(nt):
             ta1 = np.zeros(nt)
             secpa = np.zeros(nt)
 
-            with omp("parallel sections num_threads(nt)"):
-                with omp("section"):
+            with openmp("parallel sections num_threads(nt)"):
+                with openmp("section"):
                     ta0[omp_get_thread_num()] += 1
                     secpa[0] = omp_in_parallel()
-                with omp("section"):
+                with openmp("section"):
                     ta1[omp_get_thread_num()] += 1
                     secpa[1] = omp_in_parallel()
             print(ta0, ta1)
@@ -2440,15 +2441,15 @@ def test_impl(nt, iters, c):
             x = iters // c
             iters = x * c
             sum = 0
-            with omp("parallel num_threads(nt) private(tn, sum)"):
+            with openmp("parallel num_threads(nt) private(tn, sum)"):
                 tn = omp_get_thread_num()
-                with omp("critical"):
+                with openmp("critical"):
                     sum = 0
                     for i in range(iters):
                         if i % x == 0:
                             sum += 1
                     a[tn] = sum
-                with omp("barrier"):
+                with openmp("barrier"):
                     pass
                 for j in range(nt):
                     ac[tn][j] = a[j]
@@ -2467,9 +2468,9 @@ def test_impl(nt, iters, c):
     #            b = np.zeros(n)
     #            ac = np.zeros((nt, n))
     #            sum = 0
-    #            with omp("parallel num_threads(nt) private(tn)"):
+    #            with openmp("parallel num_threads(nt) private(tn)"):
     #                tn = omp_get_thread_num()
-    #                with omp("for nowait schedule(static) private(sum)"):
+    #                with openmp("for nowait schedule(static) private(sum)"):
     #                    for i in range(n):
     #                        # Sleep
     #                        sum = 0
@@ -2481,7 +2482,7 @@ def test_impl(nt, iters, c):
     #                        a[i] = i * c1 + sum
     #                for j in range(nt):
     #                    ac[tn][j] = a[j]
-    #                with omp("for schedule(static)"):
+    #                with openmp("for schedule(static)"):
     #                    for i in range(n):
     #                        b[i] = a[i] + c2
     #            return b, ac
@@ -2496,11 +2497,11 @@ def test_impl(nt, iters, c):
     #        def test_impl(n, m, a, b, y, z):
     #            omp_set_num_threads(5)
     #
-    #            with omp("parallel"):
-    #                with omp("for nowait"):
+    #            with openmp("parallel"):
+    #                with openmp("for nowait"):
     #                    for i in range(1, n):
     #                        b[i] = (a[i] + a[i-1]) / 2.0
-    #                with omp("for nowait"):
+    #                with openmp("for nowait"):
     #                    for i in range(m):
     #                        y[i] = math.sqrt(z[i])
     #
@@ -2516,9 +2517,9 @@ def test_impl(nt):
             omp_set_nested(1)
             omp_set_dynamic(0)
             a = np.zeros((nt, nt), dtype=np.int32)
-            with omp("parallel for"):
+            with openmp("parallel for"):
                 for i in range(nt):
-                    with omp("parallel for"):
+                    with openmp("parallel for"):
                         for j in range(nt):
                             a[i][j] = omp_get_thread_num()
             return a
@@ -2535,12 +2536,12 @@ def test_impl(nt1, nt2):
             omp_set_max_active_levels(2)
             ca = np.zeros(nt1)
             omp_set_num_threads(nt1)
-            with omp("parallel private(tn)"):
+            with openmp("parallel private(tn)"):
                 tn = omp_get_thread_num()
-                with omp("parallel num_threads(3)"):
-                    with omp("critical"):
+                with openmp("parallel num_threads(3)"):
+                    with openmp("critical"):
                         ca[tn] += 1
-                    with omp("single"):
+                    with openmp("single"):
                         ats = omp_get_ancestor_thread_num(1) == tn
                         ts = omp_get_team_size(1)
             return ca, ats, ts
@@ -2569,10 +2570,10 @@ def test_impl(mal, n1, n2, n3):
             omp_set_num_threads(n1)
             a = np.zeros((n2, 6), dtype=np.int32)
             b = np.zeros((n1, 6), dtype=np.int32)
-            with omp("parallel"):
+            with openmp("parallel"):
                 omp_set_num_threads(n2)
-                with omp("single"):
-                    with omp("parallel"):
+                with openmp("single"):
+                    with openmp("parallel"):
                         omp_set_num_threads(n3)
                         set_array(a)
                 set_array(b)
@@ -2593,7 +2594,7 @@ def test_impl(N):
             omp_set_dynamic(0)
             omp_set_num_threads(N)
             a = np.zeros((N, 2), dtype=np.int32)
-            with omp("parallel private(tn)"):
+            with openmp("parallel private(tn)"):
                 tn = omp_get_thread_num()
                 a[tn][0] = 1
                 a[tn][1] = 2
@@ -2612,10 +2613,10 @@ def __init__(self, *args):
     def test_task_basic(self):
         def test_impl(ntsks):
             a = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task"):
+                        with openmp("task"):
                             a[i] = 1
             return a
 
@@ -2626,10 +2627,10 @@ def test_task_thread_assignment(self):
         @njit
         def test_impl(ntsks):
             a = np.empty(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task"):
+                        with openmp("task"):
                             a[i] = omp_get_thread_num()
             return a
 
@@ -2641,14 +2642,14 @@ def test_task_data_sharing_default(self):
         @njit
         def test_impl(n1, n2):
             x = n1
-            with omp("parallel private(y)"):
+            with openmp("parallel private(y)"):
                 y = n1
-                with omp("single"):
-                    with omp("task"):
+                with openmp("single"):
+                    with openmp("task"):
                         xa = x == n1
                         ya = y == n1
                         x, y = n2, n2
-                    with omp("taskwait"):
+                    with openmp("taskwait"):
                         ysave = y
             return (x, ysave), (xa, ya)
 
@@ -2662,10 +2663,10 @@ def test_task_single_implicit_barrier(self):
         @njit
         def test_impl(ntsks):
             a = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task private(sum)"):
+                        with openmp("task private(sum)"):
                             # Sleep
                             sum = 0
                             for j in range(10000):
@@ -2674,7 +2675,7 @@ def test_impl(ntsks):
                                 else:
                                     sum -= 1
                             a[i] = 1 + sum
-                # with omp("barrier"):
+                # with openmp("barrier"):
                 #    pass
                 sa = np.copy(a)
             return sa
@@ -2688,10 +2689,10 @@ def test_task_single_nowait(self):
         @njit
         def test_impl(ntsks):
             a = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single nowait"):
+            with openmp("parallel"):
+                with openmp("single nowait"):
                     for i in range(ntsks):
-                        with omp("task private(sum)"):
+                        with openmp("task private(sum)"):
                             sum = 0
                             for j in range(10000):
                                 if j % 2 == 0:
@@ -2715,14 +2716,14 @@ def test_impl(nt):
             omp_set_num_threads(nt)
             a = np.zeros((nt + 1) * nt / 2)
             # a = np.zeros(10)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for tn in range(nt):
-                        with omp("task"):
+                        with openmp("task"):
                             for i in range(tn + 1):
-                                with omp("task"):
+                                with openmp("task"):
                                     a[i] = omp_get_thread_num() + 1
-                    with omp("barrier"):
+                    with openmp("barrier"):
                         ret = np.all(a)
             return ret
 
@@ -2731,10 +2732,10 @@ def test_impl(nt):
     def test_taskwait(self):
         def test_impl(ntsks):
             a = np.zeros(ntsks)
-            with omp("parallel private(i)"):
-                with omp("single"):
+            with openmp("parallel private(i)"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task private(sum) private(j)"):
+                        with openmp("task private(sum) private(j)"):
                             sum = 0
                             for j in range(10000):
                                 if j % 2 == 0:
@@ -2742,7 +2743,7 @@ def test_impl(ntsks):
                                 else:
                                     sum -= 1
                             a[i] = 1 + sum
-                    with omp("taskwait"):
+                    with openmp("taskwait"):
                         ret = np.all(a)
             return ret
 
@@ -2754,13 +2755,13 @@ def test_taskwait_descendants(self):
         def test_impl(ntsks, dtsks):
             a = np.zeros(ntsks)
             da = np.zeros((ntsks, dtsks))
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task"):
+                        with openmp("task"):
                             a[i] = 1
                             for j in range(dtsks):
-                                with omp("task private(sum)"):
+                                with openmp("task private(sum)"):
                                     sum = 0
                                     for k in range(10000):
                                         if k % 2 == 0:
@@ -2768,10 +2769,10 @@ def test_impl(ntsks, dtsks):
                                         else:
                                             sum -= 1
                                     da[i][j] = 1 + sum
-                    with omp("taskwait"):
+                    with openmp("taskwait"):
                         ac = np.copy(a)
                         dac = np.copy(da)
-                with omp("barrier"):
+                with openmp("barrier"):
                     pass
             return ac, dac
 
@@ -2784,10 +2785,10 @@ def test_impl(ntsks, dtsks):
     def test_undeferred_task(self):
         @njit
         def test_impl():
-            with omp("parallel"):
+            with openmp("parallel"):
                 flag = 1
-                with omp("single"):
-                    with omp("task if(1) private(sum)"):
+                with openmp("single"):
+                    with openmp("task if(1) private(sum)"):
                         sum = 0
                         for i in range(10000):
                             if i % 2 == 0:
@@ -2806,12 +2807,12 @@ def test_untied_task_thread_assignment(self):
         def test_impl(ntsks):
             start_nums = np.zeros(ntsks)
             current_nums = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task untied private(sum)"):
+                        with openmp("task untied private(sum)"):
                             start_nums[i] = omp_get_thread_num()
-                            with omp("task if(0) shared(sum)"):
+                            with openmp("task if(0) shared(sum)"):
                                 # Sleep
                                 sum = 0
                                 for j in range(10000):
@@ -2820,7 +2821,7 @@ def test_impl(ntsks):
                                     else:
                                         sum -= 1
                             current_nums[i] = omp_get_thread_num() + sum
-                with omp("barrier"):
+                with openmp("barrier"):
                     pass
             return start_nums, current_nums
 
@@ -2835,10 +2836,10 @@ def test_impl(ntsks):
             start_nums = np.zeros(ntsks)
             finish_nums = np.zeros(ntsks)
             yielded_tasks = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task private(stn, start_i, finish_i, diff)"):
+                        with openmp("task private(stn, start_i, finish_i, diff)"):
                             stn = omp_get_thread_num()
                             start_i = np.where(start_nums == stn)[0]
                             finish_i = np.where(finish_nums == stn)[0]
@@ -2852,10 +2853,10 @@ def test_impl(ntsks):
                             for dindex in diff[diff != 0]:
                                 yielded_tasks[dindex] = 1
                             start_nums[i] = stn
-                            with omp("taskyield"):
+                            with openmp("taskyield"):
                                 pass
                             finish_nums[i] = omp_get_thread_num()
-                with omp("barrier"):
+                with openmp("barrier"):
                     pass
             return yielded_tasks
 
@@ -2869,16 +2870,16 @@ def test_impl(ntsks, c):
             final_nums = np.zeros(ntsks)
             included_nums = np.zeros(ntsks)
             da = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task final(i>c) private(sum, d)"):
+                        with openmp("task final(i>c) private(sum, d)"):
                             ftask_num = i
                             final_nums[ftask_num] = omp_get_thread_num()
                             # If it is a final task, generate an included task
                             if ftask_num > c:
                                 d = 1
-                                with omp("task private(sum)"):
+                                with openmp("task private(sum)"):
                                     itask_num = ftask_num
                                     # Sleep
                                     sum = 0
@@ -2903,13 +2904,13 @@ def test_taskgroup(self):
         @njit
         def test_impl(ntsks, dtsks):
             a = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
-                    with omp("taskgroup"):
+            with openmp("parallel"):
+                with openmp("single"):
+                    with openmp("taskgroup"):
                         for i in range(ntsks):
-                            with omp("task"):
+                            with openmp("task"):
                                 for _ in range(dtsks):
-                                    with omp("task"):
+                                    with openmp("task"):
                                         # Sleep
                                         sum = 0
                                         for j in range(10000):
@@ -2932,10 +2933,10 @@ def test_task_priority(self):
         def test_impl(ntsks):
             a = np.zeros(ntsks)
             count = 0
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task priority(i)"):
+                        with openmp("task priority(i)"):
                             count += i + 1
                             a[i] = count
             return a
@@ -2952,12 +2953,12 @@ def test_task_mergeable(self):
         @njit
         def test_impl(ntsks, c1, c2):
             a = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task private(x)"):
+                        with openmp("task private(x)"):
                             x = c1
-                            with omp("task mergeable if(0)"):
+                            with openmp("task mergeable if(0)"):
                                 x = c2
                             a[i] = x
             return a
@@ -2970,23 +2971,23 @@ def test_task_depend(self):
         def test_impl(ntsks):
             a = np.zeros(ntsks)
             da = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task private(x, done)"):
+                        with openmp("task private(x, done)"):
                             x = 1
                             done = False
-                            with omp("task shared(x) depend(out: x)"):
+                            with openmp("task shared(x) depend(out: x)"):
                                 x = 5
-                            with omp("""task shared(done, x)
+                            with openmp("""task shared(done, x)
                                         depend(out: done) depend(inout: x)"""):
                                 x += i
                                 done = True
-                            with omp("""task shared(done, x)
+                            with openmp("""task shared(done, x)
                                          depend(in: done) depend(inout: x)"""):
                                 x *= i
                                 da[i] = 1 if done else 0
-                            with omp("task shared(x) depend(in: x)"):
+                            with openmp("task shared(x) depend(in: x)"):
                                 a[i] = x
             return a, da
 
@@ -2997,14 +2998,14 @@ def test_impl(ntsks):
     def test_task_affinity(self):
         def test_impl(ntsks, const):
             a = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     for i in range(ntsks):
-                        with omp("task firstprivate(i)"):
-                            with omp("""task shared(b) depend(out: b)
+                        with openmp("task firstprivate(i)"):
+                            with openmp("""task shared(b) depend(out: b)
                                          affinity(a)"""):
                                 b = np.full(i, const)
-                            with omp("""task shared(b) depend(in: b)
+                            with openmp("""task shared(b) depend(in: b)
                                          affinity(a)"""):
                                 a[i] = np.sum(b)
             return a
@@ -3017,17 +3018,17 @@ def test_impl(mode):
                 return
 
             b = np.zeros(100)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     a = np.ones(100)
                     c = 0
                     d = 0
                     if mode > 1:
-                        with omp("task shared(a, c)"):
+                        with openmp("task shared(a, c)"):
                             c = a.sum()
-                        with omp("task shared(a, d)"):
+                        with openmp("task shared(a, d)"):
                             d = a.sum()
-                        with omp("taskwait"):
+                        with openmp("taskwait"):
                             b[:] = c + d
 
             return b
@@ -3045,9 +3046,9 @@ def __init__(self, *args):
     def test_taskloop_basic(self):
         def test_impl(ntsks):
             a = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
-                    with omp("taskloop"):
+            with openmp("parallel"):
+                with openmp("single"):
+                    with openmp("taskloop"):
                         for i in range(ntsks):
                             a[i] = 1
             return a
@@ -3058,9 +3059,9 @@ def test_taskloop_num_tasks(self):
         @njit
         def test_impl(nt, iters, ntsks):
             a = np.zeros(ntsks)
-            with omp("parallel num_threads(nt)"):
-                with omp("single"):
-                    with omp("taskloop num_tasks(ntsks)"):
+            with openmp("parallel num_threads(nt)"):
+                with openmp("single"):
+                    with openmp("taskloop num_tasks(ntsks)"):
                         for i in range(iters):
                             a[i] = omp_get_thread_num()
             return a
@@ -3072,10 +3073,10 @@ def test_taskloop_grainsize(self):
         @njit
         def test_impl(nt, iters, ntsks):
             a = np.zeros(ntsks)
-            with omp("parallel num_threads(nt)"):
-                with omp("single"):
+            with openmp("parallel num_threads(nt)"):
+                with openmp("single"):
                     iters_per_task = iters // ntsks
-                    with omp("taskloop grainsize(iters_per_task)"):
+                    with openmp("taskloop grainsize(iters_per_task)"):
                         for i in range(iters):
                             a[i] = omp_get_thread_num()
             return a
@@ -3088,14 +3089,14 @@ def test_taskloop_nogroup(self):
         def test_impl(ntsks):
             a = np.zeros(ntsks)
             sa = np.zeros(ntsks)
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     s = 0
-                    with omp("taskloop nogroup num_tasks(ntsks)"):
+                    with openmp("taskloop nogroup num_tasks(ntsks)"):
                         for i in range(ntsks):
                             a[i] = 1
                             sa[i] = s
-                    with omp("task priority(1)"):
+                    with openmp("task priority(1)"):
                         s = 1
             return a, sa
 
@@ -3111,9 +3112,9 @@ def test_impl(ntsks, nt):
             sl = np.zeros(ntsks)
             tl = np.zeros(ntsks)
             omp_set_num_threads(nt)
-            with omp("parallel"):
-                with omp("single"):
-                    with omp("taskloop collapse(2) num_tasks(ntsks)"):
+            with openmp("parallel"):
+                with openmp("single"):
+                    with openmp("taskloop collapse(2) num_tasks(ntsks)"):
                         for i in range(ntsks):
                             fl[i] = omp_get_thread_num()
                             for j in range(1):
@@ -3162,7 +3163,7 @@ def target_nowait(self, device):
 
         @njit
         def test_impl():
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 a = 0
                 for i in range(1000000):
                     for j in range(1000000):
@@ -3178,8 +3179,8 @@ def target_nest_parallel_default_threadlimit(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("parallel"):
+            with openmp(target_pragma):
+                with openmp("parallel"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
@@ -3198,8 +3199,8 @@ def target_nest_parallel_set_numthreads(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("parallel num_threads(32)"):
+            with openmp(target_pragma):
+                with openmp("parallel num_threads(32)"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
@@ -3218,8 +3219,8 @@ def target_nest_teams_default_numteams(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("teams"):
+            with openmp(target_pragma):
+                with openmp("teams"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
@@ -3246,8 +3247,8 @@ def target_nest_teams_set_numteams(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(32)"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(32)"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
@@ -3271,9 +3272,9 @@ def target_nest_teams_nest_parallel_default_numteams_threadlimit(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("teams"):
-                    with omp("parallel"):
+            with openmp(target_pragma):
+                with openmp("teams"):
+                    with openmp("parallel"):
                         teamno = omp_get_team_num()
                         threadno = omp_get_thread_num()
                         if teamno == 0 and threadno == 0:
@@ -3300,9 +3301,9 @@ def target_nest_teams_nest_parallel_set_numteams(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(32)"):
-                    with omp("parallel"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(32)"):
+                    with openmp("parallel"):
                         teamno = omp_get_team_num()
                         threadno = omp_get_thread_num()
                         if teamno == 0 and threadno == 0:
@@ -3326,9 +3327,9 @@ def target_nest_teams_nest_parallel_set_threadlimit(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("teams thread_limit(32)"):
-                    with omp("parallel"):
+            with openmp(target_pragma):
+                with openmp("teams thread_limit(32)"):
+                    with openmp("parallel"):
                         teamno = omp_get_team_num()
                         threadno = omp_get_thread_num()
                         if teamno == 0 and threadno == 0:
@@ -3355,9 +3356,9 @@ def target_nest_teams_nest_parallel_set_numteams_threadlimit(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(32) thread_limit(32)"):
-                    with omp("parallel"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(32) thread_limit(32)"):
+                    with openmp("parallel"):
                         teamno = omp_get_team_num()
                         threadno = omp_get_thread_num()
                         if teamno == 0 and threadno == 0:
@@ -3385,9 +3386,9 @@ def target_nest_teams_nest_parallel_set_numteams_threadlimit_gt_numthreads(
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(32) thread_limit(64)"):
-                    with omp("parallel num_threads(32)"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(32) thread_limit(64)"):
+                    with openmp("parallel num_threads(32)"):
                         teamno = omp_get_team_num()
                         threadno = omp_get_thread_num()
                         if teamno == 0 and threadno == 0:
@@ -3415,10 +3416,10 @@ def target_nest_teams_nest_parallel_set_numteams_threadlimit_lt_numthreads(
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 # THREAD_LIMIT takes precedence over NUM_THREADS.
-                with omp("teams num_teams(32) thread_limit(64)"):
-                    with omp("parallel num_threads(128)"):
+                with openmp("teams num_teams(32) thread_limit(64)"):
+                    with openmp("parallel num_threads(128)"):
                         teamno = omp_get_team_num()
                         threadno = omp_get_thread_num()
                         if teamno == 0 and threadno == 0:
@@ -3448,14 +3449,14 @@ def test_impl():
             threads1 = 0
             teams2 = 0
             threads2 = 0
-            with omp(target_pragma):
-                with omp("parallel num_threads(32)"):
+            with openmp(target_pragma):
+                with openmp("parallel num_threads(32)"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
                         teams1 = omp_get_num_teams()
                         threads1 = omp_get_num_threads()
-                with omp("parallel num_threads(256)"):
+                with openmp("parallel num_threads(256)"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
@@ -3480,14 +3481,14 @@ def test_impl():
             threads1 = 0
             teams2 = 0
             threads2 = 0
-            with omp(target_pragma):
-                with omp("parallel"):
+            with openmp(target_pragma):
+                with openmp("parallel"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
                         teams1 = omp_get_num_teams()
                         threads1 = omp_get_num_threads()
-                with omp("parallel"):
+                with openmp("parallel"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
@@ -3511,15 +3512,15 @@ def test_impl():
             threads1 = 0
             teams2 = 0
             threads2 = 0
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 max_threads = omp_get_max_threads()
-                with omp("parallel"):
+                with openmp("parallel"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
                         teams1 = omp_get_num_teams()
                         threads1 = omp_get_num_threads()
-                with omp("parallel num_threads(256)"):
+                with openmp("parallel num_threads(256)"):
                     teamno = omp_get_team_num()
                     threadno = omp_get_thread_num()
                     if teamno == 0 and threadno == 0:
@@ -3550,8 +3551,8 @@ def target_nest_parallel(self, device):
         @njit
         def test_impl():
             a = np.zeros(32, dtype=np.int64)
-            with omp(target_pragma):
-                with omp(parallel_pragma):
+            with openmp(target_pragma):
+                with openmp(parallel_pragma):
                     thread_id = omp_get_thread_num()
                     a[thread_id] = 1
             return a
@@ -3568,8 +3569,8 @@ def target_parallel_for_range_step_arg(self, device):
         @njit
         def test_impl():
             a = np.zeros(N, dtype=np.int32)
-            with omp(target_pragma):
-                with omp(parallel_pragma):
+            with openmp(target_pragma):
+                with openmp(parallel_pragma):
                     for i in range(0, len(a), step):
                         a[i] = i + 1
 
@@ -3588,8 +3589,8 @@ def target_parallel_for_incremented_step(self, device):
         def test_impl():
             a = np.zeros(N, dtype=np.int32)
             for i in range(step_range):
-                with omp(target_pragma):
-                    with omp(parallel_pragma):
+                with openmp(target_pragma):
+                    with openmp(parallel_pragma):
                         for j in range(0, len(a), i + 1):
                             a[j] = i + 1
             return a
@@ -3606,7 +3607,7 @@ def target_teams(self, device):
         def test_impl():
             a = np.zeros(100, dtype=np.int64)
             nteams = 0
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 team_id = omp_get_team_num()
                 if team_id == 0:
                     nteams = omp_get_num_teams()
@@ -3628,8 +3629,8 @@ def target_nest_teams(self, device):
         def test_impl():
             a = np.zeros(100, dtype=np.int64)
             nteams = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(100)"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(100)"):
                     team_id = omp_get_team_num()
                     if team_id == 0:
                         nteams = omp_get_num_teams()
@@ -3650,8 +3651,8 @@ def target_nest_teams_from_shared_expl_scalar(self, device):
         @njit
         def test_impl():
             s = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(100) shared(s)"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(100) shared(s)"):
                     team_id = omp_get_team_num()
                     if team_id == 0:
                         s = 1
@@ -3666,8 +3667,8 @@ def target_nest_teams_from_shared_impl_scalar(self, device):
         @njit
         def test_impl():
             s = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(100)"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(100)"):
                     team_id = omp_get_team_num()
                     if team_id == 0:
                         s = 1
@@ -3682,8 +3683,8 @@ def target_nest_teams_tofrom_shared_expl_scalar(self, device):
         @njit
         def test_impl():
             s = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(100) shared(s)"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(100) shared(s)"):
                     team_id = omp_get_team_num()
                     if team_id == 0:
                         s = 1
@@ -3699,8 +3700,8 @@ def target_nest_teams_tofrom_shared_impl_scalar(self, device):
         def test_impl():
             s = 0
             ss = np.zeros(1)
-            with omp(target_pragma):
-                with omp("teams num_teams(100)"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(100)"):
                     team_id = omp_get_team_num()
                     if team_id == 0:
                         s = 1
@@ -3718,8 +3719,8 @@ def target_teams_nest_parallel(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("parallel"):
+            with openmp(target_pragma):
+                with openmp("parallel"):
                     team_id = omp_get_team_num()
                     thread_id = omp_get_thread_num()
                     if team_id == 0 and thread_id == 0:
@@ -3744,9 +3745,9 @@ def target_teams_nest_parallel_set_thread_limit(self, device):
         def test_impl():
             teams = 0
             threads = 0
-            with omp(target_pragma):
-                with omp("teams num_teams(10) thread_limit(32)"):
-                    with omp("parallel"):
+            with openmp(target_pragma):
+                with openmp("teams num_teams(10) thread_limit(32)"):
+                    with openmp("parallel"):
                         team_id = omp_get_team_num()
                         thread_id = omp_get_thread_num()
                         if team_id == 0 and thread_id == 0:
@@ -3769,7 +3770,7 @@ def target_map_to_scalar(self, device):
 
         @njit
         def test_impl(x):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 x += 1
                 r = x
             return r
@@ -3783,7 +3784,7 @@ def target_map_to_array(self, device):
 
         @njit
         def test_impl(a):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 r = 0
                 for i in range(len(a)):
                     r += a[i]
@@ -3800,7 +3801,7 @@ def target_map_from_scalar(self, device):
 
         @njit
         def test_impl(x):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 x = 43
             return x
 
@@ -3813,7 +3814,7 @@ def target_map_tofrom_scalar(self, device):
 
         @njit
         def test_impl(x):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 x += 1
             return x
 
@@ -3826,9 +3827,9 @@ def target_multiple_map_tofrom_scalar(self, device):
 
         @njit
         def test_impl(x):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 x += 1
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 x += 1
             return x
 
@@ -3842,7 +3843,7 @@ def target_map_from_array(self, device):
         @njit
         def test_impl(n):
             a = np.zeros(n, dtype=np.int64)
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(len(a)):
                     a[i] = 42
             return a
@@ -3858,7 +3859,7 @@ def target_map_slice_in_mapping(self, device):
         def test_impl(n):
             a = np.zeros(n)
             b = np.arange(n)
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(50):
                     # These b accesses are within the transferred region.
                     a[i + 50] = b[i + 100]
@@ -3877,7 +3878,7 @@ def target_map_slice_read_out_mapping(self, device):
         def test_impl(n):
             a = np.zeros(n)
             b = np.arange(n)
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(50):
                     # These b accesses are outside the transferred region.
                     # Should get whatever happens to be in memory at that point.
@@ -3897,7 +3898,7 @@ def target_map_tofrom_array(self, device):
 
         @njit
         def test_impl(a):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(len(a)):
                     a[i] += 1
             return a
@@ -3912,8 +3913,8 @@ def target_nest_parallel_for(self, device):
 
         @njit
         def test_impl(a, sched):
-            with omp(target_pragma):
-                with omp("parallel for num_threads(256)"):
+            with openmp(target_pragma):
+                with openmp("parallel for num_threads(256)"):
                     for i in range(len(a)):
                         a[i] = 1
                         thread_id = omp_get_thread_num()
@@ -3939,8 +3940,8 @@ def target_nest_teams_distribute(self, device):
 
         @njit
         def test_impl(a, sched):
-            with omp(target_pragma):
-                with omp("teams distribute"):
+            with openmp(target_pragma):
+                with openmp("teams distribute"):
                     for i in range(len(a)):
                         a[i] = 1
                         team_id = omp_get_team_num()
@@ -3975,7 +3976,7 @@ def target_teams_distribute(self, device):
 
         @njit
         def test_impl(a, sched):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(len(a)):
                     a[i] = 1
                     team_id = omp_get_team_num()
@@ -4009,7 +4010,7 @@ def target_teams_distribute_set_num_teams(self, device):
 
         @njit
         def test_impl(a, sched):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(len(a)):
                     a[i] = 1
                     team_id = omp_get_team_num()
@@ -4032,7 +4033,7 @@ def target_firstprivate_scalar_explicit(self, device):
 
         @njit
         def test_impl(s):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 s = 43
             return s
 
@@ -4045,7 +4046,7 @@ def target_firstprivate_scalar_implicit(self, device):
 
         @njit
         def test_impl(s):
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 s = 43
             return s
 
@@ -4061,8 +4062,8 @@ def target_data_from(self, device):
         @njit
         def test_impl():
             a = np.ones(10)
-            with omp(target_data_pragma):
-                with omp(target_pragma):
+            with openmp(target_data_pragma):
+                with openmp(target_pragma):
                     for i in range(len(a)):
                         a[i] = 42
             return a
@@ -4079,8 +4080,8 @@ def target_data_to(self, device):
         def test_impl():
             a = np.ones(10)
             b = np.zeros(10)
-            with omp(target_data_pragma):
-                with omp(target_pragma):
+            with openmp(target_data_pragma):
+                with openmp(target_pragma):
                     for i in range(len(a)):
                         a[i] = 42
                         b[i] = a[i]
@@ -4099,8 +4100,8 @@ def target_data_tofrom(self, device):
         def test_impl():
             s = 0
             a = np.ones(10)
-            with omp(target_data_pragma):
-                with omp(target_pragma):
+            with openmp(target_data_pragma):
+                with openmp(target_pragma):
                     for i in range(len(a)):
                         a[i] += 41
                     s = 42
@@ -4121,8 +4122,8 @@ def target_data_alloc_from(self, device):
         def test_impl():
             a = np.ones(10)
             b = np.zeros(10)
-            with omp(target_data_pragma):
-                with omp(target_pragma):
+            with openmp(target_data_pragma):
+                with openmp(target_pragma):
                     for i in range(len(a)):
                         a[i] = 42
                         b[i] = a[i]
@@ -4141,8 +4142,8 @@ def target_data_mix_to_from(self, device):
         def test_impl():
             a = np.ones(10)
             b = np.ones(10)
-            with omp(target_data_pragma):
-                with omp(target_pragma):
+            with openmp(target_data_pragma):
+                with openmp(target_pragma):
                     for i in range(len(a)):
                         a[i] = 42
                         b[i] = 42
@@ -4161,11 +4162,11 @@ def target_update_from(self, device):
         @njit
         def test_impl():
             a = np.ones(10)
-            with omp(target_data_pragma):
-                with omp(target_pragma):
+            with openmp(target_data_pragma):
+                with openmp(target_pragma):
                     for i in range(len(a)):
                         a[i] = 42
-                with omp(target_update_pragma):
+                with openmp(target_update_pragma):
                     pass
             return a
 
@@ -4181,13 +4182,13 @@ def target_update_to(self, device):
         @njit
         def test_impl():
             a = np.ones(10)
-            with omp(target_data_pragma):
+            with openmp(target_data_pragma):
                 a += 1
 
-                with omp(target_update_pragma):
+                with openmp(target_update_pragma):
                     pass
 
-                with omp(target_pragma):
+                with openmp(target_pragma):
                     for i in range(len(a)):
                         a[i] += 1
             return a
@@ -4205,17 +4206,17 @@ def target_update_to_from(self, device):
         @njit
         def test_impl():
             a = np.ones(10)
-            with omp(target_data_pragma):
+            with openmp(target_data_pragma):
                 a += 1
 
-                with omp(target_update_to_pragma):
+                with openmp(target_update_to_pragma):
                     pass
 
-                with omp(target_pragma):
+                with openmp(target_pragma):
                     for i in range(len(a)):
                         a[i] += 1
 
-                with omp(target_update_from_pragma):
+                with openmp(target_update_from_pragma):
                     pass
 
                 a += 1
@@ -4235,16 +4236,16 @@ def target_enter_exit_data_to_from_hostonly(self, device):
         @njit
         def test_impl():
             a = np.ones(10)
-            with omp(target_enter):
+            with openmp(target_enter):
                 pass
 
             a += 1
 
             # XXX: Test passes if uncommented!
-            # with omp("target device(1)"):
+            # with openmp("target device(1)"):
             #    pass
 
-            with omp(target_exit):
+            with openmp(target_exit):
                 pass
 
             return a
@@ -4260,11 +4261,11 @@ def target_data_tofrom_hostonly(self, device):
         @njit
         def test_impl():
             a = np.ones(10)
-            with omp(target_data):
+            with openmp(target_data):
                 a += 1
 
             # XXX: Test passes if uncommented!
-            # with omp("target device(1)"):
+            # with openmp("target device(1)"):
             #    pass
 
             return a
@@ -4279,12 +4280,12 @@ def target_data_update(self, device):
 
         @njit
         def test_impl(a):
-            with omp(target_data):
+            with openmp(target_data):
                 for rep in range(10):
                     # Target update resets a to ones.
-                    with omp(target_update):
+                    with openmp(target_update):
                         pass
-                    with omp(target_pragma):
+                    with openmp(target_pragma):
                         for i in range(len(a)):
                             a[i] += 1
 
@@ -4304,15 +4305,15 @@ def test_impl(s, n1, n2):
             as1 = np.empty(s, dtype=a.dtype)
             as2 = np.empty(s, dtype=a.dtype)
             b = n1
-            with omp(target_data_pragma):
-                with omp(target_pragma):
+            with openmp(target_data_pragma):
+                with openmp(target_pragma):
                     as1[:] = a
                     bs1 = b
-                with omp(target_pragma):
+                with openmp(target_pragma):
                     for i in range(s):
                         a[i] = n2
                     b = n2
-                with omp(target_pragma):
+                with openmp(target_pragma):
                     as2[:] = a
                     bs2 = b
             return a, as1, as2, b, bs1, bs2
@@ -4340,13 +4341,13 @@ def test_impl():
             bstop = 3
             a = np.array([1, 2, 3])
             b = np.array([3, 2, 1])
-            with omp(target_enter_pragma):
-                with omp(target_pragma):
+            with openmp(target_enter_pragma):
+                with openmp(target_pragma):
                     for i in range(1):
                         a[0] = 42
                         b[0] = 42
 
-            with omp(target_exit_pragma):
+            with openmp(target_exit_pragma):
                 pass
 
             return a, b
@@ -4364,15 +4365,15 @@ def target_enter_exit_data(self, device):
 
         @njit
         def test_impl(scalar, array):
-            with omp(target_enter_pragma):
+            with openmp(target_enter_pragma):
                 pass
 
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 scalar += 1
                 for i in range(len(array)):
                     array[i] += 1
 
-            with omp(target_exit_pragma):
+            with openmp(target_exit_pragma):
                 pass
 
             return scalar, array
@@ -4398,12 +4399,12 @@ def target_enter_exit_data_alloc(self, device):
 
         @njit
         def test_impl(a):
-            with omp(target_enter_pragma):
+            with openmp(target_enter_pragma):
                 pass
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(len(a)):
                     a[i] = 1
-            with omp(target_exit_pragma):
+            with openmp(target_exit_pragma):
                 pass
 
             return a
@@ -4421,7 +4422,7 @@ def target_teams_distribute_parallel_for(self, device):
         @njit
         def test_impl(a, sched_team, sched_thread):
             s = 42
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(len(a)):
                     a[i] = 1
                     team_id = omp_get_team_num()
@@ -4489,8 +4490,8 @@ def target_teams_nest_distribute_parallel_for(self, device):
         @njit
         def test_impl(a, sched_team, sched_thread):
             s = 42
-            with omp(target_pragma):
-                with omp(dist_parfor_pragma):
+            with openmp(target_pragma):
+                with openmp(dist_parfor_pragma):
                     for i in range(len(a)):
                         a[i] = 1
                         team_id = omp_get_team_num()
@@ -4532,8 +4533,8 @@ def test_impl():
             s = 42
             r = np.zeros(32)
             threads = 0
-            with omp(target_pragma):
-                with omp("parallel firstprivate(s)"):
+            with openmp(target_pragma):
+                with openmp("parallel firstprivate(s)"):
                     threadno = omp_get_thread_num()
                     if threadno == 0:
                         threads = omp_get_num_threads()
@@ -4553,8 +4554,8 @@ def target_nest_parallel_float_fpriv(self, device):
         def test_impl():
             s = np.float32(42.0)
             r = np.float32(0.0)
-            with omp(target_pragma):
-                with omp("parallel firstprivate(s)"):
+            with openmp(target_pragma):
+                with openmp("parallel firstprivate(s)"):
                     threadno = omp_get_thread_num()
                     if threadno == 0:
                         r = s + 1
@@ -4570,8 +4571,8 @@ def target_nest_teams_float_fpriv(self, device):
         def test_impl():
             s = np.float32(42.0)
             r = np.float32(0.0)
-            with omp(target_pragma):
-                with omp("teams firstprivate(s)"):
+            with openmp(target_pragma):
+                with openmp("teams firstprivate(s)"):
                     teamno = omp_get_thread_num()
                     if teamno == 0:
                         r = s + 1
@@ -4590,8 +4591,8 @@ def target_teams_nest_parallel_fpriv_shared_array(self, device):
         @njit
         def test_impl():
             s = np.zeros(32)
-            with omp(target_pragma):
-                with omp("parallel firstprivate(s)"):
+            with openmp(target_pragma):
+                with openmp("parallel firstprivate(s)"):
                     print("parallel s", s[0])
                     teams = omp_get_num_teams()
                     threads = omp_get_num_threads()
@@ -4611,7 +4612,7 @@ def test_impl():
             a = np.zeros(10, dtype=np.int32)
             nteams = 0
 
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 team_shared_array = np.empty(10, dtype=np.int32)
                 team_id = omp_get_team_num()
 
@@ -4645,7 +4646,7 @@ def test_impl():
             a = np.zeros((10, 2, 2), dtype=np.int32)
             nteams = 0
 
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 team_shared_array = np.empty((2, 2), dtype=np.int32)
                 team_id = omp_get_team_num()
 
@@ -4679,8 +4680,8 @@ def target_local_array(self, device):
         def test_impl():
             a = np.zeros((32, 10), dtype=np.int32)
             nthreads = 0
-            with omp(target_pragma):
-                with omp("parallel num_threads(32)"):
+            with openmp(target_pragma):
+                with openmp("parallel num_threads(32)"):
                     local_array = np.empty(10, dtype=np.int32)
                     tid = omp_get_thread_num()
                     if tid == 0:
@@ -4716,14 +4717,14 @@ def test_impl():
             nteams = 0
             nthreads = 0
 
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 team_shared_array = np.empty(32, dtype=np.int32)
                 team_id = omp_get_team_num()
                 if team_id == 0:
                     nteams = omp_get_num_teams()
                     nthreads = omp_get_num_threads()
 
-                with omp("parallel num_threads(32)"):
+                with openmp("parallel num_threads(32)"):
                     thread_local_array = np.empty(10, dtype=np.int32)
                     for i in range(10):
                         thread_local_array[i] = omp_get_thread_num()
@@ -4759,7 +4760,7 @@ def test_impl(n):
             a = np.ones((n, n))
             b = np.ones((n, n))
             c = np.zeros((n, n))
-            with omp(target_pragma):
+            with openmp(target_pragma):
                 for i in range(n):
                     for j in range(n):
                         c[i, j] = a[i, j] + b[i, j]
@@ -4777,9 +4778,9 @@ def test_impl(n):
             a = np.ones((n, n))
             b = np.ones((n, n))
             c = np.zeros((n, n))
-            with omp(target_pragma):
-                with omp("teams"):
-                    with omp("loop collapse(2)"):
+            with openmp(target_pragma):
+                with openmp("teams"):
+                    with openmp("loop collapse(2)"):
                         for i in range(n):
                             for j in range(n):
                                 c[i, j] = a[i, j] + b[i, j]
@@ -4820,8 +4821,8 @@ def test_impl(num_steps):
             the_sum = 0.0
             omp_set_num_threads(4)
 
-            with omp("parallel"):
-                with omp("for reduction(+:the_sum) schedule(static)"):
+            with openmp("parallel"):
+                with openmp("for reduction(+:the_sum) schedule(static)"):
                     for j in range(num_steps):
                         x = ((j - 1) - 0.5) * step
                         the_sum += 4.0 / (1.0 + x * x)
@@ -4838,7 +4839,7 @@ def test_impl(num_steps):
             the_sum = 0.0
             omp_set_num_threads(4)
 
-            with omp("parallel for reduction(+:the_sum) schedule(static)"):
+            with openmp("parallel for reduction(+:the_sum) schedule(static)"):
                 for j in range(num_steps):
                     x = ((j - 1) - 0.5) * step
                     the_sum += 4.0 / (1.0 + x * x)
@@ -4855,7 +4856,7 @@ def test_impl(num_steps):
             the_sum = 0.0
             omp_set_num_threads(4)
 
-            with omp("loop reduction(+:the_sum) schedule(static)"):
+            with openmp("loop reduction(+:the_sum) schedule(static)"):
                 for j in range(num_steps):
                     x = ((j - 1) - 0.5) * step
                     the_sum += 4.0 / (1.0 + x * x)
@@ -4875,7 +4876,7 @@ def test_impl(num_steps):
             omp_set_num_threads(j)
             full_sum = 0.0
 
-            with omp("parallel private(tid, numthreads, local_sum, x)"):
+            with openmp("parallel private(tid, numthreads, local_sum, x)"):
                 tid = omp_get_thread_num()
                 numthreads = omp_get_num_threads()
                 local_sum = 0.0
@@ -4907,11 +4908,11 @@ def test_pi_comp(Nstart, Nfinish, step):
                 pi_sum1 = 0.0
                 pi_sum2 = 0.0
                 cut = Nfinish - (iblk // 2)
-                with omp("task shared(pi_sum1)"):
+                with openmp("task shared(pi_sum1)"):
                     pi_sum1 = test_pi_comp(Nstart, cut, step)
-                with omp("task shared(pi_sum2)"):
+                with openmp("task shared(pi_sum2)"):
                     pi_sum2 = test_pi_comp(cut, Nfinish, step)
-                with omp("taskwait"):
+                with openmp("taskwait"):
                     pi_sum = pi_sum1 + pi_sum2
             return pi_sum
 
@@ -4928,11 +4929,11 @@ def test_pi_comp_njit(Nstart, Nfinish, step):
                 pi_sum1 = 0.0
                 pi_sum2 = 0.0
                 cut = Nfinish - (iblk // 2)
-                with omp("task shared(pi_sum1)"):
+                with openmp("task shared(pi_sum1)"):
                     pi_sum1 = test_pi_comp_njit(Nstart, cut, step)
-                with omp("task shared(pi_sum2)"):
+                with openmp("task shared(pi_sum2)"):
                     pi_sum2 = test_pi_comp_njit(cut, Nfinish, step)
-                with omp("taskwait"):
+                with openmp("taskwait"):
                     pi_sum = pi_sum1 + pi_sum2
             return pi_sum
 
@@ -4943,8 +4944,8 @@ def test_impl(lb, num_steps, pi_comp_func):
             omp_set_num_threads(j)
             full_sum = 0.0
 
-            with omp("parallel"):
-                with omp("single"):
+            with openmp("parallel"):
+                with openmp("single"):
                     full_sum = pi_comp_func(lb, num_steps, step)
 
             pi = step * full_sum

From c392b45c6c76db47e239f8ec071aa80f37c87dff Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Tue, 22 Apr 2025 19:55:56 -0700
Subject: [PATCH 04/28] Update pyproject.toml, setup.py and clean up

---
 buildscripts/conda-recipes/llvmlite/bld.bat   | 25 -----
 buildscripts/conda-recipes/llvmlite/build.sh  | 14 ---
 .../llvmlite/conda_build_config.yaml          | 12 ---
 buildscripts/conda-recipes/llvmlite/meta.yaml | 53 ----------
 .../conda-recipes/llvmlite/run_test.py        |  6 --
 buildscripts/conda-recipes/numba/bld.bat      |  3 -
 buildscripts/conda-recipes/numba/build.sh     | 17 ----
 buildscripts/conda-recipes/numba/license.txt  | 24 -----
 buildscripts/conda-recipes/numba/meta.yaml    | 99 -------------------
 buildscripts/conda-recipes/numba/run_test.bat | 23 -----
 buildscripts/conda-recipes/numba/run_test.sh  | 84 ----------------
 .../{numba => pyomp}/conda_build_config.yaml  |  0
 buildscripts/conda-recipes/pyomp/meta.yaml    | 35 +++++--
 pyproject.toml                                | 15 +--
 setup.py                                      | 70 ++++++++-----
 15 files changed, 80 insertions(+), 400 deletions(-)
 delete mode 100755 buildscripts/conda-recipes/llvmlite/bld.bat
 delete mode 100644 buildscripts/conda-recipes/llvmlite/build.sh
 delete mode 100644 buildscripts/conda-recipes/llvmlite/conda_build_config.yaml
 delete mode 100644 buildscripts/conda-recipes/llvmlite/meta.yaml
 delete mode 100644 buildscripts/conda-recipes/llvmlite/run_test.py
 delete mode 100644 buildscripts/conda-recipes/numba/bld.bat
 delete mode 100755 buildscripts/conda-recipes/numba/build.sh
 delete mode 100644 buildscripts/conda-recipes/numba/license.txt
 delete mode 100644 buildscripts/conda-recipes/numba/meta.yaml
 delete mode 100644 buildscripts/conda-recipes/numba/run_test.bat
 delete mode 100644 buildscripts/conda-recipes/numba/run_test.sh
 rename buildscripts/conda-recipes/{numba => pyomp}/conda_build_config.yaml (100%)

diff --git a/buildscripts/conda-recipes/llvmlite/bld.bat b/buildscripts/conda-recipes/llvmlite/bld.bat
deleted file mode 100755
index d7342e249d5f..000000000000
--- a/buildscripts/conda-recipes/llvmlite/bld.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-
-@rem Let CMake know about the LLVM install path, for find_package()
-set CMAKE_PREFIX_PATH=%LIBRARY_PREFIX%
-
-@rem VS2019 uses a different naming convention for platforms than older version
-if "%ARCH%"=="32" (
-    @rem VS2017:
-    @rem set CMAKE_GENERATOR_ARCH=
-    set CMAKE_GENERATOR_ARCH=Win32
-) else (
-    @rem VS2017
-    @rem set CMAKE_GENERATOR_ARCH=Win64
-    set CMAKE_GENERATOR_ARCH=x64
-)
-set CMAKE_GENERATOR=Visual Studio 16 2019
-set CMAKE_GENERATOR_TOOLKIT=v142
-
-@rem Ensure there are no build leftovers (CMake can complain)
-if exist ffi\build rmdir /S /Q ffi\build
-
-%PYTHON% -S setup.py install
-if errorlevel 1 exit 1
-
-%PYTHON% runtests.py
-if errorlevel 1 exit 1
diff --git a/buildscripts/conda-recipes/llvmlite/build.sh b/buildscripts/conda-recipes/llvmlite/build.sh
deleted file mode 100644
index 9f3d87cc3ea0..000000000000
--- a/buildscripts/conda-recipes/llvmlite/build.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-set -x
-
-export PYTHONNOUSERSITE=1
-
-# Enables static linking of stdlibc++
-export LLVMLITE_CXX_STATIC_LINK=1
-# cmake is broken for osx builds.
-#export LLVMLITE_USE_CMAKE=1
-export LLVMLITE_SHARED=1
-
-$PYTHON setup.py build --force
-$PYTHON setup.py install
diff --git a/buildscripts/conda-recipes/llvmlite/conda_build_config.yaml b/buildscripts/conda-recipes/llvmlite/conda_build_config.yaml
deleted file mode 100644
index 81b7d08c3d19..000000000000
--- a/buildscripts/conda-recipes/llvmlite/conda_build_config.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Numba/llvmlite stack needs an older compiler for backwards compatability.
-c_compiler_version:         # [linux]
-  - 7                       # [linux and (x86_64 or ppc64le)]
-  - 9                       # [linux and aarch64]
-
-cxx_compiler_version:       # [linux]
-  - 7                       # [linux and (x86_64 or ppc64le)]
-  - 9                       # [linux and aarch64]
-
-fortran_compiler_version:   # [linux]
-  - 7                       # [linux and (x86_64 or ppc64le)]
-  - 9                       # [linux and aarch64]
diff --git a/buildscripts/conda-recipes/llvmlite/meta.yaml b/buildscripts/conda-recipes/llvmlite/meta.yaml
deleted file mode 100644
index 0a15258e499e..000000000000
--- a/buildscripts/conda-recipes/llvmlite/meta.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-package:
-  name: llvmlite
-  version: pyomp_0.40
-
-source:
-  git_url: https://github.com/Python-for-HPC/llvmliteWithOpenmp.git
-  git_rev: ce7b659c6a62aa4466d6b3894573f9900f8a1451
-  git_depth: 1
-
-build:
-  string: py{{ PY_VER }}h{{ PKG_HASH }}_{{ (GITHUB_HEAD_SHA | default(''))[:7] ~ (CI_COMMIT_SHA | default(''))[:7] }}
-  script_env:
-    - PY_VCRUNTIME_REDIST
-    - GITHUB_HEAD_SHA
-    - CI_COMMIT_SHA
-
-requirements:
-  build:
-    # We cannot do this on macOS as the llvm-config from the
-    # toolchain conflicts with the same from llvmdev, the
-    # build.sh deals with it!
-    - {{ compiler('c') }}    # [not (osx or armv6l or armv7l or win)]
-    - {{ compiler('cxx') }}  # [not (osx or armv6l or armv7l or win)]
-    # The DLL build uses cmake on Windows
-    - cmake          # [win]
-    - make           # [unix]
-  host:
-    - python
-    # On channel https://anaconda.org/numba/
-    - llvmdev 14.0.6
-    - llvm 14.0.6
-    - vs2015_runtime # [win]
-    # llvmdev is built with libz compression support
-    - zlib           # [unix and not (armv6l or armv7l)]
-    # requires libxml2
-    - libxml2        # [win]
-  run:
-    - python >=3.8,<=3.10
-    - vs2015_runtime # [win]
-    # osx has dynamically linked libstdc++
-    - libcxx >=4.0.1 # [osx]
-
-test:
-  imports:
-    - llvmlite
-    - llvmlite.binding
-  commands:
-    - python -m llvmlite.tests
-
-about:
-  home: https://github.com/numba/llvmlite
-  license: New BSD License
-  summary: A lightweight LLVM python binding for writing JIT compilers
diff --git a/buildscripts/conda-recipes/llvmlite/run_test.py b/buildscripts/conda-recipes/llvmlite/run_test.py
deleted file mode 100644
index 5591c0533f36..000000000000
--- a/buildscripts/conda-recipes/llvmlite/run_test.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import os
-from llvmlite.tests import main
-
-# Enable tests for distribution only
-os.environ['LLVMLITE_DIST_TEST'] = ''
-main()
diff --git a/buildscripts/conda-recipes/numba/bld.bat b/buildscripts/conda-recipes/numba/bld.bat
deleted file mode 100644
index 6372f3a4d2e9..000000000000
--- a/buildscripts/conda-recipes/numba/bld.bat
+++ /dev/null
@@ -1,3 +0,0 @@
-%PYTHON% setup.py build install --single-version-externally-managed --record=record.txt
-
-exit /b %errorlevel%
diff --git a/buildscripts/conda-recipes/numba/build.sh b/buildscripts/conda-recipes/numba/build.sh
deleted file mode 100755
index 4ae35afafb09..000000000000
--- a/buildscripts/conda-recipes/numba/build.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-if [[ "$(uname -s)" == *"Linux"* ]] && [[ "$(uname -p)" == *"86"* ]]; then
-    EXTRA_BUILD_EXT_FLAGS="--werror --wall"
-else
-    EXTRA_BUILD_EXT_FLAGS=""
-fi
-
-if [[ "$(uname -s)" == *"Linux"* ]] && ([[ "$(uname -p)" == *"ppc64le"* ]] || [[ "$(uname -p)" == *"aarch64"* ]]); then
-    # To workaround https://github.com/numba/numba/issues/7302 
-    # because of a python build problem that the -pthread could be stripped.
-    export CC="$CC -pthread"
-    export CXX="$CXX -pthread"
-fi
-
-MACOSX_DEPLOYMENT_TARGET=10.10 $PYTHON setup.py \
-    build_static build_ext $EXTRA_BUILD_EXT_FLAGS build install --single-version-externally-managed --record=record.txt
diff --git a/buildscripts/conda-recipes/numba/license.txt b/buildscripts/conda-recipes/numba/license.txt
deleted file mode 100644
index 7d19426e7a09..000000000000
--- a/buildscripts/conda-recipes/numba/license.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Copyright (c) 2012, Anaconda, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-Redistributions of source code must retain the above copyright notice,
-this list of conditions and the following disclaimer.
-
-Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/buildscripts/conda-recipes/numba/meta.yaml b/buildscripts/conda-recipes/numba/meta.yaml
deleted file mode 100644
index 947d49816e4b..000000000000
--- a/buildscripts/conda-recipes/numba/meta.yaml
+++ /dev/null
@@ -1,99 +0,0 @@
-{% set version = "pyomp_0.57" %}
-
-package:
-   name: numba
-   #version: {{ GIT_DESCRIBE_TAG }}
-   version: {{ version }}
-
-source:
-  git_url: https://github.com/Python-for-HPC/numbaWithOpenmp.git
-  git_rev: f7a481ef05bcc2a11dfead708854530800b534c8
-  git_depth: 1
-
-build:
-  string: np{{ NPY_VER }}py{{ PY_VER }}h{{ PKG_HASH }}_{{ version }}_{{ (GITHUB_HEAD_SHA | default(''))[:7] ~ (CI_COMMIT_SHA | default(''))[:7] }}
-  entry_points:
-    - numba = numba.misc.numba_entry:main
-  script_env:
-    - PY_VCRUNTIME_REDIST
-    - GITHUB_HEAD_SHA
-    - CI_COMMIT_SHA
-  missing_dso_whitelist: # [osx]
-    # optional dependency: required only when omp is chosen as the backend for
-    # the threading layer
-    - lib/libiomp5.dylib # [osx]
-  #ignore_run_exports:
-    # tbb-devel triggers hard dependency on tbb, this is not the case.
-    - tbb     # [not (aarch64 or ppc64le or win32)]
-
-requirements:
-  # build and run dependencies are duplicated to avoid setuptools issues
-  # when we also set install_requires in setup.py
-  build:
-    - {{ compiler('c') }}
-    - {{ compiler('cxx') }}
-    # OpenMP headers from llvm needed for OSX.
-  host:
-    - python
-    - numpy >=1.22.3, <1.25
-    - llvm-openmp-dev
-    - setuptools
-    - importlib_metadata       # [py<39]
-    - llvmlite pyomp_0.40.*
-    # TBB devel version is to match TBB libs.
-    # NOTE: ppc64le and aarch64 are pending testing so excluded for now, win32
-    # is not a supported parallel target.
-    - tbb-devel >=2021.6       # [not (aarch64 or ppc64le or win32)]
-  run:
-    - python
-    # NumPy 1.22.0, 1.22.1, 1.22.2 are all broken for ufuncs, see #7756
-    - numpy >=1.22.3, <1.25
-    - importlib_metadata       # [py<39]
-    # On channel https://anaconda.org/numba/
-    - llvmlite pyomp_0.40.*
-    - lark-parser
-    - cffi
-    - llvm-openmp-dev
-  run_constrained:
-    # If TBB is present it must be at least version 2021.6
-    - tbb >=2021.6    # [not (aarch64 or ppc64le or win32)]
-    # avoid confusion from openblas bugs
-    - libopenblas !=0.3.6      # [x86_64]
-    # 0.3.17 buggy on M1 silicon
-    # https://github.com/xianyi/OpenBLAS/blob/v0.3.20/Changelog.txt#L118
-    # https://github.com/numba/numba/issues/7822#issuecomment-1063229855
-    # Exclude 0.3.20 too
-    # https://github.com/numba/numba/issues/8096
-    - libopenblas >=0.3.18, !=0.3.20     # [arm64]
-    # CUDA 10.2 or later is required for CUDA support
-    - cudatoolkit >=10.2
-    # scipy 1.0 or later
-    - scipy >=1.0
-    # CUDA Python 11.6 or later
-    - cuda-python >=11.6
-
-test:
-  requires:
-    - jinja2
-    # Required to test optional Numba features
-    - cffi
-    - scipy
-    - ipython                  # [not aarch64]
-    # for pycc
-    - setuptools
-    - tbb >=2021.6             # [not (aarch64 or ppc64le or win32)]
-    # this is clobbering as run depends on llvm-openmp-dev
-    #- llvm-openmp              # [osx]
-    # This is for driving gdb tests
-    - pexpect                  # [linux64]
-    # For testing ipython
-    - ipykernel
-    # Need these for AOT. Do not init msvc as it may not be present
-    - {{ compiler('c') }}      # [not (win or aarch64)]
-    - {{ compiler('cxx') }}    # [not (win or aarch64)]
-
-about:
-  home: https://numba.pydata.org/
-  license: BSD
-  license_file: LICENSE
-  summary: a just-in-time Python function compiler based on LLVM
diff --git a/buildscripts/conda-recipes/numba/run_test.bat b/buildscripts/conda-recipes/numba/run_test.bat
deleted file mode 100644
index a62aac845713..000000000000
--- a/buildscripts/conda-recipes/numba/run_test.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-set NUMBA_DEVELOPER_MODE=1
-set NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1
-set NUMBA_CAPTURED_ERRORS=new_style
-set PYTHONFAULTHANDLER=1
-
-@rem no parallel target support for 32 bit windows and no TBB packages
-if "%ARCH%"=="32" (
-    set NUMBA_DISABLE_TBB=1
-)
-
-@rem Check Numba executable is there
-numba -h
-
-@rem Run system info tool
-numba -s
-
-@rem Check test discovery works
-python -m numba.tests.test_runtests
-
-@rem Run the whole test suite
-python -m numba.runtests -b -m -- %TESTS_TO_RUN%
-
-if errorlevel 1 exit 1
diff --git a/buildscripts/conda-recipes/numba/run_test.sh b/buildscripts/conda-recipes/numba/run_test.sh
deleted file mode 100644
index b6caec9ac810..000000000000
--- a/buildscripts/conda-recipes/numba/run_test.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/bin/bash
-
-set -e
-
-export NUMBA_DEVELOPER_MODE=1
-export NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1
-export NUMBA_CAPTURED_ERRORS="new_style"
-export PYTHONFAULTHANDLER=1
-# Required OpenMP test env var (for offloading).
-export TEST_DEVICES=0
-
-# Disable NumPy dispatching to AVX512_SKX feature extensions if the chip is
-# reported to support the feature and NumPy >= 1.22 as this results in the use
-# of low accuracy SVML libm replacements in ufunc loops.
-_NPY_CMD='from numba.misc import numba_sysinfo;\
-          sysinfo=numba_sysinfo.get_sysinfo();\
-          print(sysinfo["NumPy AVX512_SKX detected"] and
-                sysinfo["NumPy Version"]>="1.22")'
-NUMPY_DETECTS_AVX512_SKX_NP_GT_122=$(python -c "$_NPY_CMD")
-echo "NumPy >= 1.22 with AVX512_SKX detected: $NUMPY_DETECTS_AVX512_SKX_NP_GT_122"
-
-if [[ "$NUMPY_DETECTS_AVX512_SKX_NP_GT_122" == "True" ]]; then
-    export NPY_DISABLE_CPU_FEATURES="AVX512_SKX"
-fi
-
-
-unamestr=`uname`
-if [[ "$unamestr" == 'Linux' ]]; then
-  # Test if catchsegv exists, not by default in recent libc.
-  if catchsegv --version; then
-    SEGVCATCH=catchsegv
-  else
-    SEGVCATCH=""
-  fi
-elif [[ "$unamestr" == 'Darwin' ]]; then
-  SEGVCATCH=""
-else
-  echo Error
-fi
-
-# limit CPUs in use on PPC64LE, fork() issues
-# occur on high core count systems
-archstr=`uname -m`
-if [[ "$archstr" == 'ppc64le' ]]; then
-    TEST_NPROCS=16
-fi
-
-# Check Numba executable is there
-numba -h
-
-# run system info tool
-numba -s
-
-# Check test discovery works
-python -m numba.tests.test_runtests
-
-# Disable tests for package building.
-exit 0
-
-if nvidia-smi --list-gpus; then
-  echo "Found NVIDIA GPU, enable OpenMP offloading tests"
-  export RUN_TARGET=1
-else
-  echo "Missing NVIDIA GPU, disable OpenMP offloading tests"
-  export RUN_TARGET=0
-fi
-
-# Run the whole test suite
-# Test only openmp for brevity. We may want to enable the full numba tests,
-# which include openmp, on larger runners.
-TESTS_TO_RUN="numba.tests.test_openmp"
-# Run OpenMP tests in a single-process since they use multiple cores by
-# multi-threading. Using multiple processes for testing will very probably slow
-# things down.
-# XXX: Using -m $TEST_NPROCS, even if with 1 process, hangs on github runners
-# when running the full testsuite, while individual tests pass.  This requires
-# more investigation. Some observations: 1) running the full test suite creates
-# new threads for each region, the old ones are blocked in a futex for
-# destruction, 2) it is possible that in small github runners this starves cpu
-# time, 3) there may be implications with "-m 1" vs. no flag on how the runtime
-# library is inited/de-inited.
-
-echo "Running: $SEGVCATCH python -m numba.runtests -v -- $TESTS_TO_RUN"
-$SEGVCATCH python -m numba.runtests -v -- $TESTS_TO_RUN
diff --git a/buildscripts/conda-recipes/numba/conda_build_config.yaml b/buildscripts/conda-recipes/pyomp/conda_build_config.yaml
similarity index 100%
rename from buildscripts/conda-recipes/numba/conda_build_config.yaml
rename to buildscripts/conda-recipes/pyomp/conda_build_config.yaml
diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml
index f2bd067b96fc..4ee7193abc7e 100644
--- a/buildscripts/conda-recipes/pyomp/meta.yaml
+++ b/buildscripts/conda-recipes/pyomp/meta.yaml
@@ -1,17 +1,36 @@
+{% set version = "0.2.0" %}
+
 package:
   name: pyomp
-  version: 0.1.3
+  version: {{ version }}
+
+source:
+   path: ../../..
 
 build:
-  string: {{ (GITHUB_HEAD_SHA | default(''))[:7] ~ (CI_COMMIT_SHA | default(''))[:7] }}
+  string: py{{ PY_VER }}h{{ PKG_HASH }}_{{GIT_DESCRIBE_HASH}}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
-    - PY_VCRUNTIME_REDIST
-    - GITHUB_HEAD_SHA
-    - CI_COMMIT_SHA
+    - PY_VCRUNTIME_REDIST # [win]
+  script:
+    - {{ PYTHON }} -m pip install . -vv
 
 requirements:
+  build:
+    - {{ compiler('c') }}
+    - {{ compiler('cxx') }}
+  host:
+    - python
+    - pip
+    - setuptools
+    - numba >=0.57, <0.58
   run:
-    - python >=3.8, <=3.10
-    - numba pyomp_0.57.*
+    - numba >=0.57, <0.58
+    - lark
+    - cffi
+    - llvm-openmp-dev
+
 about:
-  summary: "PyOMP metapackage, OpenMP for portable CPU/GPU parallel programming in Python."
+  home: https://github.com/Python-for-HPC/PyOMP
+  license: BSD-2-Clause
+  license_file: LICENSE
+  summary: "PyOMP: OpenMP for portable CPU/GPU parallel programming in Python using Numba."
diff --git a/pyproject.toml b/pyproject.toml
index 46b76e28f22c..af4b697973bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=61.0", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"]
+requires = ["setuptools>=77.0", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -7,11 +7,10 @@ name = "pyomp"
 version = "0.2.0"
 description = "Python OpenMP library based on Numba"
 readme = "README.md"
-requires-python = ">=3.8, <=3.12"
-license = { text = "BSD 2-Clause License" }
+requires-python = ">=3.8, <3.12"
+license = "BSD-2-Clause"
 classifiers = [
     "Programming Language :: Python :: 3",
-    "License :: OSI Approved :: BSD License",
     "Operating System :: OS Independent",
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
@@ -23,9 +22,5 @@ dependencies = ["numba>=0.57, <0.58", "lark", "cffi"]
 Homepage = "https://github.com/Python-for-HPC/PyOMP"
 Issues = "https://github.com/Python-for-HPC/PyOMP/issues"
 
-[tool.setuptools.packages.find]
-where = ["."]
-include = ["numba.openmp"]
-
-[tool.setuptools.package-data]
-"numba.openmp" = ["libs/*"]
+[tool.setuptools]
+packages = ["numba.openmp"]
diff --git a/setup.py b/setup.py
index 487343fcc112..d854b277c4c0 100644
--- a/setup.py
+++ b/setup.py
@@ -2,26 +2,28 @@
 import os
 import numba
 import sysconfig
-import numpy as np
 import subprocess
+import shutil
+import numpy as np
 from pathlib import Path
 from setuptools import setup, Extension
 from setuptools.command.build_ext import build_ext
 from setuptools.command.build_clib import build_clib
 
 
-numba_dir = os.path.dirname(numba.__file__)
+temp_dir = Path("numba/openmp/nrt/numba_src")
+
 bundle_lib = (
     "bundle",
     {
         "sources": [
             "numba/openmp/nrt/init.c",
-            f"{numba_dir}/_helpermod.c",
-            f"{numba_dir}/cext/utils.c",
-            f"{numba_dir}/cext/dictobject.c",
-            f"{numba_dir}/cext/listobject.c",
-            f"{numba_dir}/core/runtime/_nrt_pythonmod.c",
-            f"{numba_dir}/core/runtime/nrt.cpp",
+            f"{temp_dir}/_helpermod.c",
+            f"{temp_dir}/cext/utils.c",
+            f"{temp_dir}/cext/dictobject.c",
+            f"{temp_dir}/cext/listobject.c",
+            f"{temp_dir}/core/runtime/_nrt_pythonmod.c",
+            f"{temp_dir}/core/runtime/nrt.cpp",
         ],
         "include_dirs": [
             sysconfig.get_paths()["include"],
@@ -31,23 +33,42 @@
 )
 
 
-class BuildStaticBundle(build_clib):
+class BuildStaticNRTBundle(build_clib):
     def finalize_options(self):
         super().finalize_options()
-        self.build_temp = (Path("numba/openmp/nrt") / self.build_temp).absolute()
-        self.build_temp.mkdir(parents=True, exist_ok=True)
-        self.build_temp = str(self.build_temp)
-        self.build_clib = str(Path("numba/openmp/libs").absolute())
+        # Copy numba tree installation to the build directory for building the
+        # static library using relative paths.
+        numba_dir = numba.__path__[0]
+        shutil.copytree(
+            numba_dir,
+            temp_dir,
+            ignore=shutil.ignore_patterns(
+                "*.py",
+                "*.pyc",
+                "*.so",
+                "*.dylib",
+                "__pycache__",
+            ),
+            dirs_exist_ok=True,
+        )
+
+        self.build_clib = "numba/openmp/libs"
+
+    def run(self):
+        super().run()
+
+        # Clean up files after build is completed.
+        shutil.rmtree(temp_dir, ignore_errors=True)
 
 
 class CMakeExtension(Extension):
     def __init__(self, name, sourcedir):
-        # don't invoke the original build_ext for this special extension
+        # Don't invoke the original build_ext for this special extension.
         super().__init__(name, sources=[])
         self.sourcedir = sourcedir
 
 
-class BuildPass(build_ext):
+class BuildIntrinsicsOpenMPPass(build_ext):
     def run(self):
         for ext in self.extensions:
             if isinstance(ext, CMakeExtension):
@@ -56,24 +77,26 @@ def run(self):
         super().run()
 
     def build_cmake(self, ext):
-        build_dir = (Path(ext.sourcedir) / self.build_temp).absolute()
+        # Delete build directory if it exists to avoid errors with stale
+        # CMakeCache.txt leftovers.
+        shutil.rmtree(self.build_temp, ignore_errors=True)
+
         subprocess.run(
             [
                 "cmake",
                 "-S",
                 ext.sourcedir,
                 "-B",
-                build_dir,
-                "--install-prefix",
-                Path("numba/openmp/libs").absolute(),
+                self.build_temp,
                 "-DCMAKE_BUILD_TYPE=Release",
+                "-DCMAKE_INSTALL_PREFIX=numba/openmp/libs",
             ],
             check=True,
         )
 
-        subprocess.run(["cmake", "--build", build_dir, "-j"], check=True)
+        subprocess.run(["cmake", "--build", self.build_temp, "-j"], check=True)
         subprocess.run(
-            ["cmake", "--install", build_dir],
+            ["cmake", "--install", self.build_temp],
             check=True,
         )
 
@@ -81,5 +104,8 @@ def build_cmake(self, ext):
 setup(
     libraries=[bundle_lib],
     ext_modules=[CMakeExtension("libIntrinsicsOpenMP", "numba/openmp/pass")],
-    cmdclass={"build_clib": BuildStaticBundle, "build_ext": BuildPass},
+    cmdclass={
+        "build_clib": BuildStaticNRTBundle,
+        "build_ext": BuildIntrinsicsOpenMPPass,
+    },
 )

From 57ac3d82c2a3c5d92ae9b3b9e9b9a19cc8604d0b Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Tue, 22 Apr 2025 23:03:37 -0700
Subject: [PATCH 05/28] Update conda builders and simplify llvm-openmp-dev
 meta.yaml

---
 .github/workflows/build-upload-conda-base.yml | 131 +-----------------
 .gitlab/jobs/lassen.yml                       |  15 +-
 .../conda-recipes/llvm-openmp-dev/meta.yaml   |   6 +-
 buildscripts/gitlab/create-conda-pkgs.sh      |   8 --
 4 files changed, 10 insertions(+), 150 deletions(-)

diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml
index 6a9186b1e737..e5713bd04580 100644
--- a/.github/workflows/build-upload-conda-base.yml
+++ b/.github/workflows/build-upload-conda-base.yml
@@ -34,7 +34,6 @@ env:
     git clone https://github.com/Python-for-HPC/PyOMP;
     cd PyOMP;
     git checkout ${{ inputs.commit }};
-    export GITHUB_HEAD_SHA=${{ inputs.commit }};
 
 jobs:
   # Job to deploy llvm-openmp-dev, runs once as it is independent of the python
@@ -48,9 +47,6 @@ jobs:
         os: [ubuntu-latest, macOS-latest]
     steps:
       - uses: actions/checkout@v4
-      - name: Set env for HEAD SHA
-        run:
-          echo "GITHUB_HEAD_SHA=${{ inputs.commit }}" >> $GITHUB_ENV
       - name: Create and activate conda env
         uses: conda-incubator/setup-miniconda@v3
         with:
@@ -69,43 +65,10 @@ jobs:
             --token ${{ secrets.ANACONDA_TOKEN }} \
             buildscripts/conda-recipes/llvm-openmp-dev;
 
-  # Job to deploy llvmlite and numba, matrixed on os and python version.
-  conda-deploy-llvmlite:
+  # Job to deploy the pyomp metapackage matrixed on the python version.
+  conda-deploy-pyomp:
     needs: conda-deploy-llvm-openmp-dev
-    name: llvmlite ${{ matrix.os }} ${{ matrix.python-version }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        #os: [ubuntu-latest, macOS-latest, windows-latest]
-        os: [ubuntu-latest, macOS-latest]
-        python-version: ["3.8", "3.9", "3.10"]
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set env for HEAD SHA
-        run:
-          echo "GITHUB_HEAD_SHA=${{ inputs.commit }}" >> $GITHUB_ENV
-      - name: Create and activate conda env
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          python-version: ${{ matrix.python-version }}
-          environment-file: ${{ inputs.env }}
-          auto-update-conda: false
-          auto-activate-base: false
-          show-channel-urls: true
-      - name: Build and upload llvmlite
-        run: |
-          conda remove --name base conda-anaconda-telemetry
-          conda install -q -y -c conda-forge conda-build conda-verify anaconda-client;
-          conda config --set anaconda_upload yes;
-          conda build --user python-for-hpc --label ${{ inputs.label }} \
-            -c python-for-hpc -c conda-forge \
-            --python ${{ matrix.python-version }} --token ${{ secrets.ANACONDA_TOKEN }} \
-            buildscripts/conda-recipes/llvmlite;
-
-  # Job to deploy numba, matrixed on os and python version.
-  conda-deploy-numba:
-    needs: conda-deploy-llvmlite
-    name: numba ${{ matrix.os }} ${{ matrix.python-version }}
+    name: pyomp ${{ matrix.os }} ${{ matrix.python-version }}
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -114,42 +77,6 @@ jobs:
         python-version: ["3.8", "3.9", "3.10"]
     steps:
       - uses: actions/checkout@v4
-      - name: Set env for HEAD SHA
-        run:
-          echo "GITHUB_HEAD_SHA=${{ inputs.commit }}" >> $GITHUB_ENV
-      - name: Create and activate conda env
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          python-version: ${{ matrix.python-version }}
-          environment-file: ${{ inputs.env }}
-          auto-update-conda: false
-          auto-activate-base: false
-          show-channel-urls: true
-      - name: Build and upload numba
-        run: |
-          conda remove --name base conda-anaconda-telemetry
-          conda install -q -y -c conda-forge conda-build conda-verify anaconda-client;
-          conda config --set anaconda_upload yes;
-          conda build --user python-for-hpc --label ${{ inputs.label }} \
-            -c python-for-hpc -c conda-forge \
-            --python ${{ matrix.python-version }} --token ${{ secrets.ANACONDA_TOKEN }} \
-            buildscripts/conda-recipes/numba;
-
-  # Job to deploy the pyomp metapackage, runs once as it is independent of the
-  # python version.
-  conda-deploy-pyomp:
-    needs: conda-deploy-numba
-    name: pyomp ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        #os: [ubuntu-latest, macOS-latest, windows-latest]
-        os: [ubuntu-latest, macOS-latest]
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set env for HEAD SHA
-        run:
-          echo "GITHUB_HEAD_SHA=${{ inputs.commit }}" >> $GITHUB_ENV
       - name: Create and activate conda env
         uses: conda-incubator/setup-miniconda@v3
         with:
@@ -164,7 +91,7 @@ jobs:
           conda install -q -y -c conda-forge conda-build conda-verify anaconda-client;
           conda config --set anaconda_upload yes;
           conda build --user python-for-hpc --label ${{ inputs.label }} \
-            -c python-for-hpc -c conda-forge \
+            -c python-for-hpc -c conda-forge --python ${{ matrix.python-version }} \
             --token ${{ secrets.ANACONDA_TOKEN }} \
             buildscripts/conda-recipes/pyomp;
 
@@ -191,58 +118,13 @@ jobs:
             buildscripts/conda-recipes/llvm-openmp-dev;
           "
 
-  conda-deploy-llvmlite-linux-arm64:
+  conda-deploy-pyomp-linux-arm64:
     needs: conda-deploy-llvm-openmp-dev-linux-arm64
-    name: llvmlite linux-arm64
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.8", "3.9", "3.10"]
-    steps:
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          platforms: linux/arm64
-      - name: Deploy llvmlite
-        run: |
-          docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c "
-          ${{ env.SETUP_BASE }}
-          ${{ env.SETUP_MINICONDA }}
-          ${{ env.SETUP_REPO }}
-          conda build --user python-for-hpc --label ${{ inputs.label }} \
-            -c python-for-hpc -c conda-forge \
-            --python ${{ matrix.python-version}} --token ${{ secrets.ANACONDA_TOKEN }} \
-            buildscripts/conda-recipes/llvmlite
-          "
-
-  conda-deploy-numba-linux-arm64:
-    needs: conda-deploy-llvmlite-linux-arm64
-    name: numba linux-arm64
+    name: pyomp linux-arm64
     runs-on: ubuntu-latest
     strategy:
       matrix:
         python-version: ["3.8", "3.9", "3.10"]
-    steps:
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          platforms: linux/arm64
-      - name: Deploy numba
-        run: |
-          docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c "
-          ${{ env.SETUP_BASE }}
-          ${{ env.SETUP_MINICONDA }}
-          ${{ env.SETUP_REPO }}
-          conda build --user python-for-hpc --label ${{ inputs.label }} \
-            -c python-for-hpc -c conda-forge \
-            --python ${{ matrix.python-version}} --token ${{ secrets.ANACONDA_TOKEN }} \
-            buildscripts/conda-recipes/numba;
-          "
-
-  conda-deploy-pyomp-linux-arm64:
-    needs: conda-deploy-numba-linux-arm64
-    name: pyomp linux-arm64
-    runs-on: ubuntu-latest
     steps:
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
@@ -256,6 +138,7 @@ jobs:
           ${{ env.SETUP_REPO }}
           conda build --user python-for-hpc --label ${{ inputs.label }} \
             -c python-for-hpc -c conda-forge \
+            --python ${{ matrix.python-version}} \
             --token ${{ secrets.ANACONDA_TOKEN }} \
             buildscripts/conda-recipes/pyomp;
           "
\ No newline at end of file
diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml
index b394c9a55898..05e1db7bd6ea 100644
--- a/.gitlab/jobs/lassen.yml
+++ b/.gitlab/jobs/lassen.yml
@@ -66,21 +66,8 @@ build-llvm-openmp-dev-lassen:
   variables:
     PYOMP_CI_BUILD_PKG: "llvm-openmp-dev"
 
-build-llvmlite-lassen:
+build-pyomp-lassen:
   extends: [.base-job, .python-variants]
   needs: ["build-llvm-openmp-dev-lassen"]
-  variables:
-    PYOMP_CI_BUILD_PKG: "llvmlite"
-
-build-numba-lassen:
-  extends: [.base-job, .python-variants]
-  needs: ["build-llvmlite-lassen"]
-  variables:
-    PYOMP_CI_BUILD_PKG: "numba"
-
-build-pyomp-lassen:
-  extends: .base-job
-  needs: ["build-numba-lassen"]
   variables:
     PYOMP_CI_BUILD_PKG: "pyomp"
-
diff --git a/buildscripts/conda-recipes/llvm-openmp-dev/meta.yaml b/buildscripts/conda-recipes/llvm-openmp-dev/meta.yaml
index 7b440465ead0..93df663256ce 100644
--- a/buildscripts/conda-recipes/llvm-openmp-dev/meta.yaml
+++ b/buildscripts/conda-recipes/llvm-openmp-dev/meta.yaml
@@ -10,11 +10,9 @@ source:
 
 build:
   merge_build_host: False
-  string: h{{ PKG_HASH }}_{{ (GITHUB_HEAD_SHA | default(''))[:7] ~ (CI_COMMIT_SHA | default(''))[:7] }}
+  string: h{{ PKG_HASH }}
   script_env:
-    - PY_VCRUNTIME_REDIST
-    - GITHUB_HEAD_SHA
-    - CI_COMMIT_SHA
+    - PY_VCRUNTIME_REDIST # [win]
 
 requirements:
   build:
diff --git a/buildscripts/gitlab/create-conda-pkgs.sh b/buildscripts/gitlab/create-conda-pkgs.sh
index 5c7f30b66cbc..7fe406cc45d1 100755
--- a/buildscripts/gitlab/create-conda-pkgs.sh
+++ b/buildscripts/gitlab/create-conda-pkgs.sh
@@ -40,14 +40,6 @@ case ${PYOMP_CI_BUILD_PKG} in
     deploy_conda "llvm-openmp-dev"
     ;;
 
-  "llvmlite")
-    deploy_conda "llvmlite"
-    ;;
-
-  "numba")
-    deploy_conda "numba"
-    ;;
-
   "pyomp")
     deploy_conda "pyomp"
     ;;

From d2d888dd199fea957a20902c3eaf356d4d2e0af1 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Tue, 22 Apr 2025 23:31:38 -0700
Subject: [PATCH 06/28] Fix to get commit hash for the pyomp build string

---
 .github/workflows/build-upload-conda-base.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml
index e5713bd04580..3e5119e386ac 100644
--- a/.github/workflows/build-upload-conda-base.yml
+++ b/.github/workflows/build-upload-conda-base.yml
@@ -77,6 +77,10 @@ jobs:
         python-version: ["3.8", "3.9", "3.10"]
     steps:
       - uses: actions/checkout@v4
+        # Checkout the repo with history to get the commit hash for the build
+        # string.
+        with:
+          fetch-depth: 0
       - name: Create and activate conda env
         uses: conda-incubator/setup-miniconda@v3
         with:

From 21bedc076d7ecece49d3dbdec200d03ac9d53781 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 00:43:30 -0700
Subject: [PATCH 07/28] Add llvmdev dependency in pyomp meta.yaml

---
 buildscripts/conda-recipes/pyomp/meta.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml
index 4ee7193abc7e..e1d548a30725 100644
--- a/buildscripts/conda-recipes/pyomp/meta.yaml
+++ b/buildscripts/conda-recipes/pyomp/meta.yaml
@@ -23,11 +23,13 @@ requirements:
     - pip
     - setuptools
     - numba >=0.57, <0.58
+    - llvmdev 14.*
   run:
     - numba >=0.57, <0.58
     - lark
     - cffi
     - llvm-openmp-dev
+    - llvmdev 14.*
 
 about:
   home: https://github.com/Python-for-HPC/PyOMP

From 4233b5a5f0e4b5fb2f1d86eb07b25072b771461b Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 00:59:36 -0700
Subject: [PATCH 08/28] Revert license format in pyomp meta.yaml for backwards
 compat

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index af4b697973bb..113a1bc2e3f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.2.0"
 description = "Python OpenMP library based on Numba"
 readme = "README.md"
 requires-python = ">=3.8, <3.12"
-license = "BSD-2-Clause"
+license = { text = "BSD 2-Clause" }
 classifiers = [
     "Programming Language :: Python :: 3",
     "Operating System :: OS Independent",

From a02c13522e51327a794d209fc2fb62360fa1fcee Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 08:46:58 -0700
Subject: [PATCH 09/28] Update pyomp meta.yaml for cmake as build dep

---
 buildscripts/conda-recipes/pyomp/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/buildscripts/conda-recipes/pyomp/meta.yaml b/buildscripts/conda-recipes/pyomp/meta.yaml
index e1d548a30725..81e93ed4d90a 100644
--- a/buildscripts/conda-recipes/pyomp/meta.yaml
+++ b/buildscripts/conda-recipes/pyomp/meta.yaml
@@ -18,6 +18,7 @@ requirements:
   build:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
+    - cmake
   host:
     - python
     - pip

From 314a5a6d2a3cf9d5b4e90a2efa711318f5564d9b Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 08:47:37 -0700
Subject: [PATCH 10/28] Use the arm64 github runner

---
 .github/workflows/build-upload-conda-base.yml | 89 ++++++++++---------
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml
index 3e5119e386ac..9028d73302a0 100644
--- a/.github/workflows/build-upload-conda-base.yml
+++ b/.github/workflows/build-upload-conda-base.yml
@@ -44,7 +44,7 @@ jobs:
     strategy:
       matrix:
         #os: [ubuntu-latest, macOS-latest, windows-latest]
-        os: [ubuntu-latest, macOS-latest]
+        os: [ubuntu-latest, macOS-latest, ubuntu-24.04-arm]
     steps:
       - uses: actions/checkout@v4
       - name: Create and activate conda env
@@ -102,47 +102,48 @@ jobs:
   ################################################################
   # Deploy on ARM64 using QEMU and a docker container for building.
   ################################################################
-  conda-deploy-llvm-openmp-dev-linux-arm64:
-    name: llvm-openmp-dev linux-arm64
-    runs-on: ubuntu-latest
-    steps:
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          platforms: linux/arm64
-      - name: Deploy llvm-openmp-dev
-        run: |
-          docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -xel -c "
-          ${{ env.SETUP_BASE }}
-          ${{ env.SETUP_MINICONDA }}
-          ${{ env.SETUP_REPO }}
-          conda build --user python-for-hpc --label ${{ inputs.label }} \
-            -c python-for-hpc -c conda-forge \
-            --token ${{ secrets.ANACONDA_TOKEN }} \
-            buildscripts/conda-recipes/llvm-openmp-dev;
-          "
+  #conda-deploy-llvm-openmp-dev-linux-arm64:
+  #  name: llvm-openmp-dev linux-arm64
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #    - name: Set up QEMU
+  #      uses: docker/setup-qemu-action@v3
+  #      with:
+  #        platforms: linux/arm64
+  #    - name: Deploy llvm-openmp-dev
+  #      run: |
+  #        docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -xel -c "
+  #        ${{ env.SETUP_BASE }}
+  #        ${{ env.SETUP_MINICONDA }}
+  #        ${{ env.SETUP_REPO }}
+  #        conda build --user python-for-hpc --label ${{ inputs.label }} \
+  #          -c python-for-hpc -c conda-forge \
+  #          --token ${{ secrets.ANACONDA_TOKEN }} \
+  #          buildscripts/conda-recipes/llvm-openmp-dev;
+  #        "
 
-  conda-deploy-pyomp-linux-arm64:
-    needs: conda-deploy-llvm-openmp-dev-linux-arm64
-    name: pyomp linux-arm64
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.8", "3.9", "3.10"]
-    steps:
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          platforms: linux/arm64
-      - name: Deploy pyomp
-        run: |
-          docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c "
-          ${{ env.SETUP_BASE }}
-          ${{ env.SETUP_MINICONDA }}
-          ${{ env.SETUP_REPO }}
-          conda build --user python-for-hpc --label ${{ inputs.label }} \
-            -c python-for-hpc -c conda-forge \
-            --python ${{ matrix.python-version}} \
-            --token ${{ secrets.ANACONDA_TOKEN }} \
-            buildscripts/conda-recipes/pyomp;
-          "
\ No newline at end of file
+  #conda-deploy-pyomp-linux-arm64:
+  #  needs: conda-deploy-llvm-openmp-dev-linux-arm64
+  #  name: pyomp linux-arm64
+  #  runs-on: ubuntu-latest
+  #  strategy:
+  #    matrix:
+  #      python-version: ["3.8", "3.9", "3.10"]
+  #  steps:
+  #    - name: Set up QEMU
+  #      uses: docker/setup-qemu-action@v3
+  #      with:
+  #        platforms: linux/arm64
+  #    - name: Deploy pyomp
+  #      run: |
+  #        docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c "
+  #        ${{ env.SETUP_BASE }}
+  #        ${{ env.SETUP_MINICONDA }}
+  #        ${{ env.SETUP_REPO }}
+  #        conda build --user python-for-hpc --label ${{ inputs.label }} \
+  #          -c python-for-hpc -c conda-forge \
+  #          --python ${{ matrix.python-version}} \
+  #          --token ${{ secrets.ANACONDA_TOKEN }} \
+  #          buildscripts/conda-recipes/pyomp;
+  #        "
+  
\ No newline at end of file

From 5d761345913fc440e2ad55b73fa23bef7a553ba1 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 09:01:51 -0700
Subject: [PATCH 11/28] Trigger CI

[run gitlab ci]

From daebe46c59717ddc56b2a37a69ea49a7f4822ad2 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 10:01:53 -0700
Subject: [PATCH 12/28] Cleanup github workflow for conda deployment

---
 .github/workflows/build-upload-conda-base.yml | 80 ++-----------------
 1 file changed, 5 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml
index 9028d73302a0..4b1a5f43b454 100644
--- a/.github/workflows/build-upload-conda-base.yml
+++ b/.github/workflows/build-upload-conda-base.yml
@@ -13,28 +13,6 @@ on:
         required: true
         type: string
 
-# Rembember: you have to escape characters like $ with \$.
-env:
-  SETUP_BASE: |
-    apt-get -qq update > /dev/null;
-    apt-get -qq upgrade > /dev/null;
-    apt-get -qq install wget git > /dev/null;
-    useradd -ms /bin/bash pyompuser;
-    su pyompuser;
-    cd /home/pyompuser;
-  SETUP_MINICONDA: |
-    wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-\$(uname -m).sh -O miniconda.sh;
-    bash miniconda.sh -b -u -p ./miniconda3;
-    rm -rf miniconda.sh;
-    export PATH=/home/pyompuser/miniconda3/bin:\${PATH};
-    conda remove --name base conda-anaconda-telemetry
-    conda install -q -y -c conda-forge conda-build conda-verify anaconda-client;
-    conda config --set anaconda_upload yes;
-  SETUP_REPO: |
-    git clone https://github.com/Python-for-HPC/PyOMP;
-    cd PyOMP;
-    git checkout ${{ inputs.commit }};
-
 jobs:
   # Job to deploy llvm-openmp-dev, runs once as it is independent of the python
   # version.
@@ -43,7 +21,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        #os: [ubuntu-latest, macOS-latest, windows-latest]
+        # TODO: Add windows.
         os: [ubuntu-latest, macOS-latest, ubuntu-24.04-arm]
     steps:
       - uses: actions/checkout@v4
@@ -72,8 +50,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        #os: [ubuntu-latest, macOS-latest, windows-latest]
-        os: [ubuntu-latest, macOS-latest]
+        # TODO: Add windows.
+        os: [ubuntu-latest, macOS-latest, ubuntu-24.04-arm]
         python-version: ["3.8", "3.9", "3.10"]
     steps:
       - uses: actions/checkout@v4
@@ -95,55 +73,7 @@ jobs:
           conda install -q -y -c conda-forge conda-build conda-verify anaconda-client;
           conda config --set anaconda_upload yes;
           conda build --user python-for-hpc --label ${{ inputs.label }} \
-            -c python-for-hpc -c conda-forge --python ${{ matrix.python-version }} \
+            -c python-for-hpc -c conda-forge \
+            --python ${{ matrix.python-version }} \
             --token ${{ secrets.ANACONDA_TOKEN }} \
             buildscripts/conda-recipes/pyomp;
-
-  ################################################################
-  # Deploy on ARM64 using QEMU and a docker container for building.
-  ################################################################
-  #conda-deploy-llvm-openmp-dev-linux-arm64:
-  #  name: llvm-openmp-dev linux-arm64
-  #  runs-on: ubuntu-latest
-  #  steps:
-  #    - name: Set up QEMU
-  #      uses: docker/setup-qemu-action@v3
-  #      with:
-  #        platforms: linux/arm64
-  #    - name: Deploy llvm-openmp-dev
-  #      run: |
-  #        docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -xel -c "
-  #        ${{ env.SETUP_BASE }}
-  #        ${{ env.SETUP_MINICONDA }}
-  #        ${{ env.SETUP_REPO }}
-  #        conda build --user python-for-hpc --label ${{ inputs.label }} \
-  #          -c python-for-hpc -c conda-forge \
-  #          --token ${{ secrets.ANACONDA_TOKEN }} \
-  #          buildscripts/conda-recipes/llvm-openmp-dev;
-  #        "
-
-  #conda-deploy-pyomp-linux-arm64:
-  #  needs: conda-deploy-llvm-openmp-dev-linux-arm64
-  #  name: pyomp linux-arm64
-  #  runs-on: ubuntu-latest
-  #  strategy:
-  #    matrix:
-  #      python-version: ["3.8", "3.9", "3.10"]
-  #  steps:
-  #    - name: Set up QEMU
-  #      uses: docker/setup-qemu-action@v3
-  #      with:
-  #        platforms: linux/arm64
-  #    - name: Deploy pyomp
-  #      run: |
-  #        docker run --platform linux/arm64 ubuntu:22.04 /bin/bash -el -c "
-  #        ${{ env.SETUP_BASE }}
-  #        ${{ env.SETUP_MINICONDA }}
-  #        ${{ env.SETUP_REPO }}
-  #        conda build --user python-for-hpc --label ${{ inputs.label }} \
-  #          -c python-for-hpc -c conda-forge \
-  #          --python ${{ matrix.python-version}} \
-  #          --token ${{ secrets.ANACONDA_TOKEN }} \
-  #          buildscripts/conda-recipes/pyomp;
-  #        "
-  
\ No newline at end of file

From ce9f26f59493395593f1cb5e74d21e6580ee0786 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 10:02:46 -0700
Subject: [PATCH 13/28] Update gitlab ci

- Remove need for trigger ([run gitlab ci] to run
- Use per job conda pkgs dir to avoid conflicts with multiple runners
---
 .gitlab-ci.yml                           | 2 --
 .gitlab/subscribed-pipelines.yml         | 2 --
 buildscripts/gitlab/create-conda-pkgs.sh | 4 ++++
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c73c68fc9558..851416009774 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -85,8 +85,6 @@ stages:
     strategy: depend
     forward:
       pipeline_variables: true
-  rules:
-    - if: ($CI_COMMIT_MESSAGE =~ /\[run gitlab ci\]/ || $CI_COMMIT_TAG)
 
 include:
   # Sets ID tokens for every job using `default:`
diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml
index 714f1d78af49..265a344ba850 100644
--- a/.gitlab/subscribed-pipelines.yml
+++ b/.gitlab/subscribed-pipelines.yml
@@ -23,8 +23,6 @@
              --data "{ \"state\": \"failure\", \"target_url\": \"${CI_PIPELINE_URL}\", \"description\": \"GitLab ${CI_MACHINE} down\", \"context\": \"ci/gitlab/${CI_MACHINE}\" }"
         exit 1
       fi
-  rules:
-    - if: ($CI_COMMIT_MESSAGE =~ /\[run gitlab ci\]/ || $CI_COMMIT_TAG)
 
 ###
 # Trigger a build-and-test pipeline for a machine.
diff --git a/buildscripts/gitlab/create-conda-pkgs.sh b/buildscripts/gitlab/create-conda-pkgs.sh
index 7fe406cc45d1..a143f7e6d8a9 100755
--- a/buildscripts/gitlab/create-conda-pkgs.sh
+++ b/buildscripts/gitlab/create-conda-pkgs.sh
@@ -7,6 +7,10 @@ else
   LABEL="test"
 fi
 
+# Set pkg dir per job to avoid conflicts.
+export CONDA_PKGS_DIRS=/tmp/ggeorgak/conda-pkgs-${CI_JOB_ID}
+mkdir -p "$CONDA_PKGS_DIRS"
+
 function deploy_conda() {
   PKG="${1}"
 

From ebad638f757205676e58c94cd5b7a6970fc871f1 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 11:19:54 -0700
Subject: [PATCH 14/28] Add tests

---
 buildscripts/conda-recipes/pyomp/run_test.sh | 61 ++++++++++++++++++++
 pyproject.toml                               |  2 +-
 2 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 buildscripts/conda-recipes/pyomp/run_test.sh

diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh
new file mode 100644
index 000000000000..d14a16db5ce1
--- /dev/null
+++ b/buildscripts/conda-recipes/pyomp/run_test.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+set -e
+
+export NUMBA_DEVELOPER_MODE=1
+export NUMBA_DISABLE_ERROR_MESSAGE_HIGHLIGHTING=1
+export NUMBA_CAPTURED_ERRORS="new_style"
+export PYTHONFAULTHANDLER=1
+
+# Disable NumPy dispatching to AVX512_SKX feature extensions if the chip is
+# reported to support the feature and NumPy >= 1.22 as this results in the use
+# of low accuracy SVML libm replacements in ufunc loops.
+_NPY_CMD='from numba.misc import numba_sysinfo;\
+          sysinfo=numba_sysinfo.get_sysinfo();\
+          print(sysinfo["NumPy AVX512_SKX detected"] and
+                sysinfo["NumPy Version"]>="1.22")'
+NUMPY_DETECTS_AVX512_SKX_NP_GT_122=$(python -c "$_NPY_CMD")
+echo "NumPy >= 1.22 with AVX512_SKX detected: $NUMPY_DETECTS_AVX512_SKX_NP_GT_122"
+
+if [[ "$NUMPY_DETECTS_AVX512_SKX_NP_GT_122" == "True" ]]; then
+    export NPY_DISABLE_CPU_FEATURES="AVX512_SKX"
+fi
+
+unamestr=`uname`
+if [[ "$unamestr" == 'Linux' ]]; then
+  # Test if catchsegv exists, not by default in recent libc.
+  if catchsegv --version; then
+    SEGVCATCH=catchsegv
+  else
+    SEGVCATCH=""
+  fi
+elif [[ "$unamestr" == 'Darwin' ]]; then
+  SEGVCATCH=""
+else
+  echo Error
+fi
+
+# Run OpenMP tests in a single-process since they use multiple cores by
+# multi-threading. Using multiple processes for testing will very probably slow
+# things down.
+# XXX: Using -m $TEST_NPROCS, even if with 1 process, hangs on github runners
+# when running the full testsuite, while individual tests pass.  This requires
+# more investigation. Some observations: 1) running the full test suite creates
+# new threads for each region, the old ones are blocked in a futex for
+# destruction, 2) it is possible that in small github runners this starves cpu
+# time, 3) there may be implications with "-m 1" vs. no flag on how the runtime
+# library is inited/de-inited.
+
+echo "=> Run OpenMP CPU parallelism tests"
+echo "=> Running: TEST_DEVICES=0 RUN_TARGET=0 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp"
+# TODO: remove requiring the unused TEST_DEVICES.
+TEST_DEVICES=0 RUN_TARGET=0 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp 2>&1
+
+echo "=> Run OpenMP offloading tests on CPU (device 1)"
+echo "=> Running: TEST_DEVICES=1 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget"
+TEST_DEVICES=1 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1
+if nvidia-smi --list-gpus; then
+  echo "=> Found NVIDIA GPU, Run OpenMP offloading tests on GPU (device 0)"
+  echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget"
+  TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1
+fi
diff --git a/pyproject.toml b/pyproject.toml
index 113a1bc2e3f4..66178c9b56a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,4 +23,4 @@ Homepage = "https://github.com/Python-for-HPC/PyOMP"
 Issues = "https://github.com/Python-for-HPC/PyOMP/issues"
 
 [tool.setuptools]
-packages = ["numba.openmp"]
+packages = ["numba.openmp", "numba.openmp.libs", "numba.openmp.tests"]

From 76b66ce1a531b22ad4cd5f287bf0c31abd58e947 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 11:37:19 -0700
Subject: [PATCH 15/28] Remove numba.openmp.libs

- It is created at build time, hence breaks the toml unless removed
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 66178c9b56a4..7316153ba0e8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,4 +23,4 @@ Homepage = "https://github.com/Python-for-HPC/PyOMP"
 Issues = "https://github.com/Python-for-HPC/PyOMP/issues"
 
 [tool.setuptools]
-packages = ["numba.openmp", "numba.openmp.libs", "numba.openmp.tests"]
+packages = ["numba.openmp", "numba.openmp.tests"]

From f3b25b84b15e29e2f4de4126ea0e5627ac1cc7e9 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 19:13:48 -0700
Subject: [PATCH 16/28] Debug

---
 buildscripts/conda-recipes/pyomp/run_test.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh
index d14a16db5ce1..3adc734f119e 100644
--- a/buildscripts/conda-recipes/pyomp/run_test.sh
+++ b/buildscripts/conda-recipes/pyomp/run_test.sh
@@ -59,3 +59,7 @@ if nvidia-smi --list-gpus; then
   echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget"
   TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1
 fi
+
+opt_path=$(which opt)
+echo "OPT_PATH ${opt_path}"
+opt --version

From 9f21ac2326824f64ac78ad056267ddd21f04432d Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Wed, 23 Apr 2025 19:23:31 -0700
Subject: [PATCH 17/28] Debug

---
 buildscripts/conda-recipes/pyomp/run_test.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh
index 3adc734f119e..190fd359596e 100644
--- a/buildscripts/conda-recipes/pyomp/run_test.sh
+++ b/buildscripts/conda-recipes/pyomp/run_test.sh
@@ -35,6 +35,10 @@ else
   echo Error
 fi
 
+opt_path=$(which opt)
+echo "OPT_PATH ${opt_path}"
+opt --version
+
 # Run OpenMP tests in a single-process since they use multiple cores by
 # multi-threading. Using multiple processes for testing will very probably slow
 # things down.
@@ -60,6 +64,3 @@ if nvidia-smi --list-gpus; then
   TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1
 fi
 
-opt_path=$(which opt)
-echo "OPT_PATH ${opt_path}"
-opt --version

From e7e31ec301d3130cea4ed193d9f9bfe80b8cd3df Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Thu, 24 Apr 2025 09:17:01 -0700
Subject: [PATCH 18/28] Refactor to fix packaging and cleanup

- Move nrt static and the llvm plugin pass under libs subdirectory
- Rename libbundle to libnrt_static
- Fix setup.py to move build libraries in wheel-used build directory
---
 buildscripts/conda-recipes/pyomp/run_test.sh  |  5 --
 numba/openmp/__init__.py                      |  2 +-
 numba/openmp/{ => libs}/nrt/init.c            |  0
 .../{ => libs}/pass/CGIntrinsicsOpenMP.cpp    |  0
 .../{ => libs}/pass/CGIntrinsicsOpenMP.h      |  0
 numba/openmp/{ => libs}/pass/CMakeLists.txt   |  0
 numba/openmp/{ => libs}/pass/DebugOpenMP.cpp  |  0
 numba/openmp/{ => libs}/pass/DebugOpenMP.h    |  0
 .../{ => libs}/pass/IntrinsicsOpenMP.cpp      |  0
 .../openmp/{ => libs}/pass/IntrinsicsOpenMP.h |  0
 .../{ => libs}/pass/IntrinsicsOpenMP_CAPI.h   |  0
 pyproject.toml                                |  4 +-
 setup.py                                      | 57 ++++++++++---------
 13 files changed, 34 insertions(+), 34 deletions(-)
 rename numba/openmp/{ => libs}/nrt/init.c (100%)
 rename numba/openmp/{ => libs}/pass/CGIntrinsicsOpenMP.cpp (100%)
 rename numba/openmp/{ => libs}/pass/CGIntrinsicsOpenMP.h (100%)
 rename numba/openmp/{ => libs}/pass/CMakeLists.txt (100%)
 rename numba/openmp/{ => libs}/pass/DebugOpenMP.cpp (100%)
 rename numba/openmp/{ => libs}/pass/DebugOpenMP.h (100%)
 rename numba/openmp/{ => libs}/pass/IntrinsicsOpenMP.cpp (100%)
 rename numba/openmp/{ => libs}/pass/IntrinsicsOpenMP.h (100%)
 rename numba/openmp/{ => libs}/pass/IntrinsicsOpenMP_CAPI.h (100%)

diff --git a/buildscripts/conda-recipes/pyomp/run_test.sh b/buildscripts/conda-recipes/pyomp/run_test.sh
index 190fd359596e..d14a16db5ce1 100644
--- a/buildscripts/conda-recipes/pyomp/run_test.sh
+++ b/buildscripts/conda-recipes/pyomp/run_test.sh
@@ -35,10 +35,6 @@ else
   echo Error
 fi
 
-opt_path=$(which opt)
-echo "OPT_PATH ${opt_path}"
-opt --version
-
 # Run OpenMP tests in a single-process since they use multiple cores by
 # multi-threading. Using multiple processes for testing will very probably slow
 # things down.
@@ -63,4 +59,3 @@ if nvidia-smi --list-gpus; then
   echo "=> Running: TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget"
   TEST_DEVICES=0 RUN_TARGET=1 $SEGVCATCH python -m numba.runtests -v -- numba.openmp.tests.test_openmp.TestOpenmpTarget 2>&1
 fi
-
diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py
index 52e31d4a45c4..1ceeb87bef08 100644
--- a/numba/openmp/__init__.py
+++ b/numba/openmp/__init__.py
@@ -2628,7 +2628,7 @@ def prepend_device_to_func_name(outlined_ir):
                         # Do whole archive to include all symbols, esp. for the
                         # PyOMP_NRT_Init constructor.
                         "--whole-archive",
-                        libpath / "libbundle.a",
+                        libpath / "libnrt_static.a",
                         "--no-whole-archive",
                         "-o",
                         filename_so,
diff --git a/numba/openmp/nrt/init.c b/numba/openmp/libs/nrt/init.c
similarity index 100%
rename from numba/openmp/nrt/init.c
rename to numba/openmp/libs/nrt/init.c
diff --git a/numba/openmp/pass/CGIntrinsicsOpenMP.cpp b/numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp
similarity index 100%
rename from numba/openmp/pass/CGIntrinsicsOpenMP.cpp
rename to numba/openmp/libs/pass/CGIntrinsicsOpenMP.cpp
diff --git a/numba/openmp/pass/CGIntrinsicsOpenMP.h b/numba/openmp/libs/pass/CGIntrinsicsOpenMP.h
similarity index 100%
rename from numba/openmp/pass/CGIntrinsicsOpenMP.h
rename to numba/openmp/libs/pass/CGIntrinsicsOpenMP.h
diff --git a/numba/openmp/pass/CMakeLists.txt b/numba/openmp/libs/pass/CMakeLists.txt
similarity index 100%
rename from numba/openmp/pass/CMakeLists.txt
rename to numba/openmp/libs/pass/CMakeLists.txt
diff --git a/numba/openmp/pass/DebugOpenMP.cpp b/numba/openmp/libs/pass/DebugOpenMP.cpp
similarity index 100%
rename from numba/openmp/pass/DebugOpenMP.cpp
rename to numba/openmp/libs/pass/DebugOpenMP.cpp
diff --git a/numba/openmp/pass/DebugOpenMP.h b/numba/openmp/libs/pass/DebugOpenMP.h
similarity index 100%
rename from numba/openmp/pass/DebugOpenMP.h
rename to numba/openmp/libs/pass/DebugOpenMP.h
diff --git a/numba/openmp/pass/IntrinsicsOpenMP.cpp b/numba/openmp/libs/pass/IntrinsicsOpenMP.cpp
similarity index 100%
rename from numba/openmp/pass/IntrinsicsOpenMP.cpp
rename to numba/openmp/libs/pass/IntrinsicsOpenMP.cpp
diff --git a/numba/openmp/pass/IntrinsicsOpenMP.h b/numba/openmp/libs/pass/IntrinsicsOpenMP.h
similarity index 100%
rename from numba/openmp/pass/IntrinsicsOpenMP.h
rename to numba/openmp/libs/pass/IntrinsicsOpenMP.h
diff --git a/numba/openmp/pass/IntrinsicsOpenMP_CAPI.h b/numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h
similarity index 100%
rename from numba/openmp/pass/IntrinsicsOpenMP_CAPI.h
rename to numba/openmp/libs/pass/IntrinsicsOpenMP_CAPI.h
diff --git a/pyproject.toml b/pyproject.toml
index 7316153ba0e8..9842df1f018c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=77.0", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"]
+requires = ["setuptools>=77.0.3", "wheel", "numba>=0.57, <0.58", "cmake>=3.20"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -8,7 +8,7 @@ version = "0.2.0"
 description = "Python OpenMP library based on Numba"
 readme = "README.md"
 requires-python = ">=3.8, <3.12"
-license = { text = "BSD 2-Clause" }
+license = "BSD-2-Clause"
 classifiers = [
     "Programming Language :: Python :: 3",
     "Operating System :: OS Independent",
diff --git a/setup.py b/setup.py
index d854b277c4c0..d7fa0025120d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,29 +1,19 @@
-# setup.py
-import os
 import numba
 import sysconfig
 import subprocess
 import shutil
 import numpy as np
-from pathlib import Path
 from setuptools import setup, Extension
 from setuptools.command.build_ext import build_ext
 from setuptools.command.build_clib import build_clib
 
 
-temp_dir = Path("numba/openmp/nrt/numba_src")
-
-bundle_lib = (
-    "bundle",
+nrt_static = (
+    "nrt_static",
     {
+        # We extend those sources with the ones from the numba tree.
         "sources": [
-            "numba/openmp/nrt/init.c",
-            f"{temp_dir}/_helpermod.c",
-            f"{temp_dir}/cext/utils.c",
-            f"{temp_dir}/cext/dictobject.c",
-            f"{temp_dir}/cext/listobject.c",
-            f"{temp_dir}/core/runtime/_nrt_pythonmod.c",
-            f"{temp_dir}/core/runtime/nrt.cpp",
+            "numba/openmp/libs/nrt/init.c",
         ],
         "include_dirs": [
             sysconfig.get_paths()["include"],
@@ -33,15 +23,15 @@
 )
 
 
-class BuildStaticNRTBundle(build_clib):
+class BuildStaticNRT(build_clib):
     def finalize_options(self):
         super().finalize_options()
-        # Copy numba tree installation to the build directory for building the
+        # Copy numba tree installation to the temp directory for building the
         # static library using relative paths.
         numba_dir = numba.__path__[0]
         shutil.copytree(
             numba_dir,
-            temp_dir,
+            f"{self.build_temp}/numba_src",
             ignore=shutil.ignore_patterns(
                 "*.py",
                 "*.pyc",
@@ -52,13 +42,28 @@ def finalize_options(self):
             dirs_exist_ok=True,
         )
 
-        self.build_clib = "numba/openmp/libs"
+        libname, build_info = self.libraries[0]
+        if libname != "nrt_static":
+            raise Exception("Expected library name 'nrt_static'")
+        if len(self.libraries) != 1:
+            raise Exception("Expected only the `nrt_static' library in the list")
 
-    def run(self):
-        super().run()
+        sources = build_info["sources"]
+        sources.extend(
+            [
+                f"{self.build_temp}/numba_src/_helpermod.c",
+                f"{self.build_temp}/numba_src/cext/utils.c",
+                f"{self.build_temp}/numba_src/cext/dictobject.c",
+                f"{self.build_temp}/numba_src/cext/listobject.c",
+                f"{self.build_temp}/numba_src/core/runtime/_nrt_pythonmod.c",
+                f"{self.build_temp}/numba_src/core/runtime/nrt.cpp",
+            ]
+        )
 
-        # Clean up files after build is completed.
-        shutil.rmtree(temp_dir, ignore_errors=True)
+        # Get build_lib directory from the 'build' command.
+        build_cmd = self.get_finalized_command("build")
+        # Build the static library in the wheel output build directory.
+        self.build_clib = f"{build_cmd.build_lib}/numba/openmp/libs"
 
 
 class CMakeExtension(Extension):
@@ -89,7 +94,7 @@ def build_cmake(self, ext):
                 "-B",
                 self.build_temp,
                 "-DCMAKE_BUILD_TYPE=Release",
-                "-DCMAKE_INSTALL_PREFIX=numba/openmp/libs",
+                f"-DCMAKE_INSTALL_PREFIX={self.build_lib}/numba/openmp/libs",
             ],
             check=True,
         )
@@ -102,10 +107,10 @@ def build_cmake(self, ext):
 
 
 setup(
-    libraries=[bundle_lib],
-    ext_modules=[CMakeExtension("libIntrinsicsOpenMP", "numba/openmp/pass")],
+    libraries=[nrt_static],
+    ext_modules=[CMakeExtension("libIntrinsicsOpenMP", "numba/openmp/libs/pass")],
     cmdclass={
-        "build_clib": BuildStaticNRTBundle,
+        "build_clib": BuildStaticNRT,
         "build_ext": BuildIntrinsicsOpenMPPass,
     },
 )

From 6aeecfded82fe6f9bd167b175a1ddac525f45a34 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Thu, 24 Apr 2025 09:46:18 -0700
Subject: [PATCH 19/28] Change again pyproject license for old format

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9842df1f018c..fef5f6064cd9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ version = "0.2.0"
 description = "Python OpenMP library based on Numba"
 readme = "README.md"
 requires-python = ">=3.8, <3.12"
-license = "BSD-2-Clause"
+license = { text = "BSD-2-Clause" }
 classifiers = [
     "Programming Language :: Python :: 3",
     "Operating System :: OS Independent",

From 2c455b75da6a9021a0c04a3fdd42ac7409088404 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Thu, 24 Apr 2025 10:09:01 -0700
Subject: [PATCH 20/28] Do not load libomptarget if there's no gpu to avoid
 error

---
 numba/openmp/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py
index 1ceeb87bef08..9613f78bd58b 100644
--- a/numba/openmp/__init__.py
+++ b/numba/openmp/__init__.py
@@ -480,6 +480,10 @@ def _init():
     if sys_platform.startswith("darwin") or sys_platform.startswith("win32"):
         return
 
+    # libomptarget errors out if gpu is not available.
+    if not numba_cuda.is_available():
+        return
+
     omptargetlib = llvm_libpath + "/libomptarget.so"
     if DEBUG_OPENMP >= 1:
         print("Found OpenMP target runtime library at", omptargetlib)

From c44a9d423ed14ef531cd3b958c406528bd485b74 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Thu, 24 Apr 2025 11:08:37 -0700
Subject: [PATCH 21/28] Fix map_clause parsing

---
 numba/openmp/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py
index 9613f78bd58b..7baefacb8ecb 100644
--- a/numba/openmp/__init__.py
+++ b/numba/openmp/__init__.py
@@ -5248,8 +5248,9 @@ def map_clause(self, args):
             var_list = args[1]
             assert len(args) == 2
         else:
-            map_type = "TOFROM"  # is this default right?  FIX ME
-            var_list = args[0]
+            # TODO: is this default right?
+            map_type = "TOFROM"
+            var_list = args[1]
         ret = []
         for var in var_list:
             ret.append(openmp_tag("QUAL.OMP.MAP." + map_type, var))

From c0f5def1fb22ae047a999af2e8d6614e42824e7f Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Thu, 24 Apr 2025 12:25:53 -0700
Subject: [PATCH 22/28] Revert "Do not load libomptarget if there's no gpu to
 avoid error"

This reverts commit 2c455b75da6a9021a0c04a3fdd42ac7409088404.
---
 numba/openmp/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py
index 7baefacb8ecb..bd94bcd09913 100644
--- a/numba/openmp/__init__.py
+++ b/numba/openmp/__init__.py
@@ -480,10 +480,6 @@ def _init():
     if sys_platform.startswith("darwin") or sys_platform.startswith("win32"):
         return
 
-    # libomptarget errors out if gpu is not available.
-    if not numba_cuda.is_available():
-        return
-
     omptargetlib = llvm_libpath + "/libomptarget.so"
     if DEBUG_OPENMP >= 1:
         print("Found OpenMP target runtime library at", omptargetlib)

From b85eab28d2bc0b4dbf82a280b504ed411ab619b5 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Thu, 24 Apr 2025 14:42:07 -0700
Subject: [PATCH 23/28] Use the compiler driver to create shared lib of host
 offload targets

- Avoids missing symbols errors that libomptarget host RTL expects
---
 numba/openmp/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/numba/openmp/__init__.py b/numba/openmp/__init__.py
index bd94bcd09913..cfc7e57a40ed 100644
--- a/numba/openmp/__init__.py
+++ b/numba/openmp/__init__.py
@@ -2622,14 +2622,16 @@ def prepend_device_to_func_name(outlined_ir):
 
                 subprocess.run(
                     [
-                        "ld",
+                        # Use the compiler driver to create the shared library
+                        # and avoid missing symbols.
+                        "c++",
                         "-shared",
                         filename_o,
                         # Do whole archive to include all symbols, esp. for the
                         # PyOMP_NRT_Init constructor.
-                        "--whole-archive",
+                        "-Wl,--whole-archive",
                         libpath / "libnrt_static.a",
-                        "--no-whole-archive",
+                        "-Wl,--no-whole-archive",
                         "-o",
                         filename_so,
                     ],

From d64b9c9317ad2b67638a17d9e474ff9c1ebc6f87 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Thu, 24 Apr 2025 17:29:53 -0700
Subject: [PATCH 24/28] Clone with full history for versioning in gitlab ci

---
 buildscripts/gitlab/create-conda-pkgs.sh | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/buildscripts/gitlab/create-conda-pkgs.sh b/buildscripts/gitlab/create-conda-pkgs.sh
index a143f7e6d8a9..71088faed009 100755
--- a/buildscripts/gitlab/create-conda-pkgs.sh
+++ b/buildscripts/gitlab/create-conda-pkgs.sh
@@ -7,6 +7,18 @@ else
   LABEL="test"
 fi
 
+# Create a temporary directory for the build to clone the full repo for package
+# versioning.
+TMPDIR=/tmp/ggeorgak/${CI_JOB_ID}
+mkdir -p ${TMPDIR}
+pushd ${TMPDIR}
+
+# Clone and fetch the commit with history for package versioning.
+git clone https://github.com/${GITHUB_PROJECT_ORG}/${GITHUB_PROJECT_NAME}.git --single-branch
+cd ${GITHUB_PROJECT_NAME}
+git fetch origin ${CI_COMMIT_SHA}
+git checkout ${CI_COMMIT_SHA}
+
 # Set pkg dir per job to avoid conflicts.
 export CONDA_PKGS_DIRS=/tmp/ggeorgak/conda-pkgs-${CI_JOB_ID}
 mkdir -p "$CONDA_PKGS_DIRS"
@@ -22,13 +34,13 @@ function deploy_conda() {
     export CONDA_BLD_PATH="/tmp/ggeorgak/conda-build-${PYOMP_CI_BUILD_PKG}-noarch"
     conda build --no-lock --no-locking --user python-for-hpc --label ${LABEL} \
       -c python-for-hpc/label/${LABEL} -c conda-forge \
-      ${CI_PROJECT_DIR}/buildscripts/conda-recipes/${PKG}
+      buildscripts/conda-recipes/${PKG}
   else
     export CONDA_BLD_PATH="/tmp/ggeorgak/conda-build-${PYOMP_CI_BUILD_PKG}-${PYOMP_CI_PYTHON_VERSION}"
     conda build --no-lock --no-locking --user python-for-hpc --label ${LABEL} \
       -c python-for-hpc/label/${LABEL} -c conda-forge \
       --python ${PYOMP_CI_PYTHON_VERSION} \
-      ${CI_PROJECT_DIR}/buildscripts/conda-recipes/${PKG}
+      buildscripts/conda-recipes/${PKG}
   fi
 
   rm -rf ${CONDA_BLD_PATH}
@@ -55,3 +67,4 @@ case ${PYOMP_CI_BUILD_PKG} in
 
 esac
 
+popd

From 5873d1540b114dd4913bc8f6d8e043137fce1d85 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Thu, 24 Apr 2025 18:09:04 -0700
Subject: [PATCH 25/28] Simplify github workflows

- Remove unused commit input
- Add paths on pull_request event
---
 .github/workflows/build-upload-conda-base.yml | 3 ---
 .github/workflows/build-upload-conda-test.yml | 8 ++++++--
 .github/workflows/build-upload-conda.yml      | 3 +--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build-upload-conda-base.yml b/.github/workflows/build-upload-conda-base.yml
index 4b1a5f43b454..e66950aa1bf1 100644
--- a/.github/workflows/build-upload-conda-base.yml
+++ b/.github/workflows/build-upload-conda-base.yml
@@ -6,9 +6,6 @@ on:
       label:
         required: true
         type: string
-      commit:
-        required: true
-        type: string
       env:
         required: true
         type: string
diff --git a/.github/workflows/build-upload-conda-test.yml b/.github/workflows/build-upload-conda-test.yml
index e4f17f9d0d8d..9f26990014e4 100644
--- a/.github/workflows/build-upload-conda-test.yml
+++ b/.github/workflows/build-upload-conda-test.yml
@@ -2,6 +2,11 @@ name: Deploy conda pkgs (test)
 
 on:
   pull_request:
+    paths:
+      - "buildscripts/conda-recipes/**"
+      - ".github/workflows/build-upload-conda-test.yml"
+      - ".github/workflows/build-upload-conda-base.yml"
+      - "numba/**"
   workflow_dispatch:
 
 jobs:
@@ -9,6 +14,5 @@ jobs:
     uses: ./.github/workflows/build-upload-conda-base.yml
     with:
       label: test
-      commit: ${{ github.event.pull_request.head.sha }}
       env: .github/workflows/envs/env-test.yml
-    secrets: inherit
\ No newline at end of file
+    secrets: inherit
diff --git a/.github/workflows/build-upload-conda.yml b/.github/workflows/build-upload-conda.yml
index 565be3389f25..96e69538316b 100644
--- a/.github/workflows/build-upload-conda.yml
+++ b/.github/workflows/build-upload-conda.yml
@@ -10,10 +10,9 @@ jobs:
     uses: ./.github/workflows/build-upload-conda-base.yml
     with:
       label: main
-      commit: ${{ github.sha }}
       env: .github/workflows/envs/env.yml
     secrets: inherit
   deploy-containers:
     needs: deploy-conda
     uses: ./.github/workflows/build-containers.yml
-    secrets: inherit
\ No newline at end of file
+    secrets: inherit

From fd9df32ae9e8ddfacc45ca8b9b3415114f5c3e36 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Fri, 25 Apr 2025 06:12:03 -0700
Subject: [PATCH 26/28] Update examples and code

---
 buildscripts/containers/examples/hello.py   |   5 +-
 buildscripts/containers/examples/pi_loop.py |   6 +-
 buildscripts/containers/examples/pi_spmd.py |  19 ++-
 buildscripts/containers/examples/pi_task.py |  32 +++--
 docs/source/usage.rst                       |   4 +-
 examples/TestDataEnv.py                     | 112 ++++++++--------
 examples/dgemm_ompGPU.py                    | 137 +++++++++++---------
 examples/piParLoopGPU_BUD.py                |  18 +--
 examples/piParLoopGPU_BUD_combined.py       |  16 ++-
 examples/piParLoopGPU_loop.py               |  16 ++-
 examples/pi_loop.py                         |   6 +-
 examples/pi_spmd.py                         |  19 ++-
 examples/pi_task.py                         |  30 +++--
 13 files changed, 243 insertions(+), 177 deletions(-)

diff --git a/buildscripts/containers/examples/hello.py b/buildscripts/containers/examples/hello.py
index 11de2aa5f76e..e4f2f93f3c14 100644
--- a/buildscripts/containers/examples/hello.py
+++ b/buildscripts/containers/examples/hello.py
@@ -1,9 +1,12 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
 from numba.openmp import omp_get_thread_num, omp_get_num_threads
 
+
 @njit
 def hello():
     with openmp("parallel"):
         print("Hello thread", omp_get_thread_num(), " of ", omp_get_num_threads())
+
+
 hello()
diff --git a/buildscripts/containers/examples/pi_loop.py b/buildscripts/containers/examples/pi_loop.py
index 39f52d18db3a..356f2a1d2d69 100644
--- a/buildscripts/containers/examples/pi_loop.py
+++ b/buildscripts/containers/examples/pi_loop.py
@@ -1,6 +1,7 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
 
+
 @njit
 def calc_pi():
     num_steps = 100000
@@ -10,10 +11,11 @@ def calc_pi():
     with openmp("parallel for reduction(+:the_sum) schedule(static)"):
         for j in range(num_steps):
             c = step
-            x = ((j-1) - 0.5) * step
+            x = ((j - 1) - 0.5) * step
             the_sum += 4.0 / (1.0 + x * x)
 
     pi = step * the_sum
     return pi
 
+
 print("pi =", calc_pi())
diff --git a/buildscripts/containers/examples/pi_spmd.py b/buildscripts/containers/examples/pi_spmd.py
index 38e582b81967..9a6775c33cdc 100644
--- a/buildscripts/containers/examples/pi_spmd.py
+++ b/buildscripts/containers/examples/pi_spmd.py
@@ -1,14 +1,20 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
-from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime
+from numba.openmp import (
+    omp_set_num_threads,
+    omp_get_thread_num,
+    omp_get_num_threads,
+    omp_get_wtime,
+)
 import numpy as np
 
+
 @njit
 def f1():
     num_steps = 100000000
     step = 1.0 / num_steps
-    MAX_THREADS=8
-    for j in range(1,MAX_THREADS+1):
+    MAX_THREADS = 8
+    for j in range(1, MAX_THREADS + 1):
         tsum = np.zeros(j)
 
         omp_set_num_threads(j)
@@ -25,15 +31,16 @@ def f1():
                 x = (i + 0.5) * step
                 local_sum += 4.0 / (1.0 + x * x)
 
-#            print("foo:", j, tid, local_sum)
+            #            print("foo:", j, tid, local_sum)
             tsum[tid] = local_sum
 
-#        print("tsum:", tsum)
+        #        print("tsum:", tsum)
         full_sum = np.sum(tsum)
 
         pi = step * full_sum
         runtime = omp_get_wtime() - start_time
         print("pi = ", pi, "runtime = ", runtime, j)
 
+
 f1()
 print("DONE")
diff --git a/buildscripts/containers/examples/pi_task.py b/buildscripts/containers/examples/pi_task.py
index fe793c25fc5a..e8f1edb58c10 100644
--- a/buildscripts/containers/examples/pi_task.py
+++ b/buildscripts/containers/examples/pi_task.py
@@ -1,12 +1,18 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
-from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime
+from numba.openmp import (
+    omp_set_num_threads,
+    omp_get_thread_num,
+    omp_get_num_threads,
+    omp_get_wtime,
+)
 import numpy as np
 
+
 @njit
 def pi_comp(Nstart, Nfinish, step):
     MIN_BLK = 256
-    #MIN_BLK = 1024*1024*256
+    # MIN_BLK = 1024*1024*256
     pi_sum = 0.0
     if Nfinish - Nstart < MIN_BLK:
         for i in range(Nstart, Nfinish):
@@ -16,25 +22,26 @@ def pi_comp(Nstart, Nfinish, step):
         iblk = Nfinish - Nstart
         pi_sum1 = 0.0
         pi_sum2 = 0.0
-        cut = Nfinish-(iblk // 2)
+        cut = Nfinish - (iblk // 2)
         with openmp("task shared(pi_sum1)"):
             pi_sum1 = pi_comp(Nstart, cut, step)
         with openmp("task shared(pi_sum2)"):
             pi_sum2 = pi_comp(cut, Nfinish, step)
         with openmp("taskwait"):
             pi_sum = pi_sum1 + pi_sum2
-#        pi_sum1 = pi_comp(Nstart, cut, step)
-#        pi_sum2 = pi_comp(cut, Nfinish, step)
-#        pi_sum = pi_sum1 + pi_sum2
-    return pi_sum 
+    #        pi_sum1 = pi_comp(Nstart, cut, step)
+    #        pi_sum2 = pi_comp(cut, Nfinish, step)
+    #        pi_sum = pi_sum1 + pi_sum2
+    return pi_sum
+
 
 @njit
 def f1(lb, num_steps):
     step = 1.0 / num_steps
-    MAX_THREADS=4
+    MAX_THREADS = 4
     tsum = np.zeros(MAX_THREADS)
 
-    for j in range(1,MAX_THREADS+1):
+    for j in range(1, MAX_THREADS + 1):
         omp_set_num_threads(j)
         full_sum = 0.0
         start_time = omp_get_wtime()
@@ -48,9 +55,10 @@ def f1(lb, num_steps):
         runtime = omp_get_wtime() - start_time
         print("pi = ", pi, "runtime = ", runtime, j)
 
+
 lb = 0
 num_steps = 1024
-#num_steps = 1024*1024*1024
-#num_steps = 1000000000
+# num_steps = 1024*1024*1024
+# num_steps = 1000000000
 f1(lb, num_steps)
 print("DONE")
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index c4fc6cd353ab..e1f289088def 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -15,7 +15,7 @@ Diving right in, this is a minimal, parallel `hello world` example:
 .. code-block:: python
    :linenos:
 
-   from numba import njit
+   from numba.openmp import njit
    from numba.openmp import openmp_context as openmp
    from numba.openmp import omp_get_thread_num
 
@@ -69,7 +69,7 @@ thread-blocks on the GPU device:
 .. code-block:: python
    :linenos:
 
-   from numba import njit
+   from numba.openmp import njit
    from numba.openmp import openmp_context as openmp
    from numba.openmp import omp_get_thread_num
    import numpy as np
diff --git a/examples/TestDataEnv.py b/examples/TestDataEnv.py
index 7c05e7d43ecb..311e596bac3b 100755
--- a/examples/TestDataEnv.py
+++ b/examples/TestDataEnv.py
@@ -1,81 +1,91 @@
 #
 #  Test individual constructs from OpenMP
 #
-from numba import njit
+from numba.openmp import njit
 import numpy as np
 from numba.openmp import openmp_context as openmp
-from numba.openmp import omp_get_wtime, omp_get_thread_num, omp_get_num_threads,omp_set_num_threads
+from numba.openmp import (
+    omp_get_wtime,
+    omp_get_thread_num,
+    omp_get_num_threads,
+    omp_set_num_threads,
+)
+
 
 ##############################################################################
 @njit
 def testOMP():
-    x    = 5
-    y    = 3
-    zfp  = 2
-    zsh  = 7
+    x = 5
+    y = 3
+    zfp = 2
+    zsh = 7
     nerr = 0
     nsing = 0
     NTHREADS = 4
     numthrds = 0
     omp_set_num_threads(NTHREADS)
-    vals    = np.zeros(NTHREADS)
-    valsfp  = np.zeros(NTHREADS)
+    vals = np.zeros(NTHREADS)
+    valsfp = np.zeros(NTHREADS)
 
-    with openmp ("parallel private(x) shared(zsh) firstprivate(zfp) private(ID)"):
-       ID = omp_get_thread_num()
-       with openmp("single"):
-            nsing = nsing+1
+    with openmp("parallel private(x) shared(zsh) firstprivate(zfp) private(ID)"):
+        ID = omp_get_thread_num()
+        with openmp("single"):
+            nsing = nsing + 1
             numthrds = omp_get_num_threads()
-            if (y != 3):
-                 nerr = nerr+1
-                 print("Shared Default status failure y = ",y," It should equal 3");
-       with openmp("single"):
-            if(x == 5):
-                 pass
-#                 nerr = nerr+1
-#                 print("Private clause failed, variable x = original variable ",x," it should be undefined")
+            if y != 3:
+                nerr = nerr + 1
+                print("Shared Default status failure y = ", y, " It should equal 3")
+        with openmp("single"):
+            if x == 5:
+                pass
+        #                 nerr = nerr+1
+        #                 print("Private clause failed, variable x = original variable ",x," it should be undefined")
 
-       # verify each thread sees the same variable vsh
-       with openmp("critical"):
-            zsh = zsh+ID
+        # verify each thread sees the same variable vsh
+        with openmp("critical"):
+            zsh = zsh + ID
 
-       # test first private
-       zfp = zfp+ID
-       valsfp[ID] = zfp
+        # test first private
+        zfp = zfp + ID
+        valsfp[ID] = zfp
 
-       # setup test to see if each thread got it's own x value
-       x = ID
-       vals[ID] = x
+        # setup test to see if each thread got it's own x value
+        x = ID
+        vals[ID] = x
 
-# Shared clause test: assumes zsh starts at 7 and we add up IDs from 4 threads
-    if(zsh != 13):
-        print("Shared clause or critical failed",zsh)
-        nerr = nerr+1
+    # Shared clause test: assumes zsh starts at 7 and we add up IDs from 4 threads
+    if zsh != 13:
+        print("Shared clause or critical failed", zsh)
+        nerr = nerr + 1
 
-# Single Test: How many threads updated nsing?
-    if(nsing!=1):
-        print(" Single test failed",nsing)
-        nerr = nerr+1
+    # Single Test: How many threads updated nsing?
+    if nsing != 1:
+        print(" Single test failed", nsing)
+        nerr = nerr + 1
 
-# Private clause test: did each thread get its own x variable?
+    # Private clause test: did each thread get its own x variable?
     for i in range(numthrds):
-        if(int(vals[i]) != i):
-           print("Private clause failed",numthrds,i,vals[i])
-           nerr = nerr+1
+        if int(vals[i]) != i:
+            print("Private clause failed", numthrds, i, vals[i])
+            nerr = nerr + 1
 
-# First private clause test: each thread should get 2 + ID for up to 4 threads
+    # First private clause test: each thread should get 2 + ID for up to 4 threads
     for i in range(numthrds):
-        if(int(valsfp[i]) != 2+i):
-           print("Firstprivate clause failed",numthrds,i,valsfp[i])
-           nerr = nerr+1
+        if int(valsfp[i]) != 2 + i:
+            print("Firstprivate clause failed", numthrds, i, valsfp[i])
+            nerr = nerr + 1
+
+    # Test number of threads
+    if numthrds > NTHREADS:
+        print("Number of threads error: too many threads", numthrds, NTHREADS)
+        nerr = nerr + 1
 
-# Test number of threads
-    if(numthrds > NTHREADS):
-         print("Number of threads error: too many threads",numthrds,NTHREADS)
-         nerr = nerr+1
-   
-    print(nerr," errors when testing parallel, private, shared, firstprivate, critical  and single")
+    print(
+        nerr,
+        " errors when testing parallel, private, shared, firstprivate, critical  and single",
+    )
 
     return nerr
 
+
 errors = testOMP()
diff --git a/examples/dgemm_ompGPU.py b/examples/dgemm_ompGPU.py
index 20a30590e41c..c3e8131a58b5 100644
--- a/examples/dgemm_ompGPU.py
+++ b/examples/dgemm_ompGPU.py
@@ -30,7 +30,7 @@
 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
-#*******************************************************************
+# *******************************************************************
 #
 # NAME:    dgemm
 #
@@ -38,12 +38,12 @@
 #          dense multiplication is carried out
 #
 # USAGE:   The program takes as input the matrix order,
-#          the number of times the matrix-matrix multiplication 
+#          the number of times the matrix-matrix multiplication
 #          is carried out.
 #
 #          <progname> <# iterations> <matrix order>
 #
-#          The output consists of diagnostics to make sure the 
+#          The output consists of diagnostics to make sure the
 #          algorithm worked, and of timing statistics.
 #
 # HISTORY: Written by Rob Van der Wijngaart, February 2009.
@@ -52,75 +52,89 @@
 # *******************************************************************
 
 import sys
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
-from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime
+from numba.openmp import (
+    omp_set_num_threads,
+    omp_get_thread_num,
+    omp_get_num_threads,
+    omp_get_wtime,
+)
 import numpy as np
-#from time import process_time as timer
+# from time import process_time as timer
+
 
-#@njit(enable_ssa=False, cache=True)    What does "enable_ssa" mean? 
+# @njit(enable_ssa=False, cache=True)    What does "enable_ssa" mean?
 @njit(fastmath=True)
-def dgemm(iters,order):
+def dgemm(iters, order):
     # ********************************************************************
     # ** Allocate space for the input and transpose matrix
     # ********************************************************************
 
-    print('inside dgemm')
-    A = np.zeros((order,order))
-    B = np.zeros((order,order))
-    C = np.zeros((order,order))
+    print("inside dgemm")
+    A = np.zeros((order, order))
+    B = np.zeros((order, order))
+    C = np.zeros((order, order))
 
-# It can be very important to initialize data with the same threads
-# as you will use when computing.
+    # It can be very important to initialize data with the same threads
+    # as you will use when computing.
     with openmp("parallel for schedule(static)"):
-       for i in range(order):
-           A[:,i] = float(i)
-           B[:,i] = float(i)
-
-#    print(omp_get_num_threads())
-    for kiter in range(0,iters+1):
-         if kiter==1: 
-             t0 = omp_get_wtime()
-             tSum=0.0
-             tsqSum=0.0
-         with openmp("target teams distribute parallel for private(j,k)"):
-               for i in range(order):
-                   for k in range(order):
-                       for j in range(order):
-                           C[i][j] += A[i][k] * B[k][j]
-         if kiter>0:
-             tkiter = omp_get_wtime()
-             t = tkiter - t0
-             tSum = tSum + t
-             tsqSum = tsqSum+t*t
-             t0 = tkiter
-
-    dgemmAve    = tSum/iters
-    dgemmStdDev = ((tsqSum-iters*dgemmAve*dgemmAve)/(iters-1))**0.5 
-    print('finished with computations')
+        for i in range(order):
+            A[:, i] = float(i)
+            B[:, i] = float(i)
+
+    #    print(omp_get_num_threads())
+    for kiter in range(0, iters + 1):
+        if kiter == 1:
+            t0 = omp_get_wtime()
+            tSum = 0.0
+            tsqSum = 0.0
+        with openmp("target teams distribute parallel for private(j,k)"):
+            for i in range(order):
+                for k in range(order):
+                    for j in range(order):
+                        C[i][j] += A[i][k] * B[k][j]
+        if kiter > 0:
+            tkiter = omp_get_wtime()
+            t = tkiter - t0
+            tSum = tSum + t
+            tsqSum = tsqSum + t * t
+            t0 = tkiter
+
+    dgemmAve = tSum / iters
+    dgemmStdDev = ((tsqSum - iters * dgemmAve * dgemmAve) / (iters - 1)) ** 0.5
+    print("finished with computations")
 
     # ********************************************************************
     # ** Analyze and output results.
     # ********************************************************************
 
-    checksum = 0.0;
+    checksum = 0.0
     for i in range(order):
         for j in range(order):
-            checksum += C[i][j];
-
-    ref_checksum = order*order*order
-    ref_checksum *= 0.25*(order-1.0)*(order-1.0)
-    ref_checksum *= (iters+1)
-    epsilon=1.e-8
-    if abs((checksum - ref_checksum)/ref_checksum) < epsilon:
-        print('Solution validates')
-        nflops = 2.0*order*order*order
-        recipDiff = (1.0/(dgemmAve-dgemmStdDev) - 1.0/(dgemmAve+dgemmStdDev))
-        GfStdDev = 1.e-6*nflops*recipDiff/2.0
-        print('nflops: ',nflops)
-        print('Rate: ',1.e-6*nflops/dgemmAve,' +/- (MF/s): ',GfStdDev)
+            checksum += C[i][j]
+
+    ref_checksum = order * order * order
+    ref_checksum *= 0.25 * (order - 1.0) * (order - 1.0)
+    ref_checksum *= iters + 1
+    epsilon = 1.0e-8
+    if abs((checksum - ref_checksum) / ref_checksum) < epsilon:
+        print("Solution validates")
+        nflops = 2.0 * order * order * order
+        recipDiff = 1.0 / (dgemmAve - dgemmStdDev) - 1.0 / (dgemmAve + dgemmStdDev)
+        GfStdDev = 1.0e-6 * nflops * recipDiff / 2.0
+        print("nflops: ", nflops)
+        print("Rate: ", 1.0e-6 * nflops / dgemmAve, " +/- (MF/s): ", GfStdDev)
     else:
-        print('ERROR: Checksum = ', checksum,', Reference checksum = ', ref_checksum,'\n')
+        print(
+            "ERROR: Checksum = ",
+            checksum,
+            ", Reference checksum = ",
+            ref_checksum,
+            "\n",
+        )
+
+
 #        sys.exit("ERROR: solution did not validate")
 
 
@@ -128,23 +142,22 @@ def dgemm(iters,order):
 # read and test input parameters
 # ********************************************************************
 
-print('Parallel Research Kernels version ') #, PRKVERSION
-print('Python Dense matrix-matrix multiplication: C = A x B')
+print("Parallel Research Kernels version ")  # , PRKVERSION
+print("Python Dense matrix-matrix multiplication: C = A x B")
 
 if len(sys.argv) != 3:
-   print('argument count = ', len(sys.argv))
-   sys.exit("Usage: ./dgemm <# iterations> <matrix order>")
+    print("argument count = ", len(sys.argv))
+    sys.exit("Usage: ./dgemm <# iterations> <matrix order>")
 
 itersIn = int(sys.argv[1])
 if itersIn < 1:
-   sys.exit("ERROR: iterations must be >= 1")
+    sys.exit("ERROR: iterations must be >= 1")
 
 orderIn = int(sys.argv[2])
 if orderIn < 1:
     sys.exit("ERROR: order must be >= 1")
 
-print('Number of iterations = ', itersIn)
-print('Matrix order         = ', orderIn)
+print("Number of iterations = ", itersIn)
+print("Matrix order         = ", orderIn)
 
 dgemm(itersIn, orderIn)
-
diff --git a/examples/piParLoopGPU_BUD.py b/examples/piParLoopGPU_BUD.py
index 0fc5f950b243..9c01d81c1283 100644
--- a/examples/piParLoopGPU_BUD.py
+++ b/examples/piParLoopGPU_BUD.py
@@ -1,23 +1,25 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
 from numba.openmp import omp_get_num_threads
 from numba.openmp import omp_get_thread_num
 
+
 @njit
 def piFunc(NumSteps):
-    step = 1.0/NumSteps
-    sum  = 0.0
-    start_time = omp_get_wtime()  
+    step = 1.0 / NumSteps
+    sum = 0.0
+    start_time = omp_get_wtime()
     with openmp("target"):
-         with openmp("teams distribute parallel for private(x) reduction(+:sum)"):
-               for i in range(NumSteps):
-                    x = (i+0.5)*step
-                    sum += 4.0/(1.0 + x*x)
+        with openmp("teams distribute parallel for private(x) reduction(+:sum)"):
+            for i in range(NumSteps):
+                x = (i + 0.5) * step
+                sum += 4.0 / (1.0 + x * x)
 
     pi = step * sum
     runtime = omp_get_wtime() - start_time
     print("pi = ", pi, "runtime = ", runtime)
     return pi
 
+
 pi = piFunc(10000000)
 print(pi)
diff --git a/examples/piParLoopGPU_BUD_combined.py b/examples/piParLoopGPU_BUD_combined.py
index a4ab403295bc..399f89fd3c49 100644
--- a/examples/piParLoopGPU_BUD_combined.py
+++ b/examples/piParLoopGPU_BUD_combined.py
@@ -1,21 +1,23 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
 from numba.openmp import omp_get_wtime
 
+
 @njit
 def piFunc(NumSteps):
-    step = 1.0/NumSteps
-    sum  = 0.0
-    start_time = omp_get_wtime()  
+    step = 1.0 / NumSteps
+    sum = 0.0
+    start_time = omp_get_wtime()
     with openmp("target teams distribute parallel for private(x) reduction(+:sum)"):
-               for i in range(NumSteps):
-                    x = (i+0.5)*step
-                    sum += 4.0/(1.0 + x*x)
+        for i in range(NumSteps):
+            x = (i + 0.5) * step
+            sum += 4.0 / (1.0 + x * x)
 
     pi = step * sum
     runtime = omp_get_wtime() - start_time
     print("pi = ", pi, "runtime = ", runtime)
     return pi
 
+
 pi = piFunc(10000000)
 print(pi)
diff --git a/examples/piParLoopGPU_loop.py b/examples/piParLoopGPU_loop.py
index 68ba5c939b2f..d74219eb221a 100644
--- a/examples/piParLoopGPU_loop.py
+++ b/examples/piParLoopGPU_loop.py
@@ -1,23 +1,25 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
 from numba.openmp import omp_get_num_threads
 from numba.openmp import omp_get_thread_num
 
+
 @njit
 def piFunc(NumSteps):
-    step = 1.0/NumSteps
-    sum  = 0.0
+    step = 1.0 / NumSteps
+    sum = 0.0
     start_time = omp_get_wtime()
     with openmp("target "):
-         with openmp("loop private(x) reduction(+:sum)"):
-               for i in range(NumSteps):
-                    x = (i+0.5)*step
-                    sum += 4.0/(1.0 + x*x)
+        with openmp("loop private(x) reduction(+:sum)"):
+            for i in range(NumSteps):
+                x = (i + 0.5) * step
+                sum += 4.0 / (1.0 + x * x)
 
     pi = step * sum
     runtime = omp_get_wtime() - start_time
     print("pi = ", pi, "runtime = ", runtime)
     return pi
 
+
 pi = piFunc(10000000)
 print(pi)
diff --git a/examples/pi_loop.py b/examples/pi_loop.py
index e1168f0670b6..5da5025eb69c 100644
--- a/examples/pi_loop.py
+++ b/examples/pi_loop.py
@@ -1,6 +1,7 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
 
+
 @njit
 def calc_pi():
     num_steps = 100000
@@ -9,10 +10,11 @@ def calc_pi():
     the_sum = 0.0
     with openmp("parallel for reduction(+:the_sum) schedule(static)"):
         for j in range(num_steps):
-            x = ((j-1) - 0.5) * step
+            x = ((j - 1) - 0.5) * step
             the_sum += 4.0 / (1.0 + x * x)
 
     pi = step * the_sum
     return pi
 
+
 print("pi =", calc_pi())
diff --git a/examples/pi_spmd.py b/examples/pi_spmd.py
index 38e582b81967..9a6775c33cdc 100644
--- a/examples/pi_spmd.py
+++ b/examples/pi_spmd.py
@@ -1,14 +1,20 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
-from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime
+from numba.openmp import (
+    omp_set_num_threads,
+    omp_get_thread_num,
+    omp_get_num_threads,
+    omp_get_wtime,
+)
 import numpy as np
 
+
 @njit
 def f1():
     num_steps = 100000000
     step = 1.0 / num_steps
-    MAX_THREADS=8
-    for j in range(1,MAX_THREADS+1):
+    MAX_THREADS = 8
+    for j in range(1, MAX_THREADS + 1):
         tsum = np.zeros(j)
 
         omp_set_num_threads(j)
@@ -25,15 +31,16 @@ def f1():
                 x = (i + 0.5) * step
                 local_sum += 4.0 / (1.0 + x * x)
 
-#            print("foo:", j, tid, local_sum)
+            #            print("foo:", j, tid, local_sum)
             tsum[tid] = local_sum
 
-#        print("tsum:", tsum)
+        #        print("tsum:", tsum)
         full_sum = np.sum(tsum)
 
         pi = step * full_sum
         runtime = omp_get_wtime() - start_time
         print("pi = ", pi, "runtime = ", runtime, j)
 
+
 f1()
 print("DONE")
diff --git a/examples/pi_task.py b/examples/pi_task.py
index b6470935cd4b..e8f1edb58c10 100644
--- a/examples/pi_task.py
+++ b/examples/pi_task.py
@@ -1,12 +1,18 @@
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
-from numba.openmp import omp_set_num_threads, omp_get_thread_num, omp_get_num_threads, omp_get_wtime
+from numba.openmp import (
+    omp_set_num_threads,
+    omp_get_thread_num,
+    omp_get_num_threads,
+    omp_get_wtime,
+)
 import numpy as np
 
+
 @njit
 def pi_comp(Nstart, Nfinish, step):
     MIN_BLK = 256
-    #MIN_BLK = 1024*1024*256
+    # MIN_BLK = 1024*1024*256
     pi_sum = 0.0
     if Nfinish - Nstart < MIN_BLK:
         for i in range(Nstart, Nfinish):
@@ -16,25 +22,26 @@ def pi_comp(Nstart, Nfinish, step):
         iblk = Nfinish - Nstart
         pi_sum1 = 0.0
         pi_sum2 = 0.0
-        cut = Nfinish-(iblk // 2)
+        cut = Nfinish - (iblk // 2)
         with openmp("task shared(pi_sum1)"):
             pi_sum1 = pi_comp(Nstart, cut, step)
         with openmp("task shared(pi_sum2)"):
             pi_sum2 = pi_comp(cut, Nfinish, step)
         with openmp("taskwait"):
             pi_sum = pi_sum1 + pi_sum2
-#        pi_sum1 = pi_comp(Nstart, cut, step)
-#        pi_sum2 = pi_comp(cut, Nfinish, step)
-#        pi_sum = pi_sum1 + pi_sum2
+    #        pi_sum1 = pi_comp(Nstart, cut, step)
+    #        pi_sum2 = pi_comp(cut, Nfinish, step)
+    #        pi_sum = pi_sum1 + pi_sum2
     return pi_sum
 
+
 @njit
 def f1(lb, num_steps):
     step = 1.0 / num_steps
-    MAX_THREADS=4
+    MAX_THREADS = 4
     tsum = np.zeros(MAX_THREADS)
 
-    for j in range(1,MAX_THREADS+1):
+    for j in range(1, MAX_THREADS + 1):
         omp_set_num_threads(j)
         full_sum = 0.0
         start_time = omp_get_wtime()
@@ -48,9 +55,10 @@ def f1(lb, num_steps):
         runtime = omp_get_wtime() - start_time
         print("pi = ", pi, "runtime = ", runtime, j)
 
+
 lb = 0
 num_steps = 1024
-#num_steps = 1024*1024*1024
-#num_steps = 1000000000
+# num_steps = 1024*1024*1024
+# num_steps = 1000000000
 f1(lb, num_steps)
 print("DONE")

From 1548c4ba76ae559fdadb50d1d4414bf86e1ae7f1 Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Fri, 25 Apr 2025 06:42:46 -0700
Subject: [PATCH 27/28] Update README and RTD conf.py

---
 README.md           | 116 ++++++++++++++++++++++++++++----------------
 docs/source/conf.py |  41 ++++++++++------
 2 files changed, 100 insertions(+), 57 deletions(-)

diff --git a/README.md b/README.md
index 64dc24bfbb91..68e3ebb650f6 100644
--- a/README.md
+++ b/README.md
@@ -3,64 +3,66 @@
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Python-for-HPC/binder/HEAD)
 
 # PyOMP
-OpenMP for Python in Numba for CPU/GPU parallel programming.
+OpenMP for Python CPU/GPU parallel programming, powered by Numba.
 
-Currently, PyOMP is distributed as a full version of Numba which is based on a
-Numba version a few versions behind mainline.
-Since Numba is available for every combination of the past few Python versions
-and the past few NumPy versions and various operating systems and architectures,
-there is quite an extensive build infrastructure required to get all these
-combinations and recently we have sorted out some of these combinations.
-The architecture and operating system combinations that currently work are:
-linux-64 (x86_64), osx-arm64 (mac), linux-arm64, and linux-ppc64le.
-These distributions are available with the `conda` command in the next section.
+PyOMP provides a familiar interface for CPU/GPU programming using OpenMP
+abstractions adapted for Python.
+Besides effortless programmability, PyOMP generates fast code using Numba's JIT
+compiler based on LLVM, which is competitive with equivalent C/C++ implementations.
 
-Due to PyOMP using the LLVM OpenMP infrastructure, we also inherit its
-limitations which means that GPU support is only available on Linux.
+PyOMP is developed and distributed as an *extension* to Numba, so it uses
+Numba as a dependency.
+It is currently tested with Numba versions 0.57.x, 0.58.x on the following
+architecture and operating system combinations: linux-64 (x86_64), osx-arm64
+(mac), linux-arm64, and linux-ppc64le.
+Installation is possible through `conda`, detailed in the next section.
 
-In the future, we plan on converting PyOMP to a Numba extension which should eliminate the Python and NumPy versioning issues.
+As PyOMP builds on to of the LLVM OpenMP infrastructure, it also inherits its
+limitations: GPU support is only available on Linux.
+Also, PyOMP currently supports only NVIDIA GPUs with AMD GPU support planned for.
 
 ## Installation
 
 ### Conda
-PyOMP is distributed as a package through Conda, currently supporting linux-64
-(x86_64), osx-arm64 (mac), linux-arm64, and linux-ppc64le architectures.
+PyOMP is distributed through Conda, easily installable using the following command:
 
 ```bash
 conda install -c python-for-hpc -c conda-forge pyomp
 ```
+Besides a standard installation, we also provide the following options to
+quickly try out PyOMP online or through a container.
 
-## Trying it out
+### Trying it out
 
-### Binder
+#### Binder
 You can try it out for free on a multi-core CPU in JupyterLab at the following link:
 
 https://mybinder.org/v2/gh/Python-for-HPC/binder/HEAD
 
-### Docker
+#### Docker
 
 We also provide pre-built containers for arm64 and amd64 architectures with
 PyOMP and Jupyter pre-installed.
 The following show how to access the container through the terminal or using
-jupyter.
+Jupyter.
 
 First pull the container
-```
+```bash
 docker pull ghcr.io/python-for-hpc/pyomp:latest
 ```
 
 To use the terminal, run a shell on the container
-```
+```bash
 docker run -it ghcr.io/python-for-hpc/pyomp:latest /bin/bash
 ```
 
 To use Jupyter, run without arguments and forward port 8888.
-```
+```bash
 docker run -it -p 8888:8888 ghcr.io/python-for-hpc/pyomp:latest
 ```
 Jupyter will start as a service on localhost with token authentication by default.
 Grep the url with the token from the output and copy it to the browser.
-```
+```bash
 ...
 [I 2024-09-15 17:24:47.912 ServerApp]     http://127.0.0.1:8888/tree?token=<token>
 ...
@@ -68,40 +70,72 @@ Grep the url with the token from the output and copy it to the browser.
 
 ## Usage
 
-Import Numba and add the `@njit` decorator to the function in which you want to use OpenMP.
-Add `with` contexts for each OpenMP region you want to have, importing the
-context `openmp_context` from the `numba.openmp` module.
+From `numba.openmp` import the `@njit` decorator and the `openmp_context` to
+create OpenMP regions using `with` contexts.
+Decorate with `njit` the function you want to parallelize with OpenMP and
+describe parallelism in OpenMP directives using `with` contexts.
+Enjoy the simplicity of OpenMP with Python syntax and parallel performance.
 
 For a list of supported OpenMP directives and more detailed information, check
 out the [Documentation](https://pyomp.readthedocs.io).
-PyOMP supports both CPU and GPU programming for NVIDIA GPUs through the `target`
+
+PyOMP supports both CPU and GPU programming implementing OpenMP's `target`
 directive for offloading.
-For GPU programming, PyOMP supports the `device` clause and by convention the
-default without using the clause or providing `device(0)` always refers to the
-accelerator GPU device.
-It is also possible to use the host as a multi-core CPU target device setting `device(1)`.
+For GPU programming, PyOMP supports the `device` clause, with `device(0)` by
+convention offloading to a GPU device.
+It is also possible to use the host as a multi-core CPU target device (mainly
+for testing purposes) by setting `device(1)`.
 
 ### Example
 
-This is an example of calculating $\pi$ with PyOMP with a `parallel for` loop.
+This is an example of calculating $\pi$ with PyOMP with a `parallel for` loop
+using CPU parallelism:
 
 ```python
-from numba import njit
+from numba.openmp import njit
 from numba.openmp import openmp_context as openmp
 
 @njit
-def calc_pi():
-    num_steps = 100000
+def calc_pi(num_steps):
     step = 1.0 / num_steps
-
-    the_sum = 0.0
-    with openmp("parallel for reduction(+:the_sum) schedule(static)"):
+    red_sum = 0.0
+    with openmp("parallel for reduction(+:red_sum) schedule(static)"):
         for j in range(num_steps):
             x = ((j-1) - 0.5) * step
-            the_sum += 4.0 / (1.0 + x * x)
+            red_sum += 4.0 / (1.0 + x * x)
 
-    pi = step * the_sum
+    pi = step * red_sum
     return pi
 
-print("pi =", calc_pi())
+print("pi =", calc_pi(1000000))
 ```
+
+and this is the same example using GPU offloading:
+
+```python
+from numba.openmp import njit
+from numba.openmp import openmp_context as openmp
+from numba.openmp import omp_get_thread_num
+
+@njit
+def calc_pi(num_steps):
+    step = 1.0/num_steps
+    red_sum = 0.0
+    with openmp("target map(tofrom: red_sum)"):
+        with openmp("loop private(x) reduction(+:red_sum)"):
+               for i in range(num_steps):
+                   tid = omp_get_thread_num()
+                   x = (i+0.5)*step
+                   red_sum += 4.0 / (1.0 + x*x) 
+
+    pi = step * red_sum
+    print("pi=", pi)
+
+print("pi =", calc_pi(1000000))
+```
+
+## Support
+
+We welcome any feedback, bug reports, or feature requests.
+Please open an [Issue](https://github.com/Python-for-HPC/PyOMP/issues) or post
+in [Discussions](https://github.com/Python-for-HPC/PyOMP/discussions).
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7a30782a0ade..2750a959b4cf 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,35 +1,44 @@
 # Configuration file for the Sphinx documentation builder.
 
+import subprocess
+
 # -- Project information
 
-project = 'PyOMP'
-copyright = '2024, PyOMP developers'
-author = 'Giorgis Georgakoudis'
+project = "PyOMP"
+copyright = "2024, PyOMP developers"
+author = "Giorgis Georgakoudis"
 
-release = '0.1'
-version = '0.1.0'
+try:
+    release = (
+        subprocess.check_output(["git", "describe", "--tags", "--abbrev=0"])
+        .strip()
+        .decode()
+    )
+except subprocess.CalledProcessError:
+    release = "latest"
+version = release
 
 # -- General configuration
 
 extensions = [
-    'sphinx.ext.duration',
-    'sphinx.ext.doctest',
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.intersphinx',
+    "sphinx.ext.duration",
+    "sphinx.ext.doctest",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
 ]
 
 intersphinx_mapping = {
-    'python': ('https://docs.python.org/3/', None),
-    'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
+    "python": ("https://docs.python.org/3/", None),
+    "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
 }
-intersphinx_disabled_domains = ['std']
+intersphinx_disabled_domains = ["std"]
 
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # -- Options for HTML output
 
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # -- Options for EPUB output
-epub_show_urls = 'footnote'
\ No newline at end of file
+epub_show_urls = "footnote"

From 8cb57b818af0fef25309d4702ef4061b94d5f3cc Mon Sep 17 00:00:00 2001
From: Giorgis Georgakoudis <georgakoudis1@llnl.gov>
Date: Fri, 25 Apr 2025 07:09:46 -0700
Subject: [PATCH 28/28] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 68e3ebb650f6..eb8853f6ab09 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ architecture and operating system combinations: linux-64 (x86_64), osx-arm64
 (mac), linux-arm64, and linux-ppc64le.
 Installation is possible through `conda`, detailed in the next section.
 
-As PyOMP builds on to of the LLVM OpenMP infrastructure, it also inherits its
+As PyOMP builds on top of the LLVM OpenMP infrastructure, it also inherits its
 limitations: GPU support is only available on Linux.
 Also, PyOMP currently supports only NVIDIA GPUs with AMD GPU support planned for.