Initial try at calling a c++ function while lowering a function

reazulhoque · reazulhoque · commit 8ccfd360117f · 2020-08-13T15:28:05.000-05:00
diff --git a/numba/dppl/compiler.py b/numba/dppl/compiler.py
@@ -47,6 +47,7 @@ def get_ordered_arg_access_types(pyfunc, access_types):
 
     return ordered_arg_access_types
 
+
 class DPPLCompiler(CompilerBase):
     """ DPPL Compiler """
 
diff --git a/numba/dppl/dppl_passes.py b/numba/dppl/dppl_passes.py
@@ -25,7 +25,7 @@
 
 from .dppl_lowerer import DPPLLower
 
-from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass
+from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
 from numba.parfors.parfor import Parfor
 
@@ -119,13 +119,16 @@ def run_pass(self, state):
         """
         # Ensure we have an IR and type information.
         assert state.func_ir
+        functions_map = replace_functions_map.copy()
+        functions_map.pop(('dot', 'numpy'), None)
 
         preparfor_pass = _parfor_PreParforPass(
             state.func_ir,
             state.type_annotation.typemap,
             state.type_annotation.calltypes, state.typingctx,
             state.flags.auto_parallel,
-            state.parfor_diagnostics.replaced_fns
+            state.parfor_diagnostics.replaced_fns,
+            replace_functions_map=functions_map
         )
 
         preparfor_pass.run()
@@ -216,7 +219,19 @@ def run_pass(self, state):
             # be later serialized.
             state.library.enable_object_caching()
 
+
         targetctx = state.targetctx
+
+        # This should not happen here, after we have the notion of context in Numba
+        # we should have specialized dispatcher for dppl context and that dispatcher
+        # should be a cpu dispatcher that will overload the lowering functions for
+        # linalg for dppl.cpu_dispatcher and the dppl.gpu_dipatcher should be the
+        # current target context we have to launch kernels.
+        # This is broken as this essentially adds the new lowering in a list which
+        # means it does not get replaced with the new lowering_buitins
+        from . import experimental_linalg_lowering_overload
+        targetctx.refresh()
+
         library   = state.library
         interp    = state.func_ir  # why is it called this?!
         typemap   = state.typemap
@@ -273,6 +288,7 @@ def run_pass(self, state):
         """
         Back-end: Generate LLVM IR from Numba IR, compile to machine code
         """
+
         lowered = state['cr']
         signature = typing.signature(state.return_type, *state.args)
 
diff --git a/numba/dppl/experimental_linalg_lowering_overload.py b/numba/dppl/experimental_linalg_lowering_overload.py
@@ -0,0 +1,135 @@
+import numpy as np
+from numba.core import types, cgutils
+from numba.core.imputils import (lower_builtin)
+from numba.core.typing import signature
+from numba.np.arrayobj import make_array, _empty_nd_impl, array_copy
+from numba.core import itanium_mangler
+from llvmlite import ir
+import contextlib
+
+from numba import int32, int64, uint32, uint64, float32, float64
+
+
+@contextlib.contextmanager
+def make_contiguous(context, builder, sig, args):
+    """
+    Ensure that all array arguments are contiguous, if necessary by
+    copying them.
+    A new (sig, args) tuple is yielded.
+    """
+    newtys = []
+    newargs = []
+    copies = []
+    for ty, val in zip(sig.args, args):
+        if not isinstance(ty, types.Array) or ty.layout in 'CF':
+            newty, newval = ty, val
+        else:
+            newty = ty.copy(layout='C')
+            copysig = signature(newty, ty)
+            newval = array_copy(context, builder, copysig, (val,))
+            copies.append((newty, newval))
+        newtys.append(newty)
+        newargs.append(newval)
+    yield signature(sig.return_type, *newtys), tuple(newargs)
+    for ty, val in copies:
+        context.nrt.decref(builder, ty, val)
+
+def check_c_int(context, builder, n):
+    """
+    Check whether *n* fits in a C `int`.
+    """
+    _maxint = 2**31 - 1
+
+    def impl(n):
+        if n > _maxint:
+            raise OverflowError("array size too large to fit in C int")
+
+    context.compile_internal(builder, impl,
+                             signature(types.none, types.intp), (n,))
+
+
+ll_char = ir.IntType(8)
+ll_char_p = ll_char.as_pointer()
+ll_void_p = ll_char_p
+ll_intc = ir.IntType(32)
+ll_intc_p = ll_intc.as_pointer()
+intp_t = cgutils.intp_t
+ll_intp_p = intp_t.as_pointer()
+
+def call_experimental_dot(context, builder, conjugate, dtype,
+                          n, a_data, b_data, out_data):
+
+    fnty = ir.FunctionType(ir.IntType(32),
+                           [ll_void_p, ll_void_p, ll_void_p, ir.IntType(64)])
+
+    #fn = builder.module.get_or_insert_function(fnty, name="inumpy_dot")
+    #name = itanium_mangler.mangle("inumpy_dot", [int64, dtype])
+    #print(name)
+    fn = builder.module.get_or_insert_function(fnty, name="_Z10inumpy_dotIfEiPvS0_S0_m")
+
+    res = builder.call(fn, (builder.bitcast(a_data, ll_void_p),
+                            builder.bitcast(b_data, ll_void_p),
+                            builder.bitcast(out_data, ll_void_p),
+                            n))
+
+def dot_2_vv(context, builder, sig, args, conjugate=False):
+    """
+    np.dot(vector, vector)
+    np.vdot(vector, vector)
+    """
+    import llvmlite.binding as ll
+    ll.load_library_permanently('libinumpy.so')
+
+    aty, bty = sig.args
+    dtype = sig.return_type
+    a = make_array(aty)(context, builder, args[0])
+    b = make_array(bty)(context, builder, args[1])
+    n, = cgutils.unpack_tuple(builder, a.shape)
+
+    def check_args(a, b):
+        m, = a.shape
+        n, = b.shape
+        if m != n:
+            raise ValueError("incompatible array sizes for np.dot(a, b) "
+                             "(vector * vector)")
+
+    context.compile_internal(builder, check_args,
+                             signature(types.none, *sig.args), args)
+    check_c_int(context, builder, n)
+
+    out = cgutils.alloca_once(builder, context.get_value_type(dtype))
+    call_experimental_dot(context, builder, conjugate, dtype, n, a.data, b.data, out)
+    return builder.load(out)
+
+
+@lower_builtin(np.dot, types.Array, types.Array)
+def dot_dppl(context, builder, sig, args):
+    """
+    np.dot(a, b)
+    a @ b
+    """
+    import dppl.ocldrv as driver
+    device = driver.runtime.get_current_device()
+
+    # the device env should come from the context but the current context
+    # is a cpu context and not a dppl_gpu_context
+
+    with make_contiguous(context, builder, sig, args) as (sig, args):
+        ndims = [x.ndim for x in sig.args[:2]]
+        if ndims == [2, 2]:
+            print("gemm")
+            #return dot_2_mm(context, builder, sig, args)
+        elif ndims == [2, 1]:
+            print("gemv")
+            #return dot_2_mv(context, builder, sig, args)
+        elif ndims == [1, 2]:
+            print("gemv")
+            #return dot_2_vm(context, builder, sig, args)
+        elif ndims == [1, 1]:
+            print("dot")
+            return dot_2_vv(context, builder, sig, args)
+        else:
+            assert 0
+
+
+    raise ImportError("scipy 0.16+ is required for linear algebra")
diff --git a/numba/parfors/parfor.py b/numba/parfors/parfor.py
@@ -1350,14 +1350,15 @@ class PreParforPass(object):
     implementations of numpy functions if available.
     """
     def __init__(self, func_ir, typemap, calltypes, typingctx, options,
-                 swapped={}):
+                 swapped={}, replace_functions_map=replace_functions_map):
         self.func_ir = func_ir
         self.typemap = typemap
         self.calltypes = calltypes
         self.typingctx = typingctx
         self.options = options
         # diagnostics
         self.swapped = swapped
+        self.replace_functions_map = replace_functions_map
         self.stats = {
             'replaced_func': 0,
             'replaced_dtype': 0,
@@ -1394,7 +1395,7 @@ def _replace_parallel_functions(self, blocks):
                         def replace_func():
                             func_def = get_definition(self.func_ir, expr.func)
                             callname = find_callname(self.func_ir, expr)
-                            repl_func = replace_functions_map.get(callname, None)
+                            repl_func = self.replace_functions_map.get(callname, None)
                             # Handle method on array type
                             if (repl_func is None and
                                 len(callname) == 2 and
diff --git a/setup.py b/setup.py
@@ -213,6 +213,7 @@ def check_file_at_path(path2file):
         else:
             omplinkflags = ['-fopenmp']
 
+    tbb_root = False
     if tbb_root:
         print("Using Intel TBB from:", tbb_root)
         ext_np_ufunc_tbb_backend = Extension(