diff --git a/compyle/array.py b/compyle/array.py index 172add4..496eda8 100644 --- a/compyle/array.py +++ b/compyle/array.py @@ -1,11 +1,10 @@ import numpy as np import math import mako.template as mkt -import time from pytools import memoize, memoize_method from .config import get_config -from .types import (annotate, dtype_to_ctype, ctype_to_dtype, declare, +from .types import (annotate, dtype_to_ctype, declare, dtype_to_knowntype, knowntype_to_ctype) from .template import Template from .sort import radix_sort @@ -394,7 +393,6 @@ def linspace(start, stop, num, dtype=np.float64, backend='opencl', out = out * delta+start elif backend == 'cuda': import pycuda.gpuarray as gpuarray - import pycuda.autoinit if endpoint: delta = (stop-start)/(num-1) else: @@ -445,7 +443,6 @@ def diff(a, n, backend=None): backend = a.backend if backend == 'opencl' or backend == 'cuda': - from compyle.api import Elementwise binom_coeff = np.zeros(n+1) sign_fac = 1 if (n % 2 == 0) else -1 for i in range(n+1): @@ -526,6 +523,7 @@ def trapz(y, x=None, dx=1.0, backend=None): out = dot(d, sum_ar) * 0.5 return out + @annotate def where_elwise(i, condition, x, y, ans): if condition[i]: @@ -872,7 +870,6 @@ def comparison_kernel(func, backend, ary_type, other_type): def comparison_template(func, other, arr, backend=None): if backend is None: backend = arr.backend - from compyle.parallel import Elementwise other_type = dtype_to_ctype(type(other)) ary_type = dtype_to_ctype(arr.dtype) + 'p' ans = empty(arr.length, dtype=np.int32, backend=arr.backend) @@ -1023,7 +1020,7 @@ def get_buff(self, offset=0, length=0): return cu_bufint(self._data, nbytes, int(offset)) def get(self): - if self.backend == 'cython': + if self.backend == 'cython' or self.backend == 'c': return self.dev elif self.backend == 'opencl' or self.backend == 'cuda': return self.dev.get() diff --git a/compyle/autodiff_tapenade.py b/compyle/autodiff_tapenade.py new file mode 100644 index 0000000..f57e6a0 --- /dev/null +++ b/compyle/autodiff_tapenade.py @@ -0,0 +1,590 @@ +import pybind11 +import inspect +import os +import subprocess +import functools +import re +import tempfile +import shutil +import numpy as np +from mako.template import Template +from distutils.errors import CompileError + +from .profile import profile +from .translator import CConverter +from .transpiler import convert_to_float_if_needed +from . import array +from compyle.api import get_config + +from .ext_module import get_md5 +from .cimport import Cmodule, get_tpnd_obj_dir, compile_tapenade_source +from .transpiler import Transpiler + + +pyb11_setup_header = ''' + +// c code for with PyBind11 binding +#include +#include +namespace py = pybind11; + + +''' + +pyb11_setup_header_rev = ''' + +// c code for with PyBind11 binding +#include +#include +#include +namespace py = pybind11; + +\n +''' + +pyb11_wrap_template = ''' +PYBIND11_MODULE(m_${hash_fn}, m) { + m.def("m_${hash_fn}", [](${pyb_call}){ + return ${name}${FUNC_SUFFIX_F}(${c_call}); + }); +} +''' + +pyb11_wrap_template_rev = ''' +PYBIND11_MODULE(${modname}, m) { + m.def("${modname}", [](${pyb_call}){ + return ${name}${FUNC_SUFFIX_F}(${c_call}); + }); +} +''' + +c_backend_template = ''' +${c_kernel_defn} + +void elementwise_${fn_name}(long int SIZE, ${fn_args}) +{ + %if openmp: + #pragma omp parallel for + %endif + for(long int iter = (SIZE - 1); iter >= 0; iter--) + { + ${fn_name}(iter, ${fn_call}) ; + } +} +''' +VAR_SUFFIX_F = '__d' +FUNC_SUFFIX_F = '_d' + +VAR_SUFFIX_R = '__b' +FUNC_SUFFIX_R = '_b' + + +def get_source(f): + c = CConverter() + source = c.parse(f) + return source + + +def sig_to_pyb_call(par, typ): + if typ[-1] == "*": + call = "py::array_t<{}> {}".format(typ[:-1], par) + else: + call = typ + " " + str(par) + return call + + +def sig_to_c_call(par, typ): + if typ[-1] == '*': + call = "({ctype}) {arg}.request().ptr".format(ctype=typ, arg=par) + else: + call = "{arg}".format(arg=par) + return call + + +def get_diff_signature(f, active, mode='forward'): + if mode == 'forward': + VAR_SUFFIX = VAR_SUFFIX_F + elif mode == 'reverse': + VAR_SUFFIX = VAR_SUFFIX_R + + sig = inspect.signature(f) + pyb_c = [] + pyb_py = [] + pure_c = [] + pure_py = [] + for s in sig.parameters: + typ = str(sig.parameters[s].annotation.type) + if s not in active: + pyb_py.append([sig_to_pyb_call(s, typ)]) + pyb_c.append([sig_to_c_call(s, typ)]) + pure_c.append(["{typ} {i}".format(typ=typ, i=s)]) + pure_py.append([s]) + else: + typstar = typ if typ[-1] == '*' else typ + '*' + pyb_py.append([ + sig_to_pyb_call(s, typ), + sig_to_pyb_call(s + VAR_SUFFIX, typstar) + ]) + pyb_c.append([ + sig_to_c_call(s, typ), + sig_to_c_call(s + VAR_SUFFIX, typstar) + ]) + pure_c.append([ + "{typ} {i}".format(typ=typ, i=s), + "{typ} {i}".format(typ=typstar, i=s + VAR_SUFFIX) + ]) + pure_py.append([s, s + VAR_SUFFIX]) + + pyb_py_all = functools.reduce(lambda x, y: x + y, pyb_py) + pyb_c_all = functools.reduce(lambda x, y: x + y, pyb_c) + pure_c = functools.reduce(lambda x, y: x + y, pure_c) + pure_py = functools.reduce(lambda x, y: x + y, pure_py) + + return pyb_py_all, pyb_c_all, pure_py, pure_c + + +class GradBase: + def __init__(self, func, wrt, gradof, mode='forward', backend='tapenade'): + self.backend = backend + self.func = func + self.args = list(inspect.signature(self.func).parameters.keys()) + self.wrt = wrt + self.gradof = gradof + self.active = [] + self.mode = mode + self.name = func.__name__ + self._config = get_config() + self.source = 'Not yet generated' + self.grad_source = 'Not yet generated' + self.grad_all_source = 'Not yet generated' + self.tapenade_op = 'Not yet generated' + self.message = "" + self.c_func = self.c_gen_error + self.tp = Transpiler(backend='c') + self._get_sources() + self.grad_args, self.grad_types = self._get_grad_def(self.grad_source) + self._get_active_vars() + + def _get_sources(self): + self.tp.add(self.func) + self.source = self.tp.get_code(incl_header=False) + + temp_dir = tempfile.mkdtemp() + + with open(os.path.join(temp_dir, self.name + '.c'), 'w') as f: + f.write(self.source) + + if self.mode == 'forward': + command = [ + "tapenade", f"{self.name}.c", "-d", "-o", + f"{self.name}_forward_diff", "-tgtvarname", + f"{VAR_SUFFIX_F}", "-tgtfuncname", + f"{FUNC_SUFFIX_F}", + "-head", + f'{self.name}({" ".join(self.wrt)})\({" ".join(self.gradof)})' + "-nooptim", "recomputeintermediates", + "-nooptim", "spareinit", + ] + elif self.mode == 'reverse': + command = [ + "tapenade", f"{self.name}.c", "-b", "-o", + f"{self.name}_reverse_diff", "-adjvarname", + f"{VAR_SUFFIX_R}", "-adjfuncname", + f"{FUNC_SUFFIX_R}", + "-head", + f'{self.name}({" ".join(self.wrt)})\({" ".join(self.gradof)})', + "-nooptim", "adjointliveness", + ] + # "-nooptim", "diffliveness", + # "-fixinterface" + # "-nooptim", "recomputeintermediates", + else: + raise ValueError(f"supported modes are 'forward' and 'reverse', got {self.mode}") + + if self.mode == 'forward': + f_extn = "_forward_diff_d.c" + elif self.mode == 'reverse': + f_extn = "_reverse_diff_b.c" + + op_tapenade = "" + try: + proc = subprocess.run(command, capture_output=True, text=True, cwd=temp_dir) + op_tapenade += proc.stdout + except subprocess.CalledProcessError as e: + self.read_msg(temp_dir, f_extn) + print(e) + print("*"*80) + print(self.message) + print("*"*80) + raise RuntimeError( + "Encountered errors while differentiating through Tapenade.") + self.tapenade_op = op_tapenade + self.read_msg(temp_dir, f_extn) + + with open(os.path.join(temp_dir, self.name + f_extn), 'r') as f: + self.grad_source = f.read() + + def read_msg(self, temp_dir, f_extn): + try: + with open(os.path.join(temp_dir, self.name + f_extn[:-1] + "msg"), 'r') as f: + self.message = f.read() + except: + try: + with open(os.path.join(temp_dir, self.name + f_extn[:-1] + "msg~"), 'r') as f: + self.message = f.read() + except: + pass + + def c_gen_error(*args): + raise RuntimeError("Differentiated function not yet generated") + + def _massage_arg(self, x): + if isinstance(x, array.Array): + return x.dev + elif self.backend != 'cuda' or isinstance(x, np.ndarray): + return x + else: + return np.asarray(x) + + def _get_grad_def(self, src): + lines_src = src.split("\n") + n_lines = len(lines_src) + i = 0 + start = 0 + while i < n_lines: + # if lines_src[i].strip().startswith(f'void {self.name}'): + if f'void {self.name}' in lines_src[i].strip(): + start = i + break + i += 1 + if i == n_lines: + raise CompileError('could not find fn definition for derivative') + + end = 0 + + while i < n_lines: + if lines_src[i].strip().endswith("{"): + end = i + break + i += 1 + if i == n_lines: + raise CompileError('could not find fn definition for derivative') + + src_def = " ".join([i.strip() for i in lines_src[start:end + 1]]) + args_type = re.search(r"\((.*?)\)", src_def).group(1).split(",") + args = [] + types = [] + for val in args_type: + temp = val.split() + if len(temp) == 2: + t1 = temp[0] + t2 = temp[1] + elif len(temp) == 3: + t1 = temp[0] + t2 = temp[1] + temp[2] + else: + raise CompileError('could not get arguments from generated fn') + if t2.startswith("*"): + t1 += "*" + t2 = t2[1:] + types.append(t1) + args.append(t2) + return args, types + + def _get_active_vars(self): + if self.mode == 'forward': + suff = VAR_SUFFIX_F + elif self.mode == 'reverse': + suff = VAR_SUFFIX_R + + for i, var in enumerate(self.grad_args): + if var.endswith(suff): + self.active.append(self.grad_args[i-1]) + + @profile + def __call__(self, *args, **kw): + c_args = [self._massage_arg(x) for x in args] + + if self.backend == 'cuda': + import pycuda.driver as drv + event = drv.Event() + self.c_func(*c_args, **kw) + event.record() + event.synchronize() + + elif self.backend == 'c': + self.c_func(*c_args) + + else: + raise RuntimeError("Given backend not supported, got '{}'".format( + self.backend)) + + +class ForwardGrad(GradBase): + def __init__(self, func, wrt, gradof): + super(ForwardGrad, self).__init__(func, + wrt, + gradof, + mode='forward', + backend='tapenade') + self.c_func = self.get_c_forward_diff() + + def get_c_forward_diff(self): + self.grad_source = pyb11_setup_header + self.grad_source + hash_fn = get_md5(self.grad_source) + modname = f'm_{hash_fn}' + + pyb_all, c_all, _, _ = get_diff_signature(self.func, + self.active, + mode='forward') + pyb_call = ", ".join(pyb_all) + c_call = ", ".join(c_all) + + pyb_temp = Template(pyb11_wrap_template) + pyb_bind = pyb_temp.render(name=self.name, + hash_fn=hash_fn, + FUNC_SUFFIX_F=FUNC_SUFFIX_F, + pyb_call=pyb_call, + c_call=c_call) + + self.grad_all_source = self.grad_source + pyb_bind + mod = Cmodule(self.grad_all_source, hash_fn) + module = mod.load() + return getattr(module, modname) + + def _get_len_wrt_args(self, args): + len_wrt_args = [] + for i in range(len(self.args)): + if self.args[i] in self.wrt: + len_wrt_args.append(len(args[i])) + return len_wrt_args + + def _add_wrt_args_fwd(self, args, wrt_var, len_wrt_arg): + final_args = [] + gradof_args = [] + wrt_arg = None + is_grad_var = [] + for i in range(len(args)): + final_args.append(args[i]) + is_grad_var.append(0) + if self.args[i] in self.active: + temp = np.zeros((len(args[i]), len_wrt_arg)) + final_args.append(temp) + is_grad_var.append(1) + if self.args[i] == wrt_var: + wrt_arg = temp + if self.args[i] in self.gradof: + gradof_args.append(temp) + return final_args, gradof_args, wrt_arg, is_grad_var + + def _call_multi_fwd(self, final_args, wrt_arg, is_grad_var): + for i in range(len(wrt_arg)): + wrt_arg[i][i] = 1.0 + + for i in range(len(wrt_arg)): + temp_args = [] + for j, arg in enumerate(final_args): + if not is_grad_var[j]: + temp_args.append(arg) + else: + temp_args.append(arg[:, i]) + self.c_func(*temp_args) + + @profile + def __call__(self, *args): + c_args = [self._massage_arg(x) for x in args] + + len_g_args = self._get_len_wrt_args(c_args) + + ans = [] + for i, grad_var in enumerate(self.wrt): + final_args, gradof_args, wrt_arg, is_grad_var = self._add_wrt_args_fwd(c_args, grad_var, len_g_args[i]) + + self._call_multi_fwd(final_args, wrt_arg, is_grad_var) + ans.append(gradof_args) + return ans + + +class ElementwiseGrad(GradBase): + def __init__(self, func, wrt, gradof, backend='c'): + super(ElementwiseGrad, self).__init__(func, + wrt, + gradof, + mode='forward', + backend=backend) + self._config = get_config() + self.c_func = self._generate() + + def _generate(self): + if self.backend == 'c': + return self._c_gen() + elif self.backend == 'cuda': + return self._cuda_gen() + + def correct_initialization(self): + for var in self.gradof: + grad_var = var + VAR_SUFFIX_F + prev = f"*{grad_var} = 0" + after = f"{grad_var}[i] = 0" + self.grad_all_source = self.grad_all_source.replace(prev, after) + + def _c_gen(self): + pyb_args, pyb_c_args, py_args, c_args = get_diff_signature( + self.func, self.active) + + c_templt = Template(c_backend_template) + c_code = c_templt.render(c_kernel_defn=self.grad_source, + fn_name='{fname}{suff}'.format( + fname=self.name, suff=FUNC_SUFFIX_F), + fn_args=", ".join(c_args[1:]), + fn_call=", ".join(py_args[1:]), + openmp=self._config.use_openmp) + + self.grad_all_source = pyb11_setup_header + c_code + + hash_fn = get_md5(self.grad_all_source) + modname = f'm_{hash_fn}' + + pyb_templt = Template(pyb11_wrap_template) + elwise_name = 'elementwise_' + self.name + size = "{}.request().size".format(py_args[1]) + pyb_code = pyb_templt.render(name=elwise_name, + hash_fn=hash_fn, + FUNC_SUFFIX_F=FUNC_SUFFIX_F, + pyb_call=", ".join(pyb_args[1:]), + c_call=", ".join([size] + pyb_c_args[1:])) + self.grad_all_source += pyb_code + mod = Cmodule(self.grad_all_source, hash_fn) + module = mod.load() + return getattr(module, modname) + + def _cuda_gen(self): + from .cuda import set_context + set_context() + from pycuda.elementwise import ElementwiseKernel + from pycuda._cluda import CLUDA_PREAMBLE + + _, _, py_args, c_args = get_diff_signature(self.func, self.active) + + self.grad_source = self.convert_to_device_code(self.grad_source) + expr = '{func}({args})'.format(func=self.name + FUNC_SUFFIX_F, + args=", ".join(py_args)) + + arguments = convert_to_float_if_needed(", ".join(c_args[1:])) + preamble = convert_to_float_if_needed(self.grad_source) + + cluda_preamble = Template(text=CLUDA_PREAMBLE).render( + double_support=True) + self.grad_all_source = cluda_preamble + preamble + self.correct_initialization() + knl = ElementwiseKernel(name=self.name, + arguments=arguments, + operation=expr, + preamble="\n".join([cluda_preamble, preamble])) + return knl + + def convert_to_device_code(self, code): + code = re.sub(r'\bvoid\b', 'WITHIN_KERNEL void', code) + code = re.sub(r'\bfloat\b', 'GLOBAL_MEM float ', code) + code = re.sub(r'\bdouble\b', 'GLOBAL_MEM double ', code) + return code + + +class ReverseGrad(GradBase): + def __init__(self, func, wrt, gradof, backend='tapenade'): + super().__init__(func, wrt, gradof, mode='reverse', backend=backend) + self.c_func = self._c_reverse_diff() + + def _c_reverse_diff(self): + self.grad_source = pyb11_setup_header_rev + self.grad_source + hash_fn = get_md5(self.grad_source) + modname = f'm_{hash_fn}' + + pyb_all, c_all, _, _ = get_diff_signature(self.func, + self.active, + mode=self.mode) + pyb_call = ", ".join(pyb_all) + c_call = ", ".join(c_all) + + pyb_temp = Template(pyb11_wrap_template_rev) + pyb_bind = pyb_temp.render(name=self.name, + modname=modname, + FUNC_SUFFIX_F=FUNC_SUFFIX_R, + pyb_call=pyb_call, + c_call=c_call) + + self.grad_all_source = self.grad_source + pyb_bind + tpnd_obj_dir = get_tpnd_obj_dir() + + if self.req_recomp_tpnd(): + compile_tapenade_source() + extra_inc_dir = [pybind11.get_include(), tpnd_obj_dir] + extra_link_args = [os.path.join(tpnd_obj_dir, 'adBuffer.o'), + os.path.join(tpnd_obj_dir, 'adStack.o')] + mod = Cmodule(self.grad_all_source, hash_fn, + extra_inc_dir=extra_inc_dir, + extra_link_args=extra_link_args, + extra_compile_args=['-ftemplate-depth=1024', "-O3"]) + module = mod.load() + return getattr(module, modname) + + def req_recomp_tpnd(self): + tpnd_obj_dir = get_tpnd_obj_dir() + cond1 = not os.path.exists(os.path.join(tpnd_obj_dir, 'adBuffer.o')) + cond2 = not os.path.exists(os.path.join(tpnd_obj_dir, 'adStack.o')) + return cond1 or cond2 + + def _get_len_gradof_args(self, args): + len_gradof_args = [] + for i in range(len(self.args)): + if self.args[i] in self.gradof: + len_gradof_args.append(len(args[i])) + return len_gradof_args + + def _add_grad_args_rev(self, args, gradof_var, len_gradof_arg): + final_args = [] + wrt_args = [] + gradof_arg = None + is_grad_var = [] + for i in range(len(args)): + final_args.append(args[i]) + is_grad_var.append(0) + if self.args[i] in self.active: + temp = np.zeros((len_gradof_arg, len(args[i]))) + final_args.append(temp) + is_grad_var.append(1) + if self.args[i] == gradof_var: + gradof_arg = temp + if self.args[i] in self.wrt: + wrt_args.append(temp) + return final_args, wrt_args, gradof_arg, is_grad_var + + def _call_multi_rev(self, final_args, gradof_arg, is_grad_var, len_gradof_args): + for i in range(len(gradof_arg)): + gradof_arg[i][i] = 1.0 + + for i in range(len(gradof_arg)): + temp_args = [] + for j, arg in enumerate(final_args): + if not is_grad_var[j]: + temp_args.append(arg) + else: + temp_args.append(arg[i]) + self.c_func(*temp_args) + + def _get_grad_dict(self, wrt_args, len_gradof_args): + grads = {} + for varname, arg in zip(self.wrt, wrt_args): + if len_gradof_args > 1: + grads[varname] = arg + else: + grads[varname] = arg[0] + return grads + + def _update_cache(self, wrt_args, gradof_arg, gradof_var): + self.cache[gradof_var] = [wrt_args, gradof_arg] + + @profile + def __call__(self, *args): + c_args = [self._massage_arg(x) for x in args] + self.c_func(*c_args) diff --git a/compyle/c_backend.py b/compyle/c_backend.py new file mode 100644 index 0000000..1fbe42d --- /dev/null +++ b/compyle/c_backend.py @@ -0,0 +1,321 @@ +from compyle.profile import profile +from .translator import ocl_detect_type, KnownType +from .cython_generator import CythonGenerator, get_func_definition +from .cython_generator import getsourcelines +from mako.template import Template +from .ext_module import get_md5 +from .cimport import Cmodule +from .transpiler import Transpiler +from . import array + +import pybind11 +import numpy as np + + +elwise_c_pybind = ''' + +PYBIND11_MODULE(${modname}, m) { + + m.def("${modname}", [](${pyb11_args}){ + return ${name}(${pyb11_call}); + }); +} + +''' + + +class CBackend(CythonGenerator): + def __init__(self, detect_type=ocl_detect_type, known_types=None): + super(CBackend, self).__init__() + # self.function_address_space = 'WITHIN_KERNEL ' + + def get_func_signature_pyb11(self, func): + sourcelines = getsourcelines(func)[0] + defn, lines = get_func_definition(sourcelines) + f_name, returns, args = self._analyze_method(func, lines) + pyb11_args = [] + pyb11_call = [] + c_args = [] + c_call = [] + for arg, value in args: + c_type = self.detect_type(arg, value) + c_args.append('{type} {arg}'.format(type=c_type, arg=arg)) + + c_call.append(arg) + pyb11_type = self.ctype_to_pyb11(c_type) + pyb11_args.append('{type} {arg}'.format(type=pyb11_type, arg=arg)) + if c_type.endswith('*'): + pyb11_call.append( + '({ctype}){arg}.request().ptr' + .format(arg=arg, ctype=c_type)) + else: + pyb11_call.append('{arg}'.format(arg=arg)) + + return (pyb11_args, pyb11_call), (c_args, c_call) + + def ctype_to_pyb11(self, c_type): + if c_type[-1] == '*': + return 'py::array_t<{}>'.format(c_type[:-1]) + else: + return c_type + + def _get_self_type(self): + return KnownType('GLOBAL_MEM %s*' % self._class_name) + +class CCompile(CBackend): + def __init__(self, func): + super(CCompile, self).__init__() + self.func = func + self.src = "not yet generated" + self.tp = Transpiler(backend='c') + self.c_func = self._compile() + + def _compile(self): + self.tp.add(self.func) + self.src = self.tp.get_code() + + py_data, c_data = self.get_func_signature_pyb11(self.func) + + pyb11_args = ', '.join(py_data[0][:]) + pyb11_call = ', '.join(py_data[1][:]) + hash_fn = get_md5(self.src) + modname = f'm_{hash_fn}' + template = Template(elwise_c_pybind) + src_bind = template.render( + name=self.func.__name__, + modname=modname, + pyb11_args=pyb11_args, + pyb11_call=pyb11_call + ) + self.src += src_bind + + mod = Cmodule(self.src, hash_fn, openmp=False, + extra_inc_dir=[pybind11.get_include()]) + module = mod.load() + return getattr(module, modname) + + def _massage_arg(self, x): + if isinstance(x, array.Array): + return x.dev + elif isinstance(x, np.ndarray): + return x + else: + return np.asarray(x) + + @profile + def __call__(self, *args, **kwargs): + c_args = [self._massage_arg(x) for x in args] + self.c_func(*c_args) + +elwise_c_template = ''' + +void ${name}(${arguments}){ + %if openmp: + #pragma omp parallel for + %endif + for(size_t i = 0; i < SIZE; i++){ + ${operations}; + } +} + +''' + +reduction_c_template = ''' +template +T combine(T a, T b){ + return ${red_expr}; +} + +template +T reduce_one_ar(int offset, int n, T initial_val, T* ary){ + T a, b, temp; + temp = initial_val; + + for (int i = offset; i < (n + offset); i++){ + a = temp; + b = ary[i]; + + temp = combine(a, b); + } + return temp; +} + +template +T reduce(int offset, int n, T initial_val${args_extra}){ + T a, b, temp; + temp = initial_val; + + for (int i = offset; i < (n + offset); i++){ + a = temp; + b = ${map_expr}; + + temp = combine(a, b); + } + return temp; +} + + +template +T reduce_all(long N, T initial_val${args_extra}){ + T ans = initial_val; + if (N > 0){ + %if openmp: + int ntiles = omp_get_max_threads(); + %else: + int ntiles = 1; + %endif + T* stage1_res = new T[ntiles]; + %if openmp: + #pragma omp parallel for + %endif + { + // Step 1 - reducing each tile + %if openmp: + int itile = omp_get_thread_num(); + %else: + int itile = 0; + %endif + int last_tile = ntiles - 1; + int tile_size = (N / ntiles); + int last_tile_sz = N - tile_size * last_tile; + int cur_tile_size = itile == ntiles - 1 ? last_tile_sz : tile_size; + int cur_start_idx = itile * tile_size; + + stage1_res[itile] = reduce(cur_start_idx, cur_tile_size, + initial_val${call_extra}); + %if openmp: + #pragma omp barrier + + #pragma omp single + %endif + ans = reduce_one_ar(0, ntiles, initial_val, stage1_res); + } + delete[] stage1_res; + } + return ans; +} +''' + +reduction_c_pybind = ''' + +PYBIND11_MODULE(${name}, m) { + m.def("${name}", [](long n${pyb_args}){ + return reduce_all(n, (${type})${neutral}${pyb_call}); + }); +} + +''' + +scan_c_template = ''' + +template +T combine(T a, T b){ + return ${scan_expr}; +} + + +template +T reduce( T* ary, int offset, int n, T initial_val${args_in_extra}){ + T a, b, temp; + temp = initial_val; + + for (int i = offset; i < (n + offset); i++){ + a = temp; + b = ${scan_input_expr_call}; + + temp = combine(a, b); + } + return temp; +} + +template +void excl_scan_wo_ip_exp( T* ary, T* out, int N, T initial_val){ + if (N > 0){ + T a, b, temp; + temp = initial_val; + + for (int i = 0; i < N; i++){ + a = temp; + b = ary[i]; + out[i] = temp; + temp = combine(a, b); + } + out[N] = temp; + } +} + + +template +void incl_scan( T* ary, int offset, int cur_buf_size, int N, + T initial_val, T last_item${args_extra}) +{ + if (N > 0){ + T a, b, carry, prev_item, item; + carry = initial_val; + + for (int i = offset; i < (cur_buf_size + offset); i++){ + a = carry; + b = ${scan_input_expr_call}; + prev_item = carry; + carry = combine(a, b); + item = carry; + + ${scan_output_expr_call}; + } + } +} + + +template +void scan( T* ary, long N, T initial_val${args_extra}){ + if (N > 0){ + %if openmp: + int ntiles = omp_get_max_threads(); + %else: + int ntiles = 1; + %endif + T* stage1_res = new T[ntiles]; + T* stage2_res = new T[ntiles + 1]; + %if openmp: + #pragma omp parallel + %endif + { + // Step 1 - reducing each tile + %if openmp: + int itile = omp_get_thread_num(); + %else: + int itile = 0; + %endif + int last_tile = ntiles - 1; + int tile_size = (N / ntiles); + int last_tile_sz = N - tile_size * last_tile; + int cur_tile_size = itile == ntiles - 1 ? last_tile_sz : tile_size; + int cur_start_idx = itile * tile_size; + + stage1_res[itile] = reduce(ary, cur_start_idx, cur_tile_size, + initial_val${call_in_extra}); + %if openmp: + #pragma omp barrier + + #pragma omp single + %endif + excl_scan_wo_ip_exp(stage1_res, stage2_res, + ntiles, initial_val); + + incl_scan(ary, cur_start_idx, cur_tile_size, N, + stage2_res[itile],stage2_res[ntiles]${call_extra}); + } + delete[] stage1_res; + delete[] stage2_res; + } +} +''' +scan_c_pybind = ''' + +PYBIND11_MODULE(${name}, m) { + m.def("${name}", [](py::array_t<${type}> x, long n${pyb_args}){ + return scan((${type}*) x.request().ptr, n, + (${type})${neutral}${pyb_call}); + }); +} +''' diff --git a/compyle/cimport.py b/compyle/cimport.py new file mode 100644 index 0000000..131d0d0 --- /dev/null +++ b/compyle/cimport.py @@ -0,0 +1,186 @@ +import os +import io +import importlib +import shutil +import sys +from filelock import FileLock + +from os.path import exists, expanduser, isdir, join + +import pybind11 +from distutils.extension import Extension +from distutils.command import build_ext +from distutils.core import setup +from distutils.errors import CompileError, LinkError +from distutils.sysconfig import customize_compiler + +from .ext_module import get_platform_dir, get_ext_extension, get_openmp_flags +from .capture_stream import CaptureMultipleStreams # noqa: 402 +from distutils.ccompiler import new_compiler + + +class Cmodule: + def __init__(self, src, hash_fn, root=None, verbose=False, openmp=False, + extra_inc_dir=[pybind11.get_include()], + extra_link_args=[], extra_compile_args=[]): + self.src = src + self.hash = hash_fn + self.name = f'm_{self.hash}' + self.verbose = verbose + self.openmp = openmp + self.extra_inc_dir = extra_inc_dir + self.extra_link_args = extra_link_args + self.extra_compile_args = extra_compile_args + self._use_cpp11() + + self._setup_root(root) + self._setup_filenames() + self.lock = FileLock(self.lock_path, timeout=120) + + def _setup_root(self, root): + if root is None: + plat_dir = get_platform_dir() + self.root = expanduser(join('~', '.compyle', 'source', plat_dir)) + else: + self.root = root + + self.build_dir = join(self.root, 'build') + + if not isdir(self.build_dir): + try: + os.makedirs(self.build_dir) + except OSError: + pass + + def _write_source(self): + if not exists(self.src_path): + with io.open(self.src_path, 'w', encoding='utf-8') as f: + f.write(self.src) + + def _setup_filenames(self): + self.src_path = join(self.root, self.name + '.cpp') + self.ext_path = join(self.root, self.name + get_ext_extension()) + self.lock_path = join(self.root, self.name + '.lock') + + def is_build_needed(self): + return not exists(self.ext_path) + + def build(self): + self._include_openmp() + ext = Extension(name=self.name, + sources=[self.src_path], + language='c++', + include_dirs=self.extra_inc_dir, + extra_link_args=self.extra_link_args, + extra_compile_args=self.extra_compile_args) + args = [ + "build_ext", + "--build-lib=" + self.build_dir, + "--build-temp=" + self.build_dir, + "-v", + ] + + try: + with CaptureMultipleStreams() as stream: + setup(name=self.name, + ext_modules=[ext], + script_args=args, + cmdclass={"build_ext": build_ext.build_ext}) + shutil.move(join(self.build_dir, self.name + + get_ext_extension()), self.ext_path) + + except(CompileError, LinkError, SystemExit): + hline = "*"*80 + print(hline + "\nERROR") + s_out = stream.get_output() + print(s_out[0]) + print(s_out[1]) + msg = "Compilation of code failed, please check "\ + "error messages above." + print(hline + "\n" + msg) + sys.exit(1) + + def write_and_build(self): + """Write source and build the extension module""" + if self.is_build_needed(): + with self.lock: + self._write_source() + self.build() + else: + self._message("Precompiled code from:", self.src_path) + + def load(self): + self.write_and_build() + spec = importlib.util.spec_from_file_location(self.name, self.ext_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + def _include_openmp(self): + if self.openmp: + ec, el = get_openmp_flags() + self.extra_compile_args += ec + self.extra_link_args += el + + def _use_cpp11(self): + self.extra_compile_args += ['-std=c++11'] + + def _message(self, *args): + msg = ' '.join(args) + if self.verbose: + print(msg) + + +def wget_tpnd_headers(): + import requests + "https://gitlab.inria.fr/tapenade/tapenade/-/raw/3.16-v2/ADFirstAidKit/adBuffer.c?inline=false" + "https://gitlab.inria.fr/tapenade/tapenade/-/raw/3.16-v2/ADFirstAidKit/adBuffer.h?inline=false" + baseurl = 'https://gitlab.inria.fr/tapenade/tapenade/-/raw/3.16-v2/ADFirstAidKit/{}?inline=false' + files = ['adBuffer.c', 'adBuffer.h', 'adStack.c', 'adStack.h'] + reqs = [requests.get(baseurl.format(file)) for file in files] + saveloc = get_tpnd_obj_dir() + if not os.path.exists(saveloc): + os.mkdir(saveloc) + + for file, r in zip(files, reqs): + with open(join(saveloc, file), 'wb') as f: + f.write(r.content) + + +def get_tpnd_obj_dir(): + plat_dir = get_platform_dir() + root = expanduser(join('~', '.compyle', 'source', plat_dir)) + tpnd_dir = join(root, 'tapenade_src') + return tpnd_dir + + +def compile_tapenade_source(verbose=0): + print("Setting up Tapenade source code...") + try: + obj_dir_tpnd = get_tpnd_obj_dir() + with CaptureMultipleStreams() as stream: + wget_tpnd_headers() + os.environ["CC"] = 'g++' + compiler = new_compiler(verbose=1) + customize_compiler(compiler) + compiler.compile([join(obj_dir_tpnd, 'adBuffer.c')], + output_dir=obj_dir_tpnd, + extra_preargs=['-c', '-fPIC']) + compiler.compile([join(obj_dir_tpnd, 'adStack.c')], + output_dir=obj_dir_tpnd, + extra_preargs=['-c', '-fPIC']) + objdir = join(obj_dir_tpnd, obj_dir_tpnd[1:]) + shutil.move(join(objdir, 'adBuffer.o'), + join(obj_dir_tpnd, 'adBuffer.o')) + shutil.move(join(objdir, 'adStack.o'), + join(obj_dir_tpnd, 'adStack.o')) + except (CompileError, LinkError): + hline = "*"*80 + print(hline + "\nERROR") + s_out = stream.get_output() + print(s_out[0]) + print(s_out[1]) + msg = "Compilation of tapenade source failed, please check "\ + "error messages above." + print(hline + "\n" + msg) + sys.exit(1) diff --git a/compyle/jit.py b/compyle/jit.py index 080fd42..7c5a739 100644 --- a/compyle/jit.py +++ b/compyle/jit.py @@ -5,7 +5,6 @@ import ast import importlib import warnings -import time from pytools import memoize from .config import get_config from .cython_generator import CythonGenerator @@ -368,6 +367,8 @@ def __call__(self, *args, **kw): c_func(*c_args, **kw) event.record() event.synchronize() + elif self.backend == 'c': + c_func(*c_args) class ReductionJIT(parallel.ReductionBase): @@ -448,6 +449,10 @@ def __call__(self, *args, **kw): event.record() event.synchronize() return result.get() + elif self.backend == 'c': + size = len(c_args[0]) + c_args.insert(0, size) + return c_func(*c_args) class ScanJIT(parallel.ScanBase): @@ -567,3 +572,7 @@ def __call__(self, **kwargs): c_func(*[c_args_dict[k] for k in output_arg_keys]) event.record() event.synchronize() + elif self.backend == 'c': + size = len(c_args_dict[output_arg_keys[0]]) + c_args_dict['N'] = size + c_func(*[c_args_dict[k] for k in output_arg_keys]) diff --git a/compyle/parallel.py b/compyle/parallel.py index b124735..e66a9ef 100644 --- a/compyle/parallel.py +++ b/compyle/parallel.py @@ -6,17 +6,21 @@ """ +from compyle import c_backend from functools import wraps from textwrap import wrap from mako.template import Template import numpy as np +import pybind11 +from .cimport import Cmodule from .config import get_config from .profile import profile from .cython_generator import get_parallel_range, CythonGenerator from .transpiler import Transpiler, convert_to_float_if_needed -from .types import dtype_to_ctype +from .types import TYPES, annotate, dtype_to_ctype +from .ext_module import get_md5 from . import array @@ -503,6 +507,52 @@ def _generate(self, declarations=None): # FIXME: it is difficult to get the sources from pycuda. self.all_source = self.source return knl + elif self.backend == 'c': + self.pyb11_backend = c_backend.CBackend() + py_data, c_data = self.pyb11_backend.get_func_signature_pyb11( + self.func) + pyb11_args = ', '.join(py_data[0][1:]) + size = '{arg}.request().size'.format(arg=c_data[1][1]) + pyb11_call = ', '.join([size] + py_data[1][1:]) + c_defn = ['size_t SIZE'] + c_data[0][1:] + arguments = ', '.join(c_defn) + name = self.func.__name__ + expr = '{func}({args})'.format( + func=name, + args=', '.join(c_data[1]) + ) + + openmp = self._config.use_openmp + + templete_elwise = Template(c_backend.elwise_c_template) + src_elwise = templete_elwise.render( + name=self.name, + arguments=arguments, + openmp=openmp, + operations=expr + ) + + self.source = self.tp.get_code() + if openmp: + self.source = '#include \n' + self.source + self.all_source = self.source + '\n' + src_elwise + hash_fn = get_md5(self.all_source) + modname = f'm_{hash_fn}' + + template = Template(c_backend.elwise_c_pybind) + src_bind = template.render( + name=self.name, + modname=modname, + pyb11_args=pyb11_args, + pyb11_call=pyb11_call + ) + + self.all_source += src_bind + + mod = Cmodule(self.all_source, hash_fn, openmp=openmp, + extra_inc_dir=[pybind11.get_include()]) + module = mod.load() + return getattr(module, modname) def _correct_opencl_address_space(self, c_data): code = self.tp.blocks[-1].code.splitlines() @@ -552,6 +602,8 @@ def __call__(self, *args, **kw): self.c_func(*c_args, **kw) event.record() event.synchronize() + elif self.backend == 'c': + self.c_func(*c_args) class Elementwise(object): @@ -662,6 +714,71 @@ def _generate(self, declarations=None): self.tp.compile() self.all_source = self.tp.source return getattr(self.tp.mod, 'py_' + self.name) + elif self.backend == 'c': + self.pyb11_backend = c_backend.CBackend() + if self.func is not None: + self.func.__annotations__['return'] = TYPES[self.type] + self.tp.add(self.func, declarations=declarations) + pyb_data, c_data = self.pyb11_backend.get_func_signature_pyb11( + self.func) + c_call = c_data[1] + + c_call_default = ['N', 'neutral'] + predefined_vars = ['i'] + c_call_default + c_args_extra = [[], []] + pyb_args_extra = [[], []] + for i, var in enumerate(c_call[1:]): + if var not in predefined_vars: + c_args_extra[0].append(c_data[0][i + 1]) + c_args_extra[1].append(var) + pyb_args_extra[0].append(pyb_data[0][i + 1]) + pyb_args_extra[1].append(pyb_data[1][i + 1]) + c_args_extra_str = ", " + ', '.join(c_args_extra[0]) + c_call_extra_str = ", " + ', '.join(c_args_extra[1]) + pyb_args_extra_str = ", " + ', '.join(pyb_args_extra[0]) + pyb_call_extra_str = ", " + ', '.join(pyb_args_extra[1]) + map_expr = f"{self.func.__name__}({', '.join(c_call)})" + else: + c_args_extra_str = f", {self.type + '*'} in" + c_call_extra_str = ", in" + arg_typ = self.pyb11_backend.ctype_to_pyb11(self.type + '*') + pyb_args_extra_str = f", {arg_typ} in" + pyb_call_extra_str = f", ({self.type}*) in.request().ptr" + map_expr = "in[i]" + self.source = self.tp.get_code() + openmp = self._config.use_openmp + if openmp: + self.source = '#include \n' + self.source + + template_red = Template(c_backend.reduction_c_template) + src_red = template_red.render( + args_extra=c_args_extra_str, + call_extra=c_call_extra_str, + map_expr=map_expr, + red_expr=self.reduce_expr, + name=self.name, + type=self.type, + openmp=openmp + ) + self.all_source = self.source + src_red + hash_fn = get_md5(self.all_source) + modname = f'm_{hash_fn}' + + template_pybind = Template(c_backend.reduction_c_pybind) + src_pybind = template_pybind.render( + name=modname, + type=self.type, + pyb_args=pyb_args_extra_str, + pyb_call=pyb_call_extra_str, + neutral=self.neutral, + ) + self.all_source += src_pybind + + mod = Cmodule(self.all_source, hash_fn, openmp=openmp, + extra_inc_dir=[pybind11.get_include()]) + module = mod.load() + return getattr(module, modname) + elif self.backend == 'opencl': if self.func is not None: self.tp.add(self.func, declarations=declarations) @@ -815,6 +932,10 @@ def __call__(self, *args): event.record() event.synchronize() return result.get() + elif self.backend == 'c': + size = len(c_args[0]) + c_args.insert(0, size) + return self.c_func(*c_args) class Reduction(object): @@ -939,6 +1060,8 @@ def _generate(self, declarations=None): return self._generate_cuda_kernel(declarations=declarations) elif self.backend == 'cython': return self._generate_cython_code(declarations=declarations) + elif self.backend == 'c': + return self._generate_c_code(declarations=declarations) def _default_cython_input_function(self): py_data = (['int i', '{type}[:] input'.format(type=self.type)], @@ -1052,6 +1175,114 @@ def _generate_cython_code(self, declarations=None): self.all_source = self.tp.source return getattr(self.tp.mod, 'py_' + self.name) + def _generate_c_code(self, declarations=None): + self.pyb11_backend = c_backend.CBackend() + if not self.input_func: + @annotate(i='int', ary=f'{self.type}p', return_=f'{self.type}') + def input_expr(i, ary): + return ary[i] + self.input_func = input_expr + + self.tp.add(self.input_func, declarations=declarations) + pyb_data_in, c_data_in = self.pyb11_backend.get_func_signature_pyb11( + self.input_func) + self.tp.add(self.output_func, declarations=declarations) + self.source = self.tp.get_code() + openmp = self._config.use_openmp + if openmp: + self.source = '#include \n' + self.source + c_call_in = c_data_in[1] + pyb_data_out, c_data_out = self.pyb11_backend.get_func_signature_pyb11( + self.output_func) + c_call_out = c_data_out[1] + c_call_default = ['ary', 'N'] + c_internal_var = ['item', 'prev_item', 'last_item'] + predefined_vars = c_call_default + c_internal_var + + c_args_in_extra = [[], []] + pyb_args_in_extra = [[], []] + for i, var in enumerate(c_call_in[1:]): + if var not in predefined_vars: + c_args_in_extra[0].append(c_data_in[0][i + 1]) + c_args_in_extra[1].append(var) + pyb_args_in_extra[0].append(pyb_data_in[0][i + 1]) + pyb_args_in_extra[1].append(pyb_data_in[1][i + 1]) + c_args_out_extra = [[], []] + pyb_args_extra = [[], []] + for i, var in enumerate(c_call_out[1:]): + if var not in predefined_vars: + c_args_out_extra[0].append(c_data_out[0][i + 1]) + c_args_out_extra[1].append(var) + pyb_args_extra[0].append(pyb_data_out[0][i + 1]) + pyb_args_extra[1].append(pyb_data_out[1][i + 1]) + + c0 = c_args_in_extra[0] + c1 = c_args_in_extra[1] + c_args_in_extra_str = f", {','.join(c0)}" if c1 else "" + c_call_in_extra_str = f", {','.join(c1)}" if c1 else "" + + c_args_extra = c_args_out_extra.copy() + for i, var in enumerate(c_args_in_extra[1]): + if var not in c_args_extra[1]: + c_args_extra[0].append(c_args_in_extra[0][i]) + c_args_extra[1].append(var) + pyb_args_extra[0].append(pyb_args_in_extra[0][i]) + pyb_args_extra[1].append(pyb_args_in_extra[1][i]) + + if not hasattr(self.output_func, 'arg_keys'): + self.output_func.arg_keys = {} + self.output_func.arg_keys[self._get_backend_key( + )] = c_call_default + c_args_extra[1] + + c0 = c_args_extra[0] + c1 = c_args_extra[1] + p0 = pyb_args_extra[0] + p1 = pyb_args_extra[1] + c_args_extra_str = f", {', '.join(c0)}" if c1 else "" + c_call_extra_str = f", {', '.join(c1)}" if c1 else "" + pyb_args_extra_str = f", {', '.join(p0)}" if p1 else "" + pyb_call_extra_str = f", {', '.join(p1)}" if p1 else "" + + ip_fname = self.input_func.__name__ + op_fname = self.output_func.__name__ + c_call_in_str = f"{ip_fname}({', '.join(c_call_in)})" + c_call_out_str = f"{op_fname}({', '.join(c_call_out)})" + + template_scan = Template(c_backend.scan_c_template) + src_scan = template_scan.render( + scan_expr=self.scan_expr, + scan_input_expr_call=c_call_in_str, + scan_output_expr_call=c_call_out_str, + args_extra=c_args_extra_str, + args_in_extra=c_args_in_extra_str, + call_extra=c_call_extra_str, + call_in_extra=c_call_in_extra_str, + name=self.name, + type=self.type, + pyb_args=pyb_args_extra_str, + pyb_call=pyb_call_extra_str, + openmp=openmp + ) + self.all_source = self.source + src_scan + hash_fn = get_md5(self.all_source) + modname = f'm_{hash_fn}' + + pybind_template = Template(c_backend.scan_c_pybind) + src_pybind = pybind_template.render( + name=modname, + type=self.type, + pyb_args=pyb_args_extra_str, + pyb_call=pyb_call_extra_str, + neutral=self.neutral + ) + + self.all_source += src_pybind + + mod = Cmodule(self.all_source, hash_fn, openmp=openmp, + extra_inc_dir=[pybind11.get_include()]) + module = mod.load() + return getattr(module, modname) + def _wrap_ocl_function(self, func, func_type=None, declarations=None): if func is not None: self.tp.add(func, declarations=declarations) @@ -1234,6 +1465,10 @@ def __call__(self, **kwargs): self.c_func(*[c_args_dict[k] for k in output_arg_keys]) event.record() event.synchronize() + elif self.backend == 'c': + size = len(c_args_dict[output_arg_keys[0]]) + c_args_dict['N'] = size + self.c_func(*[c_args_dict[k] for k in output_arg_keys]) class Scan(object): diff --git a/compyle/tests/test_autodiff_tapenade.py b/compyle/tests/test_autodiff_tapenade.py new file mode 100644 index 0000000..44021e5 --- /dev/null +++ b/compyle/tests/test_autodiff_tapenade.py @@ -0,0 +1,99 @@ +import pytest +import numpy as np +from math import exp, log + +from ..autodiff_tapenade import ForwardGrad, ReverseGrad, ElementwiseGrad +from ..types import annotate +from ..array import wrap, declare + + + +@annotate(n_1='int', doublep='x, y') +def simple_pow(x, y, n_1): + y[0] = 1 + for i in range(n_1): + y[0] *= x[0] + +@annotate(doublep='x, y', n_1='int') +def ifelse(x, y, n_1): + if n_1 < 0: + x[0] = 1 / x[0] + n_1 = -n_1 + y[0] = 1 + for i in range(n_1): + y[0] *= x[0] + + +@annotate(doublep='ip, W, loss', double='b, label', n_1='int') +def log_reg(ip, W, b, loss, label, n_1): + h = declare('double') + pred = declare('double') + h = 0 + for i in range(n_1): + h += ip[i] * W[i] + h += b + pred = 1 / (1 + exp(-h)) + los_prob = pred * label + (1 - pred) * (1 - label) + loss[0] = -log(los_prob) + + +def grad_log_reg(ip, W, label): + h = np.dot(ip, W) + pred = 1 / (1 + exp(-h)) + return ip * (pred - label) + + +def test_simple_pow(): + grad_pow = ForwardGrad(simple_pow, ['x'], ['y']) + + x = np.array([2]) + y = np.empty([1]) + [[yfd]] = grad_pow(x, y, 5) + assert yfd == 80 + + grad_pow = ReverseGrad(simple_pow, ['x'], ['y']) + + x = np.array([2]) + y = np.empty(1) + xd = np.zeros(1) + yd = np.array([1]) + + grad_pow(x, xd, y, yd, 5) + assert xd[0] == 80 + + +def test_if_else(): + grad_pow = ForwardGrad(ifelse, ['x'], ['y']) + + x = np.array([2]) + y = np.empty(1) + + [[yfd]] = grad_pow(x, y, -5) + assert yfd == (-5 * (2 ** -6)) + + grad_pow = ReverseGrad(ifelse, ['x'], ['y']) + + x = np.array([2]) + y = np.empty(1) + xd = np.zeros(1) + yd = np.array([1]) + + grad_pow(x, xd, y, yd, -5) + assert xd[0] == (-5 * (2 ** -6)) + + +def t_log_red(): + g_log_reg = ReverseGrad(log_reg, ['W'], ['loss']) + n = 5 + ip = np.linspace(0, 1, n) + W = np.random.randn(n) + b = 1 + loss = np.array([0]) + label = 1 + + loss_d = np.ones(1) + W_d = np.empty_like(W) + + g_log_reg(ip, W, W_d, b, loss, loss_d, label, n) + + assert np.allclose(W_d, grad_log_reg(ip, W, label)) \ No newline at end of file diff --git a/compyle/tests/test_c_backend.py b/compyle/tests/test_c_backend.py new file mode 100644 index 0000000..8b1c82d --- /dev/null +++ b/compyle/tests/test_c_backend.py @@ -0,0 +1,43 @@ +import unittest +from unittest import TestCase +from ..c_backend import CBackend, CCompile +from ..types import annotate +import numpy as np + + +class TestCBackend(TestCase): + def test_get_func_signature(self): + cbackend = CBackend() + + @annotate(x='int', y='intp', z='int', w='double') + def test_fn(x, y, z=2, w=3.0): + return x+y+z+w + temp = cbackend.get_func_signature(test_fn) + (pyb11_args, pyb11_call), (c_args, c_call) = temp + exp_pyb11_args = ['int x', 'int[:] y', 'int z', 'double w'] + exp_pyb11_call = ['x', '&y[0]', 'z', 'w'] + exp_c_args = ['int x', 'int* y', 'int z', 'double w'] + exp_c_call = ['x', 'y', 'z', 'w'] + + self.assertListEqual(pyb11_args, exp_pyb11_args) + self.assertListEqual(pyb11_call, exp_pyb11_call) + self.assertListEqual(c_args, exp_c_args) + self.assertListEqual(c_call, exp_c_call) + +class TestCCompile(TestCase): + def test_compile(self): + @annotate(int='n, p', intp='x, y') + def get_pow(n, p, x, y): + for i in range(n): + y[i] = x[i]**p + c_get_pow = CCompile(get_pow) + n = 5 + p = 5 + x = np.ones(n, dtype=np.int32) * 2 + y = np.zeros(n, dtype=np.int32) + y_exp = np.ones(n, dtype=np.int32) * 32 + c_get_pow(n, p, x, y) + assert(np.all(y == y_exp)) + +if __name__ == '__main__': + unittest.main() diff --git a/compyle/tests/test_cimport.py b/compyle/tests/test_cimport.py new file mode 100644 index 0000000..9d0d5b9 --- /dev/null +++ b/compyle/tests/test_cimport.py @@ -0,0 +1,65 @@ +from genericpath import exists +from ntpath import join +import tempfile +import unittest +from unittest import TestCase +import numpy as np +from os.path import exists, expanduser, isdir, join +import sys +import os +from mako.template import Template + + +from compyle.cimport import Cmodule +from compyle.types import annotate +from compyle.ext_module import get_platform_dir, get_md5, get_ext_extension + +dummy_module = ''' +#include +#include +namespace py = pybind11; + +void f(int n, int* x, int* y) +{ + for(int i = 0; i < n; i++){ + y[i] = (2 * x[i]); + } +} +''' +pybind = """ + +PYBIND11_MODULE(${name}, m) { + + m.def("${name}", [](py::array_t x, py::array_t y){ + return f(x.request().size, (int*)x.request().ptr, + (int*)y.request().ptr); + }); +} +""" + + +class TestCmodule(TestCase): + def setUp(self): + self.root = tempfile.mkdtemp() + + def test_build(self): + hash_fn = get_md5(dummy_module) + name = f'm_{hash_fn}' + pyb_template = Template(pybind) + src_pybind = pyb_template.render(name=name) + + all_src = dummy_module + src_pybind + mod = Cmodule(all_src, hash_fn=hash_fn, root=self.root) + checksum = get_md5(dummy_module) + self.assertTrue(mod.is_build_needed()) + + mod.load() + self.assertTrue(exists(join(self.root, 'build'))) + self.assertTrue(exists(join(self.root, 'm_' + checksum + '.cpp'))) + self.assertTrue( + exists(join(self.root, f'{name}' + get_ext_extension()))) + self.assertFalse(mod.is_build_needed()) + + +if __name__ == '__main__': + unittest.main() diff --git a/compyle/tests/test_parallel.py b/compyle/tests/test_parallel.py index 8fed025..ce1667a 100644 --- a/compyle/tests/test_parallel.py +++ b/compyle/tests/test_parallel.py @@ -19,6 +19,65 @@ def external(x): return x +class ParallelUtilsBaseC(object): + def test_elementwise_works_with_c(self): + self._check_simple_elementwise(backend='c') + + def test_elementwise_works_with_global_constant_c(self): + self._check_elementwise_with_constant(backend='c') + + def test_reduction_works_without_map_c(self): + self._check_simple_reduction(backend='c') + + def test_reduction_works_with_map_c(self): + self._check_reduction_with_map(backend='c') + + def test_reduction_works_with_external_func_c(self): + self._check_reduction_with_external_func(backend='c') + + def test_reduction_works_neutral_c(self): + self._check_reduction_min(backend='c') + + def test_scan_works_c(self): + self._test_scan(backend='c') + + def test_scan_works_c_parallel(self): + with use_config(use_openmp=True): + self._test_scan(backend='c') + + def test_large_scan_works_c_parallel(self): + with use_config(use_openmp=True): + self._test_large_scan(backend='c') + + def test_scan_works_with_external_func_c(self): + self._test_scan_with_external_func(backend='c') + + def test_scan_works_with_external_func_c_parallel(self): + with use_config(use_openmp=True): + self._test_scan_with_external_func(backend='c') + + def test_scan_last_item_c_parallel(self): + with use_config(use_openmp=True): + self._test_scan_last_item(backend='c') + + def test_scan_last_item_c_serial(self): + self._test_scan_last_item(backend='c') + + def test_unique_scan_c(self): + self._test_unique_scan(backend='c') + + def test_unique_scan_c_parallel(self): + with use_config(use_openmp=True): + self._test_unique_scan(backend='c') + + def test_repeated_scans_with_different_settings_c(self): + with use_config(use_openmp=False): + self._test_unique_scan(backend='c') + + with use_config(use_openmp=True): + self._test_unique_scan(backend='c') + + class ParallelUtilsBase(object): def test_elementwise_works_with_cython(self): self._check_simple_elementwise(backend='cython') @@ -221,7 +280,9 @@ def test_repeated_scans_with_different_settings(self): self._test_unique_scan(backend='cython') -class TestParallelUtils(ParallelUtilsBase, unittest.TestCase): +class TestParallelUtils(ParallelUtilsBase, + ParallelUtilsBaseC, + unittest.TestCase): def setUp(self): cfg = get_config() self._use_double = cfg.use_double diff --git a/compyle/translator.py b/compyle/translator.py index 9d3bcfe..0175d9d 100644 --- a/compyle/translator.py +++ b/compyle/translator.py @@ -13,6 +13,7 @@ import ast import re +from subprocess import call import sys from textwrap import dedent, wrap import types @@ -163,7 +164,7 @@ def _get_self_type(self): def _get_local_arg(self, arg, type): return arg, type - def _get_function_args(self, node): + def _get_function_args(self, node, convert_array_args=False): node_args = node.args.args if PY_VER == 2: args = [x.id for x in node_args] @@ -196,8 +197,10 @@ def _get_function_args(self, node): type = self._detect_type(arg, value) if 'LOCAL_MEM' in type: arg, type = self._get_local_arg(arg, type) - call_sig.append('{type} {arg}'.format(type=type, arg=arg)) - + if convert_array_args and type.endswith('*'): + call_sig.append('{type} {arg}[]'.format(type=type[:-1], arg=arg)) + else: + call_sig.append('{type} {arg}'.format(type=type, arg=arg)) return ', '.join(call_sig) def _get_variable_declaration(self, type_str, names): diff --git a/compyle/transpiler.py b/compyle/transpiler.py index 46663e6..9be8886 100644 --- a/compyle/transpiler.py +++ b/compyle/transpiler.py @@ -8,7 +8,7 @@ from .config import get_config from .ast_utils import get_unknown_names_and_calls from .cython_generator import CythonGenerator, CodeGenerationError -from .translator import OpenCLConverter, CUDAConverter +from .translator import OpenCLConverter, CUDAConverter, CConverter from .ext_module import ExtModule from .extern import Extern, get_extern_code from .utils import getsourcelines @@ -187,6 +187,15 @@ def __init__(self, backend='cython', incl_cluda=True): #define max(x, y) fmax((double)(x), (double)(y)) ''') + elif backend == 'c': + self._cgen = CConverter() + self.header = dedent(''' + // c code for with PyBind11 binding + #include + #include + namespace py = pybind11; + using namespace std; + ''') def _handle_symbol(self, name, value): backend = self.backend @@ -216,6 +225,8 @@ def _handle_symbol(self, name, value): return '#define {name} {value}'.format( name=name, value=value ) + elif self.backend == 'c': + return f"{ctype} {name} = {value};" def _get_comment(self): return '#' if self.backend == 'cython' else '//' @@ -278,6 +289,10 @@ def add(self, obj, declarations=None): code = self._cgen.parse( obj, declarations=declarations.get(obj.__name__) if declarations else None) + elif self.backend == 'c': + code = self._cgen.parse( + obj, declarations=declarations.get(obj.__name__) + if declarations else None) cb = CodeBlock(obj, code) self.blocks.append(cb) @@ -286,8 +301,11 @@ def add_code(self, code): cb = CodeBlock(code, code) self.blocks.append(cb) - def get_code(self): - code = [self.header] + [x.code for x in self.blocks] + def get_code(self, incl_header=True): + if incl_header: + code = [self.header] + [x.code for x in self.blocks] + else: + code = [x.code for x in self.blocks] return '\n'.join(code) def compile(self): diff --git a/examples/autodiff/billiards.py b/examples/autodiff/billiards.py new file mode 100644 index 0000000..b1e7aad --- /dev/null +++ b/examples/autodiff/billiards.py @@ -0,0 +1,193 @@ +import numpy as np +from compyle.api import annotate, Elementwise, get_config, declare +from compyle.autodiff_tapenade import ReverseGrad +import taichi as ti + +from time import time + +# This problem aims at solving the billiards problem using autodiff. +# Aim is to determine initial location and velocity of a cue ball to hit the +# target location with one of the decided balls on the table. + +get_config().use_openmp = True +gui = ti.GUI("Billiards", (1024, 1024), background_color=0x3C733F) + +def visualise(x, y, n_balls, goalx, goaly, pixel_radius, steps): + gui.clear() + for t in range(1, steps): + gui.circle((goalx, goaly), 0x00000, pixel_radius // 2) + for i in range(n_balls): + idxi = t * n_balls + i + if i == 0: + color = 0xCCCCCC + elif i == n_balls - 1: + color = 0x3344cc + else: + color = 0xF20530 + + gui.circle((x[idxi], y[idxi]), color, pixel_radius) + + gui.show() + +# forward kernel to simulate billiards for given initial conditions and steps +@annotate(int='n_balls, target_ball, billiard_layers, steps', + float='dt, elasticity, goalx, goaly, radius', + floatp='x, y, vx, vy, init_x, init_y, init_vx, init_vy, impulse_x, impulse_y, x_inc, y_inc, loss') +def forward(x, y, vx, vy, init_x, init_y, init_vx, init_vy, + impulse_x, impulse_y, x_inc, y_inc, n_balls, dt, + elasticity, target_ball, goalx, goaly, loss, + billiard_layers, radius, steps): + #initialize + x[0] = init_x[0] + y[0] = init_y[0] + vx[0] = init_vx[0] + vy[0] = init_vy[0] + count = declare('int') + idxi = declare('int') + idxj = declare('int') + idxip = declare('int') + idxtg = declare('int') + i, j, t = declare('int') + count = 0 + for i in range(billiard_layers): + for j in range(i + 1): + count += 1 + x[count] = i * 2 * radius + 0.5 + y[count] = j * 2 * radius + 0.5 - i * radius * 0.7 + + for t in range(1, steps): + # collide balls + for i in range(n_balls): + x_inc[i] = 0 + y_inc[i] = 0 + impulse_x[i] = 0 + impulse_y[i] = 0 + for j in range(n_balls): + if i != j: + x_inc_contrib = 0.0 + y_inc_contrib = 0.0 + impx = 0 + impy = 0 + + idxi = (t - 1) * n_balls + i + idxj = (t - 1) * n_balls + j + distx = (x[idxi] + dt * vx[idxi]) - (x[idxj] + dt * vx[idxj]) + disty = (y[idxi] + dt * vy[idxi]) - (y[idxj] + dt * vy[idxj]) + dist_norm = ((distx * distx) + (disty * disty)) ** 0.5 + rela_vx = vx[idxi] - vx[idxj] + rela_vy = vy[idxi] - vy[idxj] + + if dist_norm < 2 * radius: + dirx = distx / dist_norm + diry = disty / dist_norm + projected_v = (dirx * rela_vx) + (diry * rela_vy) + + if projected_v < 0: + impx = -(1 + elasticity) * 0.5 * projected_v * dirx + impy = -(1 + elasticity) * 0.5 * projected_v * diry + + toi = (dist_norm - 2 * radius) / min( + -1e-3, projected_v) # Time of impact + x_inc_contrib = min(toi - dt, 0.0) * impx + y_inc_contrib = min(toi - dt, 0.0) * impy + + x_inc[i] += x_inc_contrib + y_inc[i] += y_inc_contrib + impulse_x[i] += impx + impulse_y[i] += impy + + # end collide balls + + # update speed and position + for i in range(n_balls): + idxi = t * n_balls + i + idxip = (t - 1) * n_balls + i + vx[idxi] = vx[idxip] + impulse_x[i] + vy[idxi] = vy[idxip] + impulse_y[i] + x[idxi] = x[idxip] + dt * vx[idxi] + x_inc[i] + y[idxi] = y[idxip] + dt * vy[idxi] + y_inc[i] + + # compute loss + idxtg = (steps - 1) * n_balls + target_ball + loss[0] = (x[idxtg] - goalx) ** 2 + (y[idxtg] - goaly) ** 2 + + +# generate a gradient function for the forward function +grad_forward = ReverseGrad(forward, wrt=['init_x', 'init_y', 'init_vx', 'init_vy', 'y_inc', 'x', 'y', 'vx', 'x_inc', 'vy', 'impulse_x', 'impulse_y'], gradof=['loss']) + + +def optimize(): + for iter in range(200): + lossb[0] = 1.0 + + grad_forward(x, xb, y, yb, vx, vxb, vy, vyb, init_x, init_xb, init_y, init_yb, + init_vx, init_vxb, init_vy, init_vyb, impulse_x, impulse_xb, + impulse_y, impulse_yb, x_inc, x_incb, y_inc, y_incb, + n_balls, dt, elasticity, target_ball, goalx, goaly, loss, lossb, + billiard_layers, radius, steps) + init_x[0] -= learning_rate * init_xb[0] + init_y[0] -= learning_rate * init_yb[0] + init_vx[0] -= learning_rate * init_vxb[0] + init_vy[0] -= learning_rate * init_vyb[0] + if iter % 20 == 0: + print(f"iter: {iter} \t loss: {loss[0]}") +if __name__ == '__main__': + # setup parameters + dtype = np.float32 + billiard_layers = 4 + n_balls = 1 + (1 + billiard_layers) * billiard_layers // 2 + + vis_interval = 64 + output_vis_interval = 16 + steps = 1024 + max_steps = 1024 + + vis_resolution = 1024 + + loss = np.zeros(1, dtype=dtype) + + x = np.zeros(max_steps * n_balls, dtype=dtype) + y = np.zeros(max_steps * n_balls, dtype=dtype) + x_inc = np.zeros(n_balls, dtype=dtype) + y_inc = np.zeros(n_balls, dtype=dtype) + vx = np.zeros(max_steps * n_balls, dtype=dtype) + vy = np.zeros(max_steps * n_balls, dtype=dtype) + impulse_x = np.zeros(n_balls, dtype=dtype) + impulse_y = np.zeros(n_balls, dtype=dtype) + + init_x = np.array([0.1], dtype=dtype) + init_y = np.array([0.5], dtype=dtype) + init_vx = np.array([0.3], dtype=dtype) + init_vy = np.array([0.0], dtype=dtype) + + xb = np.zeros_like(x, dtype=dtype) + yb = np.zeros_like(y, dtype=dtype) + vxb = np.zeros_like(vx, dtype=dtype) + vyb = np.zeros_like(vy, dtype=dtype) + init_xb = np.zeros_like(init_x, dtype=dtype) + init_yb = np.zeros_like(init_y, dtype=dtype) + init_vxb = np.zeros_like(init_vx, dtype=dtype) + init_vyb = np.zeros_like(init_vy, dtype=dtype) + impulse_xb = np.zeros_like(impulse_x, dtype=dtype) + impulse_yb = np.zeros_like(impulse_y, dtype=dtype) + x_incb = np.zeros_like(x_inc, dtype=dtype) + y_incb = np.zeros_like(y_inc, dtype=dtype) + lossb = np.ones_like(loss, dtype=dtype) + + target_ball = n_balls - 1 + goalx = 0.9 + goaly = 0.75 + radius = 0.03 + elasticity = 0.8 + + + dt = 0.003 + learning_rate = 0.01 + + begin = time() + optimize() + end = time() + print(f"took: {end - begin} seconds to simulate {billiard_layers} layers of billiard balls") + + pixel_radius = (int(radius * 1024) + 1) + visualise(x, y, n_balls, goalx, goaly, pixel_radius, steps) \ No newline at end of file diff --git a/examples/autodiff/nn_mnist.py b/examples/autodiff/nn_mnist.py new file mode 100644 index 0000000..4dfacf8 --- /dev/null +++ b/examples/autodiff/nn_mnist.py @@ -0,0 +1,201 @@ +import optax +import jax.numpy as jnp +import jax +import numpy as np +from optax import softmax_cross_entropy +from mnist import MNIST + +import numpy as np +from math import exp, log +import matplotlib.pyplot as plt + +from compyle.autodiff_tapenade import ReverseGrad +from compyle.api import annotate, Elementwise, declare, get_config +from compyle.array import empty, ones +from pytools import argmax + +np.random.seed(2) +# get_config().use_openmp = True + +BATCH_SIZE = 200 +from mnist import MNIST + +mndata = MNIST('/home/rohit/Documents/Pypr/examples/samples') + +images, labels = mndata.load_training() +TRAIN = np.array(images, dtype=np.float32) / 255 +TRAIN_LABELS = np.array(labels) + +imt, lt = mndata.load_testing() +TEST = np.array(imt, dtype=np.float32) / 255 +TEST_FLAT = TEST.flatten() +TEST_LABELS = np.array(lt, dtype=np.int32) + +def initialise(n_0, n_1, n_2): + w_01 = np.random.random(n_0 * n_1).astype(np.float32) * 0.01 + w_12 = np.random.random(n_1 * n_2).astype(np.float32) * 0.01 + b_1 = np.random.random(n_1).astype(np.float32) * 0.01 + b_2 = np.random.random(n_2).astype(np.float32) * 0.01 + v_1 = np.zeros(n_1, dtype=np.float32) + v_2 = np.zeros(n_2, dtype=np.float32) + + g_w_01 = np.zeros_like(w_01, dtype=np.float32) + g_w_12 = np.zeros_like(w_12, dtype=np.float32) + + g_b_1 = np.zeros_like(b_1, dtype=np.float32) + g_b_2 = np.zeros_like(b_2, dtype=np.float32) + + return w_01, b_1, v_1, w_12, b_2, v_2, g_w_01, g_w_12, g_b_1, g_b_2 + + + +n_0 = 784 +n_1 = 128 +n_2 = 10 +alpha = 0.001 +n_train = 60000 + +w_01, b_1, v_1, w_12, b_2, v_2, g_w_01, g_w_12, g_b_1, g_b_2 = initialise(n_0, n_1,n_2) +g_v_1 = np.zeros_like(v_1) +g_v_2 = np.zeros_like(v_2) +g_ip = np.zeros(n_0) +loss = np.zeros(1, dtype=np.float32) +loss_b = np.ones(1, dtype=np.float32) + +initial_params = {} +initial_params['w01'] = w_01 +initial_params['w12'] = w_12 +initial_params['b1'] = b_1 +initial_params['b2'] = b_2 + +grads = {} +grads['w01'] = g_w_01 +grads['w12'] = g_w_12 +grads['b1'] = g_b_1 +grads['b2'] = g_b_2 + +############################################################################### +############################################################################### +############################################################################### + +@annotate(i='int', v='floatp') +def reset(i, v): + v[i] = 0.0 +reset_all = Elementwise(reset, backend='c') + +@annotate(int='i, batch_size', floatp='g, gsum') +def addgrad(i, g, gsum): + gsum[i] += g[i] +addgrad_elwise = Elementwise(addgrad, backend='c') + +def reset_grads(grads): + for key in grads: + reset_all(grads[key]) + +def add_grads(grads, gradsums): + for key in grads: + addgrad_elwise(grads[key], gradsums[key]) +def avg_grads(gradsums, batch_size): + for key in gradsums: + gradsums[key] /= batch_size + +@annotate(int='n_0, n_1, n_2, n_3, expected', floatp='input, w_01, b_1, v_1, w_12, b_2, v_2, loss') +def fwd_pass_final(n_0, input, w_01, n_1, b_1, v_1, w_12, n_2, b_2, v_2, loss, expected): + i, j = declare('int') + + for i in range(n_1): + v_1[i] = b_1[i] + for j in range(n_0): + v_1[i] += w_01[i * n_0 + j] * input[j] + + for i in range(n_1): + if v_1[i] < 0: + v_1[i] = 0 + + for i in range(n_2): + v_2[i] = b_2[i] + for j in range(n_1): + v_2[i] += w_12[i * n_1 + j] * v_1[j] + + den = declare('float') + den = 0 + + for i in range(n_2): + v_2[i] = exp(v_2[i]) + den += v_2[i] + for i in range(n_2): + v_2[i] = v_2[i] / den + + loss[0] = -log(v_2[expected]) + +grad_forward = ReverseGrad(fwd_pass_final, + ['w_01', 'b_1', 'w_12', 'b_2'], ['loss']) + +def fit(optimizer, params, grads, n_train, ip_ar, op_ar): + opt_state = optimizer.init(params) + + gradsums = {} + gradsums['w01'] = np.zeros_like(w_01) + gradsums['w12'] = np.zeros_like(w_12) + gradsums['b1'] = np.zeros_like(b_1) + gradsums['b2'] = np.zeros_like(b_2) + + losssum = 0.0 + + + def step(opt_state, params, grads): + updates, opt_state = optimizer.update(grads, opt_state, params) + params = optax.apply_updates(params, updates) + return opt_state, params + + for _ in range(1): + reset_grads(grads) + reset_grads(gradsums) + for i in range(n_train): + loss_b[0] = 1.0 + grad_forward(n_0, ip_ar[i, :], params['w01'], grads['w01'], n_1, params['b1'], grads['b1'], v_1, g_v_1, + params['w12'], grads['w12'], n_2, params['b2'], grads['b2'], v_2, g_v_2, + loss, loss_b, op_ar[i]) + add_grads(grads, gradsums) + losssum += loss[0] + reset_grads(grads) + + if i!= 0 and i % BATCH_SIZE == 0: + print(i, losssum / BATCH_SIZE) + avg_grads(gradsums, BATCH_SIZE) + opt_state, params = step(opt_state, params, gradsums) + reset_grads(gradsums) + losssum = 0.0 + + return params + + +def net(params, x): + # l1 + x = jnp.dot(x, params['w01']) + params['b1'] + x = jax.nn.relu(x) + #l2 + x = jnp.dot(x, params['w12']) + params['b2'] + x = jax.nn.softmax(x) + return x + +def test(params, test_images, test_labels): + correct = 0 + for i, (batch, labels) in enumerate(zip(TEST, TEST_LABELS)): + y_hat = net(params, batch) + if argmax(y_hat) == labels: + correct += 1 + + return correct / len(TEST) +############################################################################### +############################################################################### +############################################################################### + +optimizer = optax.adam(learning_rate=1e-3) +params = fit(optimizer, initial_params, grads, n_train, TRAIN, TRAIN_LABELS) + +params['w01'] = params['w01'].reshape(n_1, n_0).T +params['w12'] = params['w12'].reshape(n_2, n_1).T + +op = test(params, TEST, TEST_LABELS) +print("Accuracy: ", op) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index db7fea3..ae429d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ pytools cython numpy pytest +pybind11 +filelock \ No newline at end of file