diff --git a/pycuda/cumath.py b/pycuda/cumath.py index dbae5bd6..1a8dcb5b 100644 --- a/pycuda/cumath.py +++ b/pycuda/cumath.py @@ -42,7 +42,7 @@ def f(array, stream_or_out=None, **kwargs): func = elementwise.get_unary_func_kernel(func_name, array.dtype) func.prepared_async_call(array._grid, array._block, stream, - array.gpudata, out.gpudata, array.mem_size) + array, out, array.mem_size) return out return f @@ -77,7 +77,7 @@ def fmod(arg, mod, stream=None): func = elementwise.get_fmod_kernel() func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size) + arg, mod, result, arg.mem_size) return result @@ -94,7 +94,7 @@ def frexp(arg, stream=None): func = elementwise.get_frexp_kernel() func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) + arg, sig, expt, arg.mem_size) return sig, expt @@ -111,7 +111,7 @@ def ldexp(significand, exponent, stream=None): func = elementwise.get_ldexp_kernel() func.prepared_async_call(significand._grid, significand._block, stream, - significand.gpudata, exponent.gpudata, result.gpudata, + significand, exponent, result, significand.mem_size) return result @@ -129,7 +129,7 @@ def modf(arg, stream=None): func = elementwise.get_modf_kernel() func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, intpart.gpudata, fracpart.gpudata, + arg, intpart, fracpart, arg.mem_size) return fracpart, intpart diff --git a/pycuda/curandom.py b/pycuda/curandom.py index e1c68428..d31ee01a 100644 --- a/pycuda/curandom.py +++ b/pycuda/curandom.py @@ -240,7 +240,7 @@ def rand(shape, dtype=np.float32, stream=None): raise NotImplementedError; func.prepared_async_call(result._grid, result._block, stream, - result.gpudata, np.random.randint(2**31-1), result.size) + result, np.random.randint(2**31-1), result.size) return result diff --git a/pycuda/deferred.py b/pycuda/deferred.py new file mode 100644 index 00000000..a843322e --- /dev/null +++ b/pycuda/deferred.py @@ -0,0 +1,553 @@ +""" +This exports a "deferred" implementation of SourceModule, where compilation +is delayed until call-time. Several methods, like get_function(), return +"deferred" values that are also only evaluated at call-time. +""" + +from pycuda.tools import context_dependent_memoize +from pycuda.compiler import compile, SourceModule +import pycuda.driver + +import re + +class DeferredSource(object): + ''' + Source generator that supports user-directed indentation, nesting + ``DeferredSource`` objects, indentation-aware string interpolation, + and deferred generation. + Use ``+=`` or ``add()`` to add source fragments as strings or + other ``DeferredSource`` objects, ``indent()`` or ``dedent()`` to + change base indentation, and ``__call__`` or ``generate()`` to + generate source. + + ''' + def __init__(self, str=None, subsources=None, base_indent=0, indent_step=2): + self.base_indent = base_indent + self.indent_step = indent_step + if subsources is None: + subsources = [] + self.subsources = subsources + if str is not None: + self.add(str) + self._regex_space = re.compile(r"^(\s*)(.*?)(\s*)$") + self._regex_format = re.compile(r"%\(([^\)]*)\)([a-zA-Z])") + + def __str__(self): + return self.generate() + + def __repr__(self): + return repr(self.__str__()) + + def __call__(self, indent=0, indent_first=True): + return self.generate(indent, indent_first) + + def generate(self, indent=0, indent_first=True, get_list=False): + if get_list: + retval = [] + else: + retval = '' + do_indent = not indent_first + for subindent, strip_space, subsource, format_dict in self.subsources: + if do_indent: + newindent = self.base_indent + indent + subindent + else: + newindent = 0 + do_indent = True + if isinstance(subsource, DeferredSource): + retval = retval + subsource.generate(indent=(indent + subindent), get_list=get_list) + continue + lines = subsource.split("\n") + minstrip = None + newlines = [] + for line in lines: + linelen = len(line) + space_match = self._regex_space.match(line) + space = space_match.group(1) + end_leading_space = space_match.end(1) + begin_trailing_space = space_match.start(3) + if strip_space: + if linelen == end_leading_space: + # all space, ignore + continue + if minstrip is None or end_leading_space < minstrip: + minstrip = end_leading_space + if not format_dict: + newlines.append(line) + continue + newlinelist = None + newline = '' + curpos = 0 + matches = list(self._regex_format.finditer(line, end_leading_space)) + nummatches = len(matches) + for match in matches: + formatchar = match.group(2) + name = match.group(1) + matchstart = match.start() + matchend = match.end() + repl = format_dict.get(name, None) + if repl is None: + continue + if (isinstance(repl, DeferredSource) and + nummatches == 1 and + matchstart == end_leading_space): + # only one replacement, and only spaces preceding + newlinelist = [ space + x + for x in repl.generate(get_list=True) ] + else: + newline = newline + line[curpos:matchstart] + newline = newline + (('%' + formatchar) % (repl,)) + curpos = matchend + if newlinelist: + newlines.extend(newlinelist) + else: + newlines.append(newline) + # add remaining unprocessed part of line to end of last + # replacement + if newlinelist is not None and len(newlinelist) > 1: + newlines.append(space + line[curpos:]) + else: + newlines[-1] = newlines[-1] + line[curpos:] + indentstr = ' ' * (indent + subindent) + for i, line in enumerate(newlines): + line = indentstr + line[minstrip:] + newlines[i] = line + if get_list: + retval += newlines + else: + retval = retval + "\n".join(newlines) + "\n" + return retval + + def indent(self, indent_step=None): + if indent_step is None: + indent_step = self.indent_step + self.base_indent += indent_step + return self + + def dedent(self, indent_step=None): + if indent_step is None: + indent_step = self.indent_step + self.base_indent -= indent_step + return self + + def format_dict(self, format_dict): + for subsource in self.subsources: + subsource[3] = format_dict + return self + + def add(self, other, strip_space=True, format_dict=None): + self.subsources.append([self.base_indent, strip_space, other, format_dict]) + return self + + def __iadd__(self, other): + self.add(other) + return self + + def __add__(self, other): + newgen = DeferredSource(subsources=self.subsources, + base_indent=self.base_indent, + indent_step=self.indent_step) + newgen.add(other) + return newgen + +class DeferredVal(object): + ''' + This is an object that serves as a wrapper to an object that may not + even exist yet (perhaps this should have been called FutureVal). All + methods provided by ``DeferredVal`` start with underscores so as not to + interfere with common attribute names. + + The life of a ``DeferredVal`` can be divided into two phases: before + and after the wrapped object is "completed", i.e. once the wrapped + object is specified (see below for how/when this happens). + + After it is completed, any attempt to access attributes on this + ``DeferredVal`` will be simply redirected to the wrapped object. + + Before the wrapped object is completed, the only attributes allowed are + methods named in the class variable ``_deferred_method_dict`` (which + must be set in a subclass). ``_deferred_method_dict`` specifies the + names of methods that can be deferred to a later time when the current + ``DeferredVal`` is completed. It maps method names to a result class + that will be instantiated and used to represent the return value of the + deferred method call. The result class can be DeferredVal, a subclass + of ``DeferredVal``, or None (same as DeferredVal). Any attempts to + call one of these defer-able methods will return an instance of the + result class, which will be tied to a future call of the method once + the current ``DeferredVal`` object is completed. + + A ``DeferredVal`` can be completed by specifying the wrapped object in + one of two ways. One way is to set it explicitly with + ``_set_val(val)``. The other is to call ``_get()``, which requires + that ``_eval()`` be defined in a subclass to return the actual object. + Both ways will trigger the deferred method calls (described above) that + depend on this ``DeferredVal`` and the completion of their own + ``DeferredVal`` return values. + + Once completed, the wrapped object can be retrieved by calling + ``_get()``. + ''' + __unimpl = object() + _deferred_method_dict = None # must be set by subclass + + def __init__(self): + self._val_available = False + self._val = None + self._deferred_method_calls = [] + + def _copy(self, retval=None): + if retval is None: + retval = self.__class__() + retval._val_available = self._val_available + retval._val = self._val + retval._deferred_method_calls = self._deferred_method_calls + return retval + + def __repr__(self): + return self._repr(0) + + def _repr(self, indent): + indentstr = " " * indent + retstrs = [] + retstrs.append("") + retstrs.append("%s [" % (self.__class__,)) + for dmc in self._deferred_method_calls: + (name, args, kwargs, retval) = dmc + retstrs.append(" method %s" % (repr(name),)) + for arg in args: + if isinstance(arg, DeferredVal): + retstrs.append(" deferred arg (id=%s)" % (id(arg),)) + retstrs.append(arg._repr(indent + 6)) + else: + retstrs.append(" arg %s" % (repr(arg),)) + for kwname, arg in kwargs.items(): + if isinstance(arg, DeferredVal): + retstrs.append(" deferred kwarg %s (id=%s)" % (repr(kwname), id(arg))) + retstrs.append(arg._repr(indent + 6)) + else: + retstrs.append(" kwarg %s=%s" % (kwname, repr(arg),)) + retstrs.append(" deferred retval (id=%s)" % (id(retval),)) + retstrs.append("]") + return "\n".join([(indentstr + retstr) for retstr in retstrs]) + + def _set_val(self, val): + self._val = val + self._val_available = True + self._eval_methods() + return val + + def _eval(self): + raise NotImplementedError() + + def _eval_list(self, vals): + newvals = [] + for val in vals: + if isinstance(val, DeferredVal): + val = val._get() + newvals.append(val) + return newvals + + def _eval_dict(self, valsdict): + newvalsdict = {} + for name, val in valsdict.items(): + if isinstance(val, DeferredVal): + val = val._get() + newvalsdict[name] = val + return newvalsdict + + def _eval_methods(self): + assert(self._val_available) + val = self._val + for op in self._deferred_method_calls: + (methodname, methodargs, methodkwargs, deferredretval) = op + methodargs = self._eval_list(methodargs) + methodkwargs = self._eval_dict(methodkwargs) + retval = getattr(val, methodname)(*methodargs, **methodkwargs) + deferredretval._set_val(retval) + self._deferred_method_calls = [] + + def _get(self): + if not self._val_available: + self._val = self._eval() + self._val_available = True + self._eval_methods() + return self._val + + def _get_deferred_func(self, _name, _retval): + def _deferred_func(*args, **kwargs): + """ + When this function is called from an "uncompleted" DeferredVal, + the name of the deferred method, arguments, and DeferredVal/wrapper + object for the eventual return value are stored for future + evaluation when the current wrapped object is completed, and + the DeferredVal return object is returned. + Otherwise, if the DeferredVal is already completed, then return + the result of calling the method directly. + """ + if not self._val_available: + self._deferred_method_calls.append((_name, args, kwargs, _retval)) + return _retval + args = self._eval_list(args) + kwargs = self._eval_dict(kwargs) + return getattr(self._val, _name)(*args, **kwargs) + _deferred_func.__name__ = _name + ".deferred" + return _deferred_func + + def __getattr__(self, name): + if self.__class__._deferred_method_dict is None: + raise Exception("DeferredVal must be subclassed and the class attribute _deferred_method_dict must be set to a valid dictionary!") + if self._val_available: + return getattr(self._val, name) + deferredclass = self.__class__._deferred_method_dict.get(name, self.__unimpl) + if deferredclass is not self.__unimpl: + if deferredclass is None: + deferredclass = DeferredVal + retval = deferredclass() + return self._get_deferred_func(name, retval) + raise AttributeError("no such attribute (yet): '%s'" % (name,)) + +class DeferredNumeric(DeferredVal): + """ + This is a DeferredVal that allows the deferral of all math operations. + """ + pass +_mathops = ( + '__add__', '__sub__', '__mul__', '__floordiv__', '__mod__', + '__divmod__', '__pow__', '__lshift__', '__rshift__', '__and__', + '__xor__', '__or__', '__div__', '__truediv__', '__radd__', '__rsub__', + '__rmul__', '__rdiv__', '__rtruediv__', '__rfloordiv__', '__rmod__', + '__rdivmod__', '__rpow__', '__rlshift__', '__rrshift__', '__rand__', + '__rxor__', '__ror__', '__iadd__', '__isub__', '__imul__', '__idiv__', + '__itruediv__', '__ifloordiv__', '__imod__', '__ipow__', '__ilshift__', + '__irshift__', '__iand__', '__ixor__', '__ior__', '__pos__', '__abs__', + '__invert__', '__complex__', '__int__', '__long__', '__float__', + '__oct__', '__hex__', '__index__', '__coerce__') +DeferredNumeric._deferred_method_dict = dict((x, DeferredNumeric) + for x in _mathops) +def _get_deferred_attr_func(_name): + def _deferred_func(self, *args, **kwargs): + return self.__getattr__(_name)(*args, **kwargs) + _deferred_func.__name__ = _name + return _deferred_func +for name in _mathops: + setattr(DeferredNumeric, name, _get_deferred_attr_func(name)) + +class DeferredModuleCall(DeferredVal): + _deferred_method_dict = {} + def __init__(self, methodstr, *args, **kwargs): + super(DeferredModuleCall, self).__init__() + self._methodstr = methodstr + self._args = args + self._kwargs = kwargs + self._mod = None + + def _set_mod(self, mod): + self._mod = mod + + def _copy(self, retval=None): + if retval is None: + retval = self.__class__(self._methodstr, *self._args, **self._kwargs) + return super(DeferredModuleCall, self)._copy(retval=retval) + + def _eval(self): + return getattr(self._mod, self._methodstr)(*self._args, **self._kwargs) + +class DeferredTexRef(DeferredModuleCall): + _deferred_method_dict = { + "set_array": None, + "set_address": DeferredNumeric, + "set_address_2d": None, + "set_format": None, + "set_address_mode": None, + "set_flags": None, + "get_address": DeferredNumeric, + "get_flags": DeferredNumeric, + } + +class DeferredFunction(object): + ''' + This class is a pseudo-replacement of ``pycuda.driver.Function``, + but takes a ``DeferredSourceModule`` and a function name as an argument, + and queues any call to ``prepare()`` until call-time, at which time it + calls out to the ``DeferredSourceModule`` object do create the actual + Function before preparing (if necessary) and calling the underlying + kernel. NOTE: you may now send the actual ``GPUArrays`` as arguments, + rather than their ``.gpudata`` members; this can be helpful to + dynamically create kernels. + ''' + def __init__(self, deferredmod, funcname): + self._deferredmod = deferredmod + self._funcname = funcname + self._prepare_args = None + + def get_unimplemented(_methodname): + def _unimplemented(self, _methodname=_methodname, *args, **kwargs): + raise NotImplementedError("%s does not implement method '%s'" % (type(self), _methodname,)) + return _unimplemented + + for meth_name in ["set_block_shape", "set_shared_size", + "param_set_size", "param_set", "param_seti", "param_setf", + "param_setv", "param_set_texref", + "launch", "launch_grid", "launch_grid_async"]: + setattr(self, meth_name, get_unimplemented(meth_name)) + + def _fix_texrefs(self, kwargs, mod): + texrefs = kwargs.get('texrefs', None) + if texrefs is not None: + kwargs = kwargs.copy() + newtexrefs = [] + for texref in texrefs: + if isinstance(texref, DeferredVal): + if isinstance(texref, DeferredModuleCall): + texref = texref._copy() # future calls may use different modules/functions + texref._set_mod(mod) + texref = texref._get() + elif isinstance(texref, DeferredVal): + texref = texref._get() + newtexrefs.append(texref) + kwargs['texrefs'] = newtexrefs + return kwargs + + def __call__(self, *args, **kwargs): + block = kwargs.get('block', None) + if block is None or not isinstance(block, tuple) or len(block) != 3: + raise ValueError("keyword argument 'block' is required, and must be a 3-tuple of integers") + grid = kwargs.get('grid', (1,1)) + mod, func = self._deferredmod._delayed_get_function(self._funcname, args, grid, block) + kwargs = self._fix_texrefs(kwargs, mod) + return func.__call__(*args, **kwargs) + + def param_set_texref(self, *args, **kwargs): + raise NotImplementedError() + + def prepare(self, *args, **kwargs): + self._prepare_args = (args, kwargs) + return self + + def _do_delayed_prepare(self, mod, func): + if self._prepare_args is None: + raise Exception("prepared_*_call() requires that prepare() be called first") + (prepare_args, prepare_kwargs) = self._prepare_args + prepare_kwargs = self._fix_texrefs(prepare_kwargs, mod) + func.prepare(*prepare_args, **prepare_kwargs) + + def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs): + grid = funcmethodargs[0] + block = funcmethodargs[1] + mod, func = self._deferredmod._delayed_get_function(self._funcname, funcargs, grid, block) + self._do_delayed_prepare(mod, func) + newfuncargs = [ getattr(arg, 'gpudata', arg) for arg in funcargs ] + fullargs = list(funcmethodargs) + fullargs.extend(newfuncargs) + return getattr(func, funcmethodstr)(*fullargs, **funckwargs) + + def prepared_call(self, grid, block, *args, **kwargs): + return self._generic_prepared_call('prepared_call', (grid, block), args, kwargs) + + def prepared_timed_call(self, grid, block, *args, **kwargs): + return self._generic_prepared_call('prepared_timed_call', (grid, block), args, kwargs) + + def prepared_async_call(self, grid, block, stream, *args, **kwargs): + return self._generic_prepared_call('prepared_async_call', (grid, block, stream), args, kwargs) + +@context_dependent_memoize +def _delayed_compile_aux(source, compileargs): + # re-convert any tuples to lists + newcompileargs = [] + for i, arg in enumerate(compileargs): + if isinstance(arg, tuple): + arg = list(arg) + newcompileargs.append(arg) + cubin = compile(source, *newcompileargs) + + from pycuda.driver import module_from_buffer + return module_from_buffer(cubin) + +class DeferredSourceModule(SourceModule): + ''' + This is an abstract specialization of SourceModule which allows the + delay of creating the actual kernel source until call-time, at which + point the actual arguments are available and their characteristics can + be used to create specific kernels. + To support this, ``get_function()`` returns a ``DeferredFunction`` + object which queues any calls to ``DeferredFunction.prepare()`` and + saves them until future calls to ``DeferredFunction.__call__()`` or + ``DeferredFunction.prepared_*_call()``. NOTE: you may now send actual + ``GPUArrays`` to these functions rather their ``.gpudata`` members; + this can be helpful when creating dynamic kernels. + Likewise, ``get_global()``, ``get_texref()`` and ``get_surfref()`` + return proxy objects that can be stored by ``DeferredFunction.prepare()`` + and will only be evaluated at call-time. + This class must be subclassed and the function ``create_source(self, + grid, block, *args)`` must be overridden, returning the kernel source + (or ``DeferredSource`` object) that should be compiled. ``grid``, + ``block``, and ``*args`` are the same arguments that were sent to the + ``DeferredFunction`` call functions above. + The function ``create_key(self, grid, block, *args)`` returns a tuple + ``(key, precalc)``. ``create_key`` is always called before + ``create_source`` and any pre-calculated info in ``precalc`` is sent back + to ``create_source``; ``key`` is used to cache any compiled functions. + Default return value of ``create_key()`` is (None, None), which means to + use the function name and generated source as the key. The key returned + by ``create_key()`` must be usable as a hash key. + ''' + _cache = {} + + def __init__(self, nvcc="nvcc", options=None, keep=False, + no_extern_c=False, arch=None, code=None, cache_dir=None, + include_dirs=[]): + self._arch = arch + # tuples below are so _compileargs can be used as a hash key + if options is not None: + options = tuple(options) + include_dirs = tuple(include_dirs) + self._compileargs = (nvcc, options, keep, no_extern_c, + arch, code, cache_dir, include_dirs) + + def _delayed_compile(self, source): + self._check_arch(self._arch) + + return _delayed_compile_aux(source, self._compileargs) + + def create_key(self, grid, block, *funcargs): + return (None, None) + + def create_source(self, precalc, grid, block, *funcargs): + raise NotImplementedError("create_source must be overridden!") + + def _delayed_get_function(self, funcname, funcargs, grid, block): + ''' + If the first element of the tuple returned by ``create_key()`` is + not None, then it is used as the key to cache compiled functions. + Otherwise the return value of ``create_source()`` is used as the key. + ''' + context = pycuda.driver.Context.get_current() + funccache = DeferredSourceModule._cache.get(context, None) + if funccache is None: + funccache = self._cache[context] = {} + (funckey, precalc) = self.create_key(grid, block, *funcargs) + modfunc = funccache.get(funckey, None) + if modfunc is None: + source = self.create_source(precalc, grid, block, *funcargs) + if isinstance(source, DeferredSource): + source = source.generate() + if funckey is None: + funckey = (funcname, source) + modfunc = funccache.get(funckey, None) + if modfunc is None: + module = self._delayed_compile(source) + func = module.get_function(funcname) + modfunc = funccache[funckey] = (module, func) + return modfunc + + def get_function(self, name): + return DeferredFunction(self, name) + + def get_global(self, name): + raise NotImplementedError("Deferred globals in element-wise kernels not supported yet") + + def get_texref(self, name): + return DeferredTexRef('get_texref', name) + + def get_surfref(self, name): + raise NotImplementedError("Deferred surfaces in element-wise kernels not supported yet") + diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 695817fa..c804bd7c 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -36,92 +36,528 @@ import numpy as np from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg from pytools import memoize_method +from pycuda.deferred import DeferredSourceModule, DeferredSource +import pycuda._driver as _drv + +class ElementwiseSourceModule(DeferredSourceModule): + ''' + This is a ``DeferredSourceModule`` which is backwards-compatible with the + original ``get_elwise_module`` and ``get_elwise_range_module`` (using + ``do_range=True``). However, this class delays the compilation of + kernels until call-time. If you send actual ``GPUArray`` arguments + (instead of their ``.gpudata`` members) when calling the methods + supported by the return value of ``get_function()``, then you get: + * support for array-specific flat indices (i.e. for input array ``z``, + you can index it as ``z[z_i]`` in addition to the old-style ``z[i]``) + * support for non-contiguous (and arbitrarily-strided) arrays, but + only if you use the array-specific indices above. + * if do_indices=True, the N-D index of the current data item in all + participating arrays is available in INDEX[dim] where dim goes from + 0 to NDIM - 1. + ``elwise_arg_inds`` specifies the kernel arguments (as indices into + ``arguments``) that will be treated element-wise and need array-specific + index calculations. If ``elwise_arg_inds`` is not specified, it is + constructed by finding all arguments that are VectorArg objects. If only + some of the VectorArgs will be traversed element-wise, specifying + ``elwise_arg_inds`` ensures that the kernel doesn't wastes time + calculating unneeded indices. When the kernel is eventually called, + all elementwise array arguments specified in ``elwise_arg_inds`` must be + the same shape, which is used to optimize index calculations. + ''' + def __init__(self, arguments, operation, + name="kernel", preamble="", loop_prep="", after_loop="", + do_range=False, + elwise_arg_inds=None, + do_indices=False, + order='K', + debug=False, + **compilekwargs): + super(ElementwiseSourceModule, self).__init__(**compilekwargs) + self._do_range = do_range + self._do_indices = do_indices + assert(order in 'CFK', "order must be 'F', 'C', or 'K'") + self._order = order + self._arg_decls = ", ".join(arg.declarator() for arg in arguments) + if elwise_arg_inds is None: + elwise_arg_inds = [i for i in range(len(arguments)) if isinstance(arguments[i], VectorArg)] + self._elwise_arg_inds = elwise_arg_inds + self._arg_names = tuple(argument.name for argument in arguments) + self._elwise_arg_names = tuple(self._arg_names[i] for i in self._elwise_arg_inds) + self._init_args = (self._arg_names, self._arg_decls, operation, + name, preamble, loop_prep, after_loop) + self._init_args_repr = repr(self._init_args) + self._debug = debug + + def _precalc_array_info(self, args): + # 'args' is the list of actual parameters being sent to the kernel + + contigmatch = True + arrayspecificinds = True + shape = None + order = None + elwise_arg_inds = self._elwise_arg_inds + for i in elwise_arg_inds: + # is a GPUArray/DeviceAllocation + arg = args[i] + if not arrayspecificinds: + continue + if not hasattr(arg, 'shape'): + # At least one array argument is probably sent as a + # GPUArray.gpudata rather than the GPUArray itself, + # so disable array-specific indices -- caller is on + # their own. + arrayspecificinds = False + continue + curshape = arg.shape + # make strides == 0 when shape == 1 + arg.strides = tuple((np.array(curshape) > 1) * arg.strides) + curorder = 'N' + if contigmatch: + if arg.flags.f_contiguous: + curorder = 'F' + elif arg.flags.c_contiguous: + curorder = 'C' + if curorder == 'N': + contigmatch = False + if shape is None: + shape = curshape + order = curorder + elif order != curorder: + contigmatch = False + if shape != curshape: + raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the arrays to be traversed element-wise explicitly with elwise_arg_inds; found shapes %s and %s" % (shape, curshape)) + + arrayitemstrides = tuple((arg.itemsize, arg.strides) for arg in (args[i] for i in elwise_arg_inds)) + + return (contigmatch, arrayspecificinds, shape, arrayitemstrides) + + + def create_key(self, grid, block, *args): + #print "Calling _precalc_array_info(args=%s, elwise_arg_inds=%s)\n" % (args, self._elwise_arg_inds) + #precalc_init = self._precalc_array_info(args) + elwise_arg_inds = self._elwise_arg_inds + precalc_init = _drv._precalc_array_info(args, elwise_arg_inds) + (contigmatch, arrayspecificinds, shape, arrayitemstrides) = precalc_init + + if (contigmatch or not arrayspecificinds) and not self._do_indices: + key = self._init_args_repr + return key, (self._init_args_repr, shape, contigmatch, arrayspecificinds, None, None, None, None, None) + + # Arrays are not contiguous or different order or need calculated indices + + if grid[1] != 1 or block[1] != 1 or block[2] != 1: + raise Exception("Grid (%s) and block (%s) specifications should have all '1' except in the first element" % (grid, block)) + + ndim = len(shape) + arg_names = self._arg_names + + # Kernel traverses in Fortran-order by default, but if order == 'C', + # we can just reverse the shape and strides, since the kernel is + # elementwise. If order == 'K', use index of minimum stride in first + # elementwise array as a hint on how to + # order the traversal of dimensions. We could probably do something + # smarter, like tranposing/reshaping arrays if possible to maximize + # performance, but that is probably best done in a pre-processing step. + # Note that this could mess up custom indexing that assumes a + # particular traversal order, but in that case one should explicitly + # set order to 'C' or 'F'. + do_reverse = False + if self._order == 'C': + do_reverse = True + elif self._order == 'K': + if np.argmin(np.abs(args[self._elwise_arg_inds[0]].strides)) > ndim // 2: + # traverse dimensions in reverse order + do_reverse = True + + # need to include grid and block in key because kernel will change + # depending on number of threads + key = (self._init_args_repr, shape, contigmatch, arrayspecificinds, arrayitemstrides, self._arg_decls, do_reverse, grid, block) + + return key, key + + def create_source(self, precalc, grid, block, *args): + (_, shape, contigmatch, arrayspecificinds, arrayitemstrides, _, do_reverse, _, _) = precalc + + (arg_names, arg_decls, operation, + funcname, preamble, loop_prep, after_loop) = self._init_args + + elwise_arg_inds = self._elwise_arg_inds + elwise_arg_names = self._elwise_arg_names + + if contigmatch and not self._do_indices: + indtype = 'unsigned' + if self._do_range: + indtype = 'long' + + # All arrays are contiguous and same order (or we don't know and + # it's up to the caller to make sure it works) + if arrayspecificinds: + for arg_name in elwise_arg_names: + preamble = preamble + """ + #define %s_i i + """ % (arg_name,) + if self._do_range: + loop_body = """ + if (step < 0) + { + for (i = start + (_CTA_START + _TID)*step; + i > stop; i += total_threads*step) + { + %(operation)s; + } + } + else + { + for (i = start + (_CTA_START + _TID)*step; + i < stop; i += total_threads*step) + { + %(operation)s; + } + } + """ % { + "operation": operation, + } + else: + loop_body = """ + for (i = _CTA_START + _TID; i < n; i += total_threads) + { + %(operation)s; + } + """ % { + "operation": operation, + } + + return """ + #include + + %(preamble)s + + __global__ void %(name)s(%(arg_decls)s) + { + unsigned _TID = threadIdx.x; + unsigned total_threads = gridDim.x*blockDim.x; + unsigned _CTA_START = blockDim.x*blockIdx.x; + + %(indtype)s i; + + %(loop_prep)s; + + %(loop_body)s; + + %(after_loop)s; + } + """ % { + "arg_decls": arg_decls, + "name": funcname, + "preamble": preamble, + "loop_prep": loop_prep, + "after_loop": after_loop, + "loop_body": loop_body, + "indtype": indtype, + } + + # Arrays are not contiguous or different order or we need N-D indices + + # these are the arrays that need calculated indices + arrayindnames = [] + + # these are the arrays that match another array and can use the + # same indices + arrayrefnames = [] + + argnamestrides = ((arg_name, itemstride[0], itemstride[1]) for (arg_name, itemstride) in zip(elwise_arg_names, arrayitemstrides)) + + ndim = len(shape) + numthreads = block[0] * grid[0] + if do_reverse: + shape = shape[::-1] + shapearr = np.array(shape) + block_step = np.array(shapearr) + tmpnumthreads = numthreads + for dimnum in range(ndim): + newstep = tmpnumthreads % block_step[dimnum] + tmpnumthreads = tmpnumthreads // block_step[dimnum] + block_step[dimnum] = newstep + arrayarginfos = [] + for (arg_name, arg_itemsize, arg_strides) in argnamestrides: + strides = [0] * ndim + if do_reverse: + strides[0:len(arg_strides)] = arg_strides[::-1] + else: + strides[0:len(arg_strides)] = arg_strides + elemstrides = np.array(strides) // arg_itemsize + dimelemstrides = tuple(elemstrides * shapearr) + blockelemstrides = tuple(elemstrides * block_step) + elemstrides = tuple(elemstrides) + inforef = None + for i, arrayarginfo in enumerate(arrayarginfos): + if elemstrides == arrayarginfo[1]: + inforef = i + if inforef is None: + arrayindnames.append(arg_name) + else: + arrayrefnames.append((arg_name, arrayarginfos[inforef][0])) + arrayarginfos.append( + (arg_name, elemstrides, dimelemstrides, blockelemstrides, inforef) + ) + + preamble = DeferredSource(preamble) + operation = DeferredSource(operation) + loop_prep = DeferredSource(loop_prep) + after_loop = DeferredSource(after_loop) + defines = DeferredSource() + decls = DeferredSource() + loop_preop = DeferredSource() + loop_inds_calc = DeferredSource() + loop_inds_inc = DeferredSource() + loop_body = DeferredSource() + + if self._do_indices: + defines += """#define NDIM %d""" % (ndim,) + + for definename, vals in ( + ('SHAPE', shape), + ('BLOCK_STEP', block_step), + ): + for dimnum in range(ndim): + defines += """ + #define %s_%d %d + """ % (definename, dimnum, vals[dimnum]) + for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos: + for definename, vals in ( + ('ELEMSTRIDE', elemstrides), + ('DIMELEMSTRIDE', dimelemstrides), + ('BLOCKELEMSTRIDE', blockelemstrides), + ): + for dimnum in range(ndim): + defines += """ + #define %s_%s_%d %d + """ % (definename, name, dimnum, vals[dimnum]) + + decls += """ + unsigned _GLOBAL_i = _CTA_START + _TID; + """ + for name in arrayindnames: + decls += """ + long %s_i = 0; + """ % (name,) + + for (name, refname) in arrayrefnames: + defines += """ + #define %s_i %s_i + """ % (name, refname) + + if self._do_indices: + decls += """ + long INDEX[%d]; + """ % (ndim,) + + for dimnum in range(ndim): + defines += """ + #define INDEX_%d (INDEX[%d]) + """ % (dimnum, dimnum) + else: + for dimnum in range(ndim): + decls += """ + long INDEX_%d; + """ % (dimnum,) + + loop_inds_calc += """ + unsigned int _TMP_GLOBAL_i = _GLOBAL_i; + """ + for dimnum in range(ndim): + loop_inds_calc += """ + INDEX_%d = _TMP_GLOBAL_i %% SHAPE_%d; + _TMP_GLOBAL_i = _TMP_GLOBAL_i / SHAPE_%d; + """ % (dimnum, dimnum, + dimnum) + + for name in arrayindnames: + loop_inds_calc += """ + %s_i += INDEX_%d * ELEMSTRIDE_%s_%d; + """ % (name, dimnum, name, dimnum) + + for name in arrayindnames: + loop_inds_inc += """ + %s_i += %s; + """ % (name, + " + ".join( + "BLOCKELEMSTRIDE_%s_%d" % (name, dimnum) + for dimnum in range(ndim))) + for dimnum in range(ndim): + loop_inds_inc += """ + INDEX_%d += BLOCK_STEP_%d; + """ % (dimnum, dimnum) + if dimnum < ndim - 1: + loop_inds_inc += """ + if (INDEX_%d >= SHAPE_%d) { + """ % (dimnum, dimnum) + loop_inds_inc.indent() + loop_inds_inc += """ + INDEX_%d -= SHAPE_%d; + INDEX_%d ++; + """ % (dimnum, dimnum, + dimnum + 1) + for name in arrayindnames: + loop_inds_inc += """ + %s_i += ELEMSTRIDE_%s_%d - DIMELEMSTRIDE_%s_%d; + """ % (name, name, dimnum + 1, name, dimnum) + loop_inds_inc.dedent() + loop_inds_inc += """ + } + """ + + if self._debug: + preamble += """ + #include + """ + loop_inds_calc += """ + if (_CTA_START == 0 && _TID == 0) { + """ + loop_inds_calc.indent() + loop_inds_calc += r""" + printf("=======================\n"); + printf("CALLING FUNC %s\n"); + printf("N=%%u\n", (unsigned int)n); + """ % (funcname,) + for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos: + loop_inds_calc += r""" + printf("(%s) %s: ptr=0x%%lx maxoffset(elems)=%s\n", (unsigned long)%s); + """ % (funcname, name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name) + loop_inds_calc.dedent() + loop_inds_calc += """ + } + """ + indtest = DeferredSource() + for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos: + if inforef is not None: + continue + indtest += r""" + if (%s_i > %s || %s_i < 0) + """ % (name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name) + indtest += r"""{""" + indtest.indent() + indtest += r""" + printf("_CTA_START=%%d _TID=%%d _GLOBAL_i=%%u %s_i=%%ld %s\n", _CTA_START, _TID, _GLOBAL_i, %s_i, %s); + """ % (name, ' '.join("INDEX_%d=%%ld" % (dimnum,) for dimnum in range(ndim)), name, ', '.join("INDEX_%d" % (dimnum,) for dimnum in range(ndim))) + indtest += """break;""" + indtest.dedent() + indtest += """}""" + loop_preop = indtest + loop_preop + after_loop += r""" + if (_CTA_START == 0 && _TID == 0) { + printf("DONE CALLING FUNC %s\n"); + printf("-----------------------\n"); + } + """ % (funcname,) + + if self._do_range: + loop_body.add(""" + if (step < 0) + { + for (/*void*/; _GLOBAL_i > stop; _GLOBAL_i += total_threads*step) + { + %(loop_preop)s; + + %(operation)s; + + %(loop_inds_inc)s; + } + } + else + { + for (/*void*/; _GLOBAL_i < stop; _GLOBAL_i += total_threads*step) + { + %(loop_preop)s; + + %(operation)s; + + %(loop_inds_inc)s; + } + } + """, format_dict={ + "loop_preop": loop_preop, + "operation": operation, + "loop_inds_inc": loop_inds_inc, + }) + else: + loop_body.add(""" + for (/*void*/; _GLOBAL_i < n; _GLOBAL_i += total_threads) + { + %(loop_preop)s; + %(operation)s; -def get_elwise_module(arguments, operation, - name="kernel", keep=False, options=None, - preamble="", loop_prep="", after_loop=""): - from pycuda.compiler import SourceModule - return SourceModule(""" - #include + %(loop_inds_inc)s; + } + """, format_dict={ + "loop_preop": loop_preop, + "operation": operation, + "loop_inds_inc": loop_inds_inc, + }) - %(preamble)s + source = DeferredSource() - __global__ void %(name)s(%(arguments)s) - { + source.add(""" + #include + #include - unsigned tid = threadIdx.x; - unsigned total_threads = gridDim.x*blockDim.x; - unsigned cta_start = blockDim.x*blockIdx.x; - unsigned i; + %(defines)s - %(loop_prep)s; + %(preamble)s - for (i = cta_start + tid; i < n; i += total_threads) - { - %(operation)s; - } + __global__ void %(name)s(%(arg_decls)s) + { - %(after_loop)s; - } - """ % { - "arguments": ", ".join(arg.declarator() for arg in arguments), - "operation": operation, - "name": name, - "preamble": preamble, - "loop_prep": loop_prep, - "after_loop": after_loop, - }, - options=options, keep=keep) + const unsigned int _TID = threadIdx.x; + const unsigned int total_threads = gridDim.x*blockDim.x; + const unsigned int _CTA_START = blockDim.x*blockIdx.x; + %(decls)s -def get_elwise_range_module(arguments, operation, - name="kernel", keep=False, options=None, - preamble="", loop_prep="", after_loop=""): - from pycuda.compiler import SourceModule - return SourceModule(""" - #include - - %(preamble)s - - __global__ void %(name)s(%(arguments)s) - { - unsigned tid = threadIdx.x; - unsigned total_threads = gridDim.x*blockDim.x; - unsigned cta_start = blockDim.x*blockIdx.x; - long i; - - %(loop_prep)s; - - if (step < 0) - { - for (i = start + (cta_start + tid)*step; - i > stop; i += total_threads*step) - { - %(operation)s; - } - } - else - { - for (i = start + (cta_start + tid)*step; - i < stop; i += total_threads*step) - { - %(operation)s; + %(loop_prep)s; + + %(loop_inds_calc)s; + + %(loop_body)s; + + %(after_loop)s; } - } + """, format_dict={ + "arg_decls": arg_decls, + "name": funcname, + "preamble": preamble, + "loop_prep": loop_prep, + "after_loop": after_loop, + "defines": defines, + "decls": decls, + "loop_inds_calc": loop_inds_calc, + "loop_body": loop_body, + }) + + return source - %(after_loop)s; - } - """ % { - "arguments": ", ".join(arg.declarator() for arg in arguments), - "operation": operation, - "name": name, - "preamble": preamble, - "loop_prep": loop_prep, - "after_loop": after_loop, - }, - options=options, keep=keep) +def get_elwise_module(arguments, operation, + name="kernel", keep=False, options=None, + preamble="", loop_prep="", after_loop="", + **kwargs): + return ElementwiseSourceModule(arguments, operation, + name=name, preamble=preamble, + loop_prep=loop_prep, after_loop=after_loop, + keep=keep, options=options, + **kwargs) + +def get_elwise_range_module(arguments, operation, + name="kernel", keep=False, options=None, + preamble="", loop_prep="", after_loop="", + **kwargs): + return ElementwiseSourceModule(arguments, operation, + name=name, preamble=preamble, + loop_prep=loop_prep, after_loop=after_loop, + keep=keep, options=options, + do_range=True, + **kwargs) def get_elwise_kernel_and_types(arguments, operation, name="kernel", keep=False, options=None, use_range=False, **kwargs): @@ -204,12 +640,8 @@ def __call__(self, *args, **kwargs): for arg, arg_descr in zip(args, arguments): if isinstance(arg_descr, VectorArg): - if not arg.flags.forc: - raise RuntimeError("elementwise kernel cannot " - "deal with non-contiguous arrays") - vectors.append(arg) - invocation_args.append(arg.gpudata) + invocation_args.append(arg) else: invocation_args.append(arg) @@ -256,12 +688,13 @@ def get_take_kernel(dtype, idx_dtype, vec_count=1): "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) for i in range(vec_count)) body = ( - ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + ("%(idx_tp)s src_idx = idx[idx_i];\n" % ctx) + "\n".join( - "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) + "dest%d[dest%d_i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i, i) for i in range(vec_count))) - mod = get_elwise_module(args, body, "take", preamble=preamble) + mod = get_elwise_module(args, body, "take", preamble=preamble, + elwise_arg_inds=(0, 1)) func = mod.get_function("take") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src) @@ -301,11 +734,12 @@ def get_copy_insn(i): return ("dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)) - body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" - "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + body = (("%(idx_tp)s src_idx = gmem_src_idx[gmem_src_idx_i];\n" + "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx) + "\n".join(get_copy_insn(i) for i in range(vec_count))) - mod = get_elwise_module(args, body, "take_put", preamble=preamble) + mod = get_elwise_module(args, body, "take_put", + preamble=preamble, elwise_arg_inds=(0,1)) func = mod.get_function("take_put") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] @@ -335,11 +769,15 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1): ] + [ScalarArg(np.intp, "n")] body = ( - "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx - + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) + "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx + + "\n".join("dest%d[dest_idx] = src%d[src%d_i];" % (i, i, i) for i in range(vec_count))) - func = get_elwise_module(args, body, "put").get_function("put") + func = get_elwise_module(args, body, "put", + elwise_arg_inds=( + [0] + range(1+vec_count, 1+(2*vec_count)) + ) + ).get_function("put") func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char) return func @@ -351,7 +789,7 @@ def get_copy_kernel(dtype_dest, dtype_src): "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, - "dest[i] = src[i]", + "dest[dest_i] = src[src_i]", "copy") @@ -383,13 +821,13 @@ def get_linear_combination_kernel(summand_descriptors, args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i)) - summands.append("a%d*x%d[i]" % (i, i)) + summands.append("a%d*x%d[x%d_i]" % (i, i, i)) args.append(VectorArg(dtype_z, "z")) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, - "z[i] = " + " + ".join(summands), + "z[z_i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep)) @@ -410,7 +848,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = a*x[i] + b*y[i]", + "z[z_i] = a*x[x_i] + b*y[y_i]", "axpbyz") @@ -421,7 +859,7 @@ def get_axpbz_kernel(dtype_x, dtype_z): "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z) }, - "z[i] = a * x[i] + b", + "z[z_i] = a * x[x_i] + b", "axpb") @@ -433,8 +871,8 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = x[i] %s y[i]" % operator, - "multiply") + "z[z_i] = x[x_i] %s y[y_i]" % operator, + "binary_op") @context_dependent_memoize @@ -444,7 +882,7 @@ def get_rdivide_elwise_kernel(dtype_x, dtype_z): "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = y / x[i]", + "z[z_i] = y / x[x_i]", "divide_r") @@ -456,7 +894,7 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s(x[i], y[i])" % func, + "z[z_i] = %s(x[x_i], y[y_i])" % func, func+"_kernel") @@ -468,7 +906,7 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s(x[i], y)" % func, + "z[z_i] = %s(x[x_i], y)" % func, func+"_kernel") @@ -492,7 +930,7 @@ def get_fill_kernel(dtype): "%(tp)s a, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = a", + "z[z_i] = a", "fill") @@ -502,7 +940,7 @@ def get_reverse_kernel(dtype): "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = y[n-1-i]", + "z[z_i] = y[n-1-y_i]", "reverse") @@ -513,7 +951,7 @@ def get_real_kernel(dtype, real_dtype): "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, - "z[i] = real(y[i])", + "z[z_i] = real(y[y_i])", "real") @@ -524,7 +962,7 @@ def get_imag_kernel(dtype, real_dtype): "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, - "z[i] = imag(y[i])", + "z[z_i] = imag(y[y_i])", "imag") @@ -534,7 +972,7 @@ def get_conj_kernel(dtype): "%(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = pycuda::conj(y[i])", + "z[z_i] = pycuda::conj(y[y_i])", "conj") @@ -544,7 +982,9 @@ def get_arange_kernel(dtype): "%(tp)s *z, %(tp)s start, %(tp)s step" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = start + i*step", + "z[z_i] = start + ((%(tp)s)(z_i))*step" % { + "tp": dtype_to_ctype(dtype), + }, "arange") @@ -559,7 +999,7 @@ def get_pow_kernel(dtype): "%(tp)s value, %(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, - "z[i] = %s(y[i], value)" % func, + "z[z_i] = %s(y[y_i], value)" % func, "pow_method") @@ -576,7 +1016,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s(x[i], y[i])" % func, + "z[z_i] = %s(x[x_i], y[y_i])" % func, "pow_method") @@ -584,7 +1024,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): def get_fmod_kernel(): return get_elwise_kernel( "float *arg, float *mod, float *z", - "z[i] = fmod(arg[i], mod[i])", + "z[z_i] = fmod(arg[arg_i], mod[mod_i])", "fmod_kernel") @@ -592,7 +1032,7 @@ def get_fmod_kernel(): def get_modf_kernel(): return get_elwise_kernel( "float *x, float *intpart ,float *fracpart", - "fracpart[i] = modf(x[i], &intpart[i])", + "fracpart[fracpart_i] = modf(x[x_i], &intpart[intpart_i])", "modf_kernel") @@ -602,8 +1042,8 @@ def get_frexp_kernel(): "float *x, float *significand, float *exponent", """ int expt = 0; - significand[i] = frexp(x[i], &expt); - exponent[i] = expt; + significand[significand_i] = frexp(x[x_i], &expt); + exponent[exponent_i] = expt; """, "frexp_kernel") @@ -612,7 +1052,7 @@ def get_frexp_kernel(): def get_ldexp_kernel(): return get_elwise_kernel( "float *sig, float *expt, float *z", - "z[i] = ldexp(sig[i], int(expt[i]))", + "z[z_i] = ldexp(sig[sig_i], int(expt[expt_i]))", "ldexp_kernel") @@ -626,7 +1066,7 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): "tp_in": dtype_to_ctype(in_dtype), "tp_out": dtype_to_ctype(out_dtype), }, - "z[i] = %s(y[i])" % func_name, + "z[z_i] = %s(y[y_i])" % func_name, "%s_kernel" % func_name) @@ -638,7 +1078,7 @@ def get_if_positive_kernel(crit_dtype, dtype): VectorArg(dtype, "else_"), VectorArg(dtype, "result"), ], - "result[i] = crit[i] > 0 ? then_[i] : else_[i]", + "result[result_i] = crit[crit_i] > 0 ? then_[then__i] : else_[else__i]", "if_positive") @@ -650,5 +1090,5 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator): "tp_y": dtype_to_ctype(dtype_y), "tp_a": dtype_to_ctype(dtype_x), }, - "y[i] = x[i] %s a" % operator, + "y[y_i] = x[x_i] %s a" % operator, "scalarop_kernel") diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 10c687d0..2a9e4ba9 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -118,23 +118,15 @@ def splay(n, dev=None): def _make_binary_op(operator): def func(self, other): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - if isinstance(other, GPUArray): assert self.shape == other.shape - if not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - result = self._new_like_me() func = elementwise.get_binary_op_kernel( self.dtype, other.dtype, result.dtype, operator) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, + self, other, result, self.mem_size) return result @@ -143,7 +135,7 @@ def func(self, other): func = elementwise.get_scalar_op_kernel( self.dtype, result.dtype, operator) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other, result.gpudata, + self, other, result, self.mem_size) return result @@ -246,13 +238,14 @@ def set_async(self, ary, stream=None): return self.set(ary, async=True, stream=stream) def get(self, ary=None, pagelocked=False, async=False, stream=None): + flipper = None if ary is None: if pagelocked: ary = drv.pagelocked_empty(self.shape, self.dtype) else: ary = np.empty(self.shape, self.dtype) - strides = _compact_strides(self) + self, strides, flipper = _compact_positive_strides(self) ary = _as_strided(ary, strides=strides) else: if self.size != ary.size: @@ -269,13 +262,15 @@ def get(self, ary=None, pagelocked=False, async=False, stream=None): if self.size: _memcpy_discontig(ary, self, async=async, stream=stream) + if flipper is not None: + ary = ary[flipper] return ary def get_async(self, stream=None, ary=None): return self.get(ary=ary, async=True, stream=stream) - def copy(self): - new = GPUArray(self.shape, self.dtype, self.allocator) + def copy(self, order="K"): + new = self._new_like_me(order=order) _memcpy_discontig(new, self) return new @@ -297,47 +292,36 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" assert self.shape == other.shape - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) if add_timer is not None: add_timer(3*self.size, func.prepared_timed_call(self._grid, - selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size)) + selffac, self, otherfac, other, + out, self.mem_size)) else: func.prepared_async_call(self._grid, self._block, stream, - selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size) + selffac, self, otherfac, other, + out, self.mem_size) return out def _axpbz(self, selffac, other, out, stream=None): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - func = elementwise.get_axpbz_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, - selffac, self.gpudata, - other, out.gpudata, self.mem_size) + selffac, self, + other, out, self.mem_size) return out def _elwise_multiply(self, other, out, stream=None): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "*") func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + self, other, + out, self.mem_size) return out @@ -347,43 +331,72 @@ def _rdiv_scalar(self, other, out, stream=None): y = n / self """ - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype) func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other, - out.gpudata, self.mem_size) + self, other, + out, self.mem_size) return out def _div(self, other, out, stream=None): """Divides an array by another array.""" - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - assert self.shape == other.shape func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "/") func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + self, other, + out, self.mem_size) return out - def _new_like_me(self, dtype=None, order="C"): - strides = None + def _new_like_me(self, dtype=None, order="K"): + flipper, selflist = _flip_negative_strides((self,)) + self = selflist[0] + ndim = self.ndim + shape = self.shape + mystrides = self.strides + mydtype = self.dtype + myitemsize = mydtype.itemsize if dtype is None: - dtype = self.dtype - if dtype == self.dtype: - strides = self.strides - - return self.__class__(self.shape, dtype, - allocator=self.allocator, strides=strides, order=order) + dtype = mydtype + else: + dtype = np.dtype(dtype) + itemsize = dtype.itemsize + if order == "K": + if self.flags.c_contiguous: + order = "C" + elif self.flags.f_contiguous: + order = "F" + if order == "C": + newstrides = _c_contiguous_strides(itemsize, shape) + elif order == "F": + newstrides = _f_contiguous_strides(itemsize, shape) + else: + maxstride = mystrides[0] + maxstrideind = 0 + for i in range(1, ndim): + curstride = mystrides[i] + if curstride > maxstride: + maxstrideind = i + maxstride = curstride + mymaxoffset = (maxstride / myitemsize) * shape[maxstrideind] + if mymaxoffset <= self.size: + # it's probably safe to just allocate and pass strides + # XXX (do we need to calculate full offset for [-1,-1,-1...]?) + newstrides = tuple((x // myitemsize) * itemsize for x in mystrides) + else: + # just punt and choose something close + if ndim > 1 and maxstrideind == 0: + newstrides = _c_contiguous_strides(itemsize, shape) + else: + newstrides = _f_contiguous_strides(itemsize, shape) + retval = self.__class__(shape, dtype, + allocator=self.allocator, strides=newstrides) + if flipper is not None: + retval = retval[flipper] + return retval # operators --------------------------------------------------------------- def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None): @@ -514,7 +527,7 @@ def fill(self, value, stream=None): """fills the array with the specified value""" func = elementwise.get_fill_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, stream, - value, self.gpudata, self.mem_size) + value, self, self.mem_size) return self @@ -600,7 +613,7 @@ def __abs__(self): out_dtype=out_dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, self.mem_size) + self, result, self.mem_size) return result @@ -611,10 +624,6 @@ def _pow(self, other, new): """ if isinstance(other, GPUArray): - if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - assert self.shape == other.shape if new: @@ -626,22 +635,18 @@ def _pow(self, other, new): self.dtype, other.dtype, result.dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, + self, other, result, self.mem_size) return result else: - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - if new: result = self._new_like_me() else: result = self func = elementwise.get_pow_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, None, - other, self.gpudata, result.gpudata, + other, self, result, self.mem_size) return result @@ -673,24 +678,16 @@ def reverse(self, stream=None): as one-dimensional. """ - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - result = self._new_like_me() func = elementwise.get_reverse_kernel(self.dtype) func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, result.gpudata, + self, result, self.mem_size) return result def astype(self, dtype, stream=None): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - if dtype == self.dtype: return self.copy() @@ -698,7 +695,7 @@ def astype(self, dtype, stream=None): func = elementwise.get_copy_kernel(dtype, self.dtype) func.prepared_async_call(self._grid, self._block, stream, - result.gpudata, self.gpudata, + result, self, self.mem_size) return result @@ -710,9 +707,6 @@ def reshape(self, *shape, **kwargs): order = kwargs.pop("order", "C") # TODO: add more error-checking, perhaps - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") if isinstance(shape[0], tuple) or isinstance(shape[0], list): shape = tuple(shape[0]) @@ -918,7 +912,11 @@ def __getitem__(self, index): strides=tuple(new_strides)) def __setitem__(self, index, value): - _memcpy_discontig(self[index], value) + if isinstance(value, GPUArray) or isinstance(value, np.ndarray): + return _memcpy_discontig(self[index], value) + + # Let's assume it's a scalar + self[index].fill(value) # }}} @@ -930,15 +928,11 @@ def real(self): if issubclass(dtype.type, np.complexfloating): from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) - if self.flags.f_contiguous: - order = "F" - else: - order = "C" - result = self._new_like_me(dtype=real_dtype, order=order) + result = self._new_like_me(dtype=real_dtype) func = elementwise.get_real_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, + self, result, self.mem_size) return result @@ -949,21 +943,13 @@ def real(self): def imag(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) - if self.flags.f_contiguous: - order = "F" - else: - order = "C" - result = self._new_like_me(dtype=real_dtype, order=order) + result = self._new_like_me(dtype=real_dtype) func = elementwise.get_imag_kernel(dtype, real_dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, + self, result, self.mem_size) return result @@ -973,19 +959,11 @@ def imag(self): def conj(self): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): - if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - - if self.flags.f_contiguous: - order = "F" - else: - order = "C" - result = self._new_like_me(order=order) + result = self._new_like_me() func = elementwise.get_conj_kernel(dtype) func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, + self, result, self.mem_size) return result @@ -1012,15 +990,21 @@ def conj(self): def to_gpu(ary, allocator=drv.mem_alloc): """converts a numpy array to a GPUArray""" - result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary)) + ary, newstrides, flipper = _compact_positive_strides(ary) + result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides) result.set(ary) + if flipper is not None: + result = result[flipper] return result def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None): """converts a numpy array to a GPUArray""" - result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary)) + ary, newstrides, flipper = _compact_positive_strides(ary) + result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides) result.set_async(ary, stream) + if flipper: + result = result[flipper] return result @@ -1050,10 +1034,10 @@ def _array_like_helper(other_ary, dtype, order): order = "F" else: # array_like routines only return positive strides - strides = [np.abs(s) for s in other_ary.strides] + _, strides, _ = _compact_positive_strides(other_ary) if dtype is not None and dtype != other_ary.dtype: # scale strides by itemsize when dtype is not the same - itemsize = other_ary.nbytes // other_ary.size + itemsize = other_ary.dtype.itemsize itemsize_ratio = np.dtype(dtype).itemsize / itemsize strides = [int(s*itemsize_ratio) for s in strides] elif order not in ["C", "F"]: @@ -1173,15 +1157,54 @@ class Info(Record): func = elementwise.get_arange_kernel(dtype) func.prepared_async_call(result._grid, result._block, kwargs.get("stream"), - result.gpudata, start, step, size) + result, start, step, size) return result # }}} -def _compact_strides(a): - # Compute strides to have same order as self, but packed +def _flip_negative_strides(arrays): + # If arrays have negative strides, flip them. Return a list + # ``(flipper, arrays)`` where ``flipper`` is a tuple of slice + # objects that can used to unflip the returned (flipped) arrays + # (or ``None`` if there was no flipping). + # NOTE: Every input array A must have the same value for the following + # expression: np.sign(A.strides) + # NOTE: ``flipper`` is its own inverse, so ``A[flipper][flipper] == A`` + if isinstance(arrays, GPUArray): + raise TypeError("_flip_negative_strides expects a list of GPUArrays") + if not any(any(s < 0 for s in x.strides) for x in arrays): + return None, arrays + strides = np.vstack(x.strides for x in arrays) + shapes = np.vstack(x.shape for x in arrays) + signs = np.sign(strides) + max_signs = np.max(signs, axis=0) + min_signs = np.min(signs, axis=0) + # Stride signs are -1 or 1, or 0 if stride is 0. + # Zero strides don't matter, but fail if there are both -1 and 1. + if _builtin_min(min_signs) != -1: + # no flips needed + return None, arrays + flips = (min_signs == -1)[None, :] + if any(max_signs * min_signs == -1): + raise ValueError("found differing signs in strides (strides for all arrays: %s)" % ([x.strides for x in arrays],)) + offsets = np.sum(strides * shapes * flips, axis=1) + strides = strides * (flips * -1) + sliceboth = (slice(None), slice(None, None, -1)) + flipper = tuple(sliceboth[flip] for flip in flips[0]) + arrays = [ + arr[flipper] + for arr in arrays + ] + return flipper, arrays + +def _compact_positive_strides(a): + # Flip ``a``'s axes if there are any negative strides, then compute + # strides to have same order as a, but packed. Return flipped ``a`` + # and packed strides. + flipper, alist = _flip_negative_strides((a,)) + a = alist[0] info = sorted( (a.strides[axis], a.shape[axis], axis) for axis in range(len(a.shape))) @@ -1191,13 +1214,33 @@ def _compact_strides(a): for _, dim, axis in info: strides[axis] = stride stride *= dim - return strides - + return a, strides, flipper + +def _memcpy_discontig_slow(dst, src): + # This is a generic memcpy using a no-op "assignment" kernel. + # NOTE: no need to use this if both src and dst are on host. + func = elementwise.get_unary_func_kernel("", src.dtype) + + # if one array is on host, need to make an intermediate device array + src_gpu = src + dst_gpu = dst + if isinstance(src, np.ndarray): + dtype, order, strides = _array_like_helper(src, None, "K") + src_gpu = GPUArray(src.shape, dtype, order=order, strides=strides) + src_gpu.set(src) + if isinstance(dst, np.ndarray): + dtype, order, strides = _array_like_helper(dst, None, "K") + dst_gpu = GPUArray(dst.shape, dtype, order=order, strides=strides) + func.prepared_async_call(src_gpu._grid, src_gpu._block, None, + src_gpu, dst_gpu, src_gpu.mem_size) + if dst is not dst_gpu: + dst_gpu.get(dst) + return def _memcpy_discontig(dst, src, async=False, stream=None): """Copy the contents of src into dst. - The two arrays should have the same dtype, shape, and order, but + The two arrays should have the same dtype, and shape, but not necessarily the same strides. There may be up to _two_ axes along which either `src` or `dst` is not contiguous. """ @@ -1216,7 +1259,26 @@ def _memcpy_discontig(dst, src, async=False, stream=None): dst[...] = src return - if src.flags.forc and dst.flags.forc: + # check for stride tricks (like stride == 0 with shape > 1) + shape = src.shape + doslow = (any(sh > 1 for sh, st in zip(shape, src.strides) if st == 0) + or + any(sh > 1 for sh, st in zip(shape, dst.strides) if st == 0)) + + try: + src, dst = _flip_negative_strides((src, dst))[1] + except ValueError: + # probably different signs in strides -- will use slow version below + doslow = True + pass + + if doslow or any(np.argsort(src.strides) != np.argsort(dst.strides)): + # order is not the same, use generic slow version + _memcpy_discontig_slow(dst, src) + return + + if ((src.flags.f_contiguous and dst.flags.f_contiguous) or + (src.flags.c_contiguous and dst.flags.c_contiguous)): shape = [src.size] src_strides = dst_strides = [src.dtype.itemsize] else: @@ -1238,11 +1300,8 @@ def _memcpy_discontig(dst, src, async=False, stream=None): axes[0:0] = [np.newaxis] # collapse contiguous dimensions - # and check that dst is in same order as src i = 1 while i < len(shape): - if dst_strides[i] < dst_strides[i-1]: - raise ValueError("src and dst must have same order") if (src_strides[i-1] * shape[i-1] == src_strides[i] and dst_strides[i-1] * shape[i-1] == dst_strides[i]): shape[i-1:i+1] = [shape[i-1] * shape[i]] @@ -1282,7 +1341,9 @@ def _memcpy_discontig(dst, src, async=False, stream=None): elif len(shape) == 3: copy = drv.Memcpy3D() else: - raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),)) + # can't use MemcpyXD, use generic slow version + _memcpy_discontig_slow(dst, src) + return if isinstance(src, GPUArray): copy.set_src_device(src.gpudata) @@ -1344,7 +1405,7 @@ def take(a, indices, out=None, stream=None): a.bind_to_texref_ext(tex_src[0], allow_double_hack=True, allow_complex_hack=True) func.prepared_async_call(out._grid, out._block, stream, - indices.gpudata, out.gpudata, indices.size) + indices, out, indices.size) return out @@ -1386,8 +1447,8 @@ def make_func_for_chunk_size(chunk_size): a.bind_to_texref_ext(tex_src[i], allow_double_hack=True) func.prepared_async_call(indices._grid, indices._block, stream, - indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] + indices, + *([o for o in out[chunk_slice]] + [indices.size])) return out @@ -1451,8 +1512,8 @@ def make_func_for_chunk_size(chunk_size): a.bind_to_texref_ext(src_tr, allow_double_hack=True) func.prepared_async_call(src_indices._grid, src_indices._block, stream, - dest_indices.gpudata, src_indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] + dest_indices, src_indices, + *([o for o in out[chunk_slice]] + src_offsets_list[chunk_slice] + [src_indices.size])) @@ -1496,9 +1557,9 @@ def make_func_for_chunk_size(chunk_size): func = make_func_for_chunk_size(vec_count-start_i) func.prepared_async_call(dest_indices._grid, dest_indices._block, stream, - dest_indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] - + [i.gpudata for i in arrays[chunk_slice]] + dest_indices, + *([o for o in out[chunk_slice]] + + [i for i in arrays[chunk_slice]] + [dest_indices.size])) return out @@ -1550,7 +1611,7 @@ def if_positive(criterion, then_, else_, out=None, stream=None): out = empty_like(then_) func.prepared_async_call(criterion._grid, criterion._block, stream, - criterion.gpudata, then_.gpudata, else_.gpudata, out.gpudata, + criterion, then_, else_, out, criterion.size) return out @@ -1565,14 +1626,14 @@ def f(a, b, out=None, stream=None): a.dtype, b.dtype, out.dtype, use_scalar=False) func.prepared_async_call(a._grid, a._block, stream, - a.gpudata, b.gpudata, out.gpudata, a.size) + a, b, out, a.size) elif isinstance(a, GPUArray): if out is None: out = empty_like(a) func = elementwise.get_binary_minmax_kernel(which, a.dtype, a.dtype, out.dtype, use_scalar=True) func.prepared_async_call(a._grid, a._block, stream, - a.gpudata, b, out.gpudata, a.size) + a, b, out, a.size) else: # assuming b is a GPUArray if out is None: out = empty_like(b) @@ -1580,7 +1641,7 @@ def f(a, b, out=None, stream=None): b.dtype, b.dtype, out.dtype, use_scalar=True) # NOTE: we switch the order of a and b here! func.prepared_async_call(b._grid, b._block, stream, - b.gpudata, a, out.gpudata, b.size) + b, a, out, b.size) return out return f diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 8a2488de..fa08f5a1 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -17,6 +17,123 @@ +namespace precalc { + namespace py = boost::python; + py::tuple _precalc_array_info(const py::object & args, const py::object & elwisearginds) { + bool contigmatch = true; + bool arrayspecificinds = true; + std::vector shape; + py::object shape_obj; + char order = 'N'; + int numarrays = py::len(elwisearginds); + PyObject * arrayitemstrides = PyTuple_New(numarrays); + + for (int aind = 0; aind < numarrays; aind++) { + int elwiseargind = py::extract(elwisearginds[aind]); + // is a GPUArray/DeviceAllocation + //py::object arg = py::object(args[i]); + py::object arg(args[elwiseargind]); + if (!arrayspecificinds) { + continue; + } + py::object curshape_obj; + py::object curstrides_obj; + py::object itemsize_obj; + int itemsize; + int ndim = 0; + try { + curshape_obj = arg.attr("shape"); + curstrides_obj = arg.attr("strides"); + itemsize_obj = arg.attr("itemsize"); + itemsize = py::extract(itemsize_obj); + ndim = py::extract(arg.attr("ndim")); + } catch (...) { + // At least one array argument is probably sent as a + // GPUArray.gpudata rather than the GPUArray itself, + // so disable array-specific indices -- caller is on + // their own. + arrayspecificinds = false; + continue; + } + if (ndim == 0) { + // probably a scalar + continue; + } + std::vector curshape(ndim); + std::vector curstrides(ndim); + for (int i = 0; i < ndim; i++) { + int tmp = py::extract(curshape_obj[i]); + curshape[i] = tmp; + if (tmp > 1) { + curstrides[i] = py::extract(curstrides_obj[i]); + } else { + curstrides[i] = 0; + } + } + PyObject * newstrides = PyTuple_New(ndim); + for (int i = 0; i < ndim; i++) { + // PyTuple_SetItem steals the reference of the new PyInt + PyTuple_SetItem(newstrides, i, PyInt_FromLong(curstrides[i])); + } + PyObject * sizestrides = PyTuple_New(2); + PyTuple_SetItem(sizestrides, 0, py::incref(itemsize_obj.ptr())); + // PyTuple_SetItem steals the reference of newstrides + PyTuple_SetItem(sizestrides, 1, newstrides); + PyTuple_SetItem(arrayitemstrides, aind, sizestrides); + if (contigmatch) { + char curorder = 'N'; + int tmpaccum = itemsize; + bool is_f = (curstrides[0] == tmpaccum); + tmpaccum *= curshape[0]; + for (int i = 1; i < ndim; i++) { + if (curshape[i] == 1) { + continue; + } + if (curstrides[i] != tmpaccum) { + is_f = false; + break; + } + tmpaccum *= curshape[i]; + } + tmpaccum = itemsize; + bool is_c = (curstrides[ndim - 1] == tmpaccum); + tmpaccum *= curshape[ndim - 1]; + for (int i = ndim - 2; i >= 0; i--) { + if (curshape[i] == 1) { + continue; + } + if (curstrides[i] != tmpaccum) { + is_c = false; + break; + } + tmpaccum *= curshape[i]; + } + if (is_f) { + curorder = 'F'; + } + if (is_c) { + curorder = 'C'; + } + if (curorder == 'N') { + contigmatch = false; + } + if (shape.size() == 0) { + shape = curshape; + shape_obj = curshape_obj; + order = curorder; + } else if (order != curorder) { + contigmatch = false; + } + } + if (shape != curshape) { + PyErr_SetString(PyExc_RuntimeError, "All input arrays to elementwise kernels must have the same shape, or you must specify the arrays to be traversed element-wise explicitly with elwise_arg_inds"); + py::throw_error_already_set(); + } + } + + return py::make_tuple(contigmatch, arrayspecificinds, shape_obj, py::object(py::handle<>(arrayitemstrides))); + } +} using namespace pycuda; using boost::shared_ptr; @@ -1657,6 +1774,11 @@ BOOST_PYTHON_MODULE(_driver) #ifdef HAVE_CURAND pycuda_expose_curand(); #endif + + { + using namespace precalc; + DEF_SIMPLE_FUNCTION_WITH_ARGS(_precalc_array_info, ("args", "elwisearginds")); + } } // vim: foldmethod=marker diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index e38e2fda..f00a4e66 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -1082,6 +1082,12 @@ def test_copy(self): for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]: assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step]) + # try 3 axes discontiguous + a_gpu = curand((10,10,10,10)) + a = a_gpu.get() + for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]: + assert np.allclose(a_gpu[start:stop:step,start:stop:step,start:stop:step].get(), a_gpu.get()[start:stop:step,start:stop:step,start:stop:step]) + @mark_cuda_test def test_get_set(self): import pycuda.gpuarray as gpuarray @@ -1145,6 +1151,121 @@ def test_zeros_like_etc(self): assert new_z.dtype == np.complex64 assert new_z.shape == arr.shape + @mark_cuda_test + def test_noncontig_get(self): + from pycuda.curandom import rand as curand + a_gpu = curand((200,200)) + a = a_gpu.get() + + ret_a = a[2::3, 1::4] + + ret_a_gpu = (a_gpu[2::3, 1::4]).get() + + assert np.allclose(ret_a, ret_a_gpu) + + @mark_cuda_test + def test_noncontig_get2(self): + from pycuda.curandom import rand as curand + a_gpu = curand((200,200)) + a = a_gpu.get() + + ret_a = a[2::3,::-1] + + ret_a_gpu = (a_gpu[2::3,::-1]).get() + + assert np.allclose(ret_a, ret_a_gpu) + + @mark_cuda_test + def test_noncontig_set(self): + import pycuda.gpuarray as gpuarray + + # test different orders, transpositions, offsets, and sizes of both + # src (host) and dst (device) + for host_order in ('C', 'F'): + for dev_order in ('C', 'F'): + for transpose in ((0,1,2), (0,2,1), (1,2,0), (1,0,2), (2,0,1), (2,1,0)): + for start, stop, step in [(0, 50, 2), (1, 50, 2), (49, None, -2), (48, None, -2)]: + print("host_order=%s dev_order=%s transpose=%s start=%s stop=%s step=%s" % (host_order, dev_order, transpose, start, stop, step)) + a = np.array(np.random.normal(0., 1., (25,25,25)), order=host_order) + a_gpu = gpuarray.zeros((50, 50, 50), np.float64, order=dev_order) + a_gpu_sub = a_gpu[start:stop:step,start:stop:step,start:stop:step] + a_gpu_sub.set(a) + assert np.allclose(a_gpu_sub.get(), a) + assert np.allclose(a_gpu.get()[start:stop:step,start:stop:step,start:stop:step], a) + + + @mark_cuda_test + def test_noncontig_unary(self): + from pycuda.curandom import rand as curand + a_gpu = curand((200,200))[1::4, ::-2] + a = a_gpu.get() + + ret_a = a ** 2 + + ret_a_gpu = (a_gpu ** 2).get() + + assert np.allclose(ret_a, ret_a_gpu) + + @mark_cuda_test + def test_noncontig_different_strides(self): + x = 200; y = 200 + + a_gpu = gpuarray.arange(0, x*y, 1, dtype=np.float32).reshape((x, y), order='C') + a = a_gpu.get() + assert a_gpu.strides[0] > a_gpu.strides[1], "a_gpu not C-order" + + b_gpu = gpuarray.zeros_like(a_gpu, order='F') + b_gpu += a_gpu + b = b_gpu.get() + assert b_gpu.strides[1] > b_gpu.strides[0], "b_gpu not F-order" + + assert np.allclose(a, b) + + @mark_cuda_test + def test_noncontig_stride_tricky(self): + from pycuda.curandom import rand as curand + a_gpu = curand((200,200)) + a = a_gpu.get() + + a_gpu = a_gpu[1::4, ::-2] + a = a[1::4, ::-2] + + a_gpu = gpuarray.GPUArray(a_gpu.shape, a_gpu.dtype, + base=a_gpu, + gpudata=a_gpu.gpudata, + strides=(0,0)) + a.strides = (0, 0) + + ret_a = a + + ret_a_gpu = a_gpu.get() + + assert np.allclose(ret_a, ret_a_gpu) + + @mark_cuda_test + def test_noncontig_stride_tricky2(self): + # sliding window using tricks + from numpy.lib.stride_tricks import as_strided + from pycuda.curandom import rand as curand + a_gpu = curand((200000)) + a = a_gpu.get() + + a_gpu = gpuarray.GPUArray((199998, 3), a_gpu.dtype, + base=a_gpu, + gpudata=a_gpu.gpudata, + strides=(a_gpu.itemsize, a_gpu.itemsize)) + a = as_strided(a, shape=(199998, 3), strides=(a.itemsize, a.itemsize)) + + ret_a = a + ret_a_gpu = a_gpu.get() + + assert np.allclose(ret_a, ret_a_gpu) + + ret_a = a + 2 + ret_a_gpu = (a_gpu + 2).get() + + assert np.allclose(ret_a, ret_a_gpu) + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests.