diff --git a/pycuda/cumath.py b/pycuda/cumath.py
index dbae5bd6..1a8dcb5b 100644
--- a/pycuda/cumath.py
+++ b/pycuda/cumath.py
@@ -42,7 +42,7 @@ def f(array, stream_or_out=None, **kwargs):
 
         func = elementwise.get_unary_func_kernel(func_name, array.dtype)
         func.prepared_async_call(array._grid, array._block, stream,
-                array.gpudata, out.gpudata, array.mem_size)
+                array, out, array.mem_size)
 
         return out
     return f
@@ -77,7 +77,7 @@ def fmod(arg, mod, stream=None):
 
     func = elementwise.get_fmod_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size)
+            arg, mod, result, arg.mem_size)
 
     return result
 
@@ -94,7 +94,7 @@ def frexp(arg, stream=None):
 
     func = elementwise.get_frexp_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size)
+            arg, sig, expt, arg.mem_size)
 
     return sig, expt
 
@@ -111,7 +111,7 @@ def ldexp(significand, exponent, stream=None):
 
     func = elementwise.get_ldexp_kernel()
     func.prepared_async_call(significand._grid, significand._block, stream,
-            significand.gpudata, exponent.gpudata, result.gpudata,
+            significand, exponent, result,
             significand.mem_size)
 
     return result
@@ -129,7 +129,7 @@ def modf(arg, stream=None):
 
     func = elementwise.get_modf_kernel()
     func.prepared_async_call(arg._grid, arg._block, stream,
-            arg.gpudata, intpart.gpudata, fracpart.gpudata,
+            arg, intpart, fracpart,
             arg.mem_size)
 
     return fracpart, intpart
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index e1c68428..d31ee01a 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -240,7 +240,7 @@ def rand(shape, dtype=np.float32, stream=None):
         raise NotImplementedError;
 
     func.prepared_async_call(result._grid, result._block, stream,
-            result.gpudata, np.random.randint(2**31-1), result.size)
+            result, np.random.randint(2**31-1), result.size)
 
     return result
 
diff --git a/pycuda/deferred.py b/pycuda/deferred.py
new file mode 100644
index 00000000..a843322e
--- /dev/null
+++ b/pycuda/deferred.py
@@ -0,0 +1,553 @@
+"""
+This exports a "deferred" implementation of SourceModule, where compilation
+is delayed until call-time.  Several methods, like get_function(), return
+"deferred" values that are also only evaluated at call-time.
+"""
+
+from pycuda.tools import context_dependent_memoize
+from pycuda.compiler import compile, SourceModule
+import pycuda.driver
+
+import re
+
+class DeferredSource(object):
+    '''
+    Source generator that supports user-directed indentation, nesting
+    ``DeferredSource`` objects, indentation-aware string interpolation,
+    and deferred generation.
+    Use ``+=`` or ``add()`` to add source fragments as strings or
+    other ``DeferredSource`` objects, ``indent()`` or ``dedent()`` to
+    change base indentation, and ``__call__`` or ``generate()`` to
+    generate source.
+    
+    '''
+    def __init__(self, str=None, subsources=None, base_indent=0, indent_step=2):
+        self.base_indent = base_indent
+        self.indent_step = indent_step
+        if subsources is None:
+            subsources = []
+        self.subsources = subsources
+        if str is not None:
+            self.add(str)
+        self._regex_space = re.compile(r"^(\s*)(.*?)(\s*)$")
+        self._regex_format = re.compile(r"%\(([^\)]*)\)([a-zA-Z])")
+
+    def __str__(self):
+        return self.generate()
+
+    def __repr__(self):
+        return repr(self.__str__())
+
+    def __call__(self, indent=0, indent_first=True):
+        return self.generate(indent, indent_first)
+
+    def generate(self, indent=0, indent_first=True, get_list=False):
+        if get_list:
+            retval = []
+        else:
+            retval = ''
+        do_indent = not indent_first
+        for subindent, strip_space, subsource, format_dict in self.subsources:
+            if do_indent:
+                newindent = self.base_indent + indent + subindent
+            else:
+                newindent = 0
+            do_indent = True
+            if isinstance(subsource, DeferredSource):
+                retval = retval + subsource.generate(indent=(indent + subindent), get_list=get_list)
+                continue
+            lines = subsource.split("\n")
+            minstrip = None
+            newlines = []
+            for line in lines:
+                linelen = len(line)
+                space_match = self._regex_space.match(line)
+                space = space_match.group(1)
+                end_leading_space = space_match.end(1)
+                begin_trailing_space = space_match.start(3)
+                if strip_space:
+                    if linelen == end_leading_space:
+                        # all space, ignore
+                        continue
+                    if minstrip is None or end_leading_space < minstrip:
+                        minstrip = end_leading_space
+                if not format_dict:
+                    newlines.append(line)
+                    continue
+                newlinelist = None
+                newline = ''
+                curpos = 0
+                matches = list(self._regex_format.finditer(line, end_leading_space))
+                nummatches = len(matches)
+                for match in matches:
+                    formatchar = match.group(2)
+                    name = match.group(1)
+                    matchstart = match.start()
+                    matchend = match.end()
+                    repl = format_dict.get(name, None)
+                    if repl is None:
+                        continue
+                    if (isinstance(repl, DeferredSource) and
+                        nummatches == 1 and
+                        matchstart == end_leading_space):
+                        # only one replacement, and only spaces preceding
+                        newlinelist = [ space + x
+                                        for x in repl.generate(get_list=True) ]
+                    else:
+                        newline = newline + line[curpos:matchstart]
+                        newline = newline + (('%' + formatchar) % (repl,))
+                    curpos = matchend
+                if newlinelist:
+                    newlines.extend(newlinelist)
+                else:
+                    newlines.append(newline)
+                # add remaining unprocessed part of line to end of last
+                # replacement
+                if newlinelist is not None and len(newlinelist) > 1:
+                    newlines.append(space + line[curpos:])
+                else:
+                    newlines[-1] = newlines[-1] + line[curpos:]
+            indentstr = ' ' * (indent + subindent)
+            for i, line in enumerate(newlines):
+                line = indentstr + line[minstrip:]
+                newlines[i] = line
+            if get_list:
+                retval += newlines
+            else:
+                retval = retval + "\n".join(newlines) + "\n"
+        return retval
+
+    def indent(self, indent_step=None):
+        if indent_step is None:
+            indent_step = self.indent_step
+        self.base_indent += indent_step
+        return self
+
+    def dedent(self, indent_step=None):
+        if indent_step is None:
+            indent_step = self.indent_step
+        self.base_indent -= indent_step
+        return self
+
+    def format_dict(self, format_dict):
+        for subsource in self.subsources:
+            subsource[3] = format_dict
+        return self
+
+    def add(self, other, strip_space=True, format_dict=None):
+        self.subsources.append([self.base_indent, strip_space, other, format_dict])
+        return self
+
+    def __iadd__(self, other):
+        self.add(other)
+        return self
+
+    def __add__(self, other):
+        newgen = DeferredSource(subsources=self.subsources,
+                                base_indent=self.base_indent,
+                                indent_step=self.indent_step)
+        newgen.add(other)
+        return newgen
+
+class DeferredVal(object):
+    '''
+    This is an object that serves as a wrapper to an object that may not
+    even exist yet (perhaps this should have been called FutureVal).  All
+    methods provided by ``DeferredVal`` start with underscores so as not to
+    interfere with common attribute names.
+
+    The life of a ``DeferredVal`` can be divided into two phases: before
+    and after the wrapped object is "completed", i.e. once the wrapped
+    object is specified (see below for how/when this happens).
+
+    After it is completed, any attempt to access attributes on this
+    ``DeferredVal`` will be simply redirected to the wrapped object.
+
+    Before the wrapped object is completed, the only attributes allowed are
+    methods named in the class variable ``_deferred_method_dict`` (which
+    must be set in a subclass).  ``_deferred_method_dict`` specifies the
+    names of methods that can be deferred to a later time when the current
+    ``DeferredVal`` is completed.  It maps method names to a result class
+    that will be instantiated and used to represent the return value of the
+    deferred method call.  The result class can be DeferredVal, a subclass
+    of ``DeferredVal``, or None (same as DeferredVal).  Any attempts to
+    call one of these defer-able methods will return an instance of the
+    result class, which will be tied to a future call of the method once
+    the current ``DeferredVal`` object is completed.
+
+    A ``DeferredVal`` can be completed by specifying the wrapped object in
+    one of two ways.  One way is to set it explicitly with
+    ``_set_val(val)``.  The other is to call ``_get()``, which requires
+    that ``_eval()`` be defined in a subclass to return the actual object.
+    Both ways will trigger the deferred method calls (described above) that
+    depend on this ``DeferredVal`` and the completion of their own
+    ``DeferredVal`` return values.
+
+    Once completed, the wrapped object can be retrieved by calling
+    ``_get()``.
+    '''
+    __unimpl = object()
+    _deferred_method_dict = None # must be set by subclass
+
+    def __init__(self):
+        self._val_available = False
+        self._val = None
+        self._deferred_method_calls = []
+
+    def _copy(self, retval=None):
+        if retval is None:
+            retval = self.__class__()
+        retval._val_available = self._val_available
+        retval._val = self._val
+        retval._deferred_method_calls = self._deferred_method_calls
+        return retval
+
+    def __repr__(self):
+        return self._repr(0)
+
+    def _repr(self, indent):
+        indentstr = " " * indent
+        retstrs = []
+        retstrs.append("")
+        retstrs.append("%s [" % (self.__class__,))
+        for dmc in self._deferred_method_calls:
+            (name, args, kwargs, retval) = dmc
+            retstrs.append("  method %s" % (repr(name),))
+            for arg in args:
+                if isinstance(arg, DeferredVal):
+                    retstrs.append("    deferred arg (id=%s)" % (id(arg),))
+                    retstrs.append(arg._repr(indent + 6))
+                else:
+                    retstrs.append("    arg %s" % (repr(arg),))
+            for kwname, arg in kwargs.items():
+                if isinstance(arg, DeferredVal):
+                    retstrs.append("    deferred kwarg %s (id=%s)" % (repr(kwname), id(arg)))
+                    retstrs.append(arg._repr(indent + 6))
+                else:
+                    retstrs.append("    kwarg %s=%s" % (kwname, repr(arg),))
+            retstrs.append("    deferred retval (id=%s)" % (id(retval),))
+        retstrs.append("]")
+        return "\n".join([(indentstr + retstr) for retstr in retstrs])
+
+    def _set_val(self, val):
+        self._val = val
+        self._val_available = True
+        self._eval_methods()
+        return val
+
+    def _eval(self):
+        raise NotImplementedError()
+
+    def _eval_list(self, vals):
+        newvals = []
+        for val in vals:
+            if isinstance(val, DeferredVal):
+                val = val._get()
+            newvals.append(val)
+        return newvals
+
+    def _eval_dict(self, valsdict):
+        newvalsdict = {}
+        for name, val in valsdict.items():
+            if isinstance(val, DeferredVal):
+                val = val._get()
+            newvalsdict[name] = val
+        return newvalsdict
+
+    def _eval_methods(self):
+        assert(self._val_available)
+        val = self._val
+        for op in self._deferred_method_calls:
+            (methodname, methodargs, methodkwargs, deferredretval) = op
+            methodargs = self._eval_list(methodargs)
+            methodkwargs = self._eval_dict(methodkwargs)
+            retval = getattr(val, methodname)(*methodargs, **methodkwargs)
+            deferredretval._set_val(retval)
+        self._deferred_method_calls = []
+
+    def _get(self):
+        if not self._val_available:
+            self._val = self._eval()
+            self._val_available = True
+            self._eval_methods()
+        return self._val
+
+    def _get_deferred_func(self, _name, _retval):
+        def _deferred_func(*args, **kwargs):
+            """
+            When this function is called from an "uncompleted" DeferredVal,
+            the name of the deferred method, arguments, and DeferredVal/wrapper
+            object for the eventual return value are stored for future
+            evaluation when the current wrapped object is completed, and
+            the DeferredVal return object is returned.
+            Otherwise, if the DeferredVal is already completed, then return
+            the result of calling the method directly.
+            """
+            if not self._val_available:
+                self._deferred_method_calls.append((_name, args, kwargs, _retval))
+                return _retval
+            args = self._eval_list(args)
+            kwargs = self._eval_dict(kwargs)
+            return getattr(self._val, _name)(*args, **kwargs)
+        _deferred_func.__name__ = _name + ".deferred"
+        return _deferred_func
+
+    def __getattr__(self, name):
+        if self.__class__._deferred_method_dict is None:
+            raise Exception("DeferredVal must be subclassed and the class attribute _deferred_method_dict must be set to a valid dictionary!")
+        if self._val_available:
+            return getattr(self._val, name)
+        deferredclass = self.__class__._deferred_method_dict.get(name, self.__unimpl)
+        if deferredclass is not self.__unimpl:
+            if deferredclass is None:
+                deferredclass = DeferredVal
+            retval = deferredclass()
+            return self._get_deferred_func(name, retval)
+        raise AttributeError("no such attribute (yet): '%s'" % (name,))
+
+class DeferredNumeric(DeferredVal):
+    """
+    This is a DeferredVal that allows the deferral of all math operations.
+    """
+    pass
+_mathops = (
+    '__add__', '__sub__', '__mul__', '__floordiv__', '__mod__',
+    '__divmod__', '__pow__', '__lshift__', '__rshift__', '__and__',
+    '__xor__', '__or__', '__div__', '__truediv__', '__radd__', '__rsub__',
+    '__rmul__', '__rdiv__', '__rtruediv__', '__rfloordiv__', '__rmod__',
+    '__rdivmod__', '__rpow__', '__rlshift__', '__rrshift__', '__rand__',
+    '__rxor__', '__ror__', '__iadd__', '__isub__', '__imul__', '__idiv__',
+    '__itruediv__', '__ifloordiv__', '__imod__', '__ipow__', '__ilshift__',
+    '__irshift__', '__iand__', '__ixor__', '__ior__', '__pos__', '__abs__',
+    '__invert__', '__complex__', '__int__', '__long__', '__float__',
+    '__oct__', '__hex__', '__index__', '__coerce__')
+DeferredNumeric._deferred_method_dict = dict((x, DeferredNumeric)
+                                             for x in _mathops)
+def _get_deferred_attr_func(_name):
+    def _deferred_func(self, *args, **kwargs):
+        return self.__getattr__(_name)(*args, **kwargs)
+    _deferred_func.__name__ = _name
+    return _deferred_func
+for name in _mathops:
+    setattr(DeferredNumeric, name, _get_deferred_attr_func(name))
+
+class DeferredModuleCall(DeferredVal):
+    _deferred_method_dict = {}
+    def __init__(self, methodstr, *args, **kwargs):
+        super(DeferredModuleCall, self).__init__()
+        self._methodstr = methodstr
+        self._args = args
+        self._kwargs = kwargs
+        self._mod = None
+
+    def _set_mod(self, mod):
+        self._mod = mod
+
+    def _copy(self, retval=None):
+        if retval is None:
+            retval = self.__class__(self._methodstr, *self._args, **self._kwargs)
+        return super(DeferredModuleCall, self)._copy(retval=retval)
+
+    def _eval(self):
+        return getattr(self._mod, self._methodstr)(*self._args, **self._kwargs)
+
+class DeferredTexRef(DeferredModuleCall):
+    _deferred_method_dict = {
+        "set_array": None,
+        "set_address": DeferredNumeric,
+        "set_address_2d": None,
+        "set_format": None,
+        "set_address_mode": None,
+        "set_flags": None,
+        "get_address": DeferredNumeric,
+        "get_flags": DeferredNumeric,
+    }
+
+class DeferredFunction(object):
+    '''
+    This class is a pseudo-replacement of ``pycuda.driver.Function``,
+    but takes a ``DeferredSourceModule`` and a function name as an argument,
+    and queues any call to ``prepare()`` until call-time, at which time it
+    calls out to the ``DeferredSourceModule`` object do create the actual
+    Function before preparing (if necessary) and calling the underlying
+    kernel.  NOTE: you may now send the actual ``GPUArrays`` as arguments,
+    rather than their ``.gpudata`` members; this can be helpful to
+    dynamically create kernels.
+    '''
+    def __init__(self, deferredmod, funcname):
+        self._deferredmod = deferredmod
+        self._funcname = funcname
+        self._prepare_args = None
+
+        def get_unimplemented(_methodname):
+            def _unimplemented(self, _methodname=_methodname, *args, **kwargs):
+                raise NotImplementedError("%s does not implement method '%s'" % (type(self), _methodname,))
+            return _unimplemented
+
+        for meth_name in ["set_block_shape", "set_shared_size",
+                "param_set_size", "param_set", "param_seti", "param_setf",
+                "param_setv", "param_set_texref",
+                "launch", "launch_grid", "launch_grid_async"]:
+            setattr(self, meth_name, get_unimplemented(meth_name))
+
+    def _fix_texrefs(self, kwargs, mod):
+        texrefs = kwargs.get('texrefs', None)
+        if texrefs is not None:
+            kwargs = kwargs.copy()
+            newtexrefs = []
+            for texref in texrefs:
+                if isinstance(texref, DeferredVal):
+                    if isinstance(texref, DeferredModuleCall):
+                        texref = texref._copy() # future calls may use different modules/functions
+                        texref._set_mod(mod)
+                    texref = texref._get()
+                elif isinstance(texref, DeferredVal):
+                    texref = texref._get()
+                newtexrefs.append(texref)
+            kwargs['texrefs'] = newtexrefs
+        return kwargs
+
+    def __call__(self, *args, **kwargs):
+        block = kwargs.get('block', None)
+        if block is None or not isinstance(block, tuple) or len(block) != 3:
+            raise ValueError("keyword argument 'block' is required, and must be a 3-tuple of integers")
+        grid = kwargs.get('grid', (1,1))
+        mod, func = self._deferredmod._delayed_get_function(self._funcname, args, grid, block)
+        kwargs = self._fix_texrefs(kwargs, mod)
+        return func.__call__(*args, **kwargs)
+
+    def param_set_texref(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def prepare(self, *args, **kwargs):
+        self._prepare_args = (args, kwargs)
+        return self
+
+    def _do_delayed_prepare(self, mod, func):
+        if self._prepare_args is None:
+            raise Exception("prepared_*_call() requires that prepare() be called first")
+        (prepare_args, prepare_kwargs) = self._prepare_args
+        prepare_kwargs = self._fix_texrefs(prepare_kwargs, mod)
+        func.prepare(*prepare_args, **prepare_kwargs)
+
+    def _generic_prepared_call(self, funcmethodstr, funcmethodargs, funcargs, funckwargs):
+        grid = funcmethodargs[0]
+        block = funcmethodargs[1]
+        mod, func = self._deferredmod._delayed_get_function(self._funcname, funcargs, grid, block)
+        self._do_delayed_prepare(mod, func)
+        newfuncargs = [ getattr(arg, 'gpudata', arg) for arg in funcargs ]
+        fullargs = list(funcmethodargs)
+        fullargs.extend(newfuncargs)
+        return getattr(func, funcmethodstr)(*fullargs, **funckwargs)
+
+    def prepared_call(self, grid, block, *args, **kwargs):
+        return self._generic_prepared_call('prepared_call', (grid, block), args, kwargs)
+
+    def prepared_timed_call(self, grid, block, *args, **kwargs):
+        return self._generic_prepared_call('prepared_timed_call', (grid, block), args, kwargs)
+
+    def prepared_async_call(self, grid, block, stream, *args, **kwargs):
+        return self._generic_prepared_call('prepared_async_call', (grid, block, stream), args, kwargs)
+
+@context_dependent_memoize
+def _delayed_compile_aux(source, compileargs):
+    # re-convert any tuples to lists
+    newcompileargs = []
+    for i, arg in enumerate(compileargs):
+        if isinstance(arg, tuple):
+            arg = list(arg)
+        newcompileargs.append(arg)
+    cubin = compile(source, *newcompileargs)
+
+    from pycuda.driver import module_from_buffer
+    return module_from_buffer(cubin)
+
+class DeferredSourceModule(SourceModule):
+    '''
+    This is an abstract specialization of SourceModule which allows the
+    delay of creating the actual kernel source until call-time, at which
+    point the actual arguments are available and their characteristics can
+    be used to create specific kernels.
+    To support this, ``get_function()`` returns a ``DeferredFunction``
+    object which queues any calls to ``DeferredFunction.prepare()`` and
+    saves them until future calls to ``DeferredFunction.__call__()`` or
+    ``DeferredFunction.prepared_*_call()``.  NOTE: you may now send actual
+    ``GPUArrays`` to these functions rather their ``.gpudata`` members;
+    this can be helpful when creating dynamic kernels.
+    Likewise, ``get_global()``, ``get_texref()`` and ``get_surfref()``
+    return proxy objects that can be stored by ``DeferredFunction.prepare()``
+    and will only be evaluated at call-time.
+    This class must be subclassed and the function ``create_source(self,
+    grid, block, *args)`` must be overridden, returning the kernel source
+    (or ``DeferredSource`` object) that should be compiled.  ``grid``,
+    ``block``, and ``*args`` are the same arguments that were sent to the
+    ``DeferredFunction`` call functions above.
+    The function ``create_key(self, grid, block, *args)`` returns a tuple
+    ``(key, precalc)``.  ``create_key`` is always called before
+    ``create_source`` and any pre-calculated info in ``precalc`` is sent back
+    to ``create_source``; ``key`` is used to cache any compiled functions.
+    Default return value of ``create_key()`` is (None, None), which means to
+    use the function name and generated source as the key.  The key returned
+    by ``create_key()`` must be usable as a hash key.
+    '''
+    _cache = {}
+
+    def __init__(self, nvcc="nvcc", options=None, keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[]):
+        self._arch = arch
+        # tuples below are so _compileargs can be used as a hash key
+        if options is not None:
+            options = tuple(options)
+        include_dirs = tuple(include_dirs)
+        self._compileargs = (nvcc, options, keep, no_extern_c,
+                             arch, code, cache_dir, include_dirs)
+
+    def _delayed_compile(self, source):
+        self._check_arch(self._arch)
+
+        return _delayed_compile_aux(source, self._compileargs)
+
+    def create_key(self, grid, block, *funcargs):
+        return (None, None)
+
+    def create_source(self, precalc, grid, block, *funcargs):
+        raise NotImplementedError("create_source must be overridden!")
+
+    def _delayed_get_function(self, funcname, funcargs, grid, block):
+        '''
+        If the first element of the tuple returned by ``create_key()`` is
+        not None, then it is used as the key to cache compiled functions.
+        Otherwise the return value of ``create_source()`` is used as the key.
+        '''
+        context = pycuda.driver.Context.get_current()
+        funccache = DeferredSourceModule._cache.get(context, None)
+        if funccache is None:
+            funccache = self._cache[context] = {}
+        (funckey, precalc) = self.create_key(grid, block, *funcargs)
+        modfunc = funccache.get(funckey, None)
+        if modfunc is None:
+            source = self.create_source(precalc, grid, block, *funcargs)
+            if isinstance(source, DeferredSource):
+                source = source.generate()
+            if funckey is None:
+                funckey = (funcname, source)
+            modfunc = funccache.get(funckey, None)
+        if modfunc is None:
+            module = self._delayed_compile(source)
+            func = module.get_function(funcname)
+            modfunc = funccache[funckey] = (module, func)
+        return modfunc
+
+    def get_function(self, name):
+        return DeferredFunction(self, name)
+
+    def get_global(self, name):
+        raise NotImplementedError("Deferred globals in element-wise kernels not supported yet")
+
+    def get_texref(self, name):
+        return DeferredTexRef('get_texref', name)
+
+    def get_surfref(self, name):
+        raise NotImplementedError("Deferred surfaces in element-wise kernels not supported yet")
+
diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 695817fa..c804bd7c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -36,92 +36,528 @@
 import numpy as np
 from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
 from pytools import memoize_method
+from pycuda.deferred import DeferredSourceModule, DeferredSource
+import pycuda._driver as _drv
+
+class ElementwiseSourceModule(DeferredSourceModule):
+    '''
+    This is a ``DeferredSourceModule`` which is backwards-compatible with the
+    original ``get_elwise_module`` and ``get_elwise_range_module`` (using
+    ``do_range=True``).  However, this class delays the compilation of
+    kernels until call-time.  If you send actual ``GPUArray`` arguments
+    (instead of their ``.gpudata`` members) when calling the methods
+    supported by the return value of ``get_function()``, then you get:
+      * support for array-specific flat indices (i.e. for input array ``z``,
+        you can index it as ``z[z_i]`` in addition to the old-style ``z[i]``)
+      * support for non-contiguous (and arbitrarily-strided) arrays, but
+        only if you use the array-specific indices above.
+      * if do_indices=True, the N-D index of the current data item in all
+        participating arrays is available in INDEX[dim] where dim goes from
+        0 to NDIM - 1.
+    ``elwise_arg_inds`` specifies the kernel arguments (as indices into
+    ``arguments``) that will be treated element-wise and need array-specific
+    index calculations.  If ``elwise_arg_inds`` is not specified, it is
+    constructed by finding all arguments that are VectorArg objects.  If only
+    some of the VectorArgs will be traversed element-wise, specifying
+    ``elwise_arg_inds`` ensures that the kernel doesn't wastes time
+    calculating unneeded indices.  When the kernel is eventually called,
+    all elementwise array arguments specified in ``elwise_arg_inds`` must be
+    the same shape, which is used to optimize index calculations.
+    '''
+    def __init__(self, arguments, operation,
+                 name="kernel", preamble="", loop_prep="", after_loop="",
+                 do_range=False,
+                 elwise_arg_inds=None,
+                 do_indices=False,
+                 order='K',
+                 debug=False,
+                 **compilekwargs):
+        super(ElementwiseSourceModule, self).__init__(**compilekwargs)
+        self._do_range = do_range
+        self._do_indices = do_indices
+        assert(order in 'CFK', "order must be 'F', 'C', or 'K'")
+        self._order = order
+        self._arg_decls = ", ".join(arg.declarator() for arg in arguments)
+        if elwise_arg_inds is None:
+            elwise_arg_inds = [i for i in range(len(arguments)) if isinstance(arguments[i], VectorArg)]
+        self._elwise_arg_inds = elwise_arg_inds
+        self._arg_names = tuple(argument.name for argument in arguments)
+        self._elwise_arg_names = tuple(self._arg_names[i] for i in self._elwise_arg_inds)
+        self._init_args = (self._arg_names, self._arg_decls, operation,
+                           name, preamble, loop_prep, after_loop)
+        self._init_args_repr = repr(self._init_args)
+        self._debug = debug
+
+    def _precalc_array_info(self, args):
+        # 'args' is the list of actual parameters being sent to the kernel
+
+        contigmatch = True
+        arrayspecificinds = True
+        shape = None
+        order = None
+        elwise_arg_inds = self._elwise_arg_inds
+        for i in elwise_arg_inds:
+            # is a GPUArray/DeviceAllocation
+            arg = args[i]
+            if not arrayspecificinds:
+                continue
+            if not hasattr(arg, 'shape'):
+                # At least one array argument is probably sent as a
+                # GPUArray.gpudata rather than the GPUArray itself,
+                # so disable array-specific indices -- caller is on
+                # their own.
+                arrayspecificinds = False
+                continue
+            curshape = arg.shape
+            # make strides == 0 when shape == 1
+            arg.strides = tuple((np.array(curshape) > 1) * arg.strides)
+            curorder = 'N'
+            if contigmatch:
+                if arg.flags.f_contiguous:
+                    curorder = 'F'
+                elif arg.flags.c_contiguous:
+                    curorder = 'C'
+                if curorder == 'N':
+                    contigmatch = False
+                if shape is None:
+                    shape = curshape
+                    order = curorder
+                elif order != curorder:
+                    contigmatch = False
+            if shape != curshape:
+                raise Exception("All input arrays to elementwise kernels must have the same shape, or you must specify the arrays to be traversed element-wise explicitly with elwise_arg_inds; found shapes %s and %s" % (shape, curshape))
+
+        arrayitemstrides = tuple((arg.itemsize, arg.strides) for arg in (args[i] for i in elwise_arg_inds))
+
+        return (contigmatch, arrayspecificinds, shape, arrayitemstrides)
+
+
+    def create_key(self, grid, block, *args):
+        #print "Calling _precalc_array_info(args=%s, elwise_arg_inds=%s)\n" % (args, self._elwise_arg_inds)
+        #precalc_init = self._precalc_array_info(args)
+        elwise_arg_inds = self._elwise_arg_inds
+        precalc_init = _drv._precalc_array_info(args, elwise_arg_inds)
+        (contigmatch, arrayspecificinds, shape, arrayitemstrides) = precalc_init
+
+        if (contigmatch or not arrayspecificinds) and not self._do_indices:
+            key = self._init_args_repr
+            return key, (self._init_args_repr, shape, contigmatch, arrayspecificinds, None, None, None, None, None)
+
+        # Arrays are not contiguous or different order or need calculated indices
+
+        if grid[1] != 1 or block[1] != 1 or block[2] != 1:
+            raise Exception("Grid (%s) and block (%s) specifications should have all '1' except in the first element" % (grid, block))
+
+        ndim = len(shape)
+        arg_names = self._arg_names
+
+        # Kernel traverses in Fortran-order by default, but if order == 'C',
+        # we can just reverse the shape and strides, since the kernel is
+        # elementwise.  If order == 'K', use index of minimum stride in first
+        # elementwise array as a hint on how to
+        # order the traversal of dimensions.  We could probably do something
+        # smarter, like tranposing/reshaping arrays if possible to maximize
+        # performance, but that is probably best done in a pre-processing step.
+        # Note that this could mess up custom indexing that assumes a
+        # particular traversal order, but in that case one should explicitly
+        # set order to 'C' or 'F'.
+        do_reverse = False
+        if self._order == 'C':
+            do_reverse = True
+        elif self._order == 'K':
+            if np.argmin(np.abs(args[self._elwise_arg_inds[0]].strides)) > ndim // 2:
+                # traverse dimensions in reverse order
+                do_reverse = True
+
+        # need to include grid and block in key because kernel will change
+        # depending on number of threads
+        key = (self._init_args_repr, shape, contigmatch, arrayspecificinds, arrayitemstrides, self._arg_decls, do_reverse, grid, block)
+
+        return key, key
+
+    def create_source(self, precalc, grid, block, *args):
+        (_, shape, contigmatch, arrayspecificinds, arrayitemstrides, _, do_reverse, _, _) = precalc
+
+        (arg_names, arg_decls, operation,
+         funcname, preamble, loop_prep, after_loop) = self._init_args
+
+        elwise_arg_inds = self._elwise_arg_inds
+        elwise_arg_names = self._elwise_arg_names
+
+        if contigmatch and not self._do_indices:
+            indtype = 'unsigned'
+            if self._do_range:
+                indtype = 'long'
+
+            # All arrays are contiguous and same order (or we don't know and
+            # it's up to the caller to make sure it works)
+            if arrayspecificinds:
+                for arg_name in elwise_arg_names:
+                    preamble = preamble + """
+                        #define %s_i i
+                    """ % (arg_name,)
+            if self._do_range:
+                loop_body = """
+                  if (step < 0)
+                  {
+                    for (i = start + (_CTA_START + _TID)*step;
+                      i > stop; i += total_threads*step)
+                    {
+                      %(operation)s;
+                    }
+                  }
+                  else
+                  {
+                    for (i = start + (_CTA_START + _TID)*step;
+                      i < stop; i += total_threads*step)
+                    {
+                      %(operation)s;
+                    }
+                  }
+                """ % {
+                    "operation": operation,
+                }
+            else:
+                loop_body = """
+                  for (i = _CTA_START + _TID; i < n; i += total_threads)
+                  {
+                    %(operation)s;
+                  }
+                """ % {
+                    "operation": operation,
+                }
+
+            return """
+                #include <pycuda-complex.hpp>
+
+                %(preamble)s
+
+                __global__ void %(name)s(%(arg_decls)s)
+                {
+                  unsigned _TID = threadIdx.x;
+                  unsigned total_threads = gridDim.x*blockDim.x;
+                  unsigned _CTA_START = blockDim.x*blockIdx.x;
+
+                  %(indtype)s i;
+
+                  %(loop_prep)s;
+
+                  %(loop_body)s;
+
+                  %(after_loop)s;
+                }
+                """ % {
+                    "arg_decls": arg_decls,
+                    "name": funcname,
+                    "preamble": preamble,
+                    "loop_prep": loop_prep,
+                    "after_loop": after_loop,
+                    "loop_body": loop_body,
+                    "indtype": indtype,
+                }
+
+        # Arrays are not contiguous or different order or we need N-D indices
+
+        # these are the arrays that need calculated indices
+        arrayindnames = []
+
+        # these are the arrays that match another array and can use the
+        # same indices
+        arrayrefnames = []
+
+        argnamestrides = ((arg_name, itemstride[0], itemstride[1]) for (arg_name, itemstride) in zip(elwise_arg_names, arrayitemstrides))
+
+        ndim = len(shape)
+        numthreads = block[0] * grid[0]
+        if do_reverse:
+            shape = shape[::-1]
+        shapearr = np.array(shape)
+        block_step = np.array(shapearr)
+        tmpnumthreads = numthreads
+        for dimnum in range(ndim):
+            newstep = tmpnumthreads % block_step[dimnum]
+            tmpnumthreads = tmpnumthreads // block_step[dimnum]
+            block_step[dimnum] = newstep
+        arrayarginfos = []
+        for (arg_name, arg_itemsize, arg_strides) in argnamestrides:
+            strides = [0] * ndim
+            if do_reverse:
+                strides[0:len(arg_strides)] = arg_strides[::-1]
+            else:
+                strides[0:len(arg_strides)] = arg_strides
+            elemstrides = np.array(strides) // arg_itemsize
+            dimelemstrides = tuple(elemstrides * shapearr)
+            blockelemstrides = tuple(elemstrides * block_step)
+            elemstrides = tuple(elemstrides)
+            inforef = None
+            for i, arrayarginfo in enumerate(arrayarginfos):
+                if elemstrides == arrayarginfo[1]:
+                    inforef = i
+            if inforef is None:
+                arrayindnames.append(arg_name)
+            else:
+                arrayrefnames.append((arg_name, arrayarginfos[inforef][0]))
+            arrayarginfos.append(
+                (arg_name, elemstrides, dimelemstrides, blockelemstrides, inforef)
+            )
+
+        preamble = DeferredSource(preamble)
+        operation = DeferredSource(operation)
+        loop_prep = DeferredSource(loop_prep)
+        after_loop = DeferredSource(after_loop)
+        defines = DeferredSource()
+        decls = DeferredSource()
+        loop_preop = DeferredSource()
+        loop_inds_calc = DeferredSource()
+        loop_inds_inc = DeferredSource()
+        loop_body = DeferredSource()
+
+        if self._do_indices:
+            defines += """#define NDIM %d""" % (ndim,)
+
+        for definename, vals in (
+                ('SHAPE', shape),
+                ('BLOCK_STEP', block_step),
+        ):
+            for dimnum in range(ndim):
+                defines += """
+                    #define %s_%d %d
+                """ % (definename, dimnum, vals[dimnum])
+        for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos:
+            for definename, vals in (
+                    ('ELEMSTRIDE', elemstrides),
+                    ('DIMELEMSTRIDE', dimelemstrides),
+                    ('BLOCKELEMSTRIDE', blockelemstrides),
+            ):
+                for dimnum in range(ndim):
+                    defines += """
+                        #define %s_%s_%d %d
+                    """ % (definename, name, dimnum, vals[dimnum])
+
+        decls += """
+            unsigned _GLOBAL_i = _CTA_START + _TID;
+        """
+        for name in arrayindnames:
+            decls += """
+                long %s_i = 0;
+            """ % (name,)
+
+        for (name, refname) in arrayrefnames:
+            defines += """
+                #define %s_i %s_i
+            """ % (name, refname)
+
+        if self._do_indices:
+            decls += """
+                long INDEX[%d];
+            """ % (ndim,)
+
+            for dimnum in range(ndim):
+                defines += """
+                #define INDEX_%d (INDEX[%d])
+                """ % (dimnum, dimnum)
+        else:
+            for dimnum in range(ndim):
+                decls += """
+                long INDEX_%d;
+                """ % (dimnum,)
+
+        loop_inds_calc += """
+            unsigned int _TMP_GLOBAL_i = _GLOBAL_i;
+        """
+        for dimnum in range(ndim):
+            loop_inds_calc += """
+                INDEX_%d = _TMP_GLOBAL_i %% SHAPE_%d;
+                _TMP_GLOBAL_i = _TMP_GLOBAL_i / SHAPE_%d;
+            """ % (dimnum, dimnum,
+                   dimnum)
+
+            for name in arrayindnames:
+                loop_inds_calc += """
+                    %s_i += INDEX_%d * ELEMSTRIDE_%s_%d;
+                """ % (name, dimnum, name, dimnum)
+
+        for name in arrayindnames:
+            loop_inds_inc += """
+                    %s_i += %s;
+            """ % (name,
+                   " + ".join(
+                       "BLOCKELEMSTRIDE_%s_%d" % (name, dimnum)
+                       for dimnum in range(ndim)))
+        for dimnum in range(ndim):
+            loop_inds_inc += """
+                    INDEX_%d += BLOCK_STEP_%d;
+            """ % (dimnum, dimnum)
+            if dimnum < ndim - 1:
+                loop_inds_inc += """
+                    if (INDEX_%d >= SHAPE_%d) {
+                """ % (dimnum, dimnum)
+                loop_inds_inc.indent()
+                loop_inds_inc += """
+                      INDEX_%d -= SHAPE_%d;
+                      INDEX_%d ++;
+                """ % (dimnum, dimnum,
+                       dimnum + 1)
+                for name in arrayindnames:
+                    loop_inds_inc += """
+                      %s_i += ELEMSTRIDE_%s_%d - DIMELEMSTRIDE_%s_%d;
+                    """ % (name, name, dimnum + 1, name, dimnum)
+                loop_inds_inc.dedent()
+                loop_inds_inc += """
+                    }
+                """
+
+        if self._debug:
+            preamble += """
+                #include <stdio.h>
+            """
+            loop_inds_calc += """
+                if (_CTA_START == 0 && _TID == 0) {
+            """
+            loop_inds_calc.indent()
+            loop_inds_calc += r"""
+                printf("=======================\n");
+                printf("CALLING FUNC %s\n");
+                printf("N=%%u\n", (unsigned int)n);
+            """ % (funcname,)
+            for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos:
+                loop_inds_calc += r"""
+                    printf("(%s) %s: ptr=0x%%lx maxoffset(elems)=%s\n", (unsigned long)%s);
+                """ % (funcname, name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
+            loop_inds_calc.dedent()
+            loop_inds_calc += """
+                }
+            """
+            indtest = DeferredSource()
+            for name, elemstrides, dimelemstrides, blockelemstrides, inforef in arrayarginfos:
+                if inforef is not None:
+                    continue
+                indtest += r"""
+                    if (%s_i > %s || %s_i < 0)
+                """ % (name, np.sum((np.array(shape) - 1) * np.array(elemstrides)), name)
+                indtest += r"""{"""
+                indtest.indent()
+                indtest += r"""
+                        printf("_CTA_START=%%d _TID=%%d _GLOBAL_i=%%u %s_i=%%ld %s\n", _CTA_START, _TID, _GLOBAL_i, %s_i, %s);
+                """ % (name, ' '.join("INDEX_%d=%%ld" % (dimnum,) for dimnum in range(ndim)), name, ',  '.join("INDEX_%d" % (dimnum,) for dimnum in range(ndim)))
+                indtest += """break;"""
+                indtest.dedent()
+                indtest += """}"""
+            loop_preop = indtest + loop_preop
+            after_loop += r"""
+                if (_CTA_START == 0 && _TID == 0) {
+                    printf("DONE CALLING FUNC %s\n");
+                    printf("-----------------------\n");
+                }
+            """ % (funcname,)
+
+        if self._do_range:
+            loop_body.add("""
+              if (step < 0)
+              {
+                for (/*void*/; _GLOBAL_i > stop; _GLOBAL_i += total_threads*step)
+                {
+                  %(loop_preop)s;
+
+                  %(operation)s;
+
+                  %(loop_inds_inc)s;
+                }
+              }
+              else
+              {
+                for (/*void*/; _GLOBAL_i < stop; _GLOBAL_i += total_threads*step)
+                {
+                  %(loop_preop)s;
+
+                  %(operation)s;
+
+                  %(loop_inds_inc)s;
+                }
+              }
+            """, format_dict={
+                "loop_preop": loop_preop,
+                "operation": operation,
+                "loop_inds_inc": loop_inds_inc,
+            })
+        else:
+            loop_body.add("""
+              for (/*void*/; _GLOBAL_i < n; _GLOBAL_i += total_threads)
+              {
+                %(loop_preop)s;
 
+                %(operation)s;
 
-def get_elwise_module(arguments, operation,
-        name="kernel", keep=False, options=None,
-        preamble="", loop_prep="", after_loop=""):
-    from pycuda.compiler import SourceModule
-    return SourceModule("""
-        #include <pycuda-complex.hpp>
+                %(loop_inds_inc)s;
+              }
+            """, format_dict={
+                "loop_preop": loop_preop,
+                "operation": operation,
+                "loop_inds_inc": loop_inds_inc,
+            })
 
-        %(preamble)s
+        source = DeferredSource()
 
-        __global__ void %(name)s(%(arguments)s)
-        {
+        source.add("""
+            #include <pycuda-complex.hpp>
+            #include <stdio.h>
 
-          unsigned tid = threadIdx.x;
-          unsigned total_threads = gridDim.x*blockDim.x;
-          unsigned cta_start = blockDim.x*blockIdx.x;
-          unsigned i;
+            %(defines)s
 
-          %(loop_prep)s;
+            %(preamble)s
 
-          for (i = cta_start + tid; i < n; i += total_threads)
-          {
-            %(operation)s;
-          }
+            __global__ void %(name)s(%(arg_decls)s)
+            {
 
-          %(after_loop)s;
-        }
-        """ % {
-            "arguments": ", ".join(arg.declarator() for arg in arguments),
-            "operation": operation,
-            "name": name,
-            "preamble": preamble,
-            "loop_prep": loop_prep,
-            "after_loop": after_loop,
-            },
-        options=options, keep=keep)
+              const unsigned int _TID = threadIdx.x;
+              const unsigned int total_threads = gridDim.x*blockDim.x;
+              const unsigned int _CTA_START = blockDim.x*blockIdx.x;
 
+              %(decls)s
 
-def get_elwise_range_module(arguments, operation,
-        name="kernel", keep=False, options=None,
-        preamble="", loop_prep="", after_loop=""):
-    from pycuda.compiler import SourceModule
-    return SourceModule("""
-        #include <pycuda-complex.hpp>
-
-        %(preamble)s
-
-        __global__ void %(name)s(%(arguments)s)
-        {
-          unsigned tid = threadIdx.x;
-          unsigned total_threads = gridDim.x*blockDim.x;
-          unsigned cta_start = blockDim.x*blockIdx.x;
-          long i;
-
-          %(loop_prep)s;
-
-          if (step < 0)
-          {
-            for (i = start + (cta_start + tid)*step;
-              i > stop; i += total_threads*step)
-            {
-              %(operation)s;
-            }
-          }
-          else
-          {
-            for (i = start + (cta_start + tid)*step;
-              i < stop; i += total_threads*step)
-            {
-              %(operation)s;
+              %(loop_prep)s;
+
+              %(loop_inds_calc)s;
+
+              %(loop_body)s;
+
+              %(after_loop)s;
             }
-          }
+            """, format_dict={
+                "arg_decls": arg_decls,
+                "name": funcname,
+                "preamble": preamble,
+                "loop_prep": loop_prep,
+                "after_loop": after_loop,
+                "defines": defines,
+                "decls": decls,
+                "loop_inds_calc": loop_inds_calc,
+                "loop_body": loop_body,
+            })
+
+        return source
 
-          %(after_loop)s;
-        }
-        """ % {
-            "arguments": ", ".join(arg.declarator() for arg in arguments),
-            "operation": operation,
-            "name": name,
-            "preamble": preamble,
-            "loop_prep": loop_prep,
-            "after_loop": after_loop,
-            },
-        options=options, keep=keep)
 
+def get_elwise_module(arguments, operation,
+        name="kernel", keep=False, options=None,
+        preamble="", loop_prep="", after_loop="",
+        **kwargs):
+    return ElementwiseSourceModule(arguments, operation,
+                                   name=name, preamble=preamble,
+                                   loop_prep=loop_prep, after_loop=after_loop,
+                                   keep=keep, options=options,
+                                   **kwargs)
+
+def get_elwise_range_module(arguments, operation,
+        name="kernel", keep=False, options=None,
+        preamble="", loop_prep="", after_loop="",
+        **kwargs):
+    return ElementwiseSourceModule(arguments, operation,
+                                   name=name, preamble=preamble,
+                                   loop_prep=loop_prep, after_loop=after_loop,
+                                   keep=keep, options=options,
+                                   do_range=True,
+                                   **kwargs)
 
 def get_elwise_kernel_and_types(arguments, operation,
         name="kernel", keep=False, options=None, use_range=False, **kwargs):
@@ -204,12 +640,8 @@ def __call__(self, *args, **kwargs):
 
         for arg, arg_descr in zip(args, arguments):
             if isinstance(arg_descr, VectorArg):
-                if not arg.flags.forc:
-                    raise RuntimeError("elementwise kernel cannot "
-                            "deal with non-contiguous arrays")
-
                 vectors.append(arg)
-                invocation_args.append(arg.gpudata)
+                invocation_args.append(arg)
             else:
                 invocation_args.append(arg)
 
@@ -256,12 +688,13 @@ def get_take_kernel(dtype, idx_dtype, vec_count=1):
         "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
         for i in range(vec_count))
     body = (
-            ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
+            ("%(idx_tp)s src_idx = idx[idx_i];\n" % ctx)
             + "\n".join(
-                "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
+                "dest%d[dest%d_i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i, i)
                 for i in range(vec_count)))
 
-    mod = get_elwise_module(args, body, "take", preamble=preamble)
+    mod = get_elwise_module(args, body, "take", preamble=preamble,
+                            elwise_arg_inds=(0, 1))
     func = mod.get_function("take")
     tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
     func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src)
@@ -301,11 +734,12 @@ def get_copy_insn(i):
             return ("dest%d[dest_idx] = "
                     "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i))
 
-    body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n"
-                "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
+    body = (("%(idx_tp)s src_idx = gmem_src_idx[gmem_src_idx_i];\n"
+                "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx)
             + "\n".join(get_copy_insn(i) for i in range(vec_count)))
 
-    mod = get_elwise_module(args, body, "take_put", preamble=preamble)
+    mod = get_elwise_module(args, body, "take_put",
+                            preamble=preamble, elwise_arg_inds=(0,1))
     func = mod.get_function("take_put")
     tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
 
@@ -335,11 +769,15 @@ def get_put_kernel(dtype, idx_dtype, vec_count=1):
             ] + [ScalarArg(np.intp, "n")]
 
     body = (
-            "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
-            + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
+            "%(idx_tp)s dest_idx = gmem_dest_idx[gmem_dest_idx_i];\n" % ctx
+            + "\n".join("dest%d[dest_idx] = src%d[src%d_i];" % (i, i, i)
                 for i in range(vec_count)))
 
-    func = get_elwise_module(args, body, "put").get_function("put")
+    func = get_elwise_module(args, body, "put",
+                             elwise_arg_inds=(
+                                 [0] + range(1+vec_count, 1+(2*vec_count))
+                             )
+                            ).get_function("put")
     func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char)
     return func
 
@@ -351,7 +789,7 @@ def get_copy_kernel(dtype_dest, dtype_src):
                 "tp_dest": dtype_to_ctype(dtype_dest),
                 "tp_src": dtype_to_ctype(dtype_src),
                 },
-            "dest[i] = src[i]",
+            "dest[dest_i] = src[src_i]",
             "copy")
 
 
@@ -383,13 +821,13 @@ def get_linear_combination_kernel(summand_descriptors,
             args.append(ScalarArg(scalar_dtype, "a%d" % i))
             args.append(VectorArg(vector_dtype, "x%d" % i))
 
-        summands.append("a%d*x%d[i]" % (i, i))
+        summands.append("a%d*x%d[x%d_i]" % (i, i, i))
 
     args.append(VectorArg(dtype_z, "z"))
     args.append(ScalarArg(np.uintp, "n"))
 
     mod = get_elwise_module(args,
-            "z[i] = " + " + ".join(summands),
+            "z[z_i] = " + " + ".join(summands),
             "linear_combination",
             preamble="\n".join(preamble),
             loop_prep=";\n".join(loop_prep))
@@ -410,7 +848,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = a*x[i] + b*y[i]",
+            "z[z_i] = a*x[x_i] + b*y[y_i]",
             "axpbyz")
 
 
@@ -421,7 +859,7 @@ def get_axpbz_kernel(dtype_x, dtype_z):
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z)
                 },
-            "z[i] = a * x[i] + b",
+            "z[z_i] = a * x[x_i] + b",
             "axpb")
 
 
@@ -433,8 +871,8 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = x[i] %s y[i]" % operator,
-            "multiply")
+            "z[z_i] = x[x_i] %s y[y_i]" % operator,
+            "binary_op")
 
 
 @context_dependent_memoize
@@ -444,7 +882,7 @@ def get_rdivide_elwise_kernel(dtype_x, dtype_z):
                 "tp_x": dtype_to_ctype(dtype_x),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = y / x[i]",
+            "z[z_i] = y / x[x_i]",
             "divide_r")
 
 
@@ -456,7 +894,7 @@ def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y[i])" % func,
+            "z[z_i] = %s(x[x_i], y[y_i])" % func,
             func+"_kernel")
 
 
@@ -468,7 +906,7 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y)" % func,
+            "z[z_i] = %s(x[x_i], y)" % func,
             func+"_kernel")
 
 
@@ -492,7 +930,7 @@ def get_fill_kernel(dtype):
             "%(tp)s a, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = a",
+            "z[z_i] = a",
             "fill")
 
 
@@ -502,7 +940,7 @@ def get_reverse_kernel(dtype):
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = y[n-1-i]",
+            "z[z_i] = y[n-1-y_i]",
             "reverse")
 
 
@@ -513,7 +951,7 @@ def get_real_kernel(dtype, real_dtype):
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
-            "z[i] = real(y[i])",
+            "z[z_i] = real(y[y_i])",
             "real")
 
 
@@ -524,7 +962,7 @@ def get_imag_kernel(dtype, real_dtype):
                 "tp": dtype_to_ctype(dtype),
                 "real_tp": dtype_to_ctype(real_dtype),
                 },
-            "z[i] = imag(y[i])",
+            "z[z_i] = imag(y[y_i])",
             "imag")
 
 
@@ -534,7 +972,7 @@ def get_conj_kernel(dtype):
             "%(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = pycuda::conj(y[i])",
+            "z[z_i] = pycuda::conj(y[y_i])",
             "conj")
 
 
@@ -544,7 +982,9 @@ def get_arange_kernel(dtype):
             "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = start + i*step",
+            "z[z_i] = start + ((%(tp)s)(z_i))*step" % {
+                "tp": dtype_to_ctype(dtype),
+                },
             "arange")
 
 
@@ -559,7 +999,7 @@ def get_pow_kernel(dtype):
             "%(tp)s value, %(tp)s *y, %(tp)s *z" % {
                 "tp": dtype_to_ctype(dtype),
                 },
-            "z[i] = %s(y[i], value)" % func,
+            "z[z_i] = %s(y[y_i], value)" % func,
             "pow_method")
 
 
@@ -576,7 +1016,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s(x[i], y[i])" % func,
+            "z[z_i] = %s(x[x_i], y[y_i])" % func,
             "pow_method")
 
 
@@ -584,7 +1024,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
 def get_fmod_kernel():
     return get_elwise_kernel(
             "float *arg, float *mod, float *z",
-            "z[i] = fmod(arg[i], mod[i])",
+            "z[z_i] = fmod(arg[arg_i], mod[mod_i])",
             "fmod_kernel")
 
 
@@ -592,7 +1032,7 @@ def get_fmod_kernel():
 def get_modf_kernel():
     return get_elwise_kernel(
             "float *x, float *intpart ,float *fracpart",
-            "fracpart[i] = modf(x[i], &intpart[i])",
+            "fracpart[fracpart_i] = modf(x[x_i], &intpart[intpart_i])",
             "modf_kernel")
 
 
@@ -602,8 +1042,8 @@ def get_frexp_kernel():
             "float *x, float *significand, float *exponent",
             """
                 int expt = 0;
-                significand[i] = frexp(x[i], &expt);
-                exponent[i] = expt;
+                significand[significand_i] = frexp(x[x_i], &expt);
+                exponent[exponent_i] = expt;
             """,
             "frexp_kernel")
 
@@ -612,7 +1052,7 @@ def get_frexp_kernel():
 def get_ldexp_kernel():
     return get_elwise_kernel(
             "float *sig, float *expt, float *z",
-            "z[i] = ldexp(sig[i], int(expt[i]))",
+            "z[z_i] = ldexp(sig[sig_i], int(expt[expt_i]))",
             "ldexp_kernel")
 
 
@@ -626,7 +1066,7 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
                 "tp_in": dtype_to_ctype(in_dtype),
                 "tp_out": dtype_to_ctype(out_dtype),
                 },
-            "z[i] = %s(y[i])" % func_name,
+            "z[z_i] = %s(y[y_i])" % func_name,
             "%s_kernel" % func_name)
 
 
@@ -638,7 +1078,7 @@ def get_if_positive_kernel(crit_dtype, dtype):
             VectorArg(dtype, "else_"),
             VectorArg(dtype, "result"),
             ],
-            "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
+            "result[result_i] = crit[crit_i] > 0 ? then_[then__i] : else_[else__i]",
             "if_positive")
 
 
@@ -650,5 +1090,5 @@ def get_scalar_op_kernel(dtype_x, dtype_y, operator):
                 "tp_y": dtype_to_ctype(dtype_y),
                 "tp_a": dtype_to_ctype(dtype_x),
                 },
-            "y[i] = x[i] %s a" % operator,
+            "y[y_i] = x[x_i] %s a" % operator,
             "scalarop_kernel")
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 10c687d0..2a9e4ba9 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -118,23 +118,15 @@ def splay(n, dev=None):
 
 def _make_binary_op(operator):
     def func(self, other):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if isinstance(other, GPUArray):
             assert self.shape == other.shape
 
-            if not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             result = self._new_like_me()
             func = elementwise.get_binary_op_kernel(
                     self.dtype, other.dtype, result.dtype,
                     operator)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
+                    self, other, result,
                     self.mem_size)
 
             return result
@@ -143,7 +135,7 @@ def func(self, other):
             func = elementwise.get_scalar_op_kernel(
                     self.dtype, result.dtype, operator)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other, result.gpudata,
+                    self, other, result,
                     self.mem_size)
             return result
 
@@ -246,13 +238,14 @@ def set_async(self, ary, stream=None):
         return self.set(ary, async=True, stream=stream)
 
     def get(self, ary=None, pagelocked=False, async=False, stream=None):
+        flipper = None
         if ary is None:
             if pagelocked:
                 ary = drv.pagelocked_empty(self.shape, self.dtype)
             else:
                 ary = np.empty(self.shape, self.dtype)
 
-            strides = _compact_strides(self)
+            self, strides, flipper = _compact_positive_strides(self)
             ary = _as_strided(ary, strides=strides)
         else:
             if self.size != ary.size:
@@ -269,13 +262,15 @@ def get(self, ary=None, pagelocked=False, async=False, stream=None):
 
         if self.size:
             _memcpy_discontig(ary, self, async=async, stream=stream)
+        if flipper is not None:
+            ary = ary[flipper]
         return ary
 
     def get_async(self, stream=None, ary=None):
         return self.get(ary=ary, async=True, stream=stream)
 
-    def copy(self):
-        new = GPUArray(self.shape, self.dtype, self.allocator)
+    def copy(self, order="K"):
+        new = self._new_like_me(order=order)
         _memcpy_discontig(new, self)
         return new
 
@@ -297,47 +292,36 @@ def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None):
         """Compute ``out = selffac * self + otherfac*other``,
         where `other` is a vector.."""
         assert self.shape == other.shape
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
 
         func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
 
         if add_timer is not None:
             add_timer(3*self.size, func.prepared_timed_call(self._grid,
-                selffac, self.gpudata, otherfac, other.gpudata,
-                out.gpudata, self.mem_size))
+                selffac, self, otherfac, other,
+                out, self.mem_size))
         else:
             func.prepared_async_call(self._grid, self._block, stream,
-                    selffac, self.gpudata, otherfac, other.gpudata,
-                    out.gpudata, self.mem_size)
+                    selffac, self, otherfac, other,
+                    out, self.mem_size)
 
         return out
 
     def _axpbz(self, selffac, other, out, stream=None):
         """Compute ``out = selffac * self + other``, where `other` is a scalar."""
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         func = elementwise.get_axpbz_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                selffac, self.gpudata,
-                other, out.gpudata, self.mem_size)
+                selffac, self,
+                other, out, self.mem_size)
 
         return out
 
     def _elwise_multiply(self, other, out, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "*")
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
@@ -347,43 +331,72 @@ def _rdiv_scalar(self, other, out, stream=None):
            y = n / self
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
     def _div(self, other, out, stream=None):
         """Divides an array by another array."""
 
-        if not self.flags.forc or not other.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         assert self.shape == other.shape
 
         func = elementwise.get_binary_op_kernel(self.dtype, other.dtype,
                 out.dtype, "/")
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, other.gpudata,
-                out.gpudata, self.mem_size)
+                self, other,
+                out, self.mem_size)
 
         return out
 
-    def _new_like_me(self, dtype=None, order="C"):
-        strides = None
+    def _new_like_me(self, dtype=None, order="K"):
+        flipper, selflist = _flip_negative_strides((self,))
+        self = selflist[0]
+        ndim = self.ndim
+        shape = self.shape
+        mystrides = self.strides
+        mydtype = self.dtype
+        myitemsize = mydtype.itemsize
         if dtype is None:
-            dtype = self.dtype
-        if dtype == self.dtype:
-            strides = self.strides
-
-        return self.__class__(self.shape, dtype,
-                allocator=self.allocator, strides=strides, order=order)
+            dtype = mydtype
+        else:
+            dtype = np.dtype(dtype)
+        itemsize = dtype.itemsize
+        if order == "K":
+            if self.flags.c_contiguous:
+                order = "C"
+            elif self.flags.f_contiguous:
+                order = "F"
+        if order == "C":
+            newstrides = _c_contiguous_strides(itemsize, shape)
+        elif order == "F":
+            newstrides = _f_contiguous_strides(itemsize, shape)
+        else:
+            maxstride = mystrides[0]
+            maxstrideind = 0
+            for i in range(1, ndim):
+                curstride = mystrides[i]
+                if curstride > maxstride:
+                    maxstrideind = i
+                    maxstride = curstride
+            mymaxoffset = (maxstride / myitemsize) * shape[maxstrideind]
+            if mymaxoffset <= self.size:
+                # it's probably safe to just allocate and pass strides
+                # XXX (do we need to calculate full offset for [-1,-1,-1...]?)
+                newstrides = tuple((x // myitemsize) * itemsize for x in mystrides)
+            else:
+                # just punt and choose something close
+                if ndim > 1 and maxstrideind == 0:
+                    newstrides = _c_contiguous_strides(itemsize, shape)
+                else:
+                    newstrides = _f_contiguous_strides(itemsize, shape)
+        retval = self.__class__(shape, dtype,
+                     allocator=self.allocator, strides=newstrides)
+        if flipper is not None:
+            retval = retval[flipper]
+        return retval
 
     # operators ---------------------------------------------------------------
     def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None):
@@ -514,7 +527,7 @@ def fill(self, value, stream=None):
         """fills the array with the specified value"""
         func = elementwise.get_fill_kernel(self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                value, self.gpudata, self.mem_size)
+                value, self, self.mem_size)
 
         return self
 
@@ -600,7 +613,7 @@ def __abs__(self):
                 out_dtype=out_dtype)
 
         func.prepared_async_call(self._grid, self._block, None,
-                self.gpudata, result.gpudata, self.mem_size)
+                self, result, self.mem_size)
 
         return result
 
@@ -611,10 +624,6 @@ def _pow(self, other, new):
         """
 
         if isinstance(other, GPUArray):
-            if not self.flags.forc or not other.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             assert self.shape == other.shape
 
             if new:
@@ -626,22 +635,18 @@ def _pow(self, other, new):
                     self.dtype, other.dtype, result.dtype)
 
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, other.gpudata, result.gpudata,
+                    self, other, result,
                     self.mem_size)
 
             return result
         else:
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             if new:
                 result = self._new_like_me()
             else:
                 result = self
             func = elementwise.get_pow_kernel(self.dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    other, self.gpudata, result.gpudata,
+                    other, self, result,
                     self.mem_size)
 
             return result
@@ -673,24 +678,16 @@ def reverse(self, stream=None):
         as one-dimensional.
         """
 
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         result = self._new_like_me()
 
         func = elementwise.get_reverse_kernel(self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                self.gpudata, result.gpudata,
+                self, result,
                 self.mem_size)
 
         return result
 
     def astype(self, dtype, stream=None):
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
-
         if dtype == self.dtype:
             return self.copy()
 
@@ -698,7 +695,7 @@ def astype(self, dtype, stream=None):
 
         func = elementwise.get_copy_kernel(dtype, self.dtype)
         func.prepared_async_call(self._grid, self._block, stream,
-                result.gpudata, self.gpudata,
+                result, self,
                 self.mem_size)
 
         return result
@@ -710,9 +707,6 @@ def reshape(self, *shape, **kwargs):
         order = kwargs.pop("order", "C")
 
         # TODO: add more error-checking, perhaps
-        if not self.flags.forc:
-            raise RuntimeError("only contiguous arrays may "
-                    "be used as arguments to this operation")
 
         if isinstance(shape[0], tuple) or isinstance(shape[0], list):
             shape = tuple(shape[0])
@@ -918,7 +912,11 @@ def __getitem__(self, index):
                 strides=tuple(new_strides))
 
     def __setitem__(self, index, value):
-        _memcpy_discontig(self[index], value)
+        if isinstance(value, GPUArray) or isinstance(value, np.ndarray):
+            return _memcpy_discontig(self[index], value)
+
+        # Let's assume it's a scalar
+        self[index].fill(value)
 
     # }}}
 
@@ -930,15 +928,11 @@ def real(self):
         if issubclass(dtype.type, np.complexfloating):
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(dtype=real_dtype, order=order)
+            result = self._new_like_me(dtype=real_dtype)
 
             func = elementwise.get_real_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -949,21 +943,13 @@ def real(self):
     def imag(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
             from pytools import match_precision
             real_dtype = match_precision(np.dtype(np.float64), dtype)
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(dtype=real_dtype, order=order)
+            result = self._new_like_me(dtype=real_dtype)
 
             func = elementwise.get_imag_kernel(dtype, real_dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -973,19 +959,11 @@ def imag(self):
     def conj(self):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
-            if not self.flags.forc:
-                raise RuntimeError("only contiguous arrays may "
-                        "be used as arguments to this operation")
-
-            if self.flags.f_contiguous:
-                order = "F"
-            else:
-                order = "C"
-            result = self._new_like_me(order=order)
+            result = self._new_like_me()
 
             func = elementwise.get_conj_kernel(dtype)
             func.prepared_async_call(self._grid, self._block, None,
-                    self.gpudata, result.gpudata,
+                    self, result,
                     self.mem_size)
 
             return result
@@ -1012,15 +990,21 @@ def conj(self):
 
 def to_gpu(ary, allocator=drv.mem_alloc):
     """converts a numpy array to a GPUArray"""
-    result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary))
+    ary, newstrides, flipper = _compact_positive_strides(ary)
+    result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides)
     result.set(ary)
+    if flipper is not None:
+        result = result[flipper]
     return result
 
 
 def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None):
     """converts a numpy array to a GPUArray"""
-    result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary))
+    ary, newstrides, flipper = _compact_positive_strides(ary)
+    result = GPUArray(ary.shape, ary.dtype, allocator, strides=newstrides)
     result.set_async(ary, stream)
+    if flipper:
+        result = result[flipper]
     return result
 
 
@@ -1050,10 +1034,10 @@ def _array_like_helper(other_ary, dtype, order):
             order = "F"
         else:
             # array_like routines only return positive strides
-            strides = [np.abs(s) for s in other_ary.strides]
+            _, strides, _ = _compact_positive_strides(other_ary)
             if dtype is not None and dtype != other_ary.dtype:
                 # scale strides by itemsize when dtype is not the same
-                itemsize = other_ary.nbytes // other_ary.size
+                itemsize = other_ary.dtype.itemsize
                 itemsize_ratio = np.dtype(dtype).itemsize / itemsize
                 strides = [int(s*itemsize_ratio) for s in strides]
     elif order not in ["C", "F"]:
@@ -1173,15 +1157,54 @@ class Info(Record):
 
     func = elementwise.get_arange_kernel(dtype)
     func.prepared_async_call(result._grid, result._block, kwargs.get("stream"),
-            result.gpudata, start, step, size)
+            result, start, step, size)
 
     return result
 
 # }}}
 
 
-def _compact_strides(a):
-    # Compute strides to have same order as self, but packed
+def _flip_negative_strides(arrays):
+    # If arrays have negative strides, flip them.  Return a list
+    # ``(flipper, arrays)`` where ``flipper`` is a tuple of slice
+    # objects that can used to unflip the returned (flipped) arrays
+    # (or ``None`` if there was no flipping).
+    # NOTE: Every input array A must have the same value for the following
+    # expression:  np.sign(A.strides)
+    # NOTE: ``flipper`` is its own inverse, so ``A[flipper][flipper] == A``
+    if isinstance(arrays, GPUArray):
+        raise TypeError("_flip_negative_strides expects a list of GPUArrays")
+    if not any(any(s < 0 for s in x.strides) for x in arrays):
+        return None, arrays
+    strides = np.vstack(x.strides for x in arrays)
+    shapes = np.vstack(x.shape for x in arrays)
+    signs = np.sign(strides)
+    max_signs = np.max(signs, axis=0)
+    min_signs = np.min(signs, axis=0)
+    # Stride signs are -1 or 1, or 0 if stride is 0.
+    # Zero strides don't matter, but fail if there are both -1 and 1.
+    if _builtin_min(min_signs) != -1:
+        # no flips needed
+        return None, arrays
+    flips = (min_signs == -1)[None, :]
+    if any(max_signs * min_signs == -1):
+        raise ValueError("found differing signs in strides (strides for all arrays: %s)" % ([x.strides for x in arrays],))
+    offsets = np.sum(strides * shapes * flips, axis=1)
+    strides = strides * (flips * -1)
+    sliceboth = (slice(None), slice(None, None, -1))
+    flipper = tuple(sliceboth[flip] for flip in flips[0])
+    arrays = [
+        arr[flipper]
+        for arr in arrays
+    ]
+    return flipper, arrays
+
+def _compact_positive_strides(a):
+    # Flip ``a``'s axes if there are any negative strides, then compute
+    # strides to have same order as a, but packed.  Return flipped ``a``
+    # and packed strides.
+    flipper, alist = _flip_negative_strides((a,))
+    a = alist[0]
     info = sorted(
             (a.strides[axis], a.shape[axis], axis)
             for axis in range(len(a.shape)))
@@ -1191,13 +1214,33 @@ def _compact_strides(a):
     for _, dim, axis in info:
         strides[axis] = stride
         stride *= dim
-    return strides
-
+    return a, strides, flipper
+
+def _memcpy_discontig_slow(dst, src):
+    # This is a generic memcpy using a no-op "assignment" kernel.
+    # NOTE: no need to use this if both src and dst are on host.
+    func = elementwise.get_unary_func_kernel("", src.dtype)
+
+    # if one array is on host, need to make an intermediate device array
+    src_gpu = src
+    dst_gpu = dst
+    if isinstance(src, np.ndarray):
+        dtype, order, strides = _array_like_helper(src, None, "K")
+        src_gpu = GPUArray(src.shape, dtype, order=order, strides=strides)
+        src_gpu.set(src)
+    if isinstance(dst, np.ndarray):
+        dtype, order, strides = _array_like_helper(dst, None, "K")
+        dst_gpu = GPUArray(dst.shape, dtype, order=order, strides=strides)
+    func.prepared_async_call(src_gpu._grid, src_gpu._block, None,
+                             src_gpu, dst_gpu, src_gpu.mem_size)
+    if dst is not dst_gpu:
+        dst_gpu.get(dst)
+    return
 
 def _memcpy_discontig(dst, src, async=False, stream=None):
     """Copy the contents of src into dst.
 
-    The two arrays should have the same dtype, shape, and order, but
+    The two arrays should have the same dtype, and shape, but
     not necessarily the same strides. There may be up to _two_
     axes along which either `src` or `dst` is not contiguous.
     """
@@ -1216,7 +1259,26 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
         dst[...] = src
         return
 
-    if src.flags.forc and dst.flags.forc:
+    # check for stride tricks (like stride == 0 with shape > 1)
+    shape = src.shape
+    doslow = (any(sh > 1 for sh, st in zip(shape, src.strides) if st == 0)
+              or
+              any(sh > 1 for sh, st in zip(shape, dst.strides) if st == 0))
+
+    try:
+        src, dst = _flip_negative_strides((src, dst))[1]
+    except ValueError:
+        # probably different signs in strides -- will use slow version below
+        doslow = True
+        pass
+
+    if doslow or any(np.argsort(src.strides) != np.argsort(dst.strides)):
+        # order is not the same, use generic slow version
+        _memcpy_discontig_slow(dst, src)
+        return
+    
+    if ((src.flags.f_contiguous and dst.flags.f_contiguous) or
+        (src.flags.c_contiguous and dst.flags.c_contiguous)):
         shape = [src.size]
         src_strides = dst_strides = [src.dtype.itemsize]
     else:
@@ -1238,11 +1300,8 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
             axes[0:0] = [np.newaxis]
 
         # collapse contiguous dimensions
-        # and check that dst is in same order as src
         i = 1
         while i < len(shape):
-            if dst_strides[i] < dst_strides[i-1]:
-                raise ValueError("src and dst must have same order")
             if (src_strides[i-1] * shape[i-1] == src_strides[i] and
                 dst_strides[i-1] * shape[i-1] == dst_strides[i]):
                 shape[i-1:i+1] = [shape[i-1] * shape[i]]
@@ -1282,7 +1341,9 @@ def _memcpy_discontig(dst, src, async=False, stream=None):
     elif len(shape) == 3:
         copy = drv.Memcpy3D()
     else:
-        raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),))
+        # can't use MemcpyXD, use generic slow version
+        _memcpy_discontig_slow(dst, src)
+        return
 
     if isinstance(src, GPUArray):
         copy.set_src_device(src.gpudata)
@@ -1344,7 +1405,7 @@ def take(a, indices, out=None, stream=None):
     a.bind_to_texref_ext(tex_src[0], allow_double_hack=True, allow_complex_hack=True)
 
     func.prepared_async_call(out._grid, out._block, stream,
-            indices.gpudata, out.gpudata, indices.size)
+            indices, out, indices.size)
 
     return out
 
@@ -1386,8 +1447,8 @@ def make_func_for_chunk_size(chunk_size):
             a.bind_to_texref_ext(tex_src[i], allow_double_hack=True)
 
         func.prepared_async_call(indices._grid, indices._block, stream,
-                indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
+                indices,
+                *([o for o in out[chunk_slice]]
                     + [indices.size]))
 
     return out
@@ -1451,8 +1512,8 @@ def make_func_for_chunk_size(chunk_size):
             a.bind_to_texref_ext(src_tr, allow_double_hack=True)
 
         func.prepared_async_call(src_indices._grid,  src_indices._block, stream,
-                dest_indices.gpudata, src_indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
+                dest_indices, src_indices,
+                *([o for o in out[chunk_slice]]
                     + src_offsets_list[chunk_slice]
                     + [src_indices.size]))
 
@@ -1496,9 +1557,9 @@ def make_func_for_chunk_size(chunk_size):
             func = make_func_for_chunk_size(vec_count-start_i)
 
         func.prepared_async_call(dest_indices._grid, dest_indices._block, stream,
-                dest_indices.gpudata,
-                *([o.gpudata for o in out[chunk_slice]]
-                    + [i.gpudata for i in arrays[chunk_slice]]
+                dest_indices,
+                *([o for o in out[chunk_slice]]
+                    + [i for i in arrays[chunk_slice]]
                     + [dest_indices.size]))
 
     return out
@@ -1550,7 +1611,7 @@ def if_positive(criterion, then_, else_, out=None, stream=None):
         out = empty_like(then_)
 
     func.prepared_async_call(criterion._grid, criterion._block, stream,
-            criterion.gpudata, then_.gpudata, else_.gpudata, out.gpudata,
+            criterion, then_, else_, out,
             criterion.size)
 
     return out
@@ -1565,14 +1626,14 @@ def f(a, b, out=None, stream=None):
                     a.dtype, b.dtype, out.dtype, use_scalar=False)
 
             func.prepared_async_call(a._grid, a._block, stream,
-                    a.gpudata, b.gpudata, out.gpudata, a.size)
+                    a, b, out, a.size)
         elif isinstance(a, GPUArray):
             if out is None:
                 out = empty_like(a)
             func = elementwise.get_binary_minmax_kernel(which,
                     a.dtype, a.dtype, out.dtype, use_scalar=True)
             func.prepared_async_call(a._grid, a._block, stream,
-                    a.gpudata, b, out.gpudata, a.size)
+                    a, b, out, a.size)
         else:  # assuming b is a GPUArray
             if out is None:
                 out = empty_like(b)
@@ -1580,7 +1641,7 @@ def f(a, b, out=None, stream=None):
                     b.dtype, b.dtype, out.dtype, use_scalar=True)
             # NOTE: we switch the order of a and b here!
             func.prepared_async_call(b._grid, b._block, stream,
-                    b.gpudata, a, out.gpudata, b.size)
+                    b, a, out, b.size)
         return out
     return f
 
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 8a2488de..fa08f5a1 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -17,6 +17,123 @@
 
 
 
+namespace precalc {
+  namespace py = boost::python;
+  py::tuple _precalc_array_info(const py::object & args, const py::object & elwisearginds) {
+    bool contigmatch = true;
+    bool arrayspecificinds = true;
+    std::vector<int> shape;
+    py::object shape_obj;
+    char order = 'N';
+    int numarrays = py::len(elwisearginds);
+    PyObject * arrayitemstrides = PyTuple_New(numarrays);
+
+    for (int aind = 0; aind < numarrays; aind++) {
+      int elwiseargind = py::extract<int>(elwisearginds[aind]);
+      // is a GPUArray/DeviceAllocation
+      //py::object arg = py::object(args[i]);
+      py::object arg(args[elwiseargind]);
+      if (!arrayspecificinds) {
+	continue;
+      }
+      py::object curshape_obj;
+      py::object curstrides_obj;
+      py::object itemsize_obj;
+      int itemsize;
+      int ndim = 0;
+      try {
+	curshape_obj = arg.attr("shape");
+	curstrides_obj = arg.attr("strides");
+	itemsize_obj = arg.attr("itemsize");
+	itemsize = py::extract<int>(itemsize_obj);
+	ndim = py::extract<int>(arg.attr("ndim"));
+      } catch (...) {
+	// At least one array argument is probably sent as a
+	// GPUArray.gpudata rather than the GPUArray itself,
+	// so disable array-specific indices -- caller is on
+	// their own.
+	arrayspecificinds = false;
+	continue;
+      }
+      if (ndim == 0) {
+	// probably a scalar
+	continue;
+      }
+      std::vector<int> curshape(ndim);
+      std::vector<int> curstrides(ndim);
+      for (int i = 0; i < ndim; i++) {
+	int tmp = py::extract<int>(curshape_obj[i]);
+	curshape[i] = tmp;
+	if (tmp > 1) {
+	  curstrides[i] = py::extract<int>(curstrides_obj[i]);
+	} else {
+	  curstrides[i] = 0;
+	}
+      }
+      PyObject * newstrides = PyTuple_New(ndim);
+      for (int i = 0; i < ndim; i++) {
+	// PyTuple_SetItem steals the reference of the new PyInt
+	PyTuple_SetItem(newstrides, i, PyInt_FromLong(curstrides[i]));
+      }
+      PyObject * sizestrides = PyTuple_New(2);
+      PyTuple_SetItem(sizestrides, 0, py::incref(itemsize_obj.ptr()));
+      // PyTuple_SetItem steals the reference of newstrides
+      PyTuple_SetItem(sizestrides, 1, newstrides);
+      PyTuple_SetItem(arrayitemstrides, aind, sizestrides);
+      if (contigmatch) {
+	char curorder = 'N';
+	int tmpaccum = itemsize;
+	bool is_f = (curstrides[0] == tmpaccum);
+	tmpaccum *= curshape[0];
+	for (int i = 1; i < ndim; i++) {
+	  if (curshape[i] == 1) {
+	    continue;
+	  }
+	  if (curstrides[i] != tmpaccum) {
+	    is_f = false;
+	    break;
+	  }
+	  tmpaccum *= curshape[i];
+	}
+	tmpaccum = itemsize;
+	bool is_c = (curstrides[ndim - 1] == tmpaccum);
+	tmpaccum *= curshape[ndim - 1];
+	for (int i = ndim - 2; i >= 0; i--) {
+	  if (curshape[i] == 1) {
+	    continue;
+	  }
+	  if (curstrides[i] != tmpaccum) {
+	    is_c = false;
+	    break;
+	  }
+	  tmpaccum *= curshape[i];
+	}
+	if (is_f) {
+	  curorder = 'F';
+	}
+	if (is_c) {
+	  curorder = 'C';
+	}
+	if (curorder == 'N') {
+	  contigmatch = false;
+	}
+	if (shape.size() == 0) {
+	  shape = curshape;
+	  shape_obj = curshape_obj;
+	  order = curorder;
+	} else if (order != curorder) {
+	  contigmatch = false;
+	}
+      }
+      if (shape != curshape) {
+	PyErr_SetString(PyExc_RuntimeError, "All input arrays to elementwise kernels must have the same shape, or you must specify the arrays to be traversed element-wise explicitly with elwise_arg_inds");
+	py::throw_error_already_set();
+      }
+    }
+
+    return py::make_tuple(contigmatch, arrayspecificinds, shape_obj, py::object(py::handle<>(arrayitemstrides)));
+  }
+}
 
 using namespace pycuda;
 using boost::shared_ptr;
@@ -1657,6 +1774,11 @@ BOOST_PYTHON_MODULE(_driver)
 #ifdef HAVE_CURAND
   pycuda_expose_curand();
 #endif
+
+  {
+    using namespace precalc;
+    DEF_SIMPLE_FUNCTION_WITH_ARGS(_precalc_array_info, ("args", "elwisearginds"));
+  }
 }
 
 // vim: foldmethod=marker
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index e38e2fda..f00a4e66 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -1082,6 +1082,12 @@ def test_copy(self):
         for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]:
             assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])
 
+        # try 3 axes discontiguous
+        a_gpu = curand((10,10,10,10))
+        a = a_gpu.get()
+        for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]:
+            assert np.allclose(a_gpu[start:stop:step,start:stop:step,start:stop:step].get(), a_gpu.get()[start:stop:step,start:stop:step,start:stop:step])
+
     @mark_cuda_test
     def test_get_set(self):
         import pycuda.gpuarray as gpuarray
@@ -1145,6 +1151,121 @@ def test_zeros_like_etc(self):
                 assert new_z.dtype == np.complex64
                 assert new_z.shape == arr.shape
 
+    @mark_cuda_test
+    def test_noncontig_get(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200,200))
+        a = a_gpu.get()
+
+        ret_a = a[2::3, 1::4]
+
+        ret_a_gpu = (a_gpu[2::3, 1::4]).get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+    @mark_cuda_test
+    def test_noncontig_get2(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200,200))
+        a = a_gpu.get()
+
+        ret_a = a[2::3,::-1]
+
+        ret_a_gpu = (a_gpu[2::3,::-1]).get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+    @mark_cuda_test
+    def test_noncontig_set(self):
+        import pycuda.gpuarray as gpuarray
+
+        # test different orders, transpositions, offsets, and sizes of both
+        # src (host) and dst (device)
+        for host_order in ('C', 'F'):
+          for dev_order in ('C', 'F'):
+            for transpose in ((0,1,2), (0,2,1), (1,2,0), (1,0,2), (2,0,1), (2,1,0)):
+              for start, stop, step in [(0, 50, 2), (1, 50, 2), (49, None, -2), (48, None, -2)]:
+                print("host_order=%s dev_order=%s transpose=%s start=%s stop=%s step=%s" % (host_order, dev_order, transpose, start, stop, step))
+                a = np.array(np.random.normal(0., 1., (25,25,25)), order=host_order)
+                a_gpu = gpuarray.zeros((50, 50, 50), np.float64, order=dev_order)
+                a_gpu_sub = a_gpu[start:stop:step,start:stop:step,start:stop:step]
+                a_gpu_sub.set(a)
+                assert np.allclose(a_gpu_sub.get(), a)
+                assert np.allclose(a_gpu.get()[start:stop:step,start:stop:step,start:stop:step], a)
+
+
+    @mark_cuda_test
+    def test_noncontig_unary(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200,200))[1::4, ::-2]
+        a = a_gpu.get()
+
+        ret_a = a ** 2
+
+        ret_a_gpu = (a_gpu ** 2).get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+    @mark_cuda_test
+    def test_noncontig_different_strides(self):
+        x = 200; y = 200
+
+        a_gpu = gpuarray.arange(0, x*y, 1, dtype=np.float32).reshape((x, y), order='C')
+        a = a_gpu.get()
+        assert a_gpu.strides[0] > a_gpu.strides[1], "a_gpu not C-order"
+
+        b_gpu = gpuarray.zeros_like(a_gpu, order='F')
+        b_gpu += a_gpu
+        b = b_gpu.get()
+        assert b_gpu.strides[1] > b_gpu.strides[0], "b_gpu not F-order"
+
+        assert np.allclose(a, b)
+
+    @mark_cuda_test
+    def test_noncontig_stride_tricky(self):
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200,200))
+        a = a_gpu.get()
+
+        a_gpu = a_gpu[1::4, ::-2]
+        a = a[1::4, ::-2]
+
+        a_gpu = gpuarray.GPUArray(a_gpu.shape, a_gpu.dtype,
+                                  base=a_gpu,
+                                  gpudata=a_gpu.gpudata,
+                                  strides=(0,0))
+        a.strides = (0, 0)
+
+        ret_a = a
+
+        ret_a_gpu = a_gpu.get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+    @mark_cuda_test
+    def test_noncontig_stride_tricky2(self):
+        # sliding window using tricks
+        from numpy.lib.stride_tricks import as_strided
+        from pycuda.curandom import rand as curand
+        a_gpu = curand((200000))
+        a = a_gpu.get()
+
+        a_gpu = gpuarray.GPUArray((199998, 3), a_gpu.dtype,
+                                  base=a_gpu,
+                                  gpudata=a_gpu.gpudata,
+                                  strides=(a_gpu.itemsize, a_gpu.itemsize))
+        a = as_strided(a, shape=(199998, 3), strides=(a.itemsize, a.itemsize))
+
+        ret_a = a
+        ret_a_gpu = a_gpu.get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
+        ret_a = a + 2
+        ret_a_gpu = (a_gpu + 2).get()
+
+        assert np.allclose(ret_a, ret_a_gpu)
+
 
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.