diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
index 2b02c4ed93a0d..9f7779c6fee8c 100644
--- a/doc/source/whatsnew/v0.17.0.txt
+++ b/doc/source/whatsnew/v0.17.0.txt
@@ -13,6 +13,7 @@ users upgrade to this version.
 
 Highlights include:
 
+  - Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
 
 Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsnew_0170.deprecations>` before updating.
 
@@ -56,8 +57,32 @@ Deprecations
 Removal of prior version deprecations/changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+.. _dask: https://dask.readthedocs.org/en/latest/
+
+.. _whatsnew_0170.gil:
+
+Releasing the GIL
+~~~~~~~~~~~~~~~~~
+
+We are releasing the global-interpreter-lock (GIL) on some cython operations.
+This will allow other threads to run simultaneously during computation, potentially allowing performance improvements
+from multi-threading. Notably ``groupby`` and some indexing operations are a benefit from this. (:issue:`8882`)
+
+For example the groupby expression in the following code will have the GIL released during the factorization step, e.g. ``df.groupby('key')``
+as well as the ``.sum()`` operation.
+
+.. code-block:: python
+
+   N = 1e6
+   df = DataFrame({'key' : np.random.randint(0,ngroups,size=N),
+                   'data' : np.random.randn(N) })
+   df.groupby('key')['data'].sum()
+
+Releasing of the GIL could benefit an application that uses threads for user interactions (e.g. ``QT``), or performaning multi-threaded computations. A nice example of a library that can handle these types of computation-in-parallel is the dask_ library.
+
 .. _whatsnew_0170.performance:
 
+
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
diff --git a/pandas/core/common.py b/pandas/core/common.py
index b9866a414f058..62721587e0828 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -839,7 +839,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan,
 
     func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype,
                                  axis=axis, mask_info=mask_info)
-
     indexer = _ensure_int64(indexer)
     func(arr, indexer, out, fill_value)
 
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
index c4cd788216018..3b3ea9fa032f8 100644
--- a/pandas/hashtable.pyx
+++ b/pandas/hashtable.pyx
@@ -1,14 +1,19 @@
+# cython: profile=False
+
 from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check
 
 from khash cimport *
 from numpy cimport *
+from cpython cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
 
 from util cimport _checknan
 cimport util
 
 import numpy as np
+nan = np.nan
 
-ONAN = np.nan
+cdef extern from "numpy/npy_math.h":
+    double NAN "NPY_NAN"
 
 cimport cython
 cimport numpy as cnp
@@ -28,33 +33,14 @@ PyDateTime_IMPORT
 cdef extern from "Python.h":
     int PySlice_Check(object)
 
-
-def list_to_object_array(list obj):
-    '''
-    Convert list to object ndarray. Seriously can't believe I had to write this
-    function
-    '''
-    cdef:
-        Py_ssize_t i, n
-        ndarray[object] arr
-
-    n = len(obj)
-    arr = np.empty(n, dtype=object)
-
-    for i from 0 <= i < n:
-        arr[i] = obj[i]
-
-    return arr
-
-
 cdef size_t _INIT_VEC_CAP = 32
 
 cdef class ObjectVector:
 
     cdef:
+        PyObject **data
         size_t n, m
         ndarray ao
-        PyObject **data
 
     def __cinit__(self):
         self.n = 0
@@ -65,11 +51,6 @@ cdef class ObjectVector:
     def __len__(self):
         return self.n
 
-    def to_array(self):
-        self.ao.resize(self.n)
-        self.m = self.n
-        return self.ao
-
     cdef inline append(self, object o):
         if self.n == self.m:
             self.m = max(self.m * 2, _INIT_VEC_CAP)
@@ -80,72 +61,120 @@ cdef class ObjectVector:
         self.data[self.n] = <PyObject*> o
         self.n += 1
 
+    def to_array(self):
+        self.ao.resize(self.n)
+        self.m = self.n
+        return self.ao
+
+ctypedef struct Int64VectorData:
+    int64_t *data
+    size_t n, m
+
+ctypedef struct Float64VectorData:
+    float64_t *data
+    size_t n, m
+
+ctypedef fused vector_data:
+    Int64VectorData
+    Float64VectorData
+
+ctypedef fused sixty_four_bit_scalar:
+    int64_t
+    float64_t
+
+cdef bint needs_resize(vector_data *data) nogil:
+    return data.n == data.m
+
+cdef void append_data(vector_data *data, sixty_four_bit_scalar x) nogil:
+
+    # compile time specilization of the fused types
+    # as the cross-product is generated, but we cannot assign float->int
+    # the types that don't pass are pruned
+    if (vector_data is Int64VectorData and sixty_four_bit_scalar is int64_t) or (
+        vector_data is Float64VectorData and sixty_four_bit_scalar is float64_t):
+
+        data.data[data.n] = x
+        data.n += 1
 
 cdef class Int64Vector:
 
     cdef:
-        size_t n, m
+        Int64VectorData *data
         ndarray ao
-        int64_t *data
 
     def __cinit__(self):
-        self.n = 0
-        self.m = _INIT_VEC_CAP
-        self.ao = np.empty(_INIT_VEC_CAP, dtype=np.int64)
-        self.data = <int64_t*> self.ao.data
+        self.data = <Int64VectorData *>PyMem_Malloc(sizeof(Int64VectorData))
+        if not self.data:
+            raise MemoryError()
+        self.data.n = 0
+        self.data.m = _INIT_VEC_CAP
+        self.ao = np.empty(self.data.m, dtype=np.int64)
+        self.data.data = <int64_t*> self.ao.data
+
+    cdef resize(self):
+        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+        self.ao.resize(self.data.m)
+        self.data.data = <int64_t*> self.ao.data
+
+    def __dealloc__(self):
+        PyMem_Free(self.data)
 
     def __len__(self):
-        return self.n
+        return self.data.n
 
     def to_array(self):
-        self.ao.resize(self.n)
-        self.m = self.n
+        self.ao.resize(self.data.n)
+        self.data.m = self.data.n
         return self.ao
 
-    cdef inline append(self, int64_t x):
-        if self.n == self.m:
-            self.m = max(self.m * 2, _INIT_VEC_CAP)
-            self.ao.resize(self.m)
-            self.data = <int64_t*> self.ao.data
+    cdef inline void append(self, int64_t x):
 
-        self.data[self.n] = x
-        self.n += 1
+        if needs_resize(self.data):
+            self.resize()
+
+        append_data(self.data, x)
 
 cdef class Float64Vector:
 
     cdef:
-        size_t n, m
+        Float64VectorData *data
         ndarray ao
-        float64_t *data
 
     def __cinit__(self):
-        self.n = 0
-        self.m = _INIT_VEC_CAP
-        self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64)
-        self.data = <float64_t*> self.ao.data
+        self.data = <Float64VectorData *>PyMem_Malloc(sizeof(Float64VectorData))
+        if not self.data:
+            raise MemoryError()
+        self.data.n = 0
+        self.data.m = _INIT_VEC_CAP
+        self.ao = np.empty(self.data.m, dtype=np.float64)
+        self.data.data = <float64_t*> self.ao.data
+
+    cdef resize(self):
+        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
+        self.ao.resize(self.data.m)
+        self.data.data = <float64_t*> self.ao.data
+
+    def __dealloc__(self):
+        PyMem_Free(self.data)
 
     def __len__(self):
-        return self.n
+        return self.data.n
 
     def to_array(self):
-        self.ao.resize(self.n)
-        self.m = self.n
+        self.ao.resize(self.data.n)
+        self.data.m = self.data.n
         return self.ao
 
-    cdef inline append(self, float64_t x):
-        if self.n == self.m:
-            self.m = max(self.m * 2, _INIT_VEC_CAP)
-            self.ao.resize(self.m)
-            self.data = <float64_t*> self.ao.data
+    cdef inline void append(self, float64_t x):
 
-        self.data[self.n] = x
-        self.n += 1
+        if needs_resize(self.data):
+            self.resize()
 
+        append_data(self.data, x)
 
 cdef class HashTable:
     pass
 
-
 cdef class StringHashTable(HashTable):
     cdef kh_str_t *table
 
@@ -157,9 +186,6 @@ cdef class StringHashTable(HashTable):
     def __dealloc__(self):
         kh_destroy_str(self.table)
 
-    cdef inline int check_type(self, object val):
-        return util.is_string_object(val)
-
     cpdef get_item(self, object val):
         cdef khiter_t k
         k = kh_get_str(self.table, util.get_c_string(val))
@@ -256,111 +282,16 @@ cdef class StringHashTable(HashTable):
 
         return reverse, labels
 
-cdef class Int32HashTable(HashTable):
-    cdef kh_int32_t *table
-
-    def __init__(self, size_hint=1):
-        if size_hint is not None:
-            kh_resize_int32(self.table, size_hint)
-
-    def __cinit__(self):
-        self.table = kh_init_int32()
-
-    def __dealloc__(self):
-        kh_destroy_int32(self.table)
-
-    cdef inline int check_type(self, object val):
-        return util.is_string_object(val)
-
-    cpdef get_item(self, int32_t val):
-        cdef khiter_t k
-        k = kh_get_int32(self.table, val)
-        if k != self.table.n_buckets:
-            return self.table.vals[k]
-        else:
-            raise KeyError(val)
-
-    def get_iter_test(self, int32_t key, Py_ssize_t iterations):
-        cdef Py_ssize_t i, val=0
-        for i in range(iterations):
-            k = kh_get_int32(self.table, val)
-            if k != self.table.n_buckets:
-                val = self.table.vals[k]
-
-    cpdef set_item(self, int32_t key, Py_ssize_t val):
-        cdef:
-            khiter_t k
-            int ret = 0
-
-        k = kh_put_int32(self.table, key, &ret)
-        self.table.keys[k] = key
-        if kh_exist_int32(self.table, k):
-            self.table.vals[k] = val
-        else:
-            raise KeyError(key)
-
-    def map_locations(self, ndarray[int32_t] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            int32_t val
-            khiter_t k
-
-        for i in range(n):
-            val = values[i]
-            k = kh_put_int32(self.table, val, &ret)
-            self.table.vals[k] = i
-
-    def lookup(self, ndarray[int32_t] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int32_t val
-            khiter_t k
-            ndarray[int32_t] locs = np.empty(n, dtype=np.int64)
-
-        for i in range(n):
-            val = values[i]
-            k = kh_get_int32(self.table, val)
-            if k != self.table.n_buckets:
-                locs[i] = self.table.vals[k]
-            else:
-                locs[i] = -1
-
-        return locs
-
-    def factorize(self, ndarray[int32_t] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            ndarray[int64_t] labels = np.empty(n, dtype=np.int64)
-            dict reverse = {}
-            Py_ssize_t idx, count = 0
-            int ret = 0
-            int32_t val
-            khiter_t k
-
-        for i in range(n):
-            val = values[i]
-            k = kh_get_int32(self.table, val)
-            if k != self.table.n_buckets:
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
-                k = kh_put_int32(self.table, val, &ret)
-                self.table.vals[k] = count
-                reverse[count] = val
-                labels[i] = count
-                count += 1
-
-        return reverse, labels
-
-cdef class Int64HashTable: #(HashTable):
-    # cdef kh_int64_t *table
+cdef class Int64HashTable(HashTable):
 
     def __cinit__(self, size_hint=1):
         self.table = kh_init_int64()
         if size_hint is not None:
             kh_resize_int64(self.table, size_hint)
 
+    def __len__(self):
+        return self.table.size
+
     def __dealloc__(self):
         kh_destroy_int64(self.table)
 
@@ -369,9 +300,6 @@ cdef class Int64HashTable: #(HashTable):
         k = kh_get_int64(self.table, key)
         return k != self.table.n_buckets
 
-    def __len__(self):
-        return self.table.size
-
     cpdef get_item(self, int64_t val):
         cdef khiter_t k
         k = kh_get_int64(self.table, val)
@@ -399,137 +327,166 @@ cdef class Int64HashTable: #(HashTable):
         else:
             raise KeyError(key)
 
-    def map(self, ndarray[int64_t] keys, ndarray[int64_t] values):
+    @cython.boundscheck(False)
+    def map(self, int64_t[:] keys, int64_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
             int64_t key
             khiter_t k
 
-        for i in range(n):
-            key = keys[i]
-            k = kh_put_int64(self.table, key, &ret)
-            self.table.vals[k] = <Py_ssize_t> values[i]
+        with nogil:
+            for i in range(n):
+                key = keys[i]
+                k = kh_put_int64(self.table, key, &ret)
+                self.table.vals[k] = <Py_ssize_t> values[i]
 
-    def map_locations(self, ndarray[int64_t] values):
+    @cython.boundscheck(False)
+    def map_locations(self, int64_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
             int64_t val
             khiter_t k
 
-        for i in range(n):
-            val = values[i]
-            k = kh_put_int64(self.table, val, &ret)
-            self.table.vals[k] = i
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_put_int64(self.table, val, &ret)
+                self.table.vals[k] = i
 
-    def lookup(self, ndarray[int64_t] values):
+    @cython.boundscheck(False)
+    def lookup(self, int64_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
             int64_t val
             khiter_t k
-            ndarray[int64_t] locs = np.empty(n, dtype=np.int64)
+            int64_t[:] locs = np.empty(n, dtype=np.int64)
 
-        for i in range(n):
-            val = values[i]
-            k = kh_get_int64(self.table, val)
-            if k != self.table.n_buckets:
-                locs[i] = self.table.vals[k]
-            else:
-                locs[i] = -1
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_get_int64(self.table, val)
+                if k != self.table.n_buckets:
+                    locs[i] = self.table.vals[k]
+                else:
+                    locs[i] = -1
 
-        return locs
+        return np.asarray(locs)
 
     def factorize(self, ndarray[object] values):
         reverse = {}
         labels = self.get_labels(values, reverse, 0)
         return reverse, labels
 
-    def get_labels(self, ndarray[int64_t] values, Int64Vector uniques,
+    @cython.boundscheck(False)
+    def get_labels(self, int64_t[:] values, Int64Vector uniques,
                    Py_ssize_t count_prior, Py_ssize_t na_sentinel):
         cdef:
             Py_ssize_t i, n = len(values)
-            ndarray[int64_t] labels
+            int64_t[:] labels
             Py_ssize_t idx, count = count_prior
             int ret = 0
             int64_t val
             khiter_t k
+            Int64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
-
-        for i in range(n):
-            val = values[i]
-            k = kh_get_int64(self.table, val)
-            if k != self.table.n_buckets:
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
-                k = kh_put_int64(self.table, val, &ret)
-                self.table.vals[k] = count
-                uniques.append(val)
-                labels[i] = count
-                count += 1
-
-        return labels
-
-    def get_labels_groupby(self, ndarray[int64_t] values):
+        ud = uniques.data
+
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_get_int64(self.table, val)
+                if k != self.table.n_buckets:
+                    idx = self.table.vals[k]
+                    labels[i] = idx
+                else:
+                    k = kh_put_int64(self.table, val, &ret)
+                    self.table.vals[k] = count
+
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data(ud, val)
+                    labels[i] = count
+                    count += 1
+
+        return np.asarray(labels)
+
+    @cython.boundscheck(False)
+    def get_labels_groupby(self, int64_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
-            ndarray[int64_t] labels
+            int64_t[:] labels
             Py_ssize_t idx, count = 0
             int ret = 0
             int64_t val
             khiter_t k
             Int64Vector uniques = Int64Vector()
+            Int64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
-
-        for i in range(n):
-            val = values[i]
-
-            # specific for groupby
-            if val < 0:
-                labels[i] = -1
-                continue
-
-            k = kh_get_int64(self.table, val)
-            if k != self.table.n_buckets:
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
-                k = kh_put_int64(self.table, val, &ret)
-                self.table.vals[k] = count
-                uniques.append(val)
-                labels[i] = count
-                count += 1
+        ud = uniques.data
+
+        with nogil:
+            for i in range(n):
+                val = values[i]
+
+                # specific for groupby
+                if val < 0:
+                    labels[i] = -1
+                    continue
+
+                k = kh_get_int64(self.table, val)
+                if k != self.table.n_buckets:
+                    idx = self.table.vals[k]
+                    labels[i] = idx
+                else:
+                    k = kh_put_int64(self.table, val, &ret)
+                    self.table.vals[k] = count
+
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data(ud, val)
+                    labels[i] = count
+                    count += 1
 
         arr_uniques = uniques.to_array()
 
-        return labels, arr_uniques
+        return np.asarray(labels), arr_uniques
 
-    def unique(self, ndarray[int64_t] values):
+    @cython.boundscheck(False)
+    def unique(self, int64_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
-            ndarray result
             int64_t val
             khiter_t k
             Int64Vector uniques = Int64Vector()
+            Int64VectorData *ud
 
-        for i in range(n):
-            val = values[i]
-            k = kh_get_int64(self.table, val)
-            if k == self.table.n_buckets:
-                kh_put_int64(self.table, val, &ret)
-                uniques.append(val)
+        ud = uniques.data
 
-        result = uniques.to_array()
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_get_int64(self.table, val)
+                if k == self.table.n_buckets:
+                    kh_put_int64(self.table, val, &ret)
+
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data(ud, val)
 
-        return result
+        return uniques.to_array()
 
 
 cdef class Float64HashTable(HashTable):
+
     def __cinit__(self, size_hint=1):
         self.table = kh_init_float64()
         if size_hint is not None:
@@ -566,99 +523,124 @@ cdef class Float64HashTable(HashTable):
         k = kh_get_float64(self.table, key)
         return k != self.table.n_buckets
 
-    def factorize(self, ndarray[float64_t] values):
+    def factorize(self, float64_t[:] values):
         uniques = Float64Vector()
         labels = self.get_labels(values, uniques, 0, -1)
         return uniques.to_array(), labels
 
-    def get_labels(self, ndarray[float64_t] values,
+    @cython.boundscheck(False)
+    def get_labels(self, float64_t[:] values,
                      Float64Vector uniques,
                      Py_ssize_t count_prior, int64_t na_sentinel):
         cdef:
             Py_ssize_t i, n = len(values)
-            ndarray[int64_t] labels
+            int64_t[:] labels
             Py_ssize_t idx, count = count_prior
             int ret = 0
             float64_t val
             khiter_t k
+            Float64VectorData *ud
 
         labels = np.empty(n, dtype=np.int64)
+        ud = uniques.data
 
-        for i in range(n):
-            val = values[i]
-
-            if val != val:
-                labels[i] = na_sentinel
-                continue
-
-            k = kh_get_float64(self.table, val)
-            if k != self.table.n_buckets:
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
-                k = kh_put_float64(self.table, val, &ret)
-                self.table.vals[k] = count
-                uniques.append(val)
-                labels[i] = count
-                count += 1
+        with nogil:
+            for i in range(n):
+                val = values[i]
 
-        return labels
+                if val != val:
+                    labels[i] = na_sentinel
+                    continue
 
-    def map_locations(self, ndarray[float64_t] values):
+                k = kh_get_float64(self.table, val)
+                if k != self.table.n_buckets:
+                    idx = self.table.vals[k]
+                    labels[i] = idx
+                else:
+                    k = kh_put_float64(self.table, val, &ret)
+                    self.table.vals[k] = count
+
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data(ud, val)
+                    labels[i] = count
+                    count += 1
+
+        return np.asarray(labels)
+
+    @cython.boundscheck(False)
+    def map_locations(self, float64_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
             khiter_t k
 
-        for i in range(n):
-            k = kh_put_float64(self.table, values[i], &ret)
-            self.table.vals[k] = i
+        with nogil:
+            for i in range(n):
+                k = kh_put_float64(self.table, values[i], &ret)
+                self.table.vals[k] = i
 
-    def lookup(self, ndarray[float64_t] values):
+    @cython.boundscheck(False)
+    def lookup(self, float64_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
             float64_t val
             khiter_t k
-            ndarray[int64_t] locs = np.empty(n, dtype=np.int64)
+            int64_t[:] locs = np.empty(n, dtype=np.int64)
 
-        for i in range(n):
-            val = values[i]
-            k = kh_get_float64(self.table, val)
-            if k != self.table.n_buckets:
-                locs[i] = self.table.vals[k]
-            else:
-                locs[i] = -1
+        with nogil:
+            for i in range(n):
+                val = values[i]
+                k = kh_get_float64(self.table, val)
+                if k != self.table.n_buckets:
+                    locs[i] = self.table.vals[k]
+                else:
+                    locs[i] = -1
 
-        return locs
+        return np.asarray(locs)
 
-    def unique(self, ndarray[float64_t] values):
+    @cython.boundscheck(False)
+    def unique(self, float64_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
             float64_t val
             khiter_t k
-            Float64Vector uniques = Float64Vector()
             bint seen_na = 0
+            Float64Vector uniques = Float64Vector()
+            Float64VectorData *ud
 
-        for i in range(n):
-            val = values[i]
+        ud = uniques.data
 
-            if val == val:
-                k = kh_get_float64(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_float64(self.table, val, &ret)
-                    uniques.append(val)
-            elif not seen_na:
-                seen_na = 1
-                uniques.append(ONAN)
+        with nogil:
+            for i in range(n):
+                val = values[i]
+
+                if val == val:
+                    k = kh_get_float64(self.table, val)
+                    if k == self.table.n_buckets:
+                        kh_put_float64(self.table, val, &ret)
+
+                        if needs_resize(ud):
+                            with gil:
+                                uniques.resize()
+                        append_data(ud, val)
+
+                elif not seen_na:
+                    seen_na = 1
+
+                    if needs_resize(ud):
+                        with gil:
+                            uniques.resize()
+                    append_data(ud, NAN)
 
         return uniques.to_array()
 
 na_sentinel = object
 
 cdef class PyObjectHashTable(HashTable):
-    # cdef kh_pymap_t *table
 
     def __init__(self, size_hint=1):
         self.table = kh_init_pymap()
@@ -740,7 +722,7 @@ cdef class PyObjectHashTable(HashTable):
             int ret = 0
             object val
             khiter_t k
-            ndarray[int64_t] locs = np.empty(n, dtype=np.int64)
+            int64_t[:] locs = np.empty(n, dtype=np.int64)
 
         for i in range(n):
             val = values[i]
@@ -754,30 +736,13 @@ cdef class PyObjectHashTable(HashTable):
             else:
                 locs[i] = -1
 
-        return locs
-
-    def lookup2(self, ndarray[object] values):
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            object val
-            khiter_t k
-            long hval
-            ndarray[int64_t] locs = np.empty(n, dtype=np.int64)
-
-        # for i in range(n):
-        #     val = values[i]
-            # hval = PyObject_Hash(val)
-            # k = kh_get_pymap(self.table, <PyObject*>val)
-
-        return locs
+        return np.asarray(locs)
 
     def unique(self, ndarray[object] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
             object val
-            ndarray result
             khiter_t k
             ObjectVector uniques = ObjectVector()
             bint seen_na = 0
@@ -792,17 +757,15 @@ cdef class PyObjectHashTable(HashTable):
                     uniques.append(val)
             elif not seen_na:
                 seen_na = 1
-                uniques.append(ONAN)
-
-        result = uniques.to_array()
+                uniques.append(nan)
 
-        return result
+        return uniques.to_array()
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                      Py_ssize_t count_prior, int64_t na_sentinel):
         cdef:
             Py_ssize_t i, n = len(values)
-            ndarray[int64_t] labels
+            int64_t[:] labels
             Py_ssize_t idx, count = count_prior
             int ret = 0
             object val
@@ -829,7 +792,7 @@ cdef class PyObjectHashTable(HashTable):
                 labels[i] = count
                 count += 1
 
-        return labels
+        return np.asarray(labels)
 
 
 cdef class Factorizer:
@@ -884,7 +847,7 @@ cdef class Int64Factorizer:
     def get_count(self):
         return self.count
 
-    def factorize(self, ndarray[int64_t] values, sort=False,
+    def factorize(self, int64_t[:] values, sort=False,
                   na_sentinel=-1):
         labels = self.table.get_labels(values, self.uniques,
                                        self.count, na_sentinel)
@@ -904,28 +867,34 @@ cdef class Int64Factorizer:
         return labels
 
 
-cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table):
+
+@cython.boundscheck(False)
+cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):
     cdef:
         khiter_t k
         Py_ssize_t i, n = len(values)
+        int64_t val
         int ret = 0
 
-    kh_resize_int64(table, n)
+    with nogil:
+        kh_resize_int64(table, n)
 
-    for i in range(n):
-        val = values[i]
-        k = kh_get_int64(table, val)
-        if k != table.n_buckets:
-            table.vals[k] += 1
-        else:
-            k = kh_put_int64(table, val, &ret)
-            table.vals[k] = 1
+        for i in range(n):
+            val = values[i]
+            k = kh_get_int64(table, val)
+            if k != table.n_buckets:
+                table.vals[k] += 1
+            else:
+                k = kh_put_int64(table, val, &ret)
+                table.vals[k] = 1
 
 
-cpdef value_count_int64(ndarray[int64_t] values):
+@cython.boundscheck(False)
+cpdef value_count_int64(int64_t[:] values):
     cdef:
         Py_ssize_t i
         kh_int64_t *table
+        int64_t[:] result_keys, result_counts
         int k
 
     table = kh_init_int64()
@@ -934,14 +903,16 @@ cpdef value_count_int64(ndarray[int64_t] values):
     i = 0
     result_keys = np.empty(table.n_occupied, dtype=np.int64)
     result_counts = np.zeros(table.n_occupied, dtype=np.int64)
-    for k in range(table.n_buckets):
-        if kh_exist_int64(table, k):
-            result_keys[i] = table.keys[k]
-            result_counts[i] = table.vals[k]
-            i += 1
+
+    with nogil:
+        for k in range(table.n_buckets):
+            if kh_exist_int64(table, k):
+                result_keys[i] = table.keys[k]
+                result_counts[i] = table.vals[k]
+                i += 1
     kh_destroy_int64(table)
 
-    return result_keys, result_counts
+    return np.asarray(result_keys), np.asarray(result_counts)
 
 
 cdef build_count_table_object(ndarray[object] values,
@@ -968,7 +939,7 @@ cdef build_count_table_object(ndarray[object] values,
 
 
 cpdef value_count_object(ndarray[object] values,
-                       ndarray[uint8_t, cast=True] mask):
+                         ndarray[uint8_t, cast=True] mask):
     cdef:
         Py_ssize_t i
         kh_pymap_t *table
@@ -995,6 +966,7 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
         int count, max_count = 2
         int j = -1 # so you can do +=
         int k
+        ndarray[object] modes
         kh_pymap_t *table
 
     table = kh_init_pymap()
@@ -1019,36 +991,39 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
     return modes[:j+1]
 
 
-def mode_int64(ndarray[int64_t] values):
+@cython.boundscheck(False)
+def mode_int64(int64_t[:] values):
     cdef:
         int count, max_count = 2
         int j = -1 # so you can do +=
         int k
         kh_int64_t *table
+        ndarray[int64_t] modes
 
     table = kh_init_int64()
 
     build_count_table_int64(values, table)
 
     modes = np.empty(table.n_buckets, dtype=np.int64)
-    for k in range(table.n_buckets):
-        if kh_exist_int64(table, k):
-            count = table.vals[k]
 
-            if count == max_count:
-                j += 1
-            elif count > max_count:
-                max_count = count
-                j = 0
-            else:
-                continue
-            modes[j] = table.keys[k]
+    with nogil:
+        for k in range(table.n_buckets):
+            if kh_exist_int64(table, k):
+                count = table.vals[k]
+
+                if count == max_count:
+                    j += 1
+                elif count > max_count:
+                    max_count = count
+                    j = 0
+                else:
+                    continue
+                modes[j] = table.keys[k]
 
     kh_destroy_int64(table)
 
     return modes[:j+1]
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
@@ -1060,14 +1035,15 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
 
     kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
 
-    if take_last:
-        for i from n > i >=0:
-            kh_put_int64(table, values[i], &ret)
-            out[i] = ret == 0
-    else:
-        for i from 0 <= i < n:
-            kh_put_int64(table, values[i], &ret)
-            out[i] = ret == 0
+    with nogil:
+        if take_last:
+            for i from n > i >=0:
+                kh_put_int64(table, values[i], &ret)
+                out[i] = ret == 0
+        else:
+            for i from 0 <= i < n:
+                kh_put_int64(table, values[i], &ret)
+                out[i] = ret == 0
 
     kh_destroy_int64(table)
     return out
@@ -1087,13 +1063,18 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels):
         kh_int64_t * table = kh_init_int64()
         Int64Vector idx = Int64Vector()
         ndarray[int64_t, ndim=1] arr
+        Int64VectorData *ud = idx.data
 
     kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
 
-    for i in range(n):
-        kh_put_int64(table, labels[i], &ret)
-        if ret != 0:
-            idx.append(i)
+    with nogil:
+        for i in range(n):
+            kh_put_int64(table, labels[i], &ret)
+            if ret != 0:
+                if needs_resize(ud):
+                    with gil:
+                        idx.resize()
+                append_data(ud, i)
 
     kh_destroy_int64(table)
 
diff --git a/pandas/index.pyx b/pandas/index.pyx
index 9be7e7404f3fe..1678e3b280ee5 100644
--- a/pandas/index.pyx
+++ b/pandas/index.pyx
@@ -1,3 +1,5 @@
+# cython: profile=False
+
 from numpy cimport ndarray
 
 from numpy cimport (float64_t, int32_t, int64_t, uint8_t,
@@ -89,6 +91,7 @@ cdef class IndexEngine:
         self.monotonic_check = 0
 
         self.unique = 0
+        self.unique_check = 0
         self.monotonic_inc = 0
         self.monotonic_dec = 0
 
@@ -230,16 +233,12 @@ cdef class IndexEngine:
     cdef inline _do_monotonic_check(self):
         try:
             values = self._get_index_values()
-            self.monotonic_inc, self.monotonic_dec, unique = \
+            self.monotonic_inc, self.monotonic_dec = \
                 self._call_monotonic(values)
-
-            if unique is not None:
-                self.unique = unique
-                self.unique_check = 1
-
         except TypeError:
             self.monotonic_inc = 0
             self.monotonic_dec = 0
+
         self.monotonic_check = 1
 
     cdef _get_index_values(self):
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
index 5d4b18b36050f..9016f232afa9a 100644
--- a/pandas/src/generate_code.py
+++ b/pandas/src/generate_code.py
@@ -23,6 +23,9 @@
 from cpython cimport PyFloat_Check
 cimport cpython
 
+cdef extern from "numpy/npy_math.h":
+    double NAN "NPY_NAN"
+
 import numpy as np
 isnan = np.isnan
 
@@ -70,29 +73,31 @@
         return arr.asobject
     else:
         return np.array(arr, dtype=np.object_)
-
 """
 
 
-take_1d_template = """@cython.wraparound(False)
-def take_1d_%(name)s_%(dest)s(ndarray[%(c_type_in)s] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[%(c_type_out)s] out,
+take_1d_template = """
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_%(name)s_%(dest)s(%(c_type_in)s[:] values,
+                              int64_t[:] indexer,
+                              %(c_type_out)s[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         %(c_type_out)s fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = %(preval)svalues[idx]%(postval)s
 
+    %(nogil)s
+    %(tab)sfor i from 0 <= i < n:
+    %(tab)s    idx = indexer[i]
+    %(tab)s    if idx == -1:
+    %(tab)s        out[i] = fv
+    %(tab)s    else:
+    %(tab)s        out[i] = %(preval)svalues[idx]%(postval)s
 """
 
 inner_take_2d_axis0_template = """\
@@ -134,7 +139,6 @@ def take_1d_%(name)s_%(dest)s(ndarray[%(c_type_in)s] values,
         else:
             for j from 0 <= j < k:
                 out[i, j] = %(preval)svalues[idx, j]%(postval)s
-
 """
 
 take_2d_axis0_template = """\
@@ -241,7 +245,6 @@ def take_2d_multi_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values,
                     out[i, j] = fv
                 else:
                     out[i, j] = %(preval)svalues[idx, idx1[j]]%(postval)s
-
 """
 
 
@@ -332,7 +335,6 @@ def backfill_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new,
         cur = prev
 
     return indexer
-
 """
 
 
@@ -396,7 +398,6 @@ def pad_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new,
         cur = next
 
     return indexer
-
 """
 
 pad_1d_template = """@cython.boundscheck(False)
@@ -431,7 +432,6 @@ def pad_inplace_%(name)s(ndarray[%(c_type)s] values,
         else:
             fill_count = 0
             val = values[i]
-
 """
 
 pad_2d_template = """@cython.boundscheck(False)
@@ -592,12 +592,11 @@ def is_monotonic_%(name)s(ndarray[%(c_type)s] arr, bint timelike):
     '''
     Returns
     -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
+    is_monotonic_inc, is_monotonic_dec
     '''
     cdef:
         Py_ssize_t i, n
         %(c_type)s prev, cur
-        bint is_unique = 1
         bint is_monotonic_inc = 1
         bint is_monotonic_dec = 1
 
@@ -606,33 +605,40 @@ def is_monotonic_%(name)s(ndarray[%(c_type)s] arr, bint timelike):
     if n == 1:
         if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
             # single value is NaN
-            return False, False, True
+            return False, False
         else:
-            return True, True, True
+            return True, True
     elif n < 2:
-        return True, True, True
+        return True, True
 
     if timelike and arr[0] == iNaT:
-        return False, False, None
-
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if timelike and cur == iNaT:
-            return False, False, None
-        if cur < prev:
-            is_monotonic_inc = 0
-        elif cur > prev:
-            is_monotonic_dec = 0
-        elif cur == prev:
-            is_unique = 0
-        else:
-            # cur or prev is NaN
-            return False, False, None
-        if not is_monotonic_inc and not is_monotonic_dec:
-            return False, False, None
-        prev = cur
-    return is_monotonic_inc, is_monotonic_dec, is_unique
+        return False, False
+
+    %(nogil)s
+    %(tab)sprev = arr[0]
+    %(tab)sfor i in range(1, n):
+    %(tab)s    cur = arr[i]
+    %(tab)s    if timelike and cur == iNaT:
+    %(tab)s        is_monotonic_inc = 0
+    %(tab)s        is_monotonic_dec = 0
+    %(tab)s        break
+    %(tab)s    if cur < prev:
+    %(tab)s        is_monotonic_inc = 0
+    %(tab)s    elif cur > prev:
+    %(tab)s        is_monotonic_dec = 0
+    %(tab)s    elif cur == prev:
+    %(tab)s        pass # is_unique = 0
+    %(tab)s    else:
+    %(tab)s        # cur or prev is NaN
+    %(tab)s        is_monotonic_inc = 0
+    %(tab)s        is_monotonic_dec = 0
+    %(tab)s        break
+    %(tab)s    if not is_monotonic_inc and not is_monotonic_dec:
+    %(tab)s        is_monotonic_inc = 0
+    %(tab)s        is_monotonic_dec = 0
+    %(tab)s        break
+    %(tab)s    prev = cur
+    return is_monotonic_inc, is_monotonic_dec
 """
 
 map_indices_template = """@cython.wraparound(False)
@@ -656,7 +662,6 @@ def is_monotonic_%(name)s(ndarray[%(c_type)s] arr, bint timelike):
         result[index[i]] = i
 
     return result
-
 """
 
 groupby_template = """@cython.wraparound(False)
@@ -686,11 +691,10 @@ def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):
             result[key] = [idx]
 
     return result
-
 """
 
 group_last_template = """@cython.wraparound(False)
-@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[%(c_type)s, ndim=2] values,
@@ -699,7 +703,7 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
@@ -712,30 +716,31 @@ def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = %(nan_val)s
-            else:
-                out[i, j] = resx[i, j]
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = %(nan_val)s
+                else:
+                    out[i, j] = resx[i, j]
 """
 
 group_last_bin_template = """@cython.wraparound(False)
-@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                    ndarray[int64_t] counts,
                    ndarray[%(c_type)s, ndim=2] values,
@@ -760,30 +765,31 @@ def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                resx[b, j] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = %(nan_val)s
-            else:
-                out[i, j] = resx[i, j]
+    with nogil:
+        b = 0
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    resx[b, j] = val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = %(nan_val)s
+                else:
+                    out[i, j] = resx[i, j]
 """
 
-group_nth_bin_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
+group_nth_bin_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[%(c_type)s, ndim=2] values,
@@ -808,31 +814,32 @@ def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+    with nogil:
+        b = 0
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                if nobs[b, j] == rank:
-                    resx[b, j] = val
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if nobs[b, j] == rank:
+                        resx[b, j] = val
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = %(nan_val)s
-            else:
-                out[i, j] = resx[i, j]
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = %(nan_val)s
+                else:
+                    out[i, j] = resx[i, j]
 """
 
-group_nth_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
+group_nth_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[%(c_type)s, ndim=2] values,
@@ -841,7 +848,7 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
@@ -854,31 +861,32 @@ def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = %(nan_val)s
-            else:
-                out[i, j] = resx[i, j]
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = %(nan_val)s
+                else:
+                    out[i, j] = resx[i, j]
 """
 
-group_add_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
+group_add_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[%(c_type)s, ndim=2] values,
@@ -887,7 +895,7 @@ def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] sumx, nobs
 
@@ -899,44 +907,50 @@ def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+    with nogil:
 
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+        if K > 1:
 
-            counts[lab] += 1
-            val = values[i, 0]
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j]
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+
+        else:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
 """
 
-group_add_bin_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
+group_add_bin_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_add_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[%(dest_type2)s, ndim=2] values,
@@ -960,43 +974,46 @@ def group_add_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         ngroups = len(bins) + 1
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+        b = 0
+        if K > 1:
 
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            val = values[i, 0]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j]
+                counts[b] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
 """
 
-group_prod_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
+group_prod_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[%(c_type)s, ndim=2] values,
@@ -1005,7 +1022,7 @@ def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] prodx, nobs
 
@@ -1017,44 +1034,45 @@ def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
 
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    prodx[lab, j] *= val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        prodx[lab, j] *= val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            val = values[i, 0]
+                counts[lab] += 1
+                val = values[i, 0]
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                prodx[lab, 0] *= val
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    prodx[lab, 0] *= val
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = prodx[i, j]
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
 """
 
-group_prod_bin_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
+group_prod_bin_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_prod_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[%(dest_type2)s, ndim=2] values,
@@ -1078,39 +1096,41 @@ def group_prod_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         ngroups = len(bins) + 1
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-                # not nan
-                if val == val:
-                    nobs[b, j] += 1
-                    prodx[b, j] *= val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
 
-            counts[b] += 1
-            val = values[i, 0]
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        prodx[b, j] *= val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                prodx[b, 0] *= val
+                counts[b] += 1
+                val = values[i, 0]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = prodx[i, j]
+                # not nan
+                if val == val:
+                    nobs[b, 0] += 1
+                    prodx[b, 0] *= val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
 """
 
 group_var_template = """@cython.wraparound(False)
@@ -1120,7 +1140,7 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
               ndarray[%(dest_type2)s, ndim=2] values,
               ndarray[int64_t] labels):
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         %(dest_type2)s val, ct
         ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx
 
@@ -1133,47 +1153,49 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
+    with nogil:
+        if K > 1:
+            for i in range(N):
 
-            lab = labels[i]
-            if lab < 0:
-                continue
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
+                counts[lab] += 1
 
-            for j in range(K):
-                val = values[i, j]
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+                        sumxx[lab, j] += val * val
+        else:
+            for i in range(N):
+
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-                    sumxx[lab, j] += val * val
-    else:
-        for i in range(N):
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+                    sumxx[lab, 0] += val * val
 
-            lab = labels[i]
-            if lab < 0:
-                continue
 
-            counts[lab] += 1
-            val = values[i, 0]
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
-                sumxx[lab, 0] += val * val
-
-
-    for i in range(len(counts)):
-        for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
-                out[i, j] = nan
-            else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
+        for i in range(ncounts):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                                 (ct * ct - ct))
 """
 
 group_var_bin_template = """@cython.wraparound(False)
@@ -1201,44 +1223,45 @@ def group_var_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
+                counts[b] += 1
 
-            for j in range(K):
-                val = values[i, j]
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+                        sumxx[b, j] += val * val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-                    sumxx[b, j] += val * val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
+                    sumxx[b, 0] += val * val
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
-                sumxx[b, 0] += val * val
-
-    for i in range(ngroups):
-        for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
-                out[i, j] = nan
-            else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
+        for i in range(ngroups):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                                 (ct * ct - ct))
 """
 
 group_count_template = """@cython.boundscheck(False)
@@ -1251,36 +1274,36 @@ def group_count_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, lab
+        Py_ssize_t i, j, lab, ncounts = len(counts)
         Py_ssize_t N = values.shape[0], K = values.shape[1]
         %(c_type)s val
         ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
                                                  dtype=np.int64)
 
     if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+        raise AssertionError("len(index) != len(labels)")
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
 
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
+    %(nogil)s
+    %(tab)sfor i in range(N):
+    %(tab)s    lab = labels[i]
+    %(tab)s    if lab < 0:
+    %(tab)s        continue
 
-    for i in range(len(counts)):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+    %(tab)s    counts[lab] += 1
+    %(tab)s    for j in range(K):
+    %(tab)s        val = values[i, j]
 
+    %(tab)s        # not nan
+    %(tab)s        nobs[lab, j] += val == val and val != iNaT
 
+    %(tab)sfor i in range(ncounts):
+    %(tab)s    for j in range(K):
+    %(tab)s        out[i, j] = nobs[i, j]
 """
 
-group_count_bin_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
+group_count_bin_template = """@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_count_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                              ndarray[int64_t] counts,
                              ndarray[%(c_type)s, ndim=2] values,
@@ -1299,23 +1322,23 @@ def group_count_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
         return
     ngroups = len(bins) + (bins[len(bins) - 1] != N)
 
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+    %(nogil)s
+    %(tab)sfor i in range(N):
+    %(tab)s    while b < ngroups - 1 and i >= bins[b]:
+    %(tab)s        b += 1
 
-            # not nan
-            nobs[b, j] += val == val and val != iNaT
-
-    for i in range(ngroups):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+    %(tab)s    counts[b] += 1
+    %(tab)s    for j in range(K):
+    %(tab)s        val = values[i, j]
 
+    %(tab)s        # not nan
+    %(tab)s        nobs[b, j] += val == val and val != iNaT
 
+    %(tab)sfor i in range(ngroups):
+    %(tab)s    for j in range(K):
+    %(tab)s        out[i, j] = nobs[i, j]
 """
+
 # add passing bin edges, instead of labels
 
 
@@ -1350,41 +1373,42 @@ def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        if val < minx[b, j]:
+                            minx[b, j] = val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    if val < minx[b, j]:
-                        minx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    if val < minx[b, 0]:
+                        minx[b, 0] = val
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val < minx[b, 0]:
-                    minx[b, 0] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = %(nan_val)s
-            else:
-                out[i, j] = minx[i, j]
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = %(nan_val)s
+                else:
+                    out[i, j] = minx[i, j]
 """
 
 group_max_template = """@cython.wraparound(False)
@@ -1397,7 +1421,7 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] maxx, nobs
 
@@ -1411,42 +1435,43 @@ def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    nobs[lab, 0] += 1
+                    if val > maxx[lab, 0]:
+                        maxx[lab, 0] = val
 
-            counts[lab] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val > maxx[lab, 0]:
-                    maxx[lab, 0] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = %(nan_val)s
-            else:
-                out[i, j] = maxx[i, j]
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = %(nan_val)s
+                else:
+                    out[i, j] = maxx[i, j]
 """
 
 group_max_bin_template = """@cython.wraparound(False)
@@ -1476,41 +1501,42 @@ def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        if val > maxx[b, j]:
+                            maxx[b, j] = val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    if val > maxx[b, j]:
-                        maxx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    if val > maxx[b, 0]:
+                        maxx[b, 0] = val
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val > maxx[b, 0]:
-                    maxx[b, 0] = val
-
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = %(nan_val)s
-            else:
-                out[i, j] = maxx[i, j]
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = %(nan_val)s
+                else:
+                    out[i, j] = maxx[i, j]
 """
 
 
@@ -1524,7 +1550,7 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] minx, nobs
 
@@ -1538,42 +1564,43 @@ def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    nobs[lab, 0] += 1
+                    if val < minx[lab, 0]:
+                        minx[lab, 0] = val
 
-            counts[lab] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val < minx[lab, 0]:
-                    minx[lab, 0] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = %(nan_val)s
-            else:
-                out[i, j] = minx[i, j]
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = %(nan_val)s
+                else:
+                    out[i, j] = minx[i, j]
 """
 
 
@@ -1584,7 +1611,7 @@ def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                ndarray[%(dest_type2)s, ndim=2] values,
                ndarray[int64_t] labels):
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         %(dest_type2)s val, count
         ndarray[%(dest_type2)s, ndim=2] sumx, nobs
 
@@ -1596,42 +1623,44 @@ def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
 
-            counts[lab] += 1
-            val = values[i, 0]
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            count = nobs[i, j]
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j] / count
+        for i in range(ncounts):
+            for j in range(K):
+                count = nobs[i, j]
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
 """
 
 group_mean_bin_template = """
+@cython.boundscheck(False)
 def group_mean_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
                    ndarray[int64_t] counts,
                    ndarray[%(dest_type2)s, ndim=2] values,
@@ -1652,40 +1681,41 @@ def group_mean_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     else:
         ngroups = len(bins) + 1
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
-
-    for i in range(ngroups):
-        for j in range(K):
-            count = nobs[i, j]
-            if count == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j] / count
+        for i in range(ngroups):
+            for j in range(K):
+                count = nobs[i, j]
+                if count == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
 """
 
 group_ohlc_template = """@cython.wraparound(False)
@@ -1700,7 +1730,7 @@ def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         %(dest_type2)s val, count
-        %(dest_type2)s vopen, vhigh, vlow, vclose, NA
+        %(dest_type2)s vopen, vhigh, vlow, vclose
         bint got_first = 0
 
     if len(bins) == 0:
@@ -1715,55 +1745,55 @@ def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     if out.shape[1] != 4:
         raise ValueError('Output array must have 4 columns')
 
-    NA = np.nan
-
     b = 0
     if K > 1:
         raise NotImplementedError("Argument 'values' must have only "
                                   "one dimension")
     else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                if not got_first:
-                    out[b, 0] = NA
-                    out[b, 1] = NA
-                    out[b, 2] = NA
-                    out[b, 3] = NA
-                else:
-                    out[b, 0] = vopen
-                    out[b, 1] = vhigh
-                    out[b, 2] = vlow
-                    out[b, 3] = vclose
-                b += 1
-                got_first = 0
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                if not got_first:
-                    got_first = 1
-                    vopen = val
-                    vlow = val
-                    vhigh = val
-                else:
-                    if val < vlow:
+        with nogil:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    if not got_first:
+                        out[b, 0] = NAN
+                        out[b, 1] = NAN
+                        out[b, 2] = NAN
+                        out[b, 3] = NAN
+                    else:
+                        out[b, 0] = vopen
+                        out[b, 1] = vhigh
+                        out[b, 2] = vlow
+                        out[b, 3] = vclose
+                    b += 1
+                    got_first = 0
+
+                counts[b] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    if not got_first:
+                        got_first = 1
+                        vopen = val
                         vlow = val
-                    if val > vhigh:
                         vhigh = val
-                vclose = val
-
-        if not got_first:
-            out[b, 0] = NA
-            out[b, 1] = NA
-            out[b, 2] = NA
-            out[b, 3] = NA
-        else:
-            out[b, 0] = vopen
-            out[b, 1] = vhigh
-            out[b, 2] = vlow
-            out[b, 3] = vclose
+                    else:
+                        if val < vlow:
+                            vlow = val
+                        if val > vhigh:
+                            vhigh = val
+                    vclose = val
+
+            if not got_first:
+                out[b, 0] = NAN
+                out[b, 1] = NAN
+                out[b, 2] = NAN
+                out[b, 3] = NAN
+            else:
+                out[b, 0] = vopen
+                out[b, 1] = vhigh
+                out[b, 2] = vlow
+                out[b, 3] = vclose
 """
 
 arrmap_template = """@cython.wraparound(False)
@@ -1780,7 +1810,6 @@ def arrmap_%(name)s(ndarray[%(c_type)s] index, object func):
         result[i] = func(index[i])
 
     return maybe_convert_objects(result)
-
 """
 
 #----------------------------------------------------------------------
@@ -1832,7 +1861,6 @@ def left_join_indexer_unique_%(name)s(ndarray[%(c_type)s] left,
             indexer[i] = -1
             i += 1
     return indexer
-
 """
 
 # @cython.wraparound(False)
@@ -1939,7 +1967,6 @@ def left_join_indexer_%(name)s(ndarray[%(c_type)s] left,
                 j += 1
 
     return result, lindexer, rindexer
-
 """
 
 
@@ -2035,7 +2062,6 @@ def inner_join_indexer_%(name)s(ndarray[%(c_type)s] left,
                 j += 1
 
     return result, lindexer, rindexer
-
 """
 
 
@@ -2167,7 +2193,6 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left,
                 j += 1
 
     return result, lindexer, rindexer
-
 """
 
 outer_join_template = """@cython.wraparound(False)
@@ -2265,7 +2290,6 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left,
             count += 1
 
     return result, lindexer, rindexer
-
 """
 
 # ensure_dtype functions
@@ -2279,7 +2303,6 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left,
             return arr.astype(np.%(dtype)s)
     else:
         return np.array(arr, dtype=np.%(dtype)s)
-
 """
 
 ensure_functions = [
@@ -2323,19 +2346,19 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values,
 def generate_put_template(template, use_ints=True, use_floats=True,
                           use_objects=False, use_datelikes=False):
     floats_list = [
-        ('float64', 'float64_t', 'float64_t', 'np.float64'),
-        ('float32', 'float32_t', 'float32_t', 'np.float32'),
+        ('float64', 'float64_t', 'float64_t', 'np.float64', True),
+        ('float32', 'float32_t', 'float32_t', 'np.float32', True),
     ]
     ints_list = [
-        ('int8',  'int8_t',  'float32_t', 'np.float32'),
-        ('int16', 'int16_t', 'float32_t', 'np.float32'),
-        ('int32', 'int32_t', 'float64_t', 'np.float64'),
-        ('int64', 'int64_t', 'float64_t', 'np.float64'),
+        ('int8',  'int8_t',  'float32_t', 'np.float32', True),
+        ('int16', 'int16_t', 'float32_t', 'np.float32', True),
+        ('int32', 'int32_t', 'float64_t', 'np.float64', True),
+        ('int64', 'int64_t', 'float64_t', 'np.float64', True),
     ]
     date_like_list = [
-        ('int64', 'int64_t', 'float64_t', 'np.float64'),
+        ('int64', 'int64_t', 'float64_t', 'np.float64', True),
     ]
-    object_list = [('object', 'object', 'object', 'np.object_')]
+    object_list = [('object', 'object', 'object', 'np.object_', False)]
     function_list = []
     if use_floats:
         function_list.extend(floats_list)
@@ -2347,28 +2370,31 @@ def generate_put_template(template, use_ints=True, use_floats=True,
         function_list.extend(date_like_list)
 
     output = StringIO()
-    for name, c_type, dest_type, dest_dtype in function_list:
+    for name, c_type, dest_type, dest_dtype, nogil in function_list:
         func = template % {'name': name,
                            'c_type': c_type,
                            'dest_type': dest_type.replace('_t', ''),
                            'dest_type2': dest_type,
-                           'dest_dtype': dest_dtype}
+                           'dest_dtype': dest_dtype,
+                           'nogil' : 'with nogil:' if nogil else '',
+                           'tab' : '    ' if nogil else '' }
         output.write(func)
+        output.write("\n")
     return output.getvalue()
 
 def generate_put_min_max_template(template, use_ints=True, use_floats=True,
                                   use_objects=False, use_datelikes=False):
     floats_list = [
-        ('float64', 'float64_t', 'nan', 'np.inf'),
-        ('float32', 'float32_t', 'nan', 'np.inf'),
+        ('float64', 'float64_t', 'NAN', 'np.inf', True),
+        ('float32', 'float32_t', 'NAN', 'np.inf', True),
     ]
     ints_list = [
-        ('int64', 'int64_t', 'iNaT', _int64_max),
+        ('int64', 'int64_t', 'iNaT', _int64_max, True),
     ]
     date_like_list = [
-        ('int64', 'int64_t', 'iNaT', _int64_max),
+        ('int64', 'int64_t', 'iNaT', _int64_max, True),
     ]
-    object_list = [('object', 'object', 'nan', 'np.inf')]
+    object_list = [('object', 'object', 'np.nan', 'np.inf', False)]
     function_list = []
     if use_floats:
         function_list.extend(floats_list)
@@ -2380,27 +2406,30 @@ def generate_put_min_max_template(template, use_ints=True, use_floats=True,
         function_list.extend(date_like_list)
 
     output = StringIO()
-    for name, dest_type, nan_val, inf_val in function_list:
+    for name, dest_type, nan_val, inf_val, nogil in function_list:
         func = template % {'name': name,
                            'dest_type2': dest_type,
                            'nan_val': nan_val,
-                           'inf_val': inf_val}
+                           'inf_val': inf_val,
+                           'nogil' : "with nogil:" if nogil else '',
+                           'tab' : '    ' if nogil else '' }
         output.write(func)
+        output.write("\n")
     return output.getvalue()
 
 def generate_put_selection_template(template, use_ints=True, use_floats=True,
                                     use_objects=False, use_datelikes=False):
     floats_list = [
-        ('float64', 'float64_t', 'float64_t', 'nan'),
-        ('float32', 'float32_t', 'float32_t', 'nan'),
+        ('float64', 'float64_t', 'float64_t', 'NAN', True),
+        ('float32', 'float32_t', 'float32_t', 'NAN', True),
     ]
     ints_list = [
-        ('int64', 'int64_t', 'int64_t', 'iNaT'),
+        ('int64', 'int64_t', 'int64_t', 'iNaT', True),
     ]
     date_like_list = [
-        ('int64', 'int64_t', 'int64_t', 'iNaT'),
+        ('int64', 'int64_t', 'int64_t', 'iNaT', True),
     ]
-    object_list = [('object', 'object', 'object', 'nan')]
+    object_list = [('object', 'object', 'object', 'np.nan', False)]
     function_list = []
     if use_floats:
         function_list.extend(floats_list)
@@ -2412,72 +2441,97 @@ def generate_put_selection_template(template, use_ints=True, use_floats=True,
         function_list.extend(date_like_list)
 
     output = StringIO()
-    for name, c_type, dest_type, nan_val in function_list:
+    for name, c_type, dest_type, nan_val, nogil in function_list:
+
+        if nogil:
+            nogil = "with nogil:"
+            tab = '    '
+        else:
+            nogil = ''
+            tab = ''
+
         func = template % {'name': name,
                            'c_type': c_type,
                            'dest_type2': dest_type,
-                           'nan_val': nan_val}
+                           'nan_val': nan_val,
+                           'nogil' : nogil,
+                           'tab' : tab }
         output.write(func)
+        output.write("\n")
     return output.getvalue()
 
 def generate_take_template(template, exclude=None):
-    # name, dest, ctypein, ctypeout, preval, postval, cancopy
+    # name, dest, ctypein, ctypeout, preval, postval, cancopy, nogil
     function_list = [
-        ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True),
+        ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True),
         ('bool', 'object', 'uint8_t', 'object',
-         'True if ', ' > 0 else False', False),
-        ('int8', 'int8', 'int8_t', 'int8_t', '', '', True),
-        ('int8', 'int32', 'int8_t', 'int32_t', '', '', False),
-        ('int8', 'int64', 'int8_t', 'int64_t', '', '', False),
-        ('int8', 'float64', 'int8_t', 'float64_t', '', '', False),
-        ('int16', 'int16', 'int16_t', 'int16_t', '', '', True),
-        ('int16', 'int32', 'int16_t', 'int32_t', '', '', False),
-        ('int16', 'int64', 'int16_t', 'int64_t', '', '', False),
-        ('int16', 'float64', 'int16_t', 'float64_t', '', '', False),
-        ('int32', 'int32', 'int32_t', 'int32_t', '', '', True),
-        ('int32', 'int64', 'int32_t', 'int64_t', '', '', False),
-        ('int32', 'float64', 'int32_t', 'float64_t', '', '', False),
-        ('int64', 'int64', 'int64_t', 'int64_t', '', '', True),
-        ('int64', 'float64', 'int64_t', 'float64_t', '', '', False),
-        ('float32', 'float32', 'float32_t', 'float32_t', '', '', True),
-        ('float32', 'float64', 'float32_t', 'float64_t', '', '', False),
-        ('float64', 'float64', 'float64_t', 'float64_t', '', '', True),
-        ('object', 'object', 'object', 'object', '', '', False)
+         'True if ', ' > 0 else False', False, False),
+        ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False),
+        ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True),
+        ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True),
+        ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True),
+        ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True),
+        ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True),
+        ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True),
+        ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True),
+        ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True),
+        ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True),
+        ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True),
+        ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True),
+        ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True),
+        ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True),
+        ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True),
+        ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True),
+        ('object', 'object', 'object', 'object', '', '', False, False),
     ]
 
     output = StringIO()
     for (name, dest, c_type_in, c_type_out,
-         preval, postval, can_copy) in function_list:
+         preval, postval, can_copy, nogil) in function_list:
+
         if exclude is not None and name in exclude:
             continue
 
+        if nogil:
+            nogil = "with nogil:"
+            tab = '    '
+        else:
+            nogil = ''
+            tab = ''
+
         func = template % {'name': name, 'dest': dest,
                            'c_type_in': c_type_in, 'c_type_out': c_type_out,
                            'preval': preval, 'postval': postval,
-                           'can_copy': 'True' if can_copy else 'False'}
+                           'can_copy': 'True' if can_copy else 'False',
+                           'nogil' : nogil,
+                           'tab' : tab }
         output.write(func)
+        output.write("\n")
     return output.getvalue()
 
 def generate_from_template(template, exclude=None):
     # name, ctype, capable of holding NA
     function_list = [
-        ('float64', 'float64_t', 'np.float64', True),
-        ('float32', 'float32_t', 'np.float32', True),
-        ('object', 'object', 'object', True),
-        ('int32', 'int32_t', 'np.int32', False),
-        ('int64', 'int64_t', 'np.int64', False),
-        ('bool', 'uint8_t', 'np.bool', False)
+        ('float64', 'float64_t', 'np.float64', True, True),
+        ('float32', 'float32_t', 'np.float32', True, True),
+        ('object', 'object', 'object', True, False),
+        ('int32', 'int32_t', 'np.int32', False, True),
+        ('int64', 'int64_t', 'np.int64', False, True),
+        ('bool', 'uint8_t', 'np.bool', False, True)
     ]
 
     output = StringIO()
-    for name, c_type, dtype, can_hold_na in function_list:
+    for name, c_type, dtype, can_hold_na, nogil in function_list:
         if exclude is not None and name in exclude:
             continue
 
         func = template % {'name': name, 'c_type': c_type,
                            'dtype': dtype,
-                           'raise_on_na': 'False' if can_hold_na else 'True'}
+                           'raise_on_na': 'False' if can_hold_na else 'True',
+                           'nogil' : 'with nogil:' if nogil else '',
+                           'tab' : '    ' if nogil else '' }
         output.write(func)
+        output.write("\n")
     return output.getvalue()
 
 put_2d = [diff_2d_template]
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
index 83dfacba45211..11334516ea555 100644
--- a/pandas/src/generated.pyx
+++ b/pandas/src/generated.pyx
@@ -14,6 +14,9 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
 from cpython cimport PyFloat_Check
 cimport cpython
 
+cdef extern from "numpy/npy_math.h":
+    double NAN "NPY_NAN"
+
 import numpy as np
 isnan = np.isnan
 
@@ -63,7 +66,6 @@ cpdef ensure_object(object arr):
         return np.array(arr, dtype=np.object_)
 
 
-
 cpdef ensure_float64(object arr):
     if util.is_array(arr):
         if (<ndarray> arr).descr.type_num == NPY_FLOAT64:
@@ -73,7 +75,6 @@ cpdef ensure_float64(object arr):
     else:
         return np.array(arr, dtype=np.float64)
 
-
 cpdef ensure_float32(object arr):
     if util.is_array(arr):
         if (<ndarray> arr).descr.type_num == NPY_FLOAT32:
@@ -83,7 +84,6 @@ cpdef ensure_float32(object arr):
     else:
         return np.array(arr, dtype=np.float32)
 
-
 cpdef ensure_int8(object arr):
     if util.is_array(arr):
         if (<ndarray> arr).descr.type_num == NPY_INT8:
@@ -93,7 +93,6 @@ cpdef ensure_int8(object arr):
     else:
         return np.array(arr, dtype=np.int8)
 
-
 cpdef ensure_int16(object arr):
     if util.is_array(arr):
         if (<ndarray> arr).descr.type_num == NPY_INT16:
@@ -103,7 +102,6 @@ cpdef ensure_int16(object arr):
     else:
         return np.array(arr, dtype=np.int16)
 
-
 cpdef ensure_int32(object arr):
     if util.is_array(arr):
         if (<ndarray> arr).descr.type_num == NPY_INT32:
@@ -113,7 +111,6 @@ cpdef ensure_int32(object arr):
     else:
         return np.array(arr, dtype=np.int32)
 
-
 cpdef ensure_int64(object arr):
     if util.is_array(arr):
         if (<ndarray> arr).descr.type_num == NPY_INT64:
@@ -123,7 +120,6 @@ cpdef ensure_int64(object arr):
     else:
         return np.array(arr, dtype=np.int64)
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef map_indices_float64(ndarray[float64_t] index):
@@ -1228,6 +1224,7 @@ def backfill_inplace_float64(ndarray[float64_t] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_inplace_float32(ndarray[float32_t] values,
@@ -1260,6 +1257,7 @@ def backfill_inplace_float32(ndarray[float32_t] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_inplace_object(ndarray[object] values,
@@ -1292,6 +1290,7 @@ def backfill_inplace_object(ndarray[object] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_inplace_int32(ndarray[int32_t] values,
@@ -1324,6 +1323,7 @@ def backfill_inplace_int32(ndarray[int32_t] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_inplace_int64(ndarray[int64_t] values,
@@ -1356,6 +1356,7 @@ def backfill_inplace_int64(ndarray[int64_t] values,
         else:
             fill_count = 0
             val = values[i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_inplace_bool(ndarray[uint8_t] values,
@@ -1389,6 +1390,7 @@ def backfill_inplace_bool(ndarray[uint8_t] values,
             fill_count = 0
             val = values[i]
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
@@ -1423,6 +1425,7 @@ def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
@@ -1457,6 +1460,7 @@ def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_2d_inplace_object(ndarray[object, ndim=2] values,
@@ -1491,6 +1495,7 @@ def pad_2d_inplace_object(ndarray[object, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
@@ -1525,6 +1530,7 @@ def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
@@ -1559,6 +1565,7 @@ def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
@@ -1594,6 +1601,7 @@ def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
                 fill_count = 0
                 val = values[j, i]
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
@@ -1628,6 +1636,7 @@ def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
@@ -1662,6 +1671,7 @@ def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
@@ -1696,6 +1706,7 @@ def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
@@ -1730,6 +1741,7 @@ def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
@@ -1764,6 +1776,7 @@ def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
             else:
                 fill_count = 0
                 val = values[j, i]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
@@ -1799,18 +1812,18 @@ def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
                 fill_count = 0
                 val = values[j, i]
 
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_float64(ndarray[float64_t] arr, bint timelike):
     '''
     Returns
     -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
+    is_monotonic_inc, is_monotonic_dec
     '''
     cdef:
         Py_ssize_t i, n
         float64_t prev, cur
-        bint is_unique = 1
         bint is_monotonic_inc = 1
         bint is_monotonic_dec = 1
 
@@ -1819,45 +1832,52 @@ def is_monotonic_float64(ndarray[float64_t] arr, bint timelike):
     if n == 1:
         if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
             # single value is NaN
-            return False, False, True
+            return False, False
         else:
-            return True, True, True
+            return True, True
     elif n < 2:
-        return True, True, True
+        return True, True
 
     if timelike and arr[0] == iNaT:
-        return False, False, None
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if timelike and cur == iNaT:
-            return False, False, None
-        if cur < prev:
-            is_monotonic_inc = 0
-        elif cur > prev:
-            is_monotonic_dec = 0
-        elif cur == prev:
-            is_unique = 0
-        else:
-            # cur or prev is NaN
-            return False, False, None
-        if not is_monotonic_inc and not is_monotonic_dec:
-            return False, False, None
-        prev = cur
-    return is_monotonic_inc, is_monotonic_dec, is_unique
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_float32(ndarray[float32_t] arr, bint timelike):
     '''
     Returns
     -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
+    is_monotonic_inc, is_monotonic_dec
     '''
     cdef:
         Py_ssize_t i, n
         float32_t prev, cur
-        bint is_unique = 1
         bint is_monotonic_inc = 1
         bint is_monotonic_dec = 1
 
@@ -1866,45 +1886,52 @@ def is_monotonic_float32(ndarray[float32_t] arr, bint timelike):
     if n == 1:
         if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
             # single value is NaN
-            return False, False, True
+            return False, False
         else:
-            return True, True, True
+            return True, True
     elif n < 2:
-        return True, True, True
+        return True, True
 
     if timelike and arr[0] == iNaT:
-        return False, False, None
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if timelike and cur == iNaT:
-            return False, False, None
-        if cur < prev:
-            is_monotonic_inc = 0
-        elif cur > prev:
-            is_monotonic_dec = 0
-        elif cur == prev:
-            is_unique = 0
-        else:
-            # cur or prev is NaN
-            return False, False, None
-        if not is_monotonic_inc and not is_monotonic_dec:
-            return False, False, None
-        prev = cur
-    return is_monotonic_inc, is_monotonic_dec, is_unique
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_object(ndarray[object] arr, bint timelike):
     '''
     Returns
     -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
+    is_monotonic_inc, is_monotonic_dec
     '''
     cdef:
         Py_ssize_t i, n
         object prev, cur
-        bint is_unique = 1
         bint is_monotonic_inc = 1
         bint is_monotonic_dec = 1
 
@@ -1913,45 +1940,52 @@ def is_monotonic_object(ndarray[object] arr, bint timelike):
     if n == 1:
         if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
             # single value is NaN
-            return False, False, True
+            return False, False
         else:
-            return True, True, True
+            return True, True
     elif n < 2:
-        return True, True, True
+        return True, True
 
     if timelike and arr[0] == iNaT:
-        return False, False, None
+        return False, False
 
+    
     prev = arr[0]
     for i in range(1, n):
         cur = arr[i]
         if timelike and cur == iNaT:
-            return False, False, None
+            is_monotonic_inc = 0
+            is_monotonic_dec = 0
+            break
         if cur < prev:
             is_monotonic_inc = 0
         elif cur > prev:
             is_monotonic_dec = 0
         elif cur == prev:
-            is_unique = 0
+            pass # is_unique = 0
         else:
             # cur or prev is NaN
-            return False, False, None
+            is_monotonic_inc = 0
+            is_monotonic_dec = 0
+            break
         if not is_monotonic_inc and not is_monotonic_dec:
-            return False, False, None
+            is_monotonic_inc = 0
+            is_monotonic_dec = 0
+            break
         prev = cur
-    return is_monotonic_inc, is_monotonic_dec, is_unique
+    return is_monotonic_inc, is_monotonic_dec
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_int32(ndarray[int32_t] arr, bint timelike):
     '''
     Returns
     -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
+    is_monotonic_inc, is_monotonic_dec
     '''
     cdef:
         Py_ssize_t i, n
         int32_t prev, cur
-        bint is_unique = 1
         bint is_monotonic_inc = 1
         bint is_monotonic_dec = 1
 
@@ -1960,45 +1994,52 @@ def is_monotonic_int32(ndarray[int32_t] arr, bint timelike):
     if n == 1:
         if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
             # single value is NaN
-            return False, False, True
+            return False, False
         else:
-            return True, True, True
+            return True, True
     elif n < 2:
-        return True, True, True
+        return True, True
 
     if timelike and arr[0] == iNaT:
-        return False, False, None
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if timelike and cur == iNaT:
-            return False, False, None
-        if cur < prev:
-            is_monotonic_inc = 0
-        elif cur > prev:
-            is_monotonic_dec = 0
-        elif cur == prev:
-            is_unique = 0
-        else:
-            # cur or prev is NaN
-            return False, False, None
-        if not is_monotonic_inc and not is_monotonic_dec:
-            return False, False, None
-        prev = cur
-    return is_monotonic_inc, is_monotonic_dec, is_unique
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_int64(ndarray[int64_t] arr, bint timelike):
     '''
     Returns
     -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
+    is_monotonic_inc, is_monotonic_dec
     '''
     cdef:
         Py_ssize_t i, n
         int64_t prev, cur
-        bint is_unique = 1
         bint is_monotonic_inc = 1
         bint is_monotonic_dec = 1
 
@@ -2007,45 +2048,52 @@ def is_monotonic_int64(ndarray[int64_t] arr, bint timelike):
     if n == 1:
         if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
             # single value is NaN
-            return False, False, True
+            return False, False
         else:
-            return True, True, True
+            return True, True
     elif n < 2:
-        return True, True, True
+        return True, True
 
     if timelike and arr[0] == iNaT:
-        return False, False, None
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if timelike and cur == iNaT:
-            return False, False, None
-        if cur < prev:
-            is_monotonic_inc = 0
-        elif cur > prev:
-            is_monotonic_dec = 0
-        elif cur == prev:
-            is_unique = 0
-        else:
-            # cur or prev is NaN
-            return False, False, None
-        if not is_monotonic_inc and not is_monotonic_dec:
-            return False, False, None
-        prev = cur
-    return is_monotonic_inc, is_monotonic_dec, is_unique
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike):
     '''
     Returns
     -------
-    is_monotonic_inc, is_monotonic_dec, is_unique
+    is_monotonic_inc, is_monotonic_dec
     '''
     cdef:
         Py_ssize_t i, n
         uint8_t prev, cur
-        bint is_unique = 1
         bint is_monotonic_inc = 1
         bint is_monotonic_dec = 1
 
@@ -2054,33 +2102,41 @@ def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike):
     if n == 1:
         if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
             # single value is NaN
-            return False, False, True
+            return False, False
         else:
-            return True, True, True
+            return True, True
     elif n < 2:
-        return True, True, True
+        return True, True
 
     if timelike and arr[0] == iNaT:
-        return False, False, None
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
 
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if timelike and cur == iNaT:
-            return False, False, None
-        if cur < prev:
-            is_monotonic_inc = 0
-        elif cur > prev:
-            is_monotonic_dec = 0
-        elif cur == prev:
-            is_unique = 0
-        else:
-            # cur or prev is NaN
-            return False, False, None
-        if not is_monotonic_inc and not is_monotonic_dec:
-            return False, False, None
-        prev = cur
-    return is_monotonic_inc, is_monotonic_dec, is_unique
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -2342,37 +2398,45 @@ def arrmap_bool(ndarray[uint8_t] index, object func):
     return maybe_convert_objects(result)
 
 
+
 @cython.wraparound(False)
-def take_1d_bool_bool(ndarray[uint8_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[uint8_t] out,
+@cython.boundscheck(False)
+def take_1d_bool_bool(uint8_t[:] values,
+                              int64_t[:] indexer,
+                              uint8_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         uint8_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_bool_object(ndarray[uint8_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[object] out,
+@cython.boundscheck(False)
+def take_1d_bool_object(uint8_t[:] values,
+                              int64_t[:] indexer,
+                              object[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         object fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
+
+    
     for i from 0 <= i < n:
         idx = indexer[i]
         if idx == -1:
@@ -2380,18 +2444,22 @@ def take_1d_bool_object(ndarray[uint8_t] values,
         else:
             out[i] = True if values[idx] > 0 else False
 
+
 @cython.wraparound(False)
-def take_1d_int8_int8(ndarray[int8_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int8_t] out,
+@cython.boundscheck(False)
+def take_1d_int8_int8(int8_t[:] values,
+                              int64_t[:] indexer,
+                              int8_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         int8_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
+
+    
     for i from 0 <= i < n:
         idx = indexer[i]
         if idx == -1:
@@ -2399,303 +2467,367 @@ def take_1d_int8_int8(ndarray[int8_t] values,
         else:
             out[i] = values[idx]
 
+
 @cython.wraparound(False)
-def take_1d_int8_int32(ndarray[int8_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int32_t] out,
+@cython.boundscheck(False)
+def take_1d_int8_int32(int8_t[:] values,
+                              int64_t[:] indexer,
+                              int32_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         int32_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int8_int64(ndarray[int8_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int64_t] out,
+@cython.boundscheck(False)
+def take_1d_int8_int64(int8_t[:] values,
+                              int64_t[:] indexer,
+                              int64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         int64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int8_float64(ndarray[int8_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[float64_t] out,
+@cython.boundscheck(False)
+def take_1d_int8_float64(int8_t[:] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         float64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int16_int16(ndarray[int16_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int16_t] out,
+@cython.boundscheck(False)
+def take_1d_int16_int16(int16_t[:] values,
+                              int64_t[:] indexer,
+                              int16_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         int16_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int16_int32(ndarray[int16_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int32_t] out,
+@cython.boundscheck(False)
+def take_1d_int16_int32(int16_t[:] values,
+                              int64_t[:] indexer,
+                              int32_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         int32_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int16_int64(ndarray[int16_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int64_t] out,
+@cython.boundscheck(False)
+def take_1d_int16_int64(int16_t[:] values,
+                              int64_t[:] indexer,
+                              int64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         int64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int16_float64(ndarray[int16_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[float64_t] out,
+@cython.boundscheck(False)
+def take_1d_int16_float64(int16_t[:] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         float64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int32_int32(ndarray[int32_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int32_t] out,
+@cython.boundscheck(False)
+def take_1d_int32_int32(int32_t[:] values,
+                              int64_t[:] indexer,
+                              int32_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         int32_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int32_int64(ndarray[int32_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int64_t] out,
+@cython.boundscheck(False)
+def take_1d_int32_int64(int32_t[:] values,
+                              int64_t[:] indexer,
+                              int64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         int64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int32_float64(ndarray[int32_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[float64_t] out,
+@cython.boundscheck(False)
+def take_1d_int32_float64(int32_t[:] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         float64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
 
-@cython.wraparound(False)
-def take_1d_int64_int64(ndarray[int64_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[int64_t] out,
-                              fill_value=np.nan):
-    cdef:
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int64_int64(int64_t[:] values,
+                              int64_t[:] indexer,
+                              int64_t[:] out,
+                              fill_value=np.nan):
+    cdef:
         Py_ssize_t i, n, idx
         int64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_int64_float64(ndarray[int64_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[float64_t] out,
+@cython.boundscheck(False)
+def take_1d_int64_float64(int64_t[:] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         float64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_float32_float32(ndarray[float32_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[float32_t] out,
+@cython.boundscheck(False)
+def take_1d_float32_float32(float32_t[:] values,
+                              int64_t[:] indexer,
+                              float32_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         float32_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_float32_float64(ndarray[float32_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[float64_t] out,
+@cython.boundscheck(False)
+def take_1d_float32_float64(float32_t[:] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         float64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_float64_float64(ndarray[float64_t] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[float64_t] out,
+@cython.boundscheck(False)
+def take_1d_float64_float64(float64_t[:] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         float64_t fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
 
 @cython.wraparound(False)
-def take_1d_object_object(ndarray[object] values,
-                              ndarray[int64_t] indexer,
-                              ndarray[object] out,
+@cython.boundscheck(False)
+def take_1d_object_object(object[:] values,
+                              int64_t[:] indexer,
+                              object[:] out,
                               fill_value=np.nan):
     cdef:
         Py_ssize_t i, n, idx
         object fv
 
-    n = len(indexer)
+    n = indexer.shape[0]
 
     fv = fill_value
+
+    
     for i from 0 <= i < n:
         idx = indexer[i]
         if idx == -1:
@@ -2750,7 +2882,6 @@ cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values,
@@ -2851,7 +2982,6 @@ cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values,
                 out[i, j] = True if values[idx, j] > 0 else False
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values,
@@ -2952,7 +3082,6 @@ cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values,
@@ -3053,7 +3182,6 @@ cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values,
@@ -3154,7 +3282,6 @@ cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values,
@@ -3255,7 +3382,6 @@ cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values,
@@ -3356,7 +3482,6 @@ cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values,
@@ -3457,7 +3582,6 @@ cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values,
@@ -3558,7 +3682,6 @@ cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values,
@@ -3659,7 +3782,6 @@ cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values,
@@ -3760,7 +3882,6 @@ cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values,
@@ -3861,7 +3982,6 @@ cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values,
@@ -3962,7 +4082,6 @@ cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values,
@@ -4063,7 +4182,6 @@ cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values,
@@ -4164,7 +4282,6 @@ cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values,
@@ -4265,7 +4382,6 @@ cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values,
@@ -4366,7 +4482,6 @@ cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values,
@@ -4467,7 +4582,6 @@ cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values,
@@ -4568,7 +4682,6 @@ cdef inline take_2d_axis0_object_object_memview(object[:, :] values,
                 out[i, j] = values[idx, j]
 
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_axis0_object_object(ndarray[object, ndim=2] values,
@@ -4686,6 +4799,7 @@ def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values,
@@ -4748,6 +4862,7 @@ def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = True if values[i, idx] > 0 else False
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values,
@@ -4810,6 +4925,7 @@ def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values,
@@ -4872,6 +4988,7 @@ def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values,
@@ -4934,6 +5051,7 @@ def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values,
@@ -4996,6 +5114,7 @@ def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values,
@@ -5058,6 +5177,7 @@ def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values,
@@ -5120,6 +5240,7 @@ def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values,
@@ -5182,6 +5303,7 @@ def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values,
@@ -5244,6 +5366,7 @@ def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values,
@@ -5306,6 +5429,7 @@ def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values,
@@ -5368,6 +5492,7 @@ def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values,
@@ -5430,6 +5555,7 @@ def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values,
@@ -5492,6 +5618,7 @@ def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values,
@@ -5554,6 +5681,7 @@ def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values,
@@ -5616,6 +5744,7 @@ def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values,
@@ -5678,6 +5807,7 @@ def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values,
@@ -5740,6 +5870,7 @@ def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values,
                 out[i, j] = fv
             else:
                 out[i, j] = values[i, idx]
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cdef inline take_2d_axis1_object_object_memview(object[:, :] values,
@@ -5803,6 +5934,7 @@ def take_2d_axis1_object_object(ndarray[object, ndim=2] values,
             else:
                 out[i, j] = values[i, idx]
 
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values,
@@ -6379,6 +6511,7 @@ def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
             for i in range(sx):
                 for j in range(start, stop):
                     out[i, j] = arr[i, j] - arr[i, j - periods]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
@@ -6422,6 +6555,7 @@ def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
             for i in range(sx):
                 for j in range(start, stop):
                     out[i, j] = arr[i, j] - arr[i, j - periods]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
@@ -6465,6 +6599,7 @@ def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
             for i in range(sx):
                 for j in range(start, stop):
                     out[i, j] = arr[i, j] - arr[i, j - periods]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
@@ -6508,6 +6643,7 @@ def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
             for i in range(sx):
                 for j in range(start, stop):
                     out[i, j] = arr[i, j] - arr[i, j - periods]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
@@ -6551,6 +6687,7 @@ def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
             for i in range(sx):
                 for j in range(start, stop):
                     out[i, j] = arr[i, j] - arr[i, j - periods]
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
@@ -6595,8 +6732,9 @@ def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
                 for j in range(start, stop):
                     out[i, j] = arr[i, j] - arr[i, j - periods]
 
-@cython.boundscheck(False)
+
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_add_float64(ndarray[float64_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float64_t, ndim=2] values,
@@ -6605,7 +6743,7 @@ def group_add_float64(ndarray[float64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float64_t val, count
         ndarray[float64_t, ndim=2] sumx, nobs
 
@@ -6617,42 +6755,49 @@ def group_add_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+    with nogil:
+
+        if K > 1:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+
+        else:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
 
-            counts[lab] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j]
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_add_float32(ndarray[float32_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float32_t, ndim=2] values,
@@ -6661,7 +6806,7 @@ def group_add_float32(ndarray[float32_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float32_t val, count
         ndarray[float32_t, ndim=2] sumx, nobs
 
@@ -6673,43 +6818,50 @@ def group_add_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+    with nogil:
+
+        if K > 1:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+
+        else:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_add_bin_float64(ndarray[float64_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float64_t, ndim=2] values,
@@ -6733,41 +6885,45 @@ def group_add_bin_float64(ndarray[float64_t, ndim=2] out,
         ngroups = len(bins) + 1
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+        b = 0
+        if K > 1:
+
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j]
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_add_bin_float32(ndarray[float32_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float32_t, ndim=2] values,
@@ -6791,42 +6947,46 @@ def group_add_bin_float32(ndarray[float32_t, ndim=2] out,
         ngroups = len(bins) + 1
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+        b = 0
+        if K > 1:
+
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_prod_float64(ndarray[float64_t, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[float64_t, ndim=2] values,
@@ -6835,7 +6995,7 @@ def group_prod_float64(ndarray[float64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float64_t val, count
         ndarray[float64_t, ndim=2] prodx, nobs
 
@@ -6847,42 +7007,44 @@ def group_prod_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        prodx[lab, j] *= val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    prodx[lab, j] *= val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
+                    nobs[lab, 0] += 1
+                    prodx[lab, 0] *= val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                prodx[lab, 0] *= val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = prodx[i, j]
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_prod_float32(ndarray[float32_t, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[float32_t, ndim=2] values,
@@ -6891,7 +7053,7 @@ def group_prod_float32(ndarray[float32_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float32_t val, count
         ndarray[float32_t, ndim=2] prodx, nobs
 
@@ -6903,43 +7065,45 @@ def group_prod_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        prodx[lab, j] *= val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    prodx[lab, j] *= val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
+                    nobs[lab, 0] += 1
+                    prodx[lab, 0] *= val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                prodx[lab, 0] *= val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = prodx[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_prod_bin_float64(ndarray[float64_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float64_t, ndim=2] values,
@@ -6963,41 +7127,44 @@ def group_prod_bin_float64(ndarray[float64_t, ndim=2] out,
         ngroups = len(bins) + 1
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        prodx[b, j] *= val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    prodx[b, j] *= val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    prodx[b, 0] *= val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                prodx[b, 0] *= val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = prodx[i, j]
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_prod_bin_float32(ndarray[float32_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float32_t, ndim=2] values,
@@ -7021,39 +7188,42 @@ def group_prod_bin_float32(ndarray[float32_t, ndim=2] out,
         ngroups = len(bins) + 1
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        prodx[b, j] *= val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    prodx[b, j] *= val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    prodx[b, 0] *= val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                prodx[b, 0] *= val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = prodx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -7062,7 +7232,7 @@ def group_var_float64(ndarray[float64_t, ndim=2] out,
               ndarray[float64_t, ndim=2] values,
               ndarray[int64_t] labels):
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float64_t val, ct
         ndarray[float64_t, ndim=2] nobs, sumx, sumxx
 
@@ -7075,47 +7245,50 @@ def group_var_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
+    with nogil:
+        if K > 1:
+            for i in range(N):
 
-            lab = labels[i]
-            if lab < 0:
-                continue
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
+                counts[lab] += 1
 
-            for j in range(K):
-                val = values[i, j]
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+                        sumxx[lab, j] += val * val
+        else:
+            for i in range(N):
+
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-                    sumxx[lab, j] += val * val
-    else:
-        for i in range(N):
-
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+                    sumxx[lab, 0] += val * val
 
-            counts[lab] += 1
-            val = values[i, 0]
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
-                sumxx[lab, 0] += val * val
 
+        for i in range(ncounts):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                                 (ct * ct - ct))
 
-    for i in range(len(counts)):
-        for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
-                out[i, j] = nan
-            else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_var_float32(ndarray[float32_t, ndim=2] out,
@@ -7123,7 +7296,7 @@ def group_var_float32(ndarray[float32_t, ndim=2] out,
               ndarray[float32_t, ndim=2] values,
               ndarray[int64_t] labels):
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float32_t val, ct
         ndarray[float32_t, ndim=2] nobs, sumx, sumxx
 
@@ -7136,47 +7309,50 @@ def group_var_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
+    with nogil:
+        if K > 1:
+            for i in range(N):
 
-            lab = labels[i]
-            if lab < 0:
-                continue
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
+                counts[lab] += 1
 
-            for j in range(K):
-                val = values[i, j]
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+                        sumxx[lab, j] += val * val
+        else:
+            for i in range(N):
+
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-                    sumxx[lab, j] += val * val
-    else:
-        for i in range(N):
-
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+                    sumxx[lab, 0] += val * val
 
-            counts[lab] += 1
-            val = values[i, 0]
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
-                sumxx[lab, 0] += val * val
 
+        for i in range(ncounts):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                                 (ct * ct - ct))
 
-    for i in range(len(counts)):
-        for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
-                out[i, j] = nan
-            else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -7203,44 +7379,46 @@ def group_var_bin_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
+                counts[b] += 1
 
-            for j in range(K):
-                val = values[i, j]
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+                        sumxx[b, j] += val * val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-                    sumxx[b, j] += val * val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
+                    sumxx[b, 0] += val * val
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
-                sumxx[b, 0] += val * val
+        for i in range(ngroups):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                                 (ct * ct - ct))
 
-    for i in range(ngroups):
-        for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
-                out[i, j] = nan
-            else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_var_bin_float32(ndarray[float32_t, ndim=2] out,
@@ -7266,44 +7444,46 @@ def group_var_bin_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
+                counts[b] += 1
 
-            for j in range(K):
-                val = values[i, j]
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+                        sumxx[b, j] += val * val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-                    sumxx[b, j] += val * val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
+                    sumxx[b, 0] += val * val
 
-            counts[b] += 1
-            val = values[i, 0]
-
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
-                sumxx[b, 0] += val * val
+        for i in range(ngroups):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
+                                 (ct * ct - ct))
 
-    for i in range(ngroups):
-        for j in range(K):
-            ct = nobs[i, j]
-            if ct < 2:
-                out[i, j] = nan
-            else:
-                out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                             (ct * ct - ct))
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -7312,7 +7492,7 @@ def group_mean_float64(ndarray[float64_t, ndim=2] out,
                ndarray[float64_t, ndim=2] values,
                ndarray[int64_t] labels):
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float64_t val, count
         ndarray[float64_t, ndim=2] sumx, nobs
 
@@ -7324,39 +7504,41 @@ def group_mean_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
 
-            counts[lab] += 1
-            val = values[i, 0]
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+        for i in range(ncounts):
+            for j in range(K):
+                count = nobs[i, j]
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
 
-    for i in range(len(counts)):
-        for j in range(K):
-            count = nobs[i, j]
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j] / count
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_mean_float32(ndarray[float32_t, ndim=2] out,
@@ -7364,7 +7546,7 @@ def group_mean_float32(ndarray[float32_t, ndim=2] out,
                ndarray[float32_t, ndim=2] values,
                ndarray[int64_t] labels):
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float32_t val, count
         ndarray[float32_t, ndim=2] sumx, nobs
 
@@ -7376,41 +7558,44 @@ def group_mean_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    sumx[lab, j] += val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
 
-            counts[lab] += 1
-            val = values[i, 0]
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                sumx[lab, 0] += val
+        for i in range(ncounts):
+            for j in range(K):
+                count = nobs[i, j]
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
 
-    for i in range(len(counts)):
-        for j in range(K):
-            count = nobs[i, j]
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j] / count
 
 
+@cython.boundscheck(False)
 def group_mean_bin_float64(ndarray[float64_t, ndim=2] out,
                    ndarray[int64_t] counts,
                    ndarray[float64_t, ndim=2] values,
@@ -7431,41 +7616,44 @@ def group_mean_bin_float64(ndarray[float64_t, ndim=2] out,
     else:
         ngroups = len(bins) + 1
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
 
-            counts[b] += 1
-            val = values[i, 0]
+        for i in range(ngroups):
+            for j in range(K):
+                count = nobs[i, j]
+                if count == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
-
-    for i in range(ngroups):
-        for j in range(K):
-            count = nobs[i, j]
-            if count == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j] / count
 
+@cython.boundscheck(False)
 def group_mean_bin_float32(ndarray[float32_t, ndim=2] out,
                    ndarray[int64_t] counts,
                    ndarray[float32_t, ndim=2] values,
@@ -7486,40 +7674,42 @@ def group_mean_bin_float32(ndarray[float32_t, ndim=2] out,
     else:
         ngroups = len(bins) + 1
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        sumx[b, j] += val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    sumx[b, j] += val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    sumx[b, 0] += val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                sumx[b, 0] += val
+        for i in range(ngroups):
+            for j in range(K):
+                count = nobs[i, j]
+                if count == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
 
-    for i in range(ngroups):
-        for j in range(K):
-            count = nobs[i, j]
-            if count == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = sumx[i, j] / count
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -7533,7 +7723,7 @@ def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         float64_t val, count
-        float64_t vopen, vhigh, vlow, vclose, NA
+        float64_t vopen, vhigh, vlow, vclose
         bint got_first = 0
 
     if len(bins) == 0:
@@ -7548,55 +7738,56 @@ def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
     if out.shape[1] != 4:
         raise ValueError('Output array must have 4 columns')
 
-    NA = np.nan
-
     b = 0
     if K > 1:
         raise NotImplementedError("Argument 'values' must have only "
                                   "one dimension")
     else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                if not got_first:
-                    out[b, 0] = NA
-                    out[b, 1] = NA
-                    out[b, 2] = NA
-                    out[b, 3] = NA
-                else:
-                    out[b, 0] = vopen
-                    out[b, 1] = vhigh
-                    out[b, 2] = vlow
-                    out[b, 3] = vclose
-                b += 1
-                got_first = 0
 
-            counts[b] += 1
-            val = values[i, 0]
+        with nogil:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    if not got_first:
+                        out[b, 0] = NAN
+                        out[b, 1] = NAN
+                        out[b, 2] = NAN
+                        out[b, 3] = NAN
+                    else:
+                        out[b, 0] = vopen
+                        out[b, 1] = vhigh
+                        out[b, 2] = vlow
+                        out[b, 3] = vclose
+                    b += 1
+                    got_first = 0
 
-            # not nan
-            if val == val:
-                if not got_first:
-                    got_first = 1
-                    vopen = val
-                    vlow = val
-                    vhigh = val
-                else:
-                    if val < vlow:
+                counts[b] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    if not got_first:
+                        got_first = 1
+                        vopen = val
                         vlow = val
-                    if val > vhigh:
                         vhigh = val
-                vclose = val
+                    else:
+                        if val < vlow:
+                            vlow = val
+                        if val > vhigh:
+                            vhigh = val
+                    vclose = val
+
+            if not got_first:
+                out[b, 0] = NAN
+                out[b, 1] = NAN
+                out[b, 2] = NAN
+                out[b, 3] = NAN
+            else:
+                out[b, 0] = vopen
+                out[b, 1] = vhigh
+                out[b, 2] = vlow
+                out[b, 3] = vclose
 
-        if not got_first:
-            out[b, 0] = NA
-            out[b, 1] = NA
-            out[b, 2] = NA
-            out[b, 3] = NA
-        else:
-            out[b, 0] = vopen
-            out[b, 1] = vhigh
-            out[b, 2] = vlow
-            out[b, 3] = vclose
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
@@ -7609,7 +7800,7 @@ def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
     cdef:
         Py_ssize_t i, j, N, K, ngroups, b
         float32_t val, count
-        float32_t vopen, vhigh, vlow, vclose, NA
+        float32_t vopen, vhigh, vlow, vclose
         bint got_first = 0
 
     if len(bins) == 0:
@@ -7624,58 +7815,59 @@ def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
     if out.shape[1] != 4:
         raise ValueError('Output array must have 4 columns')
 
-    NA = np.nan
-
     b = 0
     if K > 1:
         raise NotImplementedError("Argument 'values' must have only "
                                   "one dimension")
     else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                if not got_first:
-                    out[b, 0] = NA
-                    out[b, 1] = NA
-                    out[b, 2] = NA
-                    out[b, 3] = NA
-                else:
-                    out[b, 0] = vopen
-                    out[b, 1] = vhigh
-                    out[b, 2] = vlow
-                    out[b, 3] = vclose
-                b += 1
-                got_first = 0
 
-            counts[b] += 1
-            val = values[i, 0]
+        with nogil:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    if not got_first:
+                        out[b, 0] = NAN
+                        out[b, 1] = NAN
+                        out[b, 2] = NAN
+                        out[b, 3] = NAN
+                    else:
+                        out[b, 0] = vopen
+                        out[b, 1] = vhigh
+                        out[b, 2] = vlow
+                        out[b, 3] = vclose
+                    b += 1
+                    got_first = 0
 
-            # not nan
-            if val == val:
-                if not got_first:
-                    got_first = 1
-                    vopen = val
-                    vlow = val
-                    vhigh = val
-                else:
-                    if val < vlow:
+                counts[b] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    if not got_first:
+                        got_first = 1
+                        vopen = val
                         vlow = val
-                    if val > vhigh:
                         vhigh = val
-                vclose = val
+                    else:
+                        if val < vlow:
+                            vlow = val
+                        if val > vhigh:
+                            vhigh = val
+                    vclose = val
+
+            if not got_first:
+                out[b, 0] = NAN
+                out[b, 1] = NAN
+                out[b, 2] = NAN
+                out[b, 3] = NAN
+            else:
+                out[b, 0] = vopen
+                out[b, 1] = vhigh
+                out[b, 2] = vlow
+                out[b, 3] = vclose
 
-        if not got_first:
-            out[b, 0] = NA
-            out[b, 1] = NA
-            out[b, 2] = NA
-            out[b, 3] = NA
-        else:
-            out[b, 0] = vopen
-            out[b, 1] = vhigh
-            out[b, 2] = vlow
-            out[b, 3] = vclose
 
 @cython.wraparound(False)
-@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_last_float64(ndarray[float64_t, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[float64_t, ndim=2] values,
@@ -7684,7 +7876,7 @@ def group_last_float64(ndarray[float64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float64_t val, count
         ndarray[float64_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
@@ -7697,28 +7889,30 @@ def group_last_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-@cython.wraparound(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_last_float32(ndarray[float32_t, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[float32_t, ndim=2] values,
@@ -7727,7 +7921,7 @@ def group_last_float32(ndarray[float32_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float32_t val, count
         ndarray[float32_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
@@ -7740,28 +7934,30 @@ def group_last_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-@cython.wraparound(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_last_int64(ndarray[int64_t, ndim=2] out,
                ndarray[int64_t] counts,
                ndarray[int64_t, ndim=2] values,
@@ -7770,7 +7966,7 @@ def group_last_int64(ndarray[int64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         int64_t val, count
         ndarray[int64_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
@@ -7783,29 +7979,31 @@ def group_last_int64(ndarray[int64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = iNaT
-            else:
-                out[i, j] = resx[i, j]
 
 @cython.wraparound(False)
-@cython.wraparound(False)
+@cython.boundscheck(False)
 def group_last_bin_float64(ndarray[float64_t, ndim=2] out,
                    ndarray[int64_t] counts,
                    ndarray[float64_t, ndim=2] values,
@@ -7830,28 +8028,30 @@ def group_last_bin_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+    with nogil:
+        b = 0
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                resx[b, j] = val
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    resx[b, j] = val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-@cython.wraparound(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_last_bin_float32(ndarray[float32_t, ndim=2] out,
                    ndarray[int64_t] counts,
                    ndarray[float32_t, ndim=2] values,
@@ -7876,28 +8076,30 @@ def group_last_bin_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+    with nogil:
+        b = 0
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                resx[b, j] = val
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    resx[b, j] = val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-@cython.wraparound(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_last_bin_int64(ndarray[int64_t, ndim=2] out,
                    ndarray[int64_t] counts,
                    ndarray[int64_t, ndim=2] values,
@@ -7922,29 +8124,31 @@ def group_last_bin_int64(ndarray[int64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+    with nogil:
+        b = 0
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                resx[b, j] = val
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    resx[b, j] = val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = iNaT
-            else:
-                out[i, j] = resx[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_nth_float64(ndarray[float64_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float64_t, ndim=2] values,
@@ -7953,7 +8157,7 @@ def group_nth_float64(ndarray[float64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float64_t val, count
         ndarray[float64_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
@@ -7966,29 +8170,31 @@ def group_nth_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_nth_float32(ndarray[float32_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float32_t, ndim=2] values,
@@ -7997,7 +8203,7 @@ def group_nth_float32(ndarray[float32_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float32_t val, count
         ndarray[float32_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
@@ -8010,29 +8216,31 @@ def group_nth_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_nth_int64(ndarray[int64_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[int64_t, ndim=2] values,
@@ -8041,7 +8249,7 @@ def group_nth_int64(ndarray[int64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         int64_t val, count
         ndarray[int64_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
@@ -8054,30 +8262,32 @@ def group_nth_int64(ndarray[int64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = iNaT
-            else:
-                out[i, j] = resx[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_nth_bin_float64(ndarray[float64_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float64_t, ndim=2] values,
@@ -8102,29 +8312,31 @@ def group_nth_bin_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+    with nogil:
+        b = 0
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                if nobs[b, j] == rank:
-                    resx[b, j] = val
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if nobs[b, j] == rank:
+                        resx[b, j] = val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_nth_bin_float32(ndarray[float32_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[float32_t, ndim=2] values,
@@ -8149,29 +8361,31 @@ def group_nth_bin_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+    with nogil:
+        b = 0
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                if nobs[b, j] == rank:
-                    resx[b, j] = val
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if nobs[b, j] == rank:
+                        resx[b, j] = val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_nth_bin_int64(ndarray[int64_t, ndim=2] out,
                   ndarray[int64_t] counts,
                   ndarray[int64_t, ndim=2] values,
@@ -8196,27 +8410,29 @@ def group_nth_bin_int64(ndarray[int64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
+    with nogil:
+        b = 0
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-            # not nan
-            if val == val:
-                nobs[b, j] += 1
-                if nobs[b, j] == rank:
-                    resx[b, j] = val
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    if nobs[b, j] == rank:
+                        resx[b, j] = val
+
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = resx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = iNaT
-            else:
-                out[i, j] = resx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -8228,7 +8444,7 @@ def group_min_float64(ndarray[float64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float64_t val, count
         ndarray[float64_t, ndim=2] minx, nobs
 
@@ -8242,42 +8458,44 @@ def group_min_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
+                    nobs[lab, 0] += 1
+                    if val < minx[lab, 0]:
+                        minx[lab, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val < minx[lab, 0]:
-                    minx[lab, 0] = val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = minx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = minx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_min_float32(ndarray[float32_t, ndim=2] out,
@@ -8288,7 +8506,7 @@ def group_min_float32(ndarray[float32_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float32_t val, count
         ndarray[float32_t, ndim=2] minx, nobs
 
@@ -8302,42 +8520,44 @@ def group_min_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
+                    nobs[lab, 0] += 1
+                    if val < minx[lab, 0]:
+                        minx[lab, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val < minx[lab, 0]:
-                    minx[lab, 0] = val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = minx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = minx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_min_int64(ndarray[int64_t, ndim=2] out,
@@ -8348,7 +8568,7 @@ def group_min_int64(ndarray[int64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         int64_t val, count
         ndarray[int64_t, ndim=2] minx, nobs
 
@@ -8362,42 +8582,44 @@ def group_min_int64(ndarray[int64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
+                    nobs[lab, 0] += 1
+                    if val < minx[lab, 0]:
+                        minx[lab, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val < minx[lab, 0]:
-                    minx[lab, 0] = val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = minx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = iNaT
-            else:
-                out[i, j] = minx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -8427,41 +8649,43 @@ def group_min_bin_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        if val < minx[b, j]:
+                            minx[b, j] = val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    if val < minx[b, j]:
-                        minx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    if val < minx[b, 0]:
+                        minx[b, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val < minx[b, 0]:
-                    minx[b, 0] = val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = minx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = minx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_min_bin_float32(ndarray[float32_t, ndim=2] out,
@@ -8490,41 +8714,43 @@ def group_min_bin_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        if val < minx[b, j]:
+                            minx[b, j] = val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    if val < minx[b, j]:
-                        minx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    if val < minx[b, 0]:
+                        minx[b, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val < minx[b, 0]:
-                    minx[b, 0] = val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = minx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = minx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_min_bin_int64(ndarray[int64_t, ndim=2] out,
@@ -8553,41 +8779,43 @@ def group_min_bin_int64(ndarray[int64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        if val < minx[b, j]:
+                            minx[b, j] = val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    if val < minx[b, j]:
-                        minx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    if val < minx[b, 0]:
+                        minx[b, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val < minx[b, 0]:
-                    minx[b, 0] = val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = minx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = iNaT
-            else:
-                out[i, j] = minx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -8599,7 +8827,7 @@ def group_max_float64(ndarray[float64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float64_t val, count
         ndarray[float64_t, ndim=2] maxx, nobs
 
@@ -8613,42 +8841,44 @@ def group_max_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
+                    nobs[lab, 0] += 1
+                    if val > maxx[lab, 0]:
+                        maxx[lab, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val > maxx[lab, 0]:
-                    maxx[lab, 0] = val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = maxx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = maxx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_max_float32(ndarray[float32_t, ndim=2] out,
@@ -8659,7 +8889,7 @@ def group_max_float32(ndarray[float32_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         float32_t val, count
         ndarray[float32_t, ndim=2] maxx, nobs
 
@@ -8673,42 +8903,44 @@ def group_max_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
+                    nobs[lab, 0] += 1
+                    if val > maxx[lab, 0]:
+                        maxx[lab, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val > maxx[lab, 0]:
-                    maxx[lab, 0] = val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = maxx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = maxx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_max_int64(ndarray[int64_t, ndim=2] out,
@@ -8719,7 +8951,7 @@ def group_max_int64(ndarray[int64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, N, K, lab
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         int64_t val, count
         ndarray[int64_t, ndim=2] maxx, nobs
 
@@ -8733,42 +8965,44 @@ def group_max_int64(ndarray[int64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    if K > 1:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
 
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
-    else:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
 
-            counts[lab] += 1
-            val = values[i, 0]
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    if val > maxx[lab, 0]:
+                        maxx[lab, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[lab, 0] += 1
-                if val > maxx[lab, 0]:
-                    maxx[lab, 0] = val
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = maxx[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = iNaT
-            else:
-                out[i, j] = maxx[i, j]
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -8797,41 +9031,43 @@ def group_max_bin_float64(ndarray[float64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        if val > maxx[b, j]:
+                            maxx[b, j] = val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    if val > maxx[b, j]:
-                        maxx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    if val > maxx[b, 0]:
+                        maxx[b, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val > maxx[b, 0]:
-                    maxx[b, 0] = val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = maxx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = maxx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_max_bin_float32(ndarray[float32_t, ndim=2] out,
@@ -8859,41 +9095,43 @@ def group_max_bin_float32(ndarray[float32_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        if val > maxx[b, j]:
+                            maxx[b, j] = val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    if val > maxx[b, j]:
-                        maxx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    if val > maxx[b, 0]:
+                        maxx[b, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val > maxx[b, 0]:
-                    maxx[b, 0] = val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = maxx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = maxx[i, j]
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_max_bin_int64(ndarray[int64_t, ndim=2] out,
@@ -8921,41 +9159,43 @@ def group_max_bin_int64(ndarray[int64_t, ndim=2] out,
 
     N, K = (<object> values).shape
 
-    b = 0
-    if K > 1:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
+    with nogil:
+        b = 0
+        if K > 1:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
 
-            counts[b] += 1
-            for j in range(K):
-                val = values[i, j]
+                counts[b] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[b, j] += 1
+                        if val > maxx[b, j]:
+                            maxx[b, j] = val
+        else:
+            for i in range(N):
+                while b < ngroups - 1 and i >= bins[b]:
+                    b += 1
+
+                counts[b] += 1
+                val = values[i, 0]
 
                 # not nan
                 if val == val:
-                    nobs[b, j] += 1
-                    if val > maxx[b, j]:
-                        maxx[b, j] = val
-    else:
-        for i in range(N):
-            while b < ngroups - 1 and i >= bins[b]:
-                b += 1
-
-            counts[b] += 1
-            val = values[i, 0]
+                    nobs[b, 0] += 1
+                    if val > maxx[b, 0]:
+                        maxx[b, 0] = val
 
-            # not nan
-            if val == val:
-                nobs[b, 0] += 1
-                if val > maxx[b, 0]:
-                    maxx[b, 0] = val
+        for i in range(ngroups):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = maxx[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = iNaT
-            else:
-                out[i, j] = maxx[i, j]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -8967,31 +9207,32 @@ def group_count_float64(ndarray[float64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, lab
+        Py_ssize_t i, j, lab, ncounts = len(counts)
         Py_ssize_t N = values.shape[0], K = values.shape[1]
         float64_t val
         ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
                                                  dtype=np.int64)
 
     if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
+        raise AssertionError("len(index) != len(labels)")
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+                # not nan
+                nobs[lab, j] += val == val and val != iNaT
 
+        for i in range(ncounts):
+            for j in range(K):
+                out[i, j] = nobs[i, j]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -9003,31 +9244,32 @@ def group_count_float32(ndarray[float32_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, lab
+        Py_ssize_t i, j, lab, ncounts = len(counts)
         Py_ssize_t N = values.shape[0], K = values.shape[1]
         float32_t val
         ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
                                                  dtype=np.int64)
 
     if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
+        raise AssertionError("len(index) != len(labels)")
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+                # not nan
+                nobs[lab, j] += val == val and val != iNaT
 
+        for i in range(ncounts):
+            for j in range(K):
+                out[i, j] = nobs[i, j]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -9039,31 +9281,32 @@ def group_count_int64(ndarray[int64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, lab
+        Py_ssize_t i, j, lab, ncounts = len(counts)
         Py_ssize_t N = values.shape[0], K = values.shape[1]
         int64_t val
         ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
                                                  dtype=np.int64)
 
     if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
+        raise AssertionError("len(index) != len(labels)")
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+                # not nan
+                nobs[lab, j] += val == val and val != iNaT
 
+        for i in range(ncounts):
+            for j in range(K):
+                out[i, j] = nobs[i, j]
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -9075,15 +9318,17 @@ def group_count_object(ndarray[object, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, lab
+        Py_ssize_t i, j, lab, ncounts = len(counts)
         Py_ssize_t N = values.shape[0], K = values.shape[1]
         object val
         ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
                                                  dtype=np.int64)
 
     if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
+        raise AssertionError("len(index) != len(labels)")
 
+
+    
     for i in range(N):
         lab = labels[i]
         if lab < 0:
@@ -9096,11 +9341,10 @@ def group_count_object(ndarray[object, ndim=2] out,
             # not nan
             nobs[lab, j] += val == val and val != iNaT
 
-    for i in range(len(counts)):
+    for i in range(ncounts):
         for j in range(K):
             out[i, j] = nobs[i, j]
 
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_count_int64(ndarray[int64_t, ndim=2] out,
@@ -9111,35 +9355,36 @@ def group_count_int64(ndarray[int64_t, ndim=2] out,
     Only aggregates on axis=0
     '''
     cdef:
-        Py_ssize_t i, j, lab
+        Py_ssize_t i, j, lab, ncounts = len(counts)
         Py_ssize_t N = values.shape[0], K = values.shape[1]
         int64_t val
         ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
                                                  dtype=np.int64)
 
     if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
+        raise AssertionError("len(index) != len(labels)")
 
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
 
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
 
-    for i in range(len(counts)):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+                # not nan
+                nobs[lab, j] += val == val and val != iNaT
 
+        for i in range(ncounts):
+            for j in range(K):
+                out[i, j] = nobs[i, j]
 
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_count_bin_float64(ndarray[float64_t, ndim=2] out,
                              ndarray[int64_t] counts,
                              ndarray[float64_t, ndim=2] values,
@@ -9158,24 +9403,24 @@ def group_count_bin_float64(ndarray[float64_t, ndim=2] out,
         return
     ngroups = len(bins) + (bins[len(bins) - 1] != N)
 
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+    with nogil:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-            # not nan
-            nobs[b, j] += val == val and val != iNaT
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+                # not nan
+                nobs[b, j] += val == val and val != iNaT
 
+        for i in range(ngroups):
+            for j in range(K):
+                out[i, j] = nobs[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_count_bin_float32(ndarray[float32_t, ndim=2] out,
                              ndarray[int64_t] counts,
                              ndarray[float32_t, ndim=2] values,
@@ -9194,24 +9439,24 @@ def group_count_bin_float32(ndarray[float32_t, ndim=2] out,
         return
     ngroups = len(bins) + (bins[len(bins) - 1] != N)
 
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+    with nogil:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-            # not nan
-            nobs[b, j] += val == val and val != iNaT
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+                # not nan
+                nobs[b, j] += val == val and val != iNaT
 
+        for i in range(ngroups):
+            for j in range(K):
+                out[i, j] = nobs[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_count_bin_int64(ndarray[int64_t, ndim=2] out,
                              ndarray[int64_t] counts,
                              ndarray[int64_t, ndim=2] values,
@@ -9230,24 +9475,24 @@ def group_count_bin_int64(ndarray[int64_t, ndim=2] out,
         return
     ngroups = len(bins) + (bins[len(bins) - 1] != N)
 
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+    with nogil:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-            # not nan
-            nobs[b, j] += val == val and val != iNaT
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+                # not nan
+                nobs[b, j] += val == val and val != iNaT
 
+        for i in range(ngroups):
+            for j in range(K):
+                out[i, j] = nobs[i, j]
 
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_count_bin_object(ndarray[object, ndim=2] out,
                              ndarray[int64_t] counts,
                              ndarray[object, ndim=2] values,
@@ -9266,6 +9511,7 @@ def group_count_bin_object(ndarray[object, ndim=2] out,
         return
     ngroups = len(bins) + (bins[len(bins) - 1] != N)
 
+    
     for i in range(N):
         while b < ngroups - 1 and i >= bins[b]:
             b += 1
@@ -9281,9 +9527,8 @@ def group_count_bin_object(ndarray[object, ndim=2] out,
         for j in range(K):
             out[i, j] = nobs[i, j]
 
-
-@cython.boundscheck(False)
 @cython.wraparound(False)
+@cython.boundscheck(False)
 def group_count_bin_int64(ndarray[int64_t, ndim=2] out,
                              ndarray[int64_t] counts,
                              ndarray[int64_t, ndim=2] values,
@@ -9302,21 +9547,21 @@ def group_count_bin_int64(ndarray[int64_t, ndim=2] out,
         return
     ngroups = len(bins) + (bins[len(bins) - 1] != N)
 
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
+    with nogil:
+        for i in range(N):
+            while b < ngroups - 1 and i >= bins[b]:
+                b += 1
 
-            # not nan
-            nobs[b, j] += val == val and val != iNaT
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
 
-    for i in range(ngroups):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
+                # not nan
+                nobs[b, j] += val == val and val != iNaT
 
+        for i in range(ngroups):
+            for j in range(K):
+                out[i, j] = nobs[i, j]
 
 
 @cython.wraparound(False)
diff --git a/pandas/src/khash.pxd b/pandas/src/khash.pxd
index a8fd51a62cfbe..b28f43eecfac7 100644
--- a/pandas/src/khash.pxd
+++ b/pandas/src/khash.pxd
@@ -45,15 +45,15 @@ cdef extern from "khash_python.h":
         kh_cstr_t *keys
         size_t *vals
 
-    inline kh_str_t* kh_init_str()
-    inline void kh_destroy_str(kh_str_t*)
-    inline void kh_clear_str(kh_str_t*)
-    inline khint_t kh_get_str(kh_str_t*, kh_cstr_t)
-    inline void kh_resize_str(kh_str_t*, khint_t)
-    inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*)
-    inline void kh_del_str(kh_str_t*, khint_t)
+    inline kh_str_t* kh_init_str() nogil
+    inline void kh_destroy_str(kh_str_t*) nogil
+    inline void kh_clear_str(kh_str_t*) nogil
+    inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil
+    inline void kh_resize_str(kh_str_t*, khint_t) nogil
+    inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil
+    inline void kh_del_str(kh_str_t*, khint_t) nogil
 
-    bint kh_exist_str(kh_str_t*, khiter_t)
+    bint kh_exist_str(kh_str_t*, khiter_t) nogil
 
 
     ctypedef struct kh_int64_t:
@@ -62,15 +62,15 @@ cdef extern from "khash_python.h":
         int64_t *keys
         size_t *vals
 
-    inline kh_int64_t* kh_init_int64()
-    inline void kh_destroy_int64(kh_int64_t*)
-    inline void kh_clear_int64(kh_int64_t*)
-    inline khint_t kh_get_int64(kh_int64_t*, int64_t)
-    inline void kh_resize_int64(kh_int64_t*, khint_t)
-    inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*)
-    inline void kh_del_int64(kh_int64_t*, khint_t)
+    inline kh_int64_t* kh_init_int64() nogil
+    inline void kh_destroy_int64(kh_int64_t*) nogil
+    inline void kh_clear_int64(kh_int64_t*) nogil
+    inline khint_t kh_get_int64(kh_int64_t*, int64_t) nogil
+    inline void kh_resize_int64(kh_int64_t*, khint_t) nogil
+    inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
+    inline void kh_del_int64(kh_int64_t*, khint_t) nogil
 
-    bint kh_exist_int64(kh_int64_t*, khiter_t)
+    bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
 
     ctypedef struct kh_float64_t:
         khint_t n_buckets, size, n_occupied, upper_bound
@@ -78,15 +78,15 @@ cdef extern from "khash_python.h":
         float64_t *keys
         size_t *vals
 
-    inline kh_float64_t* kh_init_float64()
-    inline void kh_destroy_float64(kh_float64_t*)
-    inline void kh_clear_float64(kh_float64_t*)
-    inline khint_t kh_get_float64(kh_float64_t*, float64_t)
-    inline void kh_resize_float64(kh_float64_t*, khint_t)
-    inline khint_t kh_put_float64(kh_float64_t*, float64_t, int*)
-    inline void kh_del_float64(kh_float64_t*, khint_t)
+    inline kh_float64_t* kh_init_float64() nogil
+    inline void kh_destroy_float64(kh_float64_t*) nogil
+    inline void kh_clear_float64(kh_float64_t*) nogil
+    inline khint_t kh_get_float64(kh_float64_t*, float64_t) nogil
+    inline void kh_resize_float64(kh_float64_t*, khint_t) nogil
+    inline khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil
+    inline void kh_del_float64(kh_float64_t*, khint_t) nogil
 
-    bint kh_exist_float64(kh_float64_t*, khiter_t)
+    bint kh_exist_float64(kh_float64_t*, khiter_t) nogil
 
     ctypedef struct kh_int32_t:
         khint_t n_buckets, size, n_occupied, upper_bound
@@ -94,15 +94,15 @@ cdef extern from "khash_python.h":
         int32_t *keys
         size_t *vals
 
-    inline kh_int32_t* kh_init_int32()
-    inline void kh_destroy_int32(kh_int32_t*)
-    inline void kh_clear_int32(kh_int32_t*)
-    inline khint_t kh_get_int32(kh_int32_t*, int32_t)
-    inline void kh_resize_int32(kh_int32_t*, khint_t)
-    inline khint_t kh_put_int32(kh_int32_t*, int32_t, int*)
-    inline void kh_del_int32(kh_int32_t*, khint_t)
+    inline kh_int32_t* kh_init_int32() nogil
+    inline void kh_destroy_int32(kh_int32_t*) nogil
+    inline void kh_clear_int32(kh_int32_t*) nogil
+    inline khint_t kh_get_int32(kh_int32_t*, int32_t) nogil
+    inline void kh_resize_int32(kh_int32_t*, khint_t) nogil
+    inline khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil
+    inline void kh_del_int32(kh_int32_t*, khint_t) nogil
 
-    bint kh_exist_int32(kh_int32_t*, khiter_t)
+    bint kh_exist_int32(kh_int32_t*, khiter_t) nogil
 
     # sweep factorize
 
@@ -112,13 +112,12 @@ cdef extern from "khash_python.h":
         kh_cstr_t *keys
         PyObject **vals
 
-    inline kh_strbox_t* kh_init_strbox()
-    inline void kh_destroy_strbox(kh_strbox_t*)
-    inline void kh_clear_strbox(kh_strbox_t*)
-    inline khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t)
-    inline void kh_resize_strbox(kh_strbox_t*, khint_t)
-    inline khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*)
-    inline void kh_del_strbox(kh_strbox_t*, khint_t)
-
-    bint kh_exist_strbox(kh_strbox_t*, khiter_t)
+    inline kh_strbox_t* kh_init_strbox() nogil
+    inline void kh_destroy_strbox(kh_strbox_t*) nogil
+    inline void kh_clear_strbox(kh_strbox_t*) nogil
+    inline khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) nogil
+    inline void kh_resize_strbox(kh_strbox_t*, khint_t) nogil
+    inline khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) nogil
+    inline void kh_del_strbox(kh_strbox_t*, khint_t) nogil
 
+    bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index 83d6b97788e91..f4d27932f1f3f 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -1817,3 +1817,36 @@ def use_numexpr(use, min_elements=expr._MIN_ELEMENTS):
 for name, obj in inspect.getmembers(sys.modules[__name__]):
     if inspect.isfunction(obj) and name.startswith('assert'):
         setattr(TestCase, name, staticmethod(obj))
+
+def test_parallel(num_threads=2):
+    """Decorator to run the same function multiple times in parallel.
+
+    Parameters
+    ----------
+    num_threads : int, optional
+        The number of times the function is run in parallel.
+
+    Notes
+    -----
+    This decorator does not pass the return value of the decorated function.
+
+    Original from scikit-image: https://github.com/scikit-image/scikit-image/pull/1519
+
+    """
+
+    assert num_threads > 0
+    import threading
+
+    def wrapper(func):
+        @wraps(func)
+        def inner(*args, **kwargs):
+            threads = []
+            for i in range(num_threads):
+                thread = threading.Thread(target=func, args=args, kwargs=kwargs)
+                threads.append(thread)
+            for thread in threads:
+                thread.start()
+            for thread in threads:
+                thread.join()
+        return inner
+    return wrapper
diff --git a/vb_suite/gil.py b/vb_suite/gil.py
new file mode 100644
index 0000000000000..30f41bb3c738d
--- /dev/null
+++ b/vb_suite/gil.py
@@ -0,0 +1,98 @@
+from vbench.api import Benchmark
+from datetime import datetime
+
+common_setup = """from pandas_vb_common import *
+"""
+
+basic = common_setup + """
+from pandas.util.testing import test_parallel
+
+N = 1000000
+ngroups = 1000
+np.random.seed(1234)
+
+df = DataFrame({'key' : np.random.randint(0,ngroups,size=N),
+                'data' : np.random.randn(N) })
+"""
+
+setup = basic + """
+
+def f():
+    df.groupby('key')['data'].sum()
+
+# run consecutivily
+def g2():
+    for i in range(2):
+        f()
+def g4():
+    for i in range(4):
+        f()
+def g8():
+    for i in range(8):
+        f()
+
+# run in parallel
+@test_parallel(num_threads=2)
+def pg2():
+    f()
+
+@test_parallel(num_threads=4)
+def pg4():
+    f()
+
+@test_parallel(num_threads=8)
+def pg8():
+    f()
+
+"""
+
+nogil_groupby_sum_4 = Benchmark(
+    'pg4()', setup,
+    start_date=datetime(2015, 1, 1))
+
+nogil_groupby_sum_8 = Benchmark(
+    'pg8()', setup,
+    start_date=datetime(2015, 1, 1))
+
+
+#### test all groupby funcs ####
+
+setup = basic + """
+
+@test_parallel(num_threads=2)
+def pg2():
+    df.groupby('key')['data'].func()
+
+"""
+
+for f in ['sum','prod','var','count','min','max','mean','last']:
+
+    name = "nogil_groupby_{f}_2".format(f=f)
+    bmark = Benchmark('pg2()', setup.replace('func',f), start_date=datetime(2015, 1, 1))
+    bmark.name = name
+    globals()[name] = bmark
+
+del bmark
+
+
+#### test take_1d ####
+setup = basic + """
+from pandas.core import common as com
+
+N = 1e7
+df = DataFrame({'int64' : np.arange(N,dtype='int64'),
+                'float64' : np.arange(N,dtype='float64')})
+indexer = np.arange(100,len(df)-100)
+
+@test_parallel(num_threads=2)
+def take_1d_pg2_int64():
+    com.take_1d(df.int64.values,indexer)
+
+@test_parallel(num_threads=2)
+def take_1d_pg2_float64():
+    com.take_1d(df.float64.values,indexer)
+
+"""
+
+nogil_take1d_float64 = Benchmark('take_1d_pg2()_int64', setup, start_date=datetime(2015, 1, 1))
+nogil_take1d_int64 = Benchmark('take_1d_pg2()_float64', setup, start_date=datetime(2015, 1, 1))
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
index a16d183ae62e2..ca7a4a9b70836 100644
--- a/vb_suite/suite.py
+++ b/vb_suite/suite.py
@@ -16,6 +16,7 @@
            'inference',
            'hdfstore_bench',
            'join_merge',
+           'gil',
            'miscellaneous',
            'panel_ctor',
            'packers',