Parallelize pixelize_sph_kernel_* methods. Fixes yt-project#2682

Xarthisius · Jun 30, 2020 · d19846d · d19846d
1 parent e250b8a
commit d19846d
Showing 1 changed file with 47 additions and 9 deletions.
diff --git a/yt/utilities/lib/pixelization_routines.pyx b/yt/utilities/lib/pixelization_routines.pyx
@@ -33,7 +33,7 @@ from yt.utilities.lib.element_mappings cimport \
     Tet2Sampler3D
 from yt.geometry.particle_deposit cimport \
     kernel_func, get_kernel_func
-from cython.parallel cimport prange
+from cython.parallel cimport prange, parallel
 from cpython.exc cimport PyErr_CheckSignals
 from yt.funcs import get_pbar
 from yt.utilities.lib.cykdtree.kdtree cimport (
@@ -976,11 +976,12 @@ def pixelize_sph_kernel_projection(
 
     cdef np.intp_t xsize, ysize
     cdef np.float64_t x_min, x_max, y_min, y_max, prefactor_j
-    cdef np.int64_t xi, yi, x0, x1, y0, y1
+    cdef np.int64_t xi, yi, x0, x1, y0, y1, xxi, yyi
     cdef np.float64_t q_ij2, posx_diff, posy_diff, ih_j2
     cdef np.float64_t x, y, dx, dy, idx, idy, h_j2
     cdef int index, i, j
     cdef np.float64_t[:] _weight_field
+    cdef np.float64_t * local_buf
 
     if weight_field is not None:
         _weight_field = weight_field
@@ -1003,9 +1004,27 @@ def pixelize_sph_kernel_projection(
         kernel_tables[kernel_name] = SPHKernelInterpolationTable(kernel_name)
     cdef SPHKernelInterpolationTable itab = kernel_tables[kernel_name]
 
-    with nogil:
+    with nogil, parallel():
         # loop through every particle
-        for j in range(0, posx.shape[0]):
+        # NOTE: this loop can be quite time consuming. However it is easily
+        # parallelizable in multiple ways, such as:
+        #   1) use multiple cores to process individual particles (the outer loop)
+        #   2) use multiple cores to process individual pixels for a given particle
+        #      (the inner loops)
+        # Depending on the ratio of particles' "sphere of influnce" (a.k.a. the smoothing
+        # length) to the physical width of the pixels, different parallelization
+        # strategies may yield different speed-ups. Strategy #1 works better in the case
+        # of lots of itty bitty particles. Strategy #2 works well when we have a
+        # not-very-large-number of reasonably large-compared-to-pixels particles. We
+        # currently employ #1 as its workload is more even and consistent, even though it
+        # comes with a price of an additional, per thread memory for storing the
+        # intermediate results.
+
+        local_buff = <np.float64_t *> malloc(sizeof(np.float64_t) * xsize * ysize)
+        for i in range(xsize * ysize):
+            local_buff[i] = 0.0
+
+        for j in prange(0, posx.shape[0], schedule="dynamic"):
             if j % 100000 == 0:
                 with gil:
                     PyErr_CheckSignals()
@@ -1054,7 +1073,13 @@ def pixelize_sph_kernel_projection(
 
                     # see equation 32 of the SPLASH paper
                     # now we just use the kernel projection
-                    buff[xi, yi] +=  prefactor_j * itab.interpolate(q_ij2)
+                    local_buff[xi + yi*xsize] +=  prefactor_j * itab.interpolate(q_ij2)
+
+        with gil:
+            for xxi in range(xsize):
+                for yyi in range(ysize):
+                    buff[xxi, yyi] += local_buff[xxi + yyi*xsize]
+        free(local_buff)
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -1235,10 +1260,11 @@ def pixelize_sph_kernel_slice(
     # similar method to pixelize_sph_kernel_projection
     cdef np.intp_t xsize, ysize
     cdef np.float64_t x_min, x_max, y_min, y_max, prefactor_j
-    cdef np.int64_t xi, yi, x0, x1, y0, y1
+    cdef np.int64_t xi, yi, x0, x1, y0, y1, xxi, yyi
     cdef np.float64_t q_ij, posx_diff, posy_diff, ih_j
     cdef np.float64_t x, y, dx, dy, idx, idy, h_j2, h_j
     cdef int index, i, j
+    cdef np.float64_t * local_buf
 
     xsize, ysize = buff.shape[0], buff.shape[1]
 
@@ -1254,8 +1280,13 @@ def pixelize_sph_kernel_slice(
 
     kernel_func = get_kernel_func(kernel_name)
 
-    with nogil:
-        for j in range(0, posx.shape[0]):
+    with nogil, parallel():
+        # NOTE see note in pixelize_sph_kernel_projection
+        local_buff = <np.float64_t *> malloc(sizeof(np.float64_t) * xsize * ysize)
+        for i in range(xsize * ysize):
+            local_buff[i] = 0.0
+
+        for j in prange(0, posx.shape[0], schedule="dynamic"):
             if j % 100000 == 0:
                 with gil:
                     PyErr_CheckSignals()
@@ -1301,7 +1332,13 @@ def pixelize_sph_kernel_slice(
                         continue
 
                     # see equations 6, 9, and 11 of the SPLASH paper
-                    buff[xi, yi] += prefactor_j * kernel_func(q_ij)
+                    local_buff[xi + yi*xsize] += prefactor_j * kernel_func(q_ij)
+
+        with gil:
+            for xxi in range(xsize):
+                for yyi in range(ysize):
+                    buff[xxi, yyi] += local_buff[xxi + yyi*xsize]
+        free(local_buff)
 
 @cython.initializedcheck(False)
 @cython.boundscheck(False)
@@ -1339,6 +1376,7 @@ def pixelize_sph_kernel_arbitrary_grid(np.float64_t[:, :, :] buff,
     kernel_func = get_kernel_func(kernel_name)
 
     with nogil:
+        # TODO make this parallel without using too much memory
         for j in range(0, posx.shape[0]):
             if j % 50000 == 0:
                 with gil: