Skip to content

Commit

Permalink
Merge pull request #2683 from Xarthisius/2682_parallel_sph_pixelization
Browse files Browse the repository at this point in the history
Parallelize pixelize_sph_kernel_projection. Fixes #2682
  • Loading branch information
neutrinoceros authored Sep 24, 2020
2 parents bcd736b + c0f4cbc commit 4d9ab24
Showing 1 changed file with 103 additions and 49 deletions.
152 changes: 103 additions & 49 deletions yt/utilities/lib/pixelization_routines.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ from yt.utilities.lib.fp_utils cimport (
from yt.utilities.exceptions import YTElementTypeNotRecognized, YTPixelizeError

from cpython.exc cimport PyErr_CheckSignals
from cython.parallel cimport prange
from cython.parallel cimport parallel, prange
from libc.stdlib cimport free, malloc
from vec3_ops cimport cross, dot, subtract

Expand Down Expand Up @@ -176,6 +176,7 @@ def pixelize_cartesian(np.float64_t[:,:] buff,
with nogil:
for p in range(px.shape[0]):
xiter[1] = yiter[1] = 999
xiterv[1] = yiterv[1] = 0.0
oxsp = px[p]
oysp = py[p]
dxsp = pdx[p]
Expand Down Expand Up @@ -360,6 +361,7 @@ def pixelize_cartesian_nodal(np.float64_t[:,:] buff,
with nogil:
for p in range(px.shape[0]):
xiter[1] = yiter[1] = 999
xiterv[1] = yiterv[1] = 0.0
oxsp = px[p]
oysp = py[p]
ozsp = pz[p]
Expand Down Expand Up @@ -993,22 +995,21 @@ def pixelize_sph_kernel_projection(

cdef np.intp_t xsize, ysize
cdef np.float64_t x_min, x_max, y_min, y_max, prefactor_j
cdef np.int64_t xi, yi, x0, x1, y0, y1
cdef np.int64_t xi, yi, x0, x1, y0, y1, xxi, yyi
cdef np.float64_t q_ij2, posx_diff, posy_diff, ih_j2
cdef np.float64_t x, y, dx, dy, idx, idy, h_j2, px, py
cdef np.float64_t period_x, period_y
cdef np.float64_t period_x = 0, period_y = 0
cdef int index, i, j, ii, jj
cdef np.float64_t[:] _weight_field
cdef int xiter[2]
cdef int yiter[2]
cdef np.float64_t xiterv[2]
cdef np.float64_t yiterv[2]
cdef int * xiter
cdef int * yiter
cdef np.float64_t * xiterv
cdef np.float64_t * yiterv
cdef np.float64_t * local_buf

if weight_field is not None:
_weight_field = weight_field

xiter[0] = yiter[0] = 0
xiterv[0] = yiterv[0] = 0.0
if period is not None:
period_x = period[0]
period_y = period[1]
Expand All @@ -1031,9 +1032,33 @@ def pixelize_sph_kernel_projection(
kernel_tables[kernel_name] = SPHKernelInterpolationTable(kernel_name)
cdef SPHKernelInterpolationTable itab = kernel_tables[kernel_name]

with nogil:
with nogil, parallel():
# loop through every particle
for j in range(0, posx.shape[0]):
# NOTE: this loop can be quite time consuming. However it is easily
# parallelizable in multiple ways, such as:
# 1) use multiple cores to process individual particles (the outer loop)
# 2) use multiple cores to process individual pixels for a given particle
# (the inner loops)
# Depending on the ratio of particles' "sphere of influence" (a.k.a. the smoothing
# length) to the physical width of the pixels, different parallelization
# strategies may yield different speed-ups. Strategy #1 works better in the case
# of lots of itty bitty particles. Strategy #2 works well when we have a
# not-very-large-number of reasonably large-compared-to-pixels particles. We
# currently employ #1 as its workload is more even and consistent, even though it
# comes with a price of an additional, per thread memory for storing the
# intermediate results.

local_buff = <np.float64_t *> malloc(sizeof(np.float64_t) * xsize * ysize)
xiterv = <np.float64_t *> malloc(sizeof(np.float64_t) * 2)
yiterv = <np.float64_t *> malloc(sizeof(np.float64_t) * 2)
xiter = <int *> malloc(sizeof(int) * 2)
yiter = <int *> malloc(sizeof(int) * 2)
xiter[0] = yiter[0] = 0
xiterv[0] = yiterv[0] = 0.0
for i in range(xsize * ysize):
local_buff[i] = 0.0

for j in prange(0, posx.shape[0], schedule="dynamic"):
if j % 100000 == 0:
with gil:
PyErr_CheckSignals()
Expand All @@ -1054,6 +1079,14 @@ def pixelize_sph_kernel_projection(
yiter[1] = -1
yiterv[1] = -period_y

# we set the smoothing length squared with lower limit of the pixel
h_j2 = fmax(hsml[j]*hsml[j], dx*dy)
ih_j2 = 1.0/h_j2

prefactor_j = pmass[j] / pdens[j] / hsml[j]**2 * quantity_to_smooth[j]
if weight_field is not None:
prefactor_j *= _weight_field[j]

for ii in range(2):
if xiter[ii] == 999: continue
px = posx[j] + xiterv[ii]
Expand All @@ -1074,16 +1107,6 @@ def pixelize_sph_kernel_projection(
y0 = iclip(y0-1, 0, ysize)
y1 = iclip(y1+1, 0, ysize)

# we set the smoothing length squared with lower limit of the pixel
h_j2 = fmax(hsml[j]*hsml[j], dx*dy)
ih_j2 = 1.0/h_j2

prefactor_j = pmass[j] / pdens[j] / hsml[j]**2
if weight_field is None:
prefactor_j *= quantity_to_smooth[j]
else:
prefactor_j *= quantity_to_smooth[j] * _weight_field[j]

# found pixels we deposit on, loop through those pixels
for xi in range(x0, x1):
# we use the centre of the pixel to calculate contribution
Expand All @@ -1107,7 +1130,17 @@ def pixelize_sph_kernel_projection(

# see equation 32 of the SPLASH paper
# now we just use the kernel projection
buff[xi, yi] += prefactor_j * itab.interpolate(q_ij2)
local_buff[xi + yi*xsize] += prefactor_j * itab.interpolate(q_ij2)

with gil:
for xxi in range(xsize):
for yyi in range(ysize):
buff[xxi, yyi] += local_buff[xxi + yyi*xsize]
free(local_buff)
free(xiterv)
free(yiterv)
free(xiter)
free(yiter)

@cython.boundscheck(False)
@cython.wraparound(False)
Expand Down Expand Up @@ -1290,18 +1323,17 @@ def pixelize_sph_kernel_slice(
# similar method to pixelize_sph_kernel_projection
cdef np.intp_t xsize, ysize
cdef np.float64_t x_min, x_max, y_min, y_max, prefactor_j
cdef np.int64_t xi, yi, x0, x1, y0, y1
cdef np.int64_t xi, yi, x0, x1, y0, y1, xxi, yyi
cdef np.float64_t q_ij, posx_diff, posy_diff, ih_j
cdef np.float64_t x, y, dx, dy, idx, idy, h_j2, h_j, px, py
cdef int index, i, j, ii, jj
cdef np.float64_t period_x, period_y
cdef int xiter[2]
cdef int yiter[2]
cdef np.float64_t xiterv[2]
cdef np.float64_t yiterv[2]
cdef np.float64_t period_x = 0, period_y = 0
cdef int * xiter
cdef int * yiter
cdef np.float64_t * xiterv
cdef np.float64_t * yiterv
cdef np.float64_t * local_buf

xiter[0] = yiter[0] = 0
xiterv[0] = yiterv[0] = 0.0
if period is not None:
period_x = period[0]
period_y = period[1]
Expand All @@ -1320,8 +1352,19 @@ def pixelize_sph_kernel_slice(

kernel_func = get_kernel_func(kernel_name)

with nogil:
for j in range(0, posx.shape[0]):
with nogil, parallel():
# NOTE see note in pixelize_sph_kernel_projection
local_buff = <np.float64_t *> malloc(sizeof(np.float64_t) * xsize * ysize)
xiterv = <np.float64_t *> malloc(sizeof(np.float64_t) * 2)
yiterv = <np.float64_t *> malloc(sizeof(np.float64_t) * 2)
xiter = <int *> malloc(sizeof(int) * 2)
yiter = <int *> malloc(sizeof(int) * 2)
xiter[0] = yiter[0] = 0
xiterv[0] = yiterv[0] = 0.0
for i in range(xsize * ysize):
local_buff[i] = 0.0

for j in prange(0, posx.shape[0], schedule="dynamic"):
if j % 100000 == 0:
with gil:
PyErr_CheckSignals()
Expand All @@ -1342,6 +1385,13 @@ def pixelize_sph_kernel_slice(
yiter[1] = -1
yiterv[1] = -period_y

h_j2 = fmax(hsml[j]*hsml[j], dx*dy)
h_j = math.sqrt(h_j2)
ih_j = 1.0/h_j

prefactor_j = pmass[j] / pdens[j] / hsml[j]**3
prefactor_j *= quantity_to_smooth[j]

for ii in range(2):
if xiter[ii] == 999: continue
px = posx[j] + xiterv[ii]
Expand All @@ -1361,13 +1411,6 @@ def pixelize_sph_kernel_slice(
y0 = iclip(y0-1, 0, ysize)
y1 = iclip(y1+1, 0, ysize)

h_j2 = fmax(hsml[j]*hsml[j], dx*dy)
h_j = math.sqrt(h_j2)
ih_j = 1.0/h_j

prefactor_j = pmass[j] / pdens[j] / hsml[j]**3
prefactor_j *= quantity_to_smooth[j]

# Now we know which pixels to deposit onto for this particle,
# so loop over them and add this particle's contribution
for xi in range(x0, x1):
Expand All @@ -1392,7 +1435,17 @@ def pixelize_sph_kernel_slice(
continue

# see equations 6, 9, and 11 of the SPLASH paper
buff[xi, yi] += prefactor_j * kernel_func(q_ij)
local_buff[xi + yi*xsize] += prefactor_j * kernel_func(q_ij)

with gil:
for xxi in range(xsize):
for yyi in range(ysize):
buff[xxi, yyi] += local_buff[xxi + yyi*xsize]
free(local_buff)
free(xiterv)
free(yiterv)
free(xiter)
free(yiter)

@cython.initializedcheck(False)
@cython.boundscheck(False)
Expand All @@ -1412,7 +1465,7 @@ def pixelize_sph_kernel_arbitrary_grid(np.float64_t[:, :, :] buff,
cdef np.float64_t q_ij, posx_diff, posy_diff, posz_diff, px, py, pz
cdef np.float64_t x, y, z, dx, dy, dz, idx, idy, idz, h_j3, h_j2, h_j, ih_j
cdef int index, i, j, k, ii, jj, kk
cdef np.float64_t period_x, period_y, period_z
cdef np.float64_t period_x = 0, period_y = 0, period_z = 0

cdef int xiter[2]
cdef int yiter[2]
Expand Down Expand Up @@ -1447,6 +1500,7 @@ def pixelize_sph_kernel_arbitrary_grid(np.float64_t[:, :, :] buff,
kernel_func = get_kernel_func(kernel_name)

with nogil:
# TODO make this parallel without using too much memory
for j in range(0, posx.shape[0]):
if j % 50000 == 0:
with gil:
Expand All @@ -1455,6 +1509,7 @@ def pixelize_sph_kernel_arbitrary_grid(np.float64_t[:, :, :] buff,
PyErr_CheckSignals()

xiter[1] = yiter[1] = ziter[1] = 999
xiterv[1] = yiterv[1] = ziterv[1] = 0.0

if check_period == 1:
if posx[j] - hsml[j] < x_min:
Expand All @@ -1476,6 +1531,13 @@ def pixelize_sph_kernel_arbitrary_grid(np.float64_t[:, :, :] buff,
ziter[1] = -1
ziterv[1] = -period_z

h_j3 = fmax(hsml[j]*hsml[j]*hsml[j], dx*dy*dz)
h_j = math.cbrt(h_j3)
h_j2 = h_j*h_j
ih_j = 1/h_j

prefactor_j = pmass[j] / pdens[j] / hsml[j]**3 * quantity_to_smooth[j]

for ii in range(2):
if xiter[ii] == 999: continue
px = posx[j] + xiterv[ii]
Expand Down Expand Up @@ -1504,14 +1566,6 @@ def pixelize_sph_kernel_arbitrary_grid(np.float64_t[:, :, :] buff,
z0 = iclip(z0-1, 0, zsize)
z1 = iclip(z1+1, 0, zsize)

h_j3 = fmax(hsml[j]*hsml[j]*hsml[j], dx*dy*dz)
h_j = math.cbrt(h_j3)
h_j2 = h_j*h_j
ih_j = 1/h_j

prefactor_j = pmass[j] / pdens[j] / hsml[j]**3
prefactor_j *= quantity_to_smooth[j]

# Now we know which voxels to deposit onto for this particle,
# so loop over them and add this particle's contribution
for xi in range(x0, x1):
Expand Down

0 comments on commit 4d9ab24

Please sign in to comment.