Skip to content

Commit

Permalink
[FEAT]: Golang Bindings for pinned host memory (#519)
Browse files Browse the repository at this point in the history
## Describe the changes

This PR adds the capability to pin host memory in golang bindings
allowing data transfers to be quicker. Memory can be pinned once for
multiple devices by passing the flag
`cuda_runtime.CudaHostRegisterPortable` or
`cuda_runtime.CudaHostAllocPortable` depending on how pinned memory is
called
  • Loading branch information
jeremyfelder authored Jun 24, 2024
1 parent 7831f7b commit 2b07513
Show file tree
Hide file tree
Showing 25 changed files with 868 additions and 48 deletions.
29 changes: 29 additions & 0 deletions wrappers/golang/core/slice.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,35 @@ func (h HostSlice[T]) AsUnsafePointer() unsafe.Pointer {
return unsafe.Pointer(&h[0])
}

// Registers host memory as pinned, allowing the GPU to read data from the host quicker and save GPU memory space.
// Memory pinned using this function should be unpinned using [Unpin]
func (h HostSlice[T]) Pin(flags cr.RegisterPinnedFlags) cr.CudaError {
_, err := cr.RegisterPinned(h.AsUnsafePointer(), h.SizeOfElement()*h.Len(), flags)
return err
}

// Unregisters host memory as pinned
func (h HostSlice[T]) Unpin() cr.CudaError {
return cr.FreeRegisteredPinned(h.AsUnsafePointer())
}

// Allocates new host memory as pinned and copies the HostSlice data to the newly allocated area
// Memory pinned using this function should be unpinned using [FreePinned]
func (h HostSlice[T]) AllocPinned(flags cr.AllocPinnedFlags) (HostSlice[T], cr.CudaError) {
pinnedMemPointer, err := cr.AllocPinned(h.SizeOfElement()*h.Len(), flags)
if err != cr.CudaSuccess {
return nil, err
}
pinnedMem := unsafe.Slice((*T)(pinnedMemPointer), h.Len())
copy(pinnedMem, h)
return pinnedMem, cr.CudaSuccess
}

// Unpins host memory that was pinned using [AllocPinned]
func (h HostSlice[T]) FreePinned() cr.CudaError {
return cr.FreeAllocPinned(h.AsUnsafePointer())
}

func (h HostSlice[T]) CopyToDevice(dst *DeviceSlice, shouldAllocate bool) *DeviceSlice {
size := h.Len() * h.SizeOfElement()
if shouldAllocate {
Expand Down
24 changes: 24 additions & 0 deletions wrappers/golang/core/slice_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"unsafe"

"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core/internal"
"github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
"github.com/stretchr/testify/assert"
)

Expand Down Expand Up @@ -222,3 +223,26 @@ func TestSliceRanges(t *testing.T) {
hostSliceRange.CopyFromDevice(&deviceSliceRange)
assert.Equal(t, hostSlice[2:6], hostSliceRange)
}

func TestHostSlicePinning(t *testing.T) {
data := []int{1, 2, 3, 4, 5, 7, 8, 9}
dataHostSlice := HostSliceFromElements(data)
err := dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
assert.Equal(t, cuda_runtime.CudaSuccess, err)
err = dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
assert.Equal(t, cuda_runtime.CudaErrorHostMemoryAlreadyRegistered, err)

err = dataHostSlice.Unpin()
assert.Equal(t, cuda_runtime.CudaSuccess, err)
err = dataHostSlice.Unpin()
assert.Equal(t, cuda_runtime.CudaErrorHostMemoryNotRegistered, err)

pinnedMem, err := dataHostSlice.AllocPinned(cuda_runtime.CudaHostAllocDefault)
assert.Equal(t, cuda_runtime.CudaSuccess, err)
assert.ElementsMatch(t, dataHostSlice, pinnedMem)

err = pinnedMem.FreePinned()
assert.Equal(t, cuda_runtime.CudaSuccess, err)
err = pinnedMem.FreePinned()
assert.Equal(t, cuda_runtime.CudaErrorInvalidValue, err)
}
157 changes: 157 additions & 0 deletions wrappers/golang/cuda_runtime/const.go

Large diffs are not rendered by default.

88 changes: 48 additions & 40 deletions wrappers/golang/cuda_runtime/device_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
return int(cCudaPointerAttributes.device)
}

func GetDeviceAttribute(attr DeviceAttribute, device int) int {
var res int
cRes := (*C.int)(unsafe.Pointer(&res))
cDevice := (C.int)(device)
C.cudaDeviceGetAttribute(cRes, attr, cDevice)
return res
}

// RunOnDevice forces the provided function to run all GPU related calls within it
// on the same host thread and therefore the same GPU device.
//
Expand All @@ -84,46 +92,46 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
//
// As an example:
//
// cr.RunOnDevice(i, func(args ...any) {
// defer wg.Done()
// cfg := GetDefaultMSMConfig()
// stream, _ := cr.CreateStream()
// for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
// size := 1 << power
//
// // This will always print "Inner goroutine device: 0"
// // go func () {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // }()
// // To force the above goroutine to same device as the wrapping function:
// // RunOnDevice(i, func(arg ...any) {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // })
//
// scalars := GenerateScalars(size)
// points := GenerateAffinePoints(size)
//
// var p Projective
// var out core.DeviceSlice
// _, e := out.MallocAsync(p.Size(), p.Size(), stream)
// assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
// cfg.Ctx.Stream = &stream
// cfg.IsAsync = true
//
// e = Msm(scalars, points, &cfg, out)
// assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
//
// outHost := make(core.HostSlice[Projective], 1)
//
// cr.SynchronizeStream(&stream)
// outHost.CopyFromDevice(&out)
// out.Free()
// // Check with gnark-crypto
// assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
// }
// }, i)
// cr.RunOnDevice(i, func(args ...any) {
// defer wg.Done()
// cfg := GetDefaultMSMConfig()
// stream, _ := cr.CreateStream()
// for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
// size := 1 << power

// // This will always print "Inner goroutine device: 0"
// // go func () {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // }()
// // To force the above goroutine to same device as the wrapping function:
// // RunOnDevice(i, func(arg ...any) {
// // device, _ := cr.GetDevice()
// // fmt.Println("Inner goroutine device: ", device)
// // })

// scalars := GenerateScalars(size)
// points := GenerateAffinePoints(size)

// var p Projective
// var out core.DeviceSlice
// _, e := out.MallocAsync(p.Size(), p.Size(), stream)
// assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
// cfg.Ctx.Stream = &stream
// cfg.IsAsync = true

// e = Msm(scalars, points, &cfg, out)
// assert.Equal(t, e, cr.CudaSuccess, "Msm failed")

// outHost := make(core.HostSlice[Projective], 1)

// cr.SynchronizeStream(&stream)
// outHost.CopyFromDevice(&out)
// out.Free()
// // Check with gnark-crypto
// assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
// }
// }, i)
func RunOnDevice(deviceId int, funcToRun func(args ...any), args ...any) {
go func(id int) {
defer runtime.UnlockOSThread()
Expand Down
40 changes: 40 additions & 0 deletions wrappers/golang/cuda_runtime/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ package cuda_runtime
import "C"

import (
// "runtime"
"unsafe"
)

Expand Down Expand Up @@ -58,6 +59,45 @@ func FreeAsync(devicePtr unsafe.Pointer, stream Stream) CudaError {
return err
}

func AllocPinned(size int, flags AllocPinnedFlags) (unsafe.Pointer, CudaError) {
cSize := (C.size_t)(size)
var hostPtr unsafe.Pointer
ret := C.cudaHostAlloc(&hostPtr, cSize, flags)
err := (CudaError)(ret)
if err != CudaSuccess {
return nil, err
}

return hostPtr, CudaSuccess
}

func GetHostFlags(ptr unsafe.Pointer) (flag uint) {
cFlag := (C.uint)(flag)
C.cudaHostGetFlags(&cFlag, ptr)
return
}

func FreeAllocPinned(hostPtr unsafe.Pointer) CudaError {
return (CudaError)(C.cudaFreeHost(hostPtr))
}

func RegisterPinned(hostPtr unsafe.Pointer, size int, flags RegisterPinnedFlags) (unsafe.Pointer, CudaError) {
cSize := (C.size_t)(size)
// This is required since there are greater values of RegisterPinnedFlags which we do not support currently
flags = flags & 3
ret := C.cudaHostRegister(hostPtr, cSize, flags)
err := (CudaError)(ret)
if err != CudaSuccess {
return nil, err
}

return hostPtr, CudaSuccess
}

func FreeRegisteredPinned(hostPtr unsafe.Pointer) CudaError {
return (CudaError)(C.cudaHostUnregister(hostPtr))
}

func CopyFromDevice(hostDst, deviceSrc unsafe.Pointer, size uint) (unsafe.Pointer, CudaError) {
cCount := (C.size_t)(size)
ret := C.cudaMemcpy(hostDst, deviceSrc, cCount, uint32(CudaMemcpyDeviceToHost))
Expand Down
24 changes: 24 additions & 0 deletions wrappers/golang/cuda_runtime/memory_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,27 @@ func TestCopyFromToHost(t *testing.T) {
assert.Equal(t, CudaSuccess, err, "Couldn't copy to device due to %v", err)
assert.Equal(t, someInts, someInts2, "Elements of host slices do not match. Copying from/to host failed")
}

func TestRegisterUnregisterPinned(t *testing.T) {
data := []int{1, 2, 3, 4, 5, 7, 8, 9}
dataUnsafe := unsafe.Pointer(&data[0])
_, err := RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
assert.Equal(t, CudaSuccess, err)
_, err = RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
assert.Equal(t, CudaErrorHostMemoryAlreadyRegistered, err)

err = FreeRegisteredPinned(dataUnsafe)
assert.Equal(t, CudaSuccess, err)
err = FreeRegisteredPinned(dataUnsafe)
assert.Equal(t, CudaErrorHostMemoryNotRegistered, err)
}

func TestAllocFreePinned(t *testing.T) {
pinnedMemPointer, err := AllocPinned(int(unsafe.Sizeof(1)*9), CudaHostAllocDefault)
assert.Equal(t, CudaSuccess, err)

err = FreeAllocPinned(pinnedMemPointer)
assert.Equal(t, CudaSuccess, err)
err = FreeAllocPinned(pinnedMemPointer)
assert.Equal(t, CudaErrorInvalidValue, err)
}
52 changes: 52 additions & 0 deletions wrappers/golang/curves/bls12377/tests/g2_msm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {

}
}

func TestMSMG2PinnedHostMemory(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power

scalars := icicleBls12_377.GenerateScalars(size)
points := g2.G2GenerateAffinePoints(size)

pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)

pinnableAndLockable := pinnable == 1 && lockable == 0

var pinnedPoints core.HostSlice[g2.G2Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}

var p g2.G2Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[g2.G2Projective], 1)

e = g2.G2Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")

outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))

if pinnableAndLockable {
e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")

outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))

}

out.Free()

if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMG2GnarkCryptoTypes(t *testing.T) {
cfg := g2.G2GetDefaultMSMConfig()
for _, power := range []int{3} {
Expand Down
52 changes: 52 additions & 0 deletions wrappers/golang/curves/bls12377/tests/msm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {

}
}

func TestMSMPinnedHostMemory(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{10} {
size := 1 << power

scalars := icicleBls12_377.GenerateScalars(size)
points := icicleBls12_377.GenerateAffinePoints(size)

pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)

pinnableAndLockable := pinnable == 1 && lockable == 0

var pinnedPoints core.HostSlice[icicleBls12_377.Affine]
if pinnableAndLockable {
points.Pin(cr.CudaHostRegisterDefault)
pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
}

var p icicleBls12_377.Projective
var out core.DeviceSlice
_, e := out.Malloc(p.Size(), p.Size())
assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
outHost := make(core.HostSlice[icicleBls12_377.Projective], 1)

e = msm.Msm(scalars, points, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")

outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))

if pinnableAndLockable {
e = msm.Msm(scalars, pinnedPoints, &cfg, out)
assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")

outHost.CopyFromDevice(&out)
// Check with gnark-crypto
assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))

}

out.Free()

if pinnableAndLockable {
points.Unpin()
pinnedPoints.FreePinned()
}
}
}
func TestMSMGnarkCryptoTypes(t *testing.T) {
cfg := msm.GetDefaultMSMConfig()
for _, power := range []int{3} {
Expand Down
3 changes: 2 additions & 1 deletion wrappers/golang/curves/bls12377/tests/ntt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {

func TestNttBatch(t *testing.T) {
cfg := ntt.GetDefaultNttConfig()
largestTestSize := 12
largestBatchSize := 100
scalars := bls12_377.GenerateScalars(1 << largestTestSize * largestBatchSize)

for _, size := range []int{4, largestTestSize} {
for _, batchSize := range []int{1, 16, largestBatchSize} {
for _, batchSize := range []int{2, 16, largestBatchSize} {
testSize := 1 << size
totalSize := testSize * batchSize

Expand Down
Loading

0 comments on commit 2b07513

Please sign in to comment.