[FEAT]: Golang Bindings for pinned host memory (#519)

## Describe the changes This PR adds the capability to pin host memory in golang bindings allowing data transfers to be quicker. Memory can be pinned once for multiple devices by passing the flag `cuda_runtime.CudaHostRegisterPortable` or `cuda_runtime.CudaHostAllocPortable` depending on how pinned memory is called
ingonyama-zk · Jun 24, 2024 · 2b07513 · 2b07513
1 parent 7831f7b
commit 2b07513
Show file tree

Hide file tree

Showing 25 changed files with 868 additions and 48 deletions.
diff --git a/wrappers/golang/core/slice.go b/wrappers/golang/core/slice.go
@@ -197,6 +197,35 @@ func (h HostSlice[T]) AsUnsafePointer() unsafe.Pointer {
 	return unsafe.Pointer(&h[0])
 }
 
+// Registers host memory as pinned, allowing the GPU to read data from the host quicker and save GPU memory space.
+// Memory pinned using this function should be unpinned using [Unpin]
+func (h HostSlice[T]) Pin(flags cr.RegisterPinnedFlags) cr.CudaError {
+	_, err := cr.RegisterPinned(h.AsUnsafePointer(), h.SizeOfElement()*h.Len(), flags)
+	return err
+}
+
+// Unregisters host memory as pinned
+func (h HostSlice[T]) Unpin() cr.CudaError {
+	return cr.FreeRegisteredPinned(h.AsUnsafePointer())
+}
+
+// Allocates new host memory as pinned and copies the HostSlice data to the newly allocated area
+// Memory pinned using this function should be unpinned using [FreePinned]
+func (h HostSlice[T]) AllocPinned(flags cr.AllocPinnedFlags) (HostSlice[T], cr.CudaError) {
+	pinnedMemPointer, err := cr.AllocPinned(h.SizeOfElement()*h.Len(), flags)
+	if err != cr.CudaSuccess {
+		return nil, err
+	}
+	pinnedMem := unsafe.Slice((*T)(pinnedMemPointer), h.Len())
+	copy(pinnedMem, h)
+	return pinnedMem, cr.CudaSuccess
+}
+
+// Unpins host memory that was pinned using [AllocPinned]
+func (h HostSlice[T]) FreePinned() cr.CudaError {
+	return cr.FreeAllocPinned(h.AsUnsafePointer())
+}
+
 func (h HostSlice[T]) CopyToDevice(dst *DeviceSlice, shouldAllocate bool) *DeviceSlice {
 	size := h.Len() * h.SizeOfElement()
 	if shouldAllocate {

diff --git a/wrappers/golang/core/slice_test.go b/wrappers/golang/core/slice_test.go
@@ -6,6 +6,7 @@ import (
 	"unsafe"
 
 	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/core/internal"
+	"github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -222,3 +223,26 @@ func TestSliceRanges(t *testing.T) {
 	hostSliceRange.CopyFromDevice(&deviceSliceRange)
 	assert.Equal(t, hostSlice[2:6], hostSliceRange)
 }
+
+func TestHostSlicePinning(t *testing.T) {
+	data := []int{1, 2, 3, 4, 5, 7, 8, 9}
+	dataHostSlice := HostSliceFromElements(data)
+	err := dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
+	assert.Equal(t, cuda_runtime.CudaSuccess, err)
+	err = dataHostSlice.Pin(cuda_runtime.CudaHostRegisterDefault)
+	assert.Equal(t, cuda_runtime.CudaErrorHostMemoryAlreadyRegistered, err)
+
+	err = dataHostSlice.Unpin()
+	assert.Equal(t, cuda_runtime.CudaSuccess, err)
+	err = dataHostSlice.Unpin()
+	assert.Equal(t, cuda_runtime.CudaErrorHostMemoryNotRegistered, err)
+
+	pinnedMem, err := dataHostSlice.AllocPinned(cuda_runtime.CudaHostAllocDefault)
+	assert.Equal(t, cuda_runtime.CudaSuccess, err)
+	assert.ElementsMatch(t, dataHostSlice, pinnedMem)
+
+	err = pinnedMem.FreePinned()
+	assert.Equal(t, cuda_runtime.CudaSuccess, err)
+	err = pinnedMem.FreePinned()
+	assert.Equal(t, cuda_runtime.CudaErrorInvalidValue, err)
+}
diff --git a/wrappers/golang/cuda_runtime/const.go b/wrappers/golang/cuda_runtime/const.go
diff --git a/wrappers/golang/cuda_runtime/device_context.go b/wrappers/golang/cuda_runtime/device_context.go
@@ -74,6 +74,14 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
 	return int(cCudaPointerAttributes.device)
 }
 
+func GetDeviceAttribute(attr DeviceAttribute, device int) int {
+	var res int
+	cRes := (*C.int)(unsafe.Pointer(&res))
+	cDevice := (C.int)(device)
+	C.cudaDeviceGetAttribute(cRes, attr, cDevice)
+	return res
+}
+
 // RunOnDevice forces the provided function to run all GPU related calls within it
 // on the same host thread and therefore the same GPU device.
 //
@@ -84,46 +92,46 @@ func GetDeviceFromPointer(ptr unsafe.Pointer) int {
 //
 // As an example:
 //
-//	   		cr.RunOnDevice(i, func(args ...any) {
-//					 	defer wg.Done()
-//					 	cfg := GetDefaultMSMConfig()
-//					 	stream, _ := cr.CreateStream()
-//					 	for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
-//					 		size := 1 << power
-//
-//					 		// This will always print "Inner goroutine device: 0"
-//							// go func ()  {
-//							// 	device, _ := cr.GetDevice()
-//							// 	fmt.Println("Inner goroutine device: ", device)
-//							// }()
-//					 		// To force the above goroutine to same device as the wrapping function:
-//							// RunOnDevice(i, func(arg ...any) {
-//							// 	device, _ := cr.GetDevice()
-//							// 	fmt.Println("Inner goroutine device: ", device)
-//							// })
-//
-//					 		scalars := GenerateScalars(size)
-//					 		points := GenerateAffinePoints(size)
-//
-//					 		var p Projective
-//					 		var out core.DeviceSlice
-//					 		_, e := out.MallocAsync(p.Size(), p.Size(), stream)
-//					 		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
-//					 		cfg.Ctx.Stream = &stream
-//					 		cfg.IsAsync = true
-//
-//					 		e = Msm(scalars, points, &cfg, out)
-//					 		assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
-//
-//					 		outHost := make(core.HostSlice[Projective], 1)
-//
-//					 		cr.SynchronizeStream(&stream)
-//					 		outHost.CopyFromDevice(&out)
-//					 		out.Free()
-//					 		// Check with gnark-crypto
-//					 		assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
-//					 	}
-//					}, i)
+// cr.RunOnDevice(i, func(args ...any) {
+// 		defer wg.Done()
+// 		cfg := GetDefaultMSMConfig()
+// 		stream, _ := cr.CreateStream()
+// 		for _, power := range []int{2, 3, 4, 5, 6, 7, 8, 10, 18} {
+// 			size := 1 << power
+
+// 			// This will always print "Inner goroutine device: 0"
+// 			// go func ()  {
+// 			// 	device, _ := cr.GetDevice()
+// 			// 	fmt.Println("Inner goroutine device: ", device)
+// 			// }()
+// 			// To force the above goroutine to same device as the wrapping function:
+// 			// RunOnDevice(i, func(arg ...any) {
+// 			// 	device, _ := cr.GetDevice()
+// 			// 	fmt.Println("Inner goroutine device: ", device)
+// 			// })
+
+// 			scalars := GenerateScalars(size)
+// 			points := GenerateAffinePoints(size)
+
+// 			var p Projective
+// 			var out core.DeviceSlice
+// 			_, e := out.MallocAsync(p.Size(), p.Size(), stream)
+// 			assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+// 			cfg.Ctx.Stream = &stream
+// 			cfg.IsAsync = true
+
+// 			e = Msm(scalars, points, &cfg, out)
+// 			assert.Equal(t, e, cr.CudaSuccess, "Msm failed")
+
+// 			outHost := make(core.HostSlice[Projective], 1)
+
+//			cr.SynchronizeStream(&stream)
+//			outHost.CopyFromDevice(&out)
+//			out.Free()
+//			// Check with gnark-crypto
+//			assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
+//		}
+//	}, i)
 func RunOnDevice(deviceId int, funcToRun func(args ...any), args ...any) {
 	go func(id int) {
 		defer runtime.UnlockOSThread()

diff --git a/wrappers/golang/cuda_runtime/memory.go b/wrappers/golang/cuda_runtime/memory.go
@@ -9,6 +9,7 @@ package cuda_runtime
 import "C"
 
 import (
+	// "runtime"
 	"unsafe"
 )
 
@@ -58,6 +59,45 @@ func FreeAsync(devicePtr unsafe.Pointer, stream Stream) CudaError {
 	return err
 }
 
+func AllocPinned(size int, flags AllocPinnedFlags) (unsafe.Pointer, CudaError) {
+	cSize := (C.size_t)(size)
+	var hostPtr unsafe.Pointer
+	ret := C.cudaHostAlloc(&hostPtr, cSize, flags)
+	err := (CudaError)(ret)
+	if err != CudaSuccess {
+		return nil, err
+	}
+
+	return hostPtr, CudaSuccess
+}
+
+func GetHostFlags(ptr unsafe.Pointer) (flag uint) {
+	cFlag := (C.uint)(flag)
+	C.cudaHostGetFlags(&cFlag, ptr)
+	return
+}
+
+func FreeAllocPinned(hostPtr unsafe.Pointer) CudaError {
+	return (CudaError)(C.cudaFreeHost(hostPtr))
+}
+
+func RegisterPinned(hostPtr unsafe.Pointer, size int, flags RegisterPinnedFlags) (unsafe.Pointer, CudaError) {
+	cSize := (C.size_t)(size)
+	// This is required since there are greater values of RegisterPinnedFlags which we do not support currently
+	flags = flags & 3
+	ret := C.cudaHostRegister(hostPtr, cSize, flags)
+	err := (CudaError)(ret)
+	if err != CudaSuccess {
+		return nil, err
+	}
+
+	return hostPtr, CudaSuccess
+}
+
+func FreeRegisteredPinned(hostPtr unsafe.Pointer) CudaError {
+	return (CudaError)(C.cudaHostUnregister(hostPtr))
+}
+
 func CopyFromDevice(hostDst, deviceSrc unsafe.Pointer, size uint) (unsafe.Pointer, CudaError) {
 	cCount := (C.size_t)(size)
 	ret := C.cudaMemcpy(hostDst, deviceSrc, cCount, uint32(CudaMemcpyDeviceToHost))

diff --git a/wrappers/golang/cuda_runtime/memory_test.go b/wrappers/golang/cuda_runtime/memory_test.go
@@ -40,3 +40,27 @@ func TestCopyFromToHost(t *testing.T) {
 	assert.Equal(t, CudaSuccess, err, "Couldn't copy to device due to %v", err)
 	assert.Equal(t, someInts, someInts2, "Elements of host slices do not match. Copying from/to host failed")
 }
+
+func TestRegisterUnregisterPinned(t *testing.T) {
+	data := []int{1, 2, 3, 4, 5, 7, 8, 9}
+	dataUnsafe := unsafe.Pointer(&data[0])
+	_, err := RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
+	assert.Equal(t, CudaSuccess, err)
+	_, err = RegisterPinned(dataUnsafe, int(unsafe.Sizeof(data[0])*9), CudaHostRegisterDefault)
+	assert.Equal(t, CudaErrorHostMemoryAlreadyRegistered, err)
+
+	err = FreeRegisteredPinned(dataUnsafe)
+	assert.Equal(t, CudaSuccess, err)
+	err = FreeRegisteredPinned(dataUnsafe)
+	assert.Equal(t, CudaErrorHostMemoryNotRegistered, err)
+}
+
+func TestAllocFreePinned(t *testing.T) {
+	pinnedMemPointer, err := AllocPinned(int(unsafe.Sizeof(1)*9), CudaHostAllocDefault)
+	assert.Equal(t, CudaSuccess, err)
+
+	err = FreeAllocPinned(pinnedMemPointer)
+	assert.Equal(t, CudaSuccess, err)
+	err = FreeAllocPinned(pinnedMemPointer)
+	assert.Equal(t, CudaErrorInvalidValue, err)
+}
diff --git a/wrappers/golang/curves/bls12377/tests/g2_msm_test.go b/wrappers/golang/curves/bls12377/tests/g2_msm_test.go
@@ -146,6 +146,58 @@ func TestMSMG2(t *testing.T) {
 
 	}
 }
+
+func TestMSMG2PinnedHostMemory(t *testing.T) {
+	cfg := g2.G2GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBls12_377.GenerateScalars(size)
+		points := g2.G2GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[g2.G2Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p g2.G2Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[g2.G2Projective], 1)
+
+		e = g2.G2Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = g2.G2Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsmG2(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMG2GnarkCryptoTypes(t *testing.T) {
 	cfg := g2.G2GetDefaultMSMConfig()
 	for _, power := range []int{3} {

diff --git a/wrappers/golang/curves/bls12377/tests/msm_test.go b/wrappers/golang/curves/bls12377/tests/msm_test.go
@@ -106,6 +106,58 @@ func TestMSM(t *testing.T) {
 
 	}
 }
+
+func TestMSMPinnedHostMemory(t *testing.T) {
+	cfg := msm.GetDefaultMSMConfig()
+	for _, power := range []int{10} {
+		size := 1 << power
+
+		scalars := icicleBls12_377.GenerateScalars(size)
+		points := icicleBls12_377.GenerateAffinePoints(size)
+
+		pinnable := cr.GetDeviceAttribute(cr.CudaDevAttrHostRegisterSupported, 0)
+		lockable := cr.GetDeviceAttribute(cr.CudaDevAttrPageableMemoryAccessUsesHostPageTables, 0)
+
+		pinnableAndLockable := pinnable == 1 && lockable == 0
+
+		var pinnedPoints core.HostSlice[icicleBls12_377.Affine]
+		if pinnableAndLockable {
+			points.Pin(cr.CudaHostRegisterDefault)
+			pinnedPoints, _ = points.AllocPinned(cr.CudaHostAllocDefault)
+			assert.Equal(t, points, pinnedPoints, "Allocating newly pinned memory resulted in bad points")
+		}
+
+		var p icicleBls12_377.Projective
+		var out core.DeviceSlice
+		_, e := out.Malloc(p.Size(), p.Size())
+		assert.Equal(t, e, cr.CudaSuccess, "Allocating bytes on device for Projective results failed")
+		outHost := make(core.HostSlice[icicleBls12_377.Projective], 1)
+
+		e = msm.Msm(scalars, points, &cfg, out)
+		assert.Equal(t, e, cr.CudaSuccess, "Msm allocated pinned host mem failed")
+
+		outHost.CopyFromDevice(&out)
+		// Check with gnark-crypto
+		assert.True(t, testAgainstGnarkCryptoMsm(scalars, points, outHost[0]))
+
+		if pinnableAndLockable {
+			e = msm.Msm(scalars, pinnedPoints, &cfg, out)
+			assert.Equal(t, e, cr.CudaSuccess, "Msm registered pinned host mem failed")
+
+			outHost.CopyFromDevice(&out)
+			// Check with gnark-crypto
+			assert.True(t, testAgainstGnarkCryptoMsm(scalars, pinnedPoints, outHost[0]))
+
+		}
+
+		out.Free()
+
+		if pinnableAndLockable {
+			points.Unpin()
+			pinnedPoints.FreePinned()
+		}
+	}
+}
 func TestMSMGnarkCryptoTypes(t *testing.T) {
 	cfg := msm.GetDefaultMSMConfig()
 	for _, power := range []int{3} {

diff --git a/wrappers/golang/curves/bls12377/tests/ntt_test.go b/wrappers/golang/curves/bls12377/tests/ntt_test.go
@@ -151,11 +151,12 @@ func TestNttDeviceAsync(t *testing.T) {
 
 func TestNttBatch(t *testing.T) {
 	cfg := ntt.GetDefaultNttConfig()
+	largestTestSize := 12
 	largestBatchSize := 100
 	scalars := bls12_377.GenerateScalars(1 << largestTestSize * largestBatchSize)
 
 	for _, size := range []int{4, largestTestSize} {
-		for _, batchSize := range []int{1, 16, largestBatchSize} {
+		for _, batchSize := range []int{2, 16, largestBatchSize} {
 			testSize := 1 << size
 			totalSize := testSize * batchSize