From fdb620f30939a47a2cafd9e2da1b0a47823a0af9 Mon Sep 17 00:00:00 2001 From: "Randall C. O'Reilly" Date: Fri, 4 Oct 2024 12:32:14 -0700 Subject: [PATCH] gpu: major update to arrayn variable creation: actual array size is determined when value buffer is set, and also added ReadOnly flag, so it auto-creates read buffer for all !ReadOnly Storage variables. --- goal/gosl/examples/basic/compute.go | 1 + goal/gosl/examples/basic/gosl.go | 137 ++++++++++++++++++++++++++++ goal/gosl/examples/basic/main.go | 6 +- goal/gosl/examples/rand/main.go | 2 - goal/gosl/gotosl/gengpu.go | 19 ++-- goal/gosl/gotosl/gotosl.go | 5 + gpu/examples/compute/compute.go | 2 - gpu/value.go | 39 ++++++-- gpu/values.go | 12 --- gpu/var.go | 16 ++-- gpu/vargroup.go | 12 --- gpu/vars.go | 21 ----- 12 files changed, 197 insertions(+), 75 deletions(-) create mode 100644 goal/gosl/examples/basic/gosl.go diff --git a/goal/gosl/examples/basic/compute.go b/goal/gosl/examples/basic/compute.go index 5f44bdfce..512afd149 100644 --- a/goal/gosl/examples/basic/compute.go +++ b/goal/gosl/examples/basic/compute.go @@ -12,6 +12,7 @@ import "cogentcore.org/core/math32" //gosl:vars var ( // Params are the parameters for the computation. + //gosl:read-only Params []ParamStruct // Data is the data on which the computation operates. diff --git a/goal/gosl/examples/basic/gosl.go b/goal/gosl/examples/basic/gosl.go new file mode 100644 index 000000000..a3a027701 --- /dev/null +++ b/goal/gosl/examples/basic/gosl.go @@ -0,0 +1,137 @@ +// Code generated by "gosl"; DO NOT EDIT + +package main + +import ( + "embed" + "unsafe" + + "cogentcore.org/core/gpu" +) + +//go:embed shaders/*.wgsl +var shaders embed.FS + +// GPU is the compute gpu device +var ComputeGPU *gpu.GPU + +// UseGPU indicates whether to use GPU vs. CPU. +var UseGPU bool + +var GPUSystem *gpu.ComputeSystem + +// GPUVars is an enum for GPU variables, for specifying what to sync. +type GPUVars int32 //enums:enum + +const ( + ParamsVar GPUVars = 0 + DataVar GPUVars = 1 +) + +// GPUInit initializes the GPU compute system +// Configuring Systems, variables and kernels. +func GPUInit() { + gp := gpu.NewComputeGPU() + ComputeGPU = gp + { + sy := gpu.NewComputeSystem(gp, "Default") + GPUSystem = sy + gpu.NewComputePipelineShaderFS(shaders, "shaders/Compute.wgsl", sy) + vars := sy.Vars() + { + sgp := vars.AddGroup(gpu.Storage) + sgp.AddStruct("Params", int(unsafe.Sizeof(ParamStruct{})), len(Params), gpu.ComputeShader) + sgp.AddStruct("Data", int(unsafe.Sizeof(DataStruct{})), len(Data), gpu.ComputeShader) + sgp.SetNValues(1) + } + sy.Config() + } +} + +// GPURelease releases the GPU compute system. +func GPURelease() { + GPUSystem.Release() + ComputeGPU.Release() +} + +// RunCompute runs the Compute kernel with given number of items, +// on either the CPU or GPU depending on the UseGPU. +// Pass *Var variable names to sync those variables back from the GPU +// after running (irrelevant for CPU). +func RunCompute(n int, syncVars ...GPUVars) { + if UseGPU { + RunComputeGPU(n, syncVars...) + } else { + RunComputeCPU(n) + } +} + +// RunComputeGPU runs the Compute kernel on the GPU. +func RunComputeGPU(n int, syncVars ...GPUVars) { + sy := GPUSystem + pl := sy.ComputePipelines["Compute"] + ce, _ := sy.BeginComputePass() + pl.Dispatch1D(ce, n, 64) + ce.End() + ReadFromGPU(syncVars...) + sy.EndComputePass() + SyncFromGPU(syncVars...) +} + +// RunComputeCPU runs the Compute kernel on the CPU. +func RunComputeCPU(n int) { + // todo: need flops, need threaded api -- not tensor + for i := range n { + Compute(uint32(i)) + } +} + +// ToGPU copies given variables to the GPU for the system. +func ToGPU(vars ...GPUVars) { + sy := GPUSystem + syVars := sy.Vars() + for _, v := range vars { + switch v { + case ParamsVar: + v, _ := syVars.ValueByIndex(0, "Params", 0) + gpu.SetValueFrom(v, Params) + case DataVar: + v, _ := syVars.ValueByIndex(0, "Data", 0) + gpu.SetValueFrom(v, Data) + } + } +} + +// ReadFromGPU starts the process of copying vars to the GPU. +func ReadFromGPU(vars ...GPUVars) { + sy := GPUSystem + syVars := sy.Vars() + for _, v := range vars { + switch v { + case ParamsVar: + v, _ := syVars.ValueByIndex(0, "Params", 0) + v.GPUToRead(sy.CommandEncoder) + case DataVar: + v, _ := syVars.ValueByIndex(0, "Data", 0) + v.GPUToRead(sy.CommandEncoder) + } + } +} + +// SyncFromGPU synchronizes vars from the GPU to the actual variable. +func SyncFromGPU(vars ...GPUVars) { + sy := GPUSystem + syVars := sy.Vars() + for _, v := range vars { + switch v { + case ParamsVar: + v, _ := syVars.ValueByIndex(0, "Params", 0) + v.ReadSync() + gpu.ReadToBytes(v, Params) + case DataVar: + v, _ := syVars.ValueByIndex(0, "Data", 0) + v.ReadSync() + gpu.ReadToBytes(v, Data) + } + } +} diff --git a/goal/gosl/examples/basic/main.go b/goal/gosl/examples/basic/main.go index 3a6b4e3d0..632b52eda 100644 --- a/goal/gosl/examples/basic/main.go +++ b/goal/gosl/examples/basic/main.go @@ -12,6 +12,7 @@ import ( "runtime" "cogentcore.org/core/base/timer" + "cogentcore.org/core/gpu" ) //go:generate gosl . @@ -22,6 +23,9 @@ func init() { } func main() { + gpu.Debug = true + GPUInit() + n := 2000000 // note: not necc to spec up-front, but easier if so Params = make([]ParamStruct, 1) @@ -37,8 +41,6 @@ func main() { sd[i].Raw = Data[i].Raw } - GPUInit() - cpuTmr := timer.Time{} cpuTmr.Start() diff --git a/goal/gosl/examples/rand/main.go b/goal/gosl/examples/rand/main.go index f053e9676..99a93dcaf 100644 --- a/goal/gosl/examples/rand/main.go +++ b/goal/gosl/examples/rand/main.go @@ -70,8 +70,6 @@ func main() { gpu.SetValueFrom(cvl, []uint64{seed}) gpu.SetValueFrom(dvl, dataG) - sgp.CreateReadBuffers() - gpuTmr := timer.Time{} gpuTmr.Start() diff --git a/goal/gosl/gotosl/gengpu.go b/goal/gosl/gotosl/gengpu.go index aee3768a5..e75998d8e 100644 --- a/goal/gosl/gotosl/gengpu.go +++ b/goal/gosl/gotosl/gengpu.go @@ -42,7 +42,7 @@ import ( //go:embed %s/*.wgsl var shaders embed.FS -// GPU is the compute gpu device +// ComputeGPU is the compute gpu device var ComputeGPU *gpu.GPU // UseGPU indicates whether to use GPU vs. CPU. @@ -53,6 +53,7 @@ var UseGPU bool b.WriteString(fmt.Sprintf(header, st.Config.Output)) for _, sy := range st.Systems { + b.WriteString(fmt.Sprintf("// %s is a GPU compute System with kernels operating on the\nsame set of data variables.\n", st.genSysVar(sy))) b.WriteString(fmt.Sprintf("var %s *gpu.ComputeSystem\n", st.genSysVar(sy))) } @@ -77,8 +78,8 @@ const ( b.WriteString(")\n") initf := ` -// GPUInit initializes the GPU compute system -// Configuring Systems, variables and kernels. +// GPUInit initializes the GPU compute system, +// configuring system(s), variables and kernels. func GPUInit() { gp := gpu.NewComputeGPU() ComputeGPU = gp @@ -92,7 +93,8 @@ func GPUInit() { b.WriteString("}\n\n") release := ` -// GPURelease releases the GPU compute system. +// GPURelease releases the GPU compute system resources. +// Call this at program exit. func GPURelease() { ` @@ -138,9 +140,6 @@ func (st *State) GenGPUSystemInit(sy *System) string { b.WriteString(fmt.Sprintf("\t\t\tsgp.AddStruct(%q, int(unsafe.Sizeof(%s{})), len(%s), gpu.ComputeShader)\n", vr.Name, vr.Type[2:], vr.Name)) } b.WriteString("\t\t\tsgp.SetNValues(1)\n") - if !gp.Uniform { - b.WriteString("\t\t\tsgp.CreateReadBuffers()\n") - } b.WriteString("\t\t}\n") } b.WriteString("\t\tsy.Config()\n") @@ -157,9 +156,9 @@ func (st *State) GenGPUSystemOps(sy *System) string { // 1 = kernel, 2 = system, 3 = sysname run := ` -// Run%[1]s runs the %[1]s kernel with given number of items, -// on either the CPU or GPU depending on the UseGPU. -// Pass *Var variable names to sync those variables back from the GPU +// Run%[1]s runs the %[1]s kernel with given number of elements, +// on either the CPU or GPU depending on the UseGPU variable. +// Pass *Var variable enums to sync those variables back from the GPU // after running (irrelevant for CPU). func Run%[1]s(n int, syncVars ...GPUVars) { if UseGPU { diff --git a/goal/gosl/gotosl/gotosl.go b/goal/gosl/gotosl/gotosl.go index 1566adcf7..2cc552f5a 100644 --- a/goal/gosl/gotosl/gotosl.go +++ b/goal/gosl/gotosl/gotosl.go @@ -54,6 +54,11 @@ type Var struct { // Type of variable: either []Type or tensor.Float32, tensor.Int32 Type string + + // ReadOnly indicates that this variable is never read back from GPU, + // specified by the gosl:read-only property in the variable comments. + // It is important to optimize GPU memory usage to indicate this. + ReadOnly bool } // Group represents one variable group. diff --git a/gpu/examples/compute/compute.go b/gpu/examples/compute/compute.go index 6a6cdbf46..cfb835180 100644 --- a/gpu/examples/compute/compute.go +++ b/gpu/examples/compute/compute.go @@ -59,8 +59,6 @@ func main() { } gpu.SetValueFrom(dvl, sd) - sgp.CreateReadBuffers() - ce, _ := sy.BeginComputePass() pl.Dispatch1D(ce, n, threads) ce.End() diff --git a/gpu/value.go b/gpu/value.go index 922f8cc26..0f7fab316 100644 --- a/gpu/value.go +++ b/gpu/value.go @@ -30,10 +30,17 @@ type Value struct { // index of this value within the Var list of values Index int - // VarSize is the size of each Var element, which includes any fixed ArrayN + // VarSize is the size of each Var element, which includes any fixed Var.ArrayN // array size specified on the Var. + // The actual buffer size is VarSize * Value.ArrayN (or DynamicN for dynamic). VarSize int + // ArrayN is the actual number of array elements, for Uniform or Storage + // variables without a fixed array size (i.e., the Var ArrayN = 1). + // This is set when the buffer is actually created, based on the data, + // or can be set directly prior to buffer creation. + ArrayN int + // DynamicIndex is the current index into a DynamicOffset variable // to use for the SetBindGroup call. Note that this is an index, // not an offset, so it indexes the DynamicN Vars in the Value, @@ -112,6 +119,7 @@ func (vl *Value) init(vr *Var, dev *Device, idx int) { vl.Index = idx vl.Name = fmt.Sprintf("%s_%d", vr.Name, vl.Index) vl.VarSize = vr.MemSize() + vl.ArrayN = 1 vl.alignBytes = vr.alignBytes vl.AlignVarSize = MemSizeAlign(vl.VarSize, vl.alignBytes) vl.isDynamic = vl.role == Vertex || vl.role == Index || vr.DynamicOffset @@ -129,11 +137,12 @@ func (vl *Value) MemSize() int { if vl.isDynamic { return vl.AlignVarSize * vl.dynamicN } - return vl.VarSize + return vl.ArrayN * vl.VarSize } // CreateBuffer creates the GPU buffer for this value if it does not // yet exist or is not the right size. +// For !ReadOnly [Storage] buffers, calls [Value.CreateReadBuffer]. func (vl *Value) CreateBuffer() error { if vl.role == SampledTexture { return nil @@ -159,6 +168,9 @@ func (vl *Value) CreateBuffer() error { } vl.AllocSize = sz vl.buffer = buf + if vl.role == Storage && !vl.vvar.ReadOnly { + vl.CreateReadBuffer() + } return nil } @@ -214,6 +226,9 @@ func (vl *Value) SetDynamicN(n int) { // SetValueFrom copies given values into value buffer memory, // making the buffer if it has not yet been constructed. +// The actual ArrayN size of Storage or Uniform variables will +// be computed based on the size of the from bytes, relative to +// the variable size. // IMPORTANT: do not use this for dynamic offset Uniform or // Storage variables, as the alignment will not be correct; // See [SetDynamicFromBytes]. @@ -223,6 +238,7 @@ func SetValueFrom[E any](vl *Value, from []E) error { // SetFromBytes copies given bytes into value buffer memory, // making the buffer if it has not yet been constructed. +// For !ReadOnly [Storage] buffers, calls [Value.CreateReadBuffer]. // IMPORTANT: do not use this for dynamic offset Uniform or // Storage variables, as the alignment will not be correct; // See [SetDynamicFromBytes]. @@ -232,12 +248,19 @@ func (vl *Value) SetFromBytes(from []byte) error { return errors.Log(err) } nb := len(from) + an := nb / vl.VarSize + aover := nb % vl.VarSize + if aover != 0 { + err := fmt.Errorf("gpu.Value SetFromBytes %s, Size passed: %d is not an even multiple of the variable size: %d", vl.Name, nb, vl.VarSize) + return errors.Log(err) + } if vl.isDynamic { // Vertex, Index at this point - dn := nb / vl.VarSize - vl.SetDynamicN(dn) + vl.SetDynamicN(an) + } else { + vl.ArrayN = an } tb := vl.MemSize() - if nb != tb { + if nb != tb { // this should never happen, but justin case err := fmt.Errorf("gpu.Value SetFromBytes %s, Size passed: %d != Size expected %d", vl.Name, nb, tb) return errors.Log(err) } @@ -254,6 +277,9 @@ func (vl *Value) SetFromBytes(from []byte) error { } vl.buffer = buf vl.AllocSize = nb + if vl.role == Storage && !vl.vvar.ReadOnly { + vl.CreateReadBuffer() + } } else { err := vl.device.Queue.WriteBuffer(vl.buffer, 0, from) if errors.Log(err) != nil { @@ -406,9 +432,8 @@ func (vl *Value) SetFromTexture(tx *Texture) *Texture { } // CreateReadBuffer creates a read buffer for this value, -// if it does not yet exist or is not the right size. +// for [Storage] values only. Automatically called for !ReadOnly. // Read buffer is needed for reading values back from the GPU. -// Only for Storage role variables. func (vl *Value) CreateReadBuffer() error { if !(vl.role == Storage || vl.role == StorageTexture) { return nil diff --git a/gpu/values.go b/gpu/values.go index 695db559e..193c6da69 100644 --- a/gpu/values.go +++ b/gpu/values.go @@ -147,18 +147,6 @@ func (vs *Values) MemSize() int { return tsz } -// CreateReadBuffers creates read buffers for all values. -func (vs *Values) CreateReadBuffers() error { - var errs []error - for _, vl := range vs.Values { - err := vl.CreateReadBuffer() - if err != nil { - errs = append(errs, err) - } - } - return errors.Join(errs...) -} - // bindGroupEntry returns the BindGroupEntry for Current // value for this variable. func (vs *Values) bindGroupEntry(vr *Var) []wgpu.BindGroupEntry { diff --git a/gpu/var.go b/gpu/var.go index 0845aa28a..7e59acca9 100644 --- a/gpu/var.go +++ b/gpu/var.go @@ -35,13 +35,12 @@ type Var struct { // automatically be sent as 4 interleaved Float32Vector4 chuncks. Type Types - // number of elements, which is 1 for a single element, or a constant - // number for a fixed array of elements. For Vertex variables, the - // number is dynamic and does not need to be specified in advance, - // so you can leave it at 1. There can be alignment issues with arrays + // ArrayN is the number of elements in an array, only if there is a + // fixed array size. Otherwise, for single elements or dynamic arrays + // use a value of 1. There can be alignment issues with arrays // so make sure your elemental types are compatible. // Note that DynamicOffset variables can have Value buffers with multiple - // instances of the variable (with proper alignment stride), which is + // instances of the variable (with proper alignment stride), // which goes on top of any array value for the variable itself. ArrayN int @@ -87,6 +86,11 @@ type Var struct { // Only for Uniform and Storage variables. DynamicOffset bool + // ReadOnly applies only to [Storage] variables, and indicates that + // they are never read back from the GPU, so the additional staging + // buffers needed to do so are not created for these variables. + ReadOnly bool + // Values is the the array of Values allocated for this variable. // Each value has its own corresponding Buffer or Texture. // The currently-active Value is specified by the Current index, @@ -145,7 +149,6 @@ func (vr *Var) MemSize() int { if vr.ArrayN < 1 { vr.ArrayN = 1 } - // todo: may need to diagnose alignments here.. switch { case vr.Role >= SampledTexture: return 0 @@ -157,7 +160,6 @@ func (vr *Var) MemSize() int { // Release resets the MemPtr for values, resets any self-owned resources (Textures) func (vr *Var) Release() { vr.Values.Release() - // todo: free anything in var } // SetNValues sets specified number of Values for this var. diff --git a/gpu/vargroup.go b/gpu/vargroup.go index a676ad661..54c022a5f 100644 --- a/gpu/vargroup.go +++ b/gpu/vargroup.go @@ -166,18 +166,6 @@ func (vg *VarGroup) SetAllCurrentValue(i int) { } } -// CreateReadBuffers creates read buffers for all values. -func (vg *VarGroup) CreateReadBuffers() error { - var errs []error - for _, vr := range vg.Vars { - err := vr.Values.CreateReadBuffers() - if err != nil { - errs = append(errs, err) - } - } - return errors.Join(errs...) -} - // Config must be called after all variables have been added. // Configures binding / location for all vars based on sequential order. // also does validation and returns error message. diff --git a/gpu/vars.go b/gpu/vars.go index 9580a95ce..2e1720b94 100644 --- a/gpu/vars.go +++ b/gpu/vars.go @@ -169,27 +169,6 @@ func (vs *Vars) SetDynamicIndex(group int, name string, dynamicIndex int) *Var { return vr } -// CreateReadBuffers creates read buffers for all Storage variables. -// This is needed to be able to read values back from GPU (e.g., for Compute). -func (vs *Vars) CreateReadBuffers() error { - var errs []error - ns := vs.NGroups() - for gi := vs.StartGroup(); gi < ns; gi++ { - vg := vs.Groups[gi] - if vg == nil { - continue - } - if vg.Role != Storage { - continue - } - err := vg.CreateReadBuffers() - if err != nil { - errs = append(errs, err) - } - } - return errors.Join(errs...) -} - // Config must be called after all variables have been added. // Configures all Groups and also does validation, returning error // does DescLayout too, so all ready for Pipeline config.