From fdb620f30939a47a2cafd9e2da1b0a47823a0af9 Mon Sep 17 00:00:00 2001
From: "Randall C. O'Reilly" <oreilly@ucdavis.edu>
Date: Fri, 4 Oct 2024 12:32:14 -0700
Subject: [PATCH] gpu: major update to arrayn variable creation: actual array
 size is determined when value buffer is set, and also added ReadOnly flag, so
 it auto-creates read buffer for all !ReadOnly Storage variables.

---
 goal/gosl/examples/basic/compute.go |   1 +
 goal/gosl/examples/basic/gosl.go    | 137 ++++++++++++++++++++++++++++
 goal/gosl/examples/basic/main.go    |   6 +-
 goal/gosl/examples/rand/main.go     |   2 -
 goal/gosl/gotosl/gengpu.go          |  19 ++--
 goal/gosl/gotosl/gotosl.go          |   5 +
 gpu/examples/compute/compute.go     |   2 -
 gpu/value.go                        |  39 ++++++--
 gpu/values.go                       |  12 ---
 gpu/var.go                          |  16 ++--
 gpu/vargroup.go                     |  12 ---
 gpu/vars.go                         |  21 -----
 12 files changed, 197 insertions(+), 75 deletions(-)
 create mode 100644 goal/gosl/examples/basic/gosl.go

diff --git a/goal/gosl/examples/basic/compute.go b/goal/gosl/examples/basic/compute.go
index 5f44bdfce..512afd149 100644
--- a/goal/gosl/examples/basic/compute.go
+++ b/goal/gosl/examples/basic/compute.go
@@ -12,6 +12,7 @@ import "cogentcore.org/core/math32"
 //gosl:vars
 var (
 	// Params are the parameters for the computation.
+	//gosl:read-only
 	Params []ParamStruct
 
 	// Data is the data on which the computation operates.
diff --git a/goal/gosl/examples/basic/gosl.go b/goal/gosl/examples/basic/gosl.go
new file mode 100644
index 000000000..a3a027701
--- /dev/null
+++ b/goal/gosl/examples/basic/gosl.go
@@ -0,0 +1,137 @@
+// Code generated by "gosl"; DO NOT EDIT
+
+package main
+
+import (
+	"embed"
+	"unsafe"
+
+	"cogentcore.org/core/gpu"
+)
+
+//go:embed shaders/*.wgsl
+var shaders embed.FS
+
+// GPU is the compute gpu device
+var ComputeGPU *gpu.GPU
+
+// UseGPU indicates whether to use GPU vs. CPU.
+var UseGPU bool
+
+var GPUSystem *gpu.ComputeSystem
+
+// GPUVars is an enum for GPU variables, for specifying what to sync.
+type GPUVars int32 //enums:enum
+
+const (
+	ParamsVar GPUVars = 0
+	DataVar   GPUVars = 1
+)
+
+// GPUInit initializes the GPU compute system
+// Configuring Systems, variables and kernels.
+func GPUInit() {
+	gp := gpu.NewComputeGPU()
+	ComputeGPU = gp
+	{
+		sy := gpu.NewComputeSystem(gp, "Default")
+		GPUSystem = sy
+		gpu.NewComputePipelineShaderFS(shaders, "shaders/Compute.wgsl", sy)
+		vars := sy.Vars()
+		{
+			sgp := vars.AddGroup(gpu.Storage)
+			sgp.AddStruct("Params", int(unsafe.Sizeof(ParamStruct{})), len(Params), gpu.ComputeShader)
+			sgp.AddStruct("Data", int(unsafe.Sizeof(DataStruct{})), len(Data), gpu.ComputeShader)
+			sgp.SetNValues(1)
+		}
+		sy.Config()
+	}
+}
+
+// GPURelease releases the GPU compute system.
+func GPURelease() {
+	GPUSystem.Release()
+	ComputeGPU.Release()
+}
+
+// RunCompute runs the Compute kernel with given number of items,
+// on either the CPU or GPU depending on the UseGPU.
+// Pass *Var variable names to sync those variables back from the GPU
+// after running (irrelevant for CPU).
+func RunCompute(n int, syncVars ...GPUVars) {
+	if UseGPU {
+		RunComputeGPU(n, syncVars...)
+	} else {
+		RunComputeCPU(n)
+	}
+}
+
+// RunComputeGPU runs the Compute kernel on the GPU.
+func RunComputeGPU(n int, syncVars ...GPUVars) {
+	sy := GPUSystem
+	pl := sy.ComputePipelines["Compute"]
+	ce, _ := sy.BeginComputePass()
+	pl.Dispatch1D(ce, n, 64)
+	ce.End()
+	ReadFromGPU(syncVars...)
+	sy.EndComputePass()
+	SyncFromGPU(syncVars...)
+}
+
+// RunComputeCPU runs the Compute kernel on the CPU.
+func RunComputeCPU(n int) {
+	// todo: need flops, need threaded api -- not tensor
+	for i := range n {
+		Compute(uint32(i))
+	}
+}
+
+// ToGPU copies given variables to the GPU for the system.
+func ToGPU(vars ...GPUVars) {
+	sy := GPUSystem
+	syVars := sy.Vars()
+	for _, v := range vars {
+		switch v {
+		case ParamsVar:
+			v, _ := syVars.ValueByIndex(0, "Params", 0)
+			gpu.SetValueFrom(v, Params)
+		case DataVar:
+			v, _ := syVars.ValueByIndex(0, "Data", 0)
+			gpu.SetValueFrom(v, Data)
+		}
+	}
+}
+
+// ReadFromGPU starts the process of copying vars to the GPU.
+func ReadFromGPU(vars ...GPUVars) {
+	sy := GPUSystem
+	syVars := sy.Vars()
+	for _, v := range vars {
+		switch v {
+		case ParamsVar:
+			v, _ := syVars.ValueByIndex(0, "Params", 0)
+			v.GPUToRead(sy.CommandEncoder)
+		case DataVar:
+			v, _ := syVars.ValueByIndex(0, "Data", 0)
+			v.GPUToRead(sy.CommandEncoder)
+		}
+	}
+}
+
+// SyncFromGPU synchronizes vars from the GPU to the actual variable.
+func SyncFromGPU(vars ...GPUVars) {
+	sy := GPUSystem
+	syVars := sy.Vars()
+	for _, v := range vars {
+		switch v {
+		case ParamsVar:
+			v, _ := syVars.ValueByIndex(0, "Params", 0)
+			v.ReadSync()
+			gpu.ReadToBytes(v, Params)
+		case DataVar:
+			v, _ := syVars.ValueByIndex(0, "Data", 0)
+			v.ReadSync()
+			gpu.ReadToBytes(v, Data)
+		}
+	}
+}
diff --git a/goal/gosl/examples/basic/main.go b/goal/gosl/examples/basic/main.go
index 3a6b4e3d0..632b52eda 100644
--- a/goal/gosl/examples/basic/main.go
+++ b/goal/gosl/examples/basic/main.go
@@ -12,6 +12,7 @@ import (
 	"runtime"
 
 	"cogentcore.org/core/base/timer"
+	"cogentcore.org/core/gpu"
 )
 
 //go:generate gosl .
@@ -22,6 +23,9 @@ func init() {
 }
 
 func main() {
+	gpu.Debug = true
+	GPUInit()
+
 	n := 2000000 // note: not necc to spec up-front, but easier if so
 
 	Params = make([]ParamStruct, 1)
@@ -37,8 +41,6 @@ func main() {
 		sd[i].Raw = Data[i].Raw
 	}
 
-	GPUInit()
-
 	cpuTmr := timer.Time{}
 	cpuTmr.Start()
 
diff --git a/goal/gosl/examples/rand/main.go b/goal/gosl/examples/rand/main.go
index f053e9676..99a93dcaf 100644
--- a/goal/gosl/examples/rand/main.go
+++ b/goal/gosl/examples/rand/main.go
@@ -70,8 +70,6 @@ func main() {
 	gpu.SetValueFrom(cvl, []uint64{seed})
 	gpu.SetValueFrom(dvl, dataG)
 
-	sgp.CreateReadBuffers()
-
 	gpuTmr := timer.Time{}
 	gpuTmr.Start()
 
diff --git a/goal/gosl/gotosl/gengpu.go b/goal/gosl/gotosl/gengpu.go
index aee3768a5..e75998d8e 100644
--- a/goal/gosl/gotosl/gengpu.go
+++ b/goal/gosl/gotosl/gengpu.go
@@ -42,7 +42,7 @@ import (
 //go:embed %s/*.wgsl
 var shaders embed.FS
 
-// GPU is the compute gpu device
+// ComputeGPU is the compute gpu device
 var ComputeGPU *gpu.GPU
 
 // UseGPU indicates whether to use GPU vs. CPU.
@@ -53,6 +53,7 @@ var UseGPU bool
 	b.WriteString(fmt.Sprintf(header, st.Config.Output))
 
 	for _, sy := range st.Systems {
+		b.WriteString(fmt.Sprintf("// %s is a GPU compute System with kernels operating on the\nsame set of data variables.\n", st.genSysVar(sy)))
 		b.WriteString(fmt.Sprintf("var %s *gpu.ComputeSystem\n", st.genSysVar(sy)))
 	}
 
@@ -77,8 +78,8 @@ const (
 	b.WriteString(")\n")
 
 	initf := `
-// GPUInit initializes the GPU compute system
-// Configuring Systems, variables and kernels.
+// GPUInit initializes the GPU compute system,
+// configuring system(s), variables and kernels.
 func GPUInit() {
 	gp := gpu.NewComputeGPU()
 	ComputeGPU = gp
@@ -92,7 +93,8 @@ func GPUInit() {
 	b.WriteString("}\n\n")
 
 	release := `
-// GPURelease releases the GPU compute system.
+// GPURelease releases the GPU compute system resources.
+// Call this at program exit.
 func GPURelease() {
 `
 
@@ -138,9 +140,6 @@ func (st *State) GenGPUSystemInit(sy *System) string {
 			b.WriteString(fmt.Sprintf("\t\t\tsgp.AddStruct(%q, int(unsafe.Sizeof(%s{})), len(%s), gpu.ComputeShader)\n", vr.Name, vr.Type[2:], vr.Name))
 		}
 		b.WriteString("\t\t\tsgp.SetNValues(1)\n")
-		if !gp.Uniform {
-			b.WriteString("\t\t\tsgp.CreateReadBuffers()\n")
-		}
 		b.WriteString("\t\t}\n")
 	}
 	b.WriteString("\t\tsy.Config()\n")
@@ -157,9 +156,9 @@ func (st *State) GenGPUSystemOps(sy *System) string {
 
 	// 1 = kernel, 2 = system, 3 = sysname
 	run := `
-// Run%[1]s runs the %[1]s kernel with given number of items,
-// on either the CPU or GPU depending on the UseGPU.
-// Pass *Var variable names to sync those variables back from the GPU
+// Run%[1]s runs the %[1]s kernel with given number of elements,
+// on either the CPU or GPU depending on the UseGPU variable.
+// Pass *Var variable enums to sync those variables back from the GPU
 // after running (irrelevant for CPU).
 func Run%[1]s(n int, syncVars ...GPUVars) {
 	if UseGPU {
diff --git a/goal/gosl/gotosl/gotosl.go b/goal/gosl/gotosl/gotosl.go
index 1566adcf7..2cc552f5a 100644
--- a/goal/gosl/gotosl/gotosl.go
+++ b/goal/gosl/gotosl/gotosl.go
@@ -54,6 +54,11 @@ type Var struct {
 
 	// Type of variable: either []Type or tensor.Float32, tensor.Int32
 	Type string
+
+	// ReadOnly indicates that this variable is never read back from GPU,
+	// specified by the gosl:read-only property in the variable comments.
+	// It is important to optimize GPU memory usage to indicate this.
+	ReadOnly bool
 }
 
 // Group represents one variable group.
diff --git a/gpu/examples/compute/compute.go b/gpu/examples/compute/compute.go
index 6a6cdbf46..cfb835180 100644
--- a/gpu/examples/compute/compute.go
+++ b/gpu/examples/compute/compute.go
@@ -59,8 +59,6 @@ func main() {
 	}
 	gpu.SetValueFrom(dvl, sd)
 
-	sgp.CreateReadBuffers()
-
 	ce, _ := sy.BeginComputePass()
 	pl.Dispatch1D(ce, n, threads)
 	ce.End()
diff --git a/gpu/value.go b/gpu/value.go
index 922f8cc26..0f7fab316 100644
--- a/gpu/value.go
+++ b/gpu/value.go
@@ -30,10 +30,17 @@ type Value struct {
 	// index of this value within the Var list of values
 	Index int
 
-	// VarSize is the size of each Var element, which includes any fixed ArrayN
+	// VarSize is the size of each Var element, which includes any fixed Var.ArrayN
 	// array size specified on the Var.
+	// The actual buffer size is VarSize * Value.ArrayN (or DynamicN for dynamic).
 	VarSize int
 
+	// ArrayN is the actual number of array elements, for Uniform or Storage
+	// variables without a fixed array size (i.e., the Var ArrayN = 1).
+	// This is set when the buffer is actually created, based on the data,
+	// or can be set directly prior to buffer creation.
+	ArrayN int
+
 	// DynamicIndex is the current index into a DynamicOffset variable
 	// to use for the SetBindGroup call.  Note that this is an index,
 	// not an offset, so it indexes the DynamicN Vars in the Value,
@@ -112,6 +119,7 @@ func (vl *Value) init(vr *Var, dev *Device, idx int) {
 	vl.Index = idx
 	vl.Name = fmt.Sprintf("%s_%d", vr.Name, vl.Index)
 	vl.VarSize = vr.MemSize()
+	vl.ArrayN = 1
 	vl.alignBytes = vr.alignBytes
 	vl.AlignVarSize = MemSizeAlign(vl.VarSize, vl.alignBytes)
 	vl.isDynamic = vl.role == Vertex || vl.role == Index || vr.DynamicOffset
@@ -129,11 +137,12 @@ func (vl *Value) MemSize() int {
 	if vl.isDynamic {
 		return vl.AlignVarSize * vl.dynamicN
 	}
-	return vl.VarSize
+	return vl.ArrayN * vl.VarSize
 }
 
 // CreateBuffer creates the GPU buffer for this value if it does not
 // yet exist or is not the right size.
+// For !ReadOnly [Storage] buffers, calls [Value.CreateReadBuffer].
 func (vl *Value) CreateBuffer() error {
 	if vl.role == SampledTexture {
 		return nil
@@ -159,6 +168,9 @@ func (vl *Value) CreateBuffer() error {
 	}
 	vl.AllocSize = sz
 	vl.buffer = buf
+	if vl.role == Storage && !vl.vvar.ReadOnly {
+		vl.CreateReadBuffer()
+	}
 	return nil
 }
 
@@ -214,6 +226,9 @@ func (vl *Value) SetDynamicN(n int) {
 
 // SetValueFrom copies given values into value buffer memory,
 // making the buffer if it has not yet been constructed.
+// The actual ArrayN size of Storage or Uniform variables will
+// be computed based on the size of the from bytes, relative to
+// the variable size.
 // IMPORTANT: do not use this for dynamic offset Uniform or
 // Storage variables, as the alignment will not be correct;
 // See [SetDynamicFromBytes].
@@ -223,6 +238,7 @@ func SetValueFrom[E any](vl *Value, from []E) error {
 
 // SetFromBytes copies given bytes into value buffer memory,
 // making the buffer if it has not yet been constructed.
+// For !ReadOnly [Storage] buffers, calls [Value.CreateReadBuffer].
 // IMPORTANT: do not use this for dynamic offset Uniform or
 // Storage variables, as the alignment will not be correct;
 // See [SetDynamicFromBytes].
@@ -232,12 +248,19 @@ func (vl *Value) SetFromBytes(from []byte) error {
 		return errors.Log(err)
 	}
 	nb := len(from)
+	an := nb / vl.VarSize
+	aover := nb % vl.VarSize
+	if aover != 0 {
+		err := fmt.Errorf("gpu.Value SetFromBytes %s, Size passed: %d is not an even multiple of the variable size: %d", vl.Name, nb, vl.VarSize)
+		return errors.Log(err)
+	}
 	if vl.isDynamic { // Vertex, Index at this point
-		dn := nb / vl.VarSize
-		vl.SetDynamicN(dn)
+		vl.SetDynamicN(an)
+	} else {
+		vl.ArrayN = an
 	}
 	tb := vl.MemSize()
-	if nb != tb {
+	if nb != tb { // this should never happen, but justin case
 		err := fmt.Errorf("gpu.Value SetFromBytes %s, Size passed: %d != Size expected %d", vl.Name, nb, tb)
 		return errors.Log(err)
 	}
@@ -254,6 +277,9 @@ func (vl *Value) SetFromBytes(from []byte) error {
 		}
 		vl.buffer = buf
 		vl.AllocSize = nb
+		if vl.role == Storage && !vl.vvar.ReadOnly {
+			vl.CreateReadBuffer()
+		}
 	} else {
 		err := vl.device.Queue.WriteBuffer(vl.buffer, 0, from)
 		if errors.Log(err) != nil {
@@ -406,9 +432,8 @@ func (vl *Value) SetFromTexture(tx *Texture) *Texture {
 }
 
 // CreateReadBuffer creates a read buffer for this value,
-// if it does not yet exist or is not the right size.
+// for [Storage] values only. Automatically called for !ReadOnly.
 // Read buffer is needed for reading values back from the GPU.
-// Only for Storage role variables.
 func (vl *Value) CreateReadBuffer() error {
 	if !(vl.role == Storage || vl.role == StorageTexture) {
 		return nil
diff --git a/gpu/values.go b/gpu/values.go
index 695db559e..193c6da69 100644
--- a/gpu/values.go
+++ b/gpu/values.go
@@ -147,18 +147,6 @@ func (vs *Values) MemSize() int {
 	return tsz
 }
 
-// CreateReadBuffers creates read buffers for all values.
-func (vs *Values) CreateReadBuffers() error {
-	var errs []error
-	for _, vl := range vs.Values {
-		err := vl.CreateReadBuffer()
-		if err != nil {
-			errs = append(errs, err)
-		}
-	}
-	return errors.Join(errs...)
-}
-
 // bindGroupEntry returns the BindGroupEntry for Current
 // value for this variable.
 func (vs *Values) bindGroupEntry(vr *Var) []wgpu.BindGroupEntry {
diff --git a/gpu/var.go b/gpu/var.go
index 0845aa28a..7e59acca9 100644
--- a/gpu/var.go
+++ b/gpu/var.go
@@ -35,13 +35,12 @@ type Var struct {
 	// automatically be sent as 4 interleaved Float32Vector4 chuncks.
 	Type Types
 
-	// number of elements, which is 1 for a single element, or a constant
-	// number for a fixed array of elements.  For Vertex variables, the
-	// number is dynamic and does not need to be specified in advance,
-	// so you can leave it at 1. There can be alignment issues with arrays
+	// ArrayN is the number of elements in an array, only if there is a
+	// fixed array size. Otherwise, for single elements or dynamic arrays
+	// use a value of 1. There can be alignment issues with arrays
 	// so make sure your elemental types are compatible.
 	// Note that DynamicOffset variables can have Value buffers with multiple
-	// instances of the variable (with proper alignment stride), which is
+	// instances of the variable (with proper alignment stride),
 	// which goes on top of any array value for the variable itself.
 	ArrayN int
 
@@ -87,6 +86,11 @@ type Var struct {
 	// Only for Uniform and Storage variables.
 	DynamicOffset bool
 
+	// ReadOnly applies only to [Storage] variables, and indicates that
+	// they are never read back from the GPU, so the additional staging
+	// buffers needed to do so are not created for these variables.
+	ReadOnly bool
+
 	// Values is the the array of Values allocated for this variable.
 	// Each value has its own corresponding Buffer or Texture.
 	// The currently-active Value is specified by the Current index,
@@ -145,7 +149,6 @@ func (vr *Var) MemSize() int {
 	if vr.ArrayN < 1 {
 		vr.ArrayN = 1
 	}
-	// todo: may need to diagnose alignments here..
 	switch {
 	case vr.Role >= SampledTexture:
 		return 0
@@ -157,7 +160,6 @@ func (vr *Var) MemSize() int {
 // Release resets the MemPtr for values, resets any self-owned resources (Textures)
 func (vr *Var) Release() {
 	vr.Values.Release()
-	// todo: free anything in var
 }
 
 // SetNValues sets specified number of Values for this var.
diff --git a/gpu/vargroup.go b/gpu/vargroup.go
index a676ad661..54c022a5f 100644
--- a/gpu/vargroup.go
+++ b/gpu/vargroup.go
@@ -166,18 +166,6 @@ func (vg *VarGroup) SetAllCurrentValue(i int) {
 	}
 }
 
-// CreateReadBuffers creates read buffers for all values.
-func (vg *VarGroup) CreateReadBuffers() error {
-	var errs []error
-	for _, vr := range vg.Vars {
-		err := vr.Values.CreateReadBuffers()
-		if err != nil {
-			errs = append(errs, err)
-		}
-	}
-	return errors.Join(errs...)
-}
-
 // Config must be called after all variables have been added.
 // Configures binding / location for all vars based on sequential order.
 // also does validation and returns error message.
diff --git a/gpu/vars.go b/gpu/vars.go
index 9580a95ce..2e1720b94 100644
--- a/gpu/vars.go
+++ b/gpu/vars.go
@@ -169,27 +169,6 @@ func (vs *Vars) SetDynamicIndex(group int, name string, dynamicIndex int) *Var {
 	return vr
 }
 
-// CreateReadBuffers creates read buffers for all Storage variables.
-// This is needed to be able to read values back from GPU (e.g., for Compute).
-func (vs *Vars) CreateReadBuffers() error {
-	var errs []error
-	ns := vs.NGroups()
-	for gi := vs.StartGroup(); gi < ns; gi++ {
-		vg := vs.Groups[gi]
-		if vg == nil {
-			continue
-		}
-		if vg.Role != Storage {
-			continue
-		}
-		err := vg.CreateReadBuffers()
-		if err != nil {
-			errs = append(errs, err)
-		}
-	}
-	return errors.Join(errs...)
-}
-
 // Config must be called after all variables have been added.
 // Configures all Groups and also does validation, returning error
 // does DescLayout too, so all ready for Pipeline config.