tools/flow: high memory usage #1795

jlongtine · 2022-07-18T19:07:20Z

What version of CUE are you using (`cue version`)?

v0.4.3

Does this issue reproduce with the latest release?

Yes.

What did you do?

go build -o main
./main
go tool pprof heap
./main -disjunctions=false
go tool pprof heap

-- go.mod --
module dagger.io/dagger/cue-memory-use

go 1.18

require cuelang.org/go v0.4.3

require (
	github.com/cockroachdb/apd/v2 v2.0.1 // indirect
	github.com/google/uuid v1.2.0 // indirect
	github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de // indirect
	github.com/pkg/errors v0.8.1 // indirect
	golang.org/x/net v0.0.0-20200226121028-0de0cce0169b // indirect
	golang.org/x/text v0.3.7 // indirect
	gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
)
-- go.sum --
cuelang.org/go v0.4.3 h1:W3oBBjDTm7+IZfCKZAmC8uDG0eYfJL4Pp/xbbCMKaVo=
cuelang.org/go v0.4.3/go.mod h1:7805vR9H+VoBNdWFdI7jyDR3QLUPp4+naHfbcgp55HI=
github.com/cockroachdb/apd v1.1.0 h1:3LFP3629v+1aKXU5Q37mxmRxX/pIu1nijXydLShEq5I=
github.com/cockroachdb/apd/v2 v2.0.1 h1:y1Rh3tEU89D+7Tgbw+lp52T6p/GJLpDmNvr10UWqLTE=
github.com/cockroachdb/apd/v2 v2.0.1/go.mod h1:DDxRlzC2lo3/vSlmSoS7JkqbbrARPuFOGr0B9pvN3Gw=
github.com/google/uuid v1.2.0 h1:qJYtXnJRWmpe7m/3XlyhrsLrEURqHRM2kxzoxXqyUDs=
github.com/google/uuid v1.2.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de h1:D5x39vF5KCwKQaw+OC9ZPiLVHXz3UFw2+psEX+gYcto=
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de/go.mod h1:kJun4WP5gFuHZgRjZUWWuH1DTxCtxbHDOIJsudS8jzY=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b h1:0mm1VjtFUOIlE1SbDlwjYaDxZVDP2S5ou6y0gSgXHu8=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-- main.go --
package main

import (
	"context"
	"fmt"
	"log"
	"os"
	"runtime/pprof"
	"time"

	"cuelang.org/go/cue"
	"cuelang.org/go/cue/ast"
	"cuelang.org/go/cue/cuecontext"
	"cuelang.org/go/cue/format"
	"cuelang.org/go/cue/token"
	"cuelang.org/go/tools/flow"
)

func genTask(name, prev int) *ast.Field {
	var prevTask ast.Expr
	if prev >= 0 {
		prevTask = ast.NewStruct(
			ast.NewIdent("input"),
			ast.NewSel(ast.NewIdent(fmt.Sprintf("task%d", prev)), "output"),
		)
	} else {
		prevTask = ast.NewStruct(ast.NewIdent("input"), ast.NewString("world"))
	}
	return &ast.Field{
		Label: ast.NewIdent(fmt.Sprintf("task%d", name)),
		Value: ast.NewBinExpr(token.AND,
			ast.NewIdent("#MyDef"),
			prevTask,
		),
	}
}

func genFile(disjunctions bool) ast.File {
	var schema ast.Expr
	if disjunctions {
		schema = ast.NewBinExpr(token.OR,
			ast.NewIdent("string"),
			ast.NewIdent("bool"),
			ast.NewIdent("int"),
			ast.NewStruct(
				ast.NewIdent("a"), ast.NewIdent("string"),
				ast.NewIdent("b"), ast.NewIdent("int"),
			),
		)
	} else {
		schema = ast.NewIdent("string")
	}
	return ast.File{
		Filename: "example.cue",
		Decls: []ast.Decl{
			&ast.Field{
				Label: ast.NewIdent("#MyDef"),
				Value: ast.NewStruct(
					&ast.Field{
						Label: ast.NewIdent("input"),
						Value: schema,
					},
					&ast.Field{
						Label: ast.NewIdent("output"),
						Value: schema,
					},
				),
			},
		},
	}
}

func main() {
	r := cuecontext.New()
	file := genFile(true)
	fmt.Println("genTasks: Starting at ", time.Now())
	for i := 0; i < 1000; i++ {
		file.Decls = append(file.Decls, genTask(i, i-1))
	}
	fmt.Println("genTasks: Ended at ", time.Now())

	b, _ := format.Node(&file, format.Simplify())
	fmt.Println(string(b))
	val := r.BuildFile(&file)
	if err := val.Err(); err != nil {
		log.Fatal(err)
	}
	fmt.Println("Starting at ", time.Now())
	controller := flow.New(nil, val, ioTaskFunc)
	if err := controller.Run(context.Background()); err != nil {
		log.Fatal(err)
	}

	fmt.Println("Taking heap dump")
	outputFile, _ := os.Create("heap")
	err := pprof.WriteHeapProfile(outputFile)
	if err != nil {
		panic(err)
	}
	fmt.Println("Ended at ", time.Now())
}

func ioTaskFunc(v cue.Value) (flow.Runner, error) {
	inputPath := cue.ParsePath("input")

	input := v.LookupPath(inputPath)
	if !input.Exists() {
		return nil, nil
	}

	return flow.RunnerFunc(func(t *flow.Task) error {
		inputVal, err := t.Value().LookupPath(inputPath).String()
		if err != nil {
			return fmt.Errorf("input not of type string")
		}

		outputVal := inputVal

		return t.Fill(map[string]string{
			"output": outputVal,
		})
	}), nil
}

What did you expect to see?

Light memory usage. This tools/flow run is literally copying input -> output.

What did you see instead?

4-5GB of memory usage over the course of this run.

The executable will dump pprof files (named heap*).

There is a fairly different memory usage pattern for -disjunctions (2.5x more).

Dependencies

internal/core/adt: dynamic fields not properly resolve with default values #1871

The text was updated successfully, but these errors were encountered:

jlongtine · 2022-07-18T23:14:02Z

Note: we are experiencing a version of this within a Dagger user's config.

verdverm · 2022-07-18T23:44:21Z

This reminds me of "Abuse of CUE" https://github.com/hofstadter-io/cuetorials.com/tree/main/code/puzzles/advent-of-code (I used lists, this is a struct, but the same idea applies) There is a non-flow reproducer more than likely.

I believe what happens is that each successive entry refers to the prev, so you end up with references (in the ADT) which need to be walked / evaluated. So the last element essentially has a link to every element, and every element effectively links to every element before it. You can imagine this hits the runtime complexity pretty hard.

I'm 99% sure the ideas @mpvl has to fix performance issues should cover this case, in particular the structural or reference sharing (forgetting the name a.t.m.) [edit: I believe the idea is called "structural sharing"]

As a dagger user, I've run into this situation and another (where I cannot control parallelism and end up with too many tasks running at once). One thought I had, which may address both cases, is a "task list" that runs sequentially, without parallelism.

marcosnils · 2022-07-19T15:58:36Z

As a dagger user, I've run into this situation and another (where I cannot control parallelism and end up with too many tasks running at once). One thought I had, which may address both cases, is a "task list" that runs sequentially, without parallelism.

Dagger delegates parallelism to buildkit. It's correct that there's no currently friendly way to configure it through Dagger. Having said that, it can be configured in buidkit through the following configuration. https://github.com/moby/buildkit/blob/8700be396100c7e8bdde1aaec5b590d5338948b0/docs/buildkitd.toml.md?plain=1#L59

verdverm · 2022-07-19T17:24:49Z

There is task parallelism in CUE/flow that cannot be controlled when more than one tasks is ready to run. This is the issue I am referring to, which lives above buildkit. Dagger has some customization around CUE/flow and task discovery, which is where it could be controlled, by introducing a task list and managing the dependency ordering outside of the config

marcosnils · 2022-07-19T17:57:13Z

There is task parallelism in CUE/flow that cannot be controlled when more than one tasks is ready to run. This is the issue I am referring to, which lives above buildkit

Correct. Since all Dagger tasks ultimately translate to buildkit ops, in Dagger, this parallelism is being "limited" by buildkit's

verdverm · 2022-07-19T18:03:29Z

I think there is a misunderstanding, the issue I am describing is in no way related to buildkit. The issue lives in the Go code that implements cue/flow. To work around this, we do things like the original description, i.e. inject artificial dependencies between tasks so they are not released from dagger & cue/flow to buildkit. This however has unintended consequences, the hidden and high growth of references between sequential elements, which result in long runtimes in the CUE evaluator. (structural sharing should alleviate much of this reference / performance issue, but it is still a t.b.d. It however does not address controlling parallelism and needing this workaround in the first place)

The idea I am suggesting is to introduce a "task list" which iterates over the tasks, marking them ready to run to the cue/flow engine. Again, all of this is separate from anything buildkit related. I have the same issue in hof/flow which also builds on cue/flow and does not use buildkit. I have also experienced this issue in Dagger. For example, wanting to build a Go binary for each OS. If I want to build them one-by-one, I have to introduce artificial dependencies. Setting buildkit parallelism is not a general or acceptable solution when I do want other tasks to run in parallel.

Pinning versions is super important. `alpine/helm:3.9.1` in this case means limiting potential `alpine/helm:latest` breakages. Including a link that shows the source & other available tags makes future bumps easier. Another option would be to "dependabot" it. This matches an emerging best practice for actions: dagger#2781 It also avoids using cueflow which maks this CUE high memory issue 2x worse: cue-lang/cue#1795 Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>

marcosnils · 2022-07-21T14:32:24Z

I think there is a misunderstanding, the issue I am describing is in no way related to buildkit. The issue lives in the Go code that implements cue/flow.

I don't think there's a misunderstanding. I was talking about dagger specifics and I understand that you're referring to a different scenario when using cue/flow. In any case, let's move on from that topic since it doesn't help this discussion.

One thing that I'd like to reinforce is that this high memory usage exponentially gets worse when using disjunctions in the cue/flow workflow. So it's not only the amount of tasks and their dependencies which is the only issue, but as observed in the attached heap dumps, reducing the task count while increasing the amount of disjunctions in the CUE tree (which dagger uses/abuses), also shows unusable memory limits.

verdverm · 2022-07-21T15:59:43Z

This issue is not specific to cue/flow, it's in the evaluator which cue/flow also uses, though cue/flow may exacerbate it due to re-evaluation every time a task completes.

There are several issues with similar reproducers: https://github.com/cue-lang/cue/issues?q=is%3Aopen+is%3Aissue+label%3Aroadmap%2Fperformance This is probably a duplicate of one of those

verdverm · 2022-07-21T16:18:38Z

A simpler reproducer without cue/flow

exec cue eval repro.cue

-- repro.cue --
import "list"

#Task: {
  // input: _
  input: string | bool | int | { a: string, b: int }
  output: _
}

tasks: [string]: #Task
tasks: {
        for i,j in list.Range(1,1000,1) {
                "task-\(i)": {
                        if i > 0 {
                                input: tasks["task-\(i-1)"].output
                        }
                        input: _
                        output: input
                }
        }
}

marcosnils · 2022-07-21T16:28:46Z

A simpler reproducer without cue/flow

awesome, thx for sharing!

myitcv · 2022-07-25T16:33:39Z

Thanks for the report, @jlongtine. Thanks also for the analysis @verdverm and @marcosnils.

Taking a look now.

* feat: helm.#Upgrade * chore: allow to override the image in the helm install action Signed-off-by: Nico Braun <rainbowstack@gmail.com> * chore: Add make target to create kind cluster This is the simplest thing for now. The next step is to add it to ci.cue. Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk> * chore: Add make target that installs the correct CUE version This is a follow-up to #2801 Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk> * Simplify the helm.#Image definition Pinning versions is super important. `alpine/helm:3.9.1` in this case means limiting potential `alpine/helm:latest` breakages. This matches an emerging best practice for actions: #2781 It also avoids using cueflow which makes this CUE high memory issue 2x worse: cue-lang/cue#1795 Next step: restore Kubernetes tests. These have been temporarily disabled in #936. In other words, CI does not test these packages. It took me a while to track this down and understand what the simplest thing to do would be. Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>

myitcv · 2022-07-26T12:52:08Z

Drive-by response for this comment whilst I'm investigating.

There is task parallelism in CUE/flow that cannot be controlled when more than one tasks is ready to run

cue cmd does not have any means to limit parallelism today, something that has come up multiple times: I've added a bullet to #1325 so we don't forget that.

With tools/flow, there is nothing to actually stop the implementer controlling parallelism, and delegating the control to the user. I forked that discussion into #1818.

I'll update here with any further insights on the performance issue.

samalba · 2022-08-09T19:01:15Z

I'll update here with any further insights on the performance issue.

@myitcv @mpvl Any update on this one? Another dagger user is impacted by a very high memory usage.

myitcv · 2022-08-10T14:58:29Z

Any update on this one? Another dagger user is impacted by a very high memory usage.

@samalba - @mpvl and I are just working through some remaining work on cycle and related changes, at which point we will be doing a deep dive on this and all open issues that affect Dagger, picking up on my investigation from a couple of weeks ago. I pinged @jlongtine for confirmation from your side on what that list looks like from your perspective.

myitcv · 2022-08-12T07:39:53Z

@jlongtine is the example in this description actual Dagger code, or is this a reproducer? i.e. are you constructing CUE via cue/ast in this way?

marcosnils · 2022-08-12T13:30:42Z

Hi Paul, it's a reproducer. The main reason to use the last here is the convenience to generate the cueflow definitions and their references. sent from mobile Em sex, 12 de ago de 2022 04:40, Paul Jolly ***@***.***> escreveu:

…

@jlongtine <https://github.com/jlongtine> is the example in this description actual Dagger code, or is this a reproducer? i.e. are you constructing CUE via cue/ast in this way? — Reply to this email directly, view it on GitHub <#1795 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAMBLWTHXZ4BEADD6WSVXZDVYX5VHANCNFSM535JHQYQ> . You are receiving this because you were mentioned.Message ID: ***@***.***>

myitcv · 2022-08-12T14:25:44Z

Thanks @marcosnils. I just want to make sure as we start to dig deep on this that we don't miss a critical piece of the puzzle. Creating reduced versions of the actual problem is often great, but I want to be sure that we don't "miss" the actual scenario along the way. Because that's entirely possible when creating repros.

So to summarise my understanding of the context of the original problem: the problem as you/your users are experiencing it essentially boils down to tools/flow "evaluating" a configuration where there is a long, linear dependency chain of tasks. This causes long evaluation time and memory usage. As @verdverm points out, this issue is very likely rooted in more low-level aspects of the evaluator, but as a top level summary of the problem you are facing today, I believe this is accurate.

That said, as a first step I'm pulling together some analysis which breaks down your example (or at least an example that captures the essence of your example), showing, from the ground up, the explosion in the number of conjunctions/disjunctions that are triggered within the evaluation engine with each step.

marcosnils · 2022-08-12T19:20:45Z

boils down to tools/flow "evaluating" a configuration where there is a long, linear dependency chain of tasks.

I'd also add to this that the use of conjunctions in those deep nested tasks makes the issue exponentially worse.

As @verdverm points out, this issue is very likely rooted in more low-level aspects of the evaluator, but as a top level summary of the problem you are facing today, I believe this is accurate

yes, that's correct.

That said, as a first step I'm pulling together some analysis which breaks down your example (or at least an example that captures the essence of your example), showing, from the ground up, the explosion in the number of conjunctions/disjunctions that are triggered within the evaluation engine with each step.

awesome! really appreciate that. LMK if there's anything we can do to help out 🙏

mpvl · 2022-08-23T08:58:52Z

There are several issues going on here:

a very expensive interface comparison in SpawnRef (results in about 25% of running time).
allocation of closeInfos, about 40% of running time.
an algorithmic "property" that causes such chained lookups to be O(n^2) ((n-2)*(n-1), to be precise).

Fixing the last would, of course, give by far the biggest win. It is also the hardest to fix. In principle, CUE does the right thing: in general, the expression of the referenced field may evaluate differently in the new context, as it may be unified with new structure. However, in the vast majority of cases it would be fine to just unify the resulting value of the referenced field, rather than recomputing the expression.

An experiment with a test implementation of such an optimization shows that it would speedup this case here by about 1000x (the depth of the references), thus confirming the hypotheses. The idea is that one can only not reuse the result if the referenced subtree contains a FieldReference (among some others) directly to the root of the node. Having references to both ancestor or internal references does not change the result.

In practice it is a little harder: the CloseInfo of the conjuncts needs to be rewritten to handle cycles (easy) but also to reflect the closedNess info itself. The current algorithm is not prepared for that and thus will need to be changed. Investigating this also uncovered some other issues with the closedness algorithm, so a revisit is in order either way. Seems doable, but will require some work.

mpvl · 2022-08-23T09:02:52Z

This test is used for evaluation: https://gist.github.com/mpvl/ddd37202677606ab7edf80066e7f74fe

myitcv · 2022-08-23T15:02:11Z

Adding to @mpvl's comment with some detail. Thanks for your patience here.

Firstly, here is a rewritten version of @jlongtine's repro (the original didn't run for me). Joel, perhaps you can confirm this is faithful to your original?

go mod tidy

# 1 task
go run . -tasks 1
go run . -tasks 1 -disjunctions

# 10 tasks
go run . -tasks 10
go run . -tasks 10 -disjunctions

# 100 tasks
go run . -tasks 100
go run . -tasks 100 -disjunctions

# 1000 tasks
go run . -tasks 1000
go run . -tasks 1000 -disjunctions

-- go.mod --
module dagger.io/dagger/cue-memory-use

go 1.18

require cuelang.org/go v0.4.4-0.20220815180243-e187f9fe5d45

-- main.go --
package main

import (
	"context"
	"flag"
	"fmt"
	"log"
	"os"
	"runtime/pprof"
	"text/tabwriter"
	"time"

	"cuelang.org/go/cue"
	"cuelang.org/go/cue/ast"
	"cuelang.org/go/cue/cuecontext"
	"cuelang.org/go/cue/format"
	"cuelang.org/go/cue/token"
	"cuelang.org/go/tools/flow"
)

var (
	fTaskCount    = flag.Int("tasks", 1000, "number of tasks to chain")
	fDisjunctions = flag.Bool("disjunctions", false, "use disjunctions")
	fDump         = flag.Bool("dump", false, "dump generated CUE to stdout")
	fHeap         = flag.Bool("heap", false, "create heap profile")
)

func main() {
	flag.Parse()
	tw := tabwriter.NewWriter(os.Stdout, 0, 1, 1, ' ', tabwriter.AlignRight)

	r := cuecontext.New()

	now := time.Now()
	step := func(prefix string) {
		t := time.Now()
		d := t.Sub(now)
		now = t
		fmt.Fprintf(tw, prefix+":\t\t%.4fms\t\n", float64(d)/float64(time.Millisecond))
	}

	file := genFile(*fDisjunctions)
	step("genFile")

	for i := 0; i < *fTaskCount; i++ {
		file.Decls = append(file.Decls, genTask(i, i-1))
	}
	step("genTasks")

	b, _ := format.Node(&file, format.Simplify())
	if *fDump {
		fmt.Println(string(b))
	}
	step("formatNode")

	val := r.BuildFile(&file)
	if err := val.Err(); err != nil {
		log.Fatal(err)
	}
	step("buildFile")

	controller := flow.New(nil, val, ioTaskFunc)
	if err := controller.Run(context.Background()); err != nil {
		log.Fatal(err)
	}
	step("flow")

	if *fHeap {
		outputFile, _ := os.Create("heap")
		err := pprof.WriteHeapProfile(outputFile)
		if err != nil {
			panic(err)
		}
		step("heapProfile")
	}
	tw.Flush()
}

func genFile(disjunctions bool) ast.File {
	var schema ast.Expr
	if disjunctions {
		schema = ast.NewBinExpr(token.OR,
			ast.NewIdent("string"),
			ast.NewIdent("bool"),
			ast.NewIdent("int"),
			ast.NewStruct(
				ast.NewIdent("a"), ast.NewIdent("string"),
				ast.NewIdent("b"), ast.NewIdent("int"),
			),
		)
	} else {
		schema = ast.NewIdent("string")
	}
	return ast.File{
		Filename: "example.cue",
		Decls: []ast.Decl{
			&ast.Field{
				Label: ast.NewIdent("#MyDef"),
				Value: ast.NewStruct(
					&ast.Field{
						Label: ast.NewIdent("input"),
						Value: schema,
					},
					&ast.Field{
						Label: ast.NewIdent("output"),
						Value: schema,
					},
				),
			},
		},
	}
}

func genTask(name, prev int) *ast.Field {
	var prevTask ast.Expr
	if prev >= 0 {
		prevTask = ast.NewStruct(
			ast.NewIdent("input"),
			ast.NewSel(ast.NewIdent(fmt.Sprintf("task%d", prev)), "output"),
		)
	} else {
		prevTask = ast.NewStruct(ast.NewIdent("input"), ast.NewString("world"))
	}
	return &ast.Field{
		Label: ast.NewIdent(fmt.Sprintf("task%d", name)),
		Value: ast.NewBinExpr(token.AND,
			ast.NewIdent("#MyDef"),
			prevTask,
		),
	}
}

func ioTaskFunc(v cue.Value) (flow.Runner, error) {
	inputPath := cue.ParsePath("input")

	input := v.LookupPath(inputPath)
	if !input.Exists() {
		return nil, nil
	}

	return flow.RunnerFunc(func(t *flow.Task) error {
		inputVal, err := t.Value().LookupPath(inputPath).String()
		if err != nil {
			return fmt.Errorf("input not of type string")
		}

		outputVal := inputVal

		return t.Fill(map[string]string{
			"output": outputVal,
		})
	}), nil
}

In the original example the -disjunctions flag was not declared, and the program was hard coded to always generate disjunctions. I've fixed that along with adding a command line flags for the number of tasks to "chain", generating the heap profile, dumping the generated input CUE.

Note that I am using the current tip version of CUE: v0.4.4-0.20220815180243-e187f9fe5d45.

For reference with the following performance results, I'm running VMWare Fusion on top of an M1 Max. The VM has 10 CPUs available, with 48GB of RAM.

Firstly looking at the output of the above to give rough evaluation time results:

> go mod tidy
# 1 task (0.491s)
> go run . -tasks 1
[stdout]
    genFile:  0.0012ms
   genTasks:  0.0167ms
 formatNode:  0.0209ms
  buildFile:  0.0297ms
       flow:  0.1306ms

> go run . -tasks 1 -disjunctions
[stdout]
    genFile:  0.0020ms
   genTasks:  0.0335ms
 formatNode:  0.0390ms
  buildFile:  0.0958ms
       flow:  0.1372ms

# 10 tasks (0.475s)
> go run . -tasks 10
[stdout]
    genFile:  0.0014ms
   genTasks:  0.0273ms
 formatNode:  0.0474ms
  buildFile:  0.0806ms
       flow:  0.9170ms

> go run . -tasks 10 -disjunctions
[stdout]
    genFile:  0.0017ms
   genTasks:  0.0257ms
 formatNode:  0.0564ms
  buildFile:  0.4274ms
       flow:  3.0863ms

# 100 tasks (0.928s)
> go run . -tasks 100
[stdout]
    genFile:   0.0009ms
   genTasks:   0.0567ms
 formatNode:   0.2871ms
  buildFile:   0.4000ms
       flow:  87.1700ms

> go run . -tasks 100 -disjunctions
[stdout]
    genFile:    0.0020ms
   genTasks:    0.0577ms
 formatNode:    0.2604ms
  buildFile:    3.4494ms
       flow:  352.5022ms

# 1000 tasks (92.793s)
> go run . -tasks 1000
[stdout]
    genFile:      0.0008ms
   genTasks:      0.4276ms
 formatNode:      3.5348ms
  buildFile:      5.5205ms
       flow:  31146.8159ms

> go run . -tasks 1000 -disjunctions
[stdout]
    genFile:      0.0021ms
   genTasks:      0.6324ms
 formatNode:      3.4441ms
  buildFile:     44.6750ms
       flow:  61071.4085ms

I can also roughly reproduce the heap sizes differences you have seen:

1000 tasks, no disjunctions:   654.40MB
1000 tasks, with disjunctions: 2443.68MB

The multiple is ~3.7x, but of the same order. @jlongtine, please let me know if these numbers are off from what you were seeing?

Whilst the memory usage is very high, the evaluation time is even more strange. We look at that first, because it's possible/likely that redundant computation, or a CPU "leak", is the cause of the memory "leak".

Looking at the timings above, what's interesting to note is that whilst the number of tasks between each run increases linearly, the increase in evaluation time is most definitely super linear.

But before diving into the detail to analyse timings, we first look at whether we can strip down the example.

@jlongtine's example uses tools/flow, representative of the Dagger situation as we understand it. As @verdverm mentions above, the filling of a value on task completion necessarily results in a re-run of the API-equivalent of cue eval.

As a first step, we looked to analyse a single evaluation run with varying sizes of "chain". We simplified the "chain" to references between fields, as follows:

f1:    string
f2:    f1
f3:    f2
f4:    f3
f5:    f4
f6:    f5
f7:    f6
f8:    f7
f9:    f8
f10:   f9
....

This creates the same effect of a chain between fields in the original example. Looking at "chain" lengths of 1, 10, 100 and 1000, we see broadly the same CPU and memory increases at each step (more details below).

We took a CPU profile of the 1000 length "chain" to see where time was being spent:

(pprof) top10 -cum
Showing nodes accounting for 2330ms, 88.93% of 2620ms total
Dropped 3 nodes (cum <= 13.10ms)
Showing top 10 nodes out of 20
      flat  flat%   sum%        cum   cum%
         0     0%     0%     2620ms   100%  cuelang.org/go/internal/core/adt.(*nodeContext).addExprConjunct
     290ms 11.07% 11.07%     2620ms   100%  cuelang.org/go/internal/core/adt.(*nodeContext).addVertexConjuncts
         0     0% 11.07%     2620ms   100%  cuelang.org/go/internal/core/adt.(*nodeContext).evalExpr
     220ms  8.40% 19.47%      930ms 35.50%  cuelang.org/go/internal/core/adt.CloseInfo.SpawnRef
     620ms 23.66% 43.13%      620ms 23.66%  runtime.duffcopy
     270ms 10.31% 53.44%      540ms 20.61%  cuelang.org/go/internal/core/adt.(*nodeContext).markCycle
     210ms  8.02% 61.45%      540ms 20.61%  runtime.convI2I
     310ms 11.83% 73.28%      420ms 16.03%  runtime.ifaceeq
     260ms  9.92% 83.21%      340ms 12.98%  runtime.getitab
     150ms  5.73% 88.93%      150ms  5.73%  type..eq.cuelang.org/go/internal/core/adt.arcKey

Adding some crude instrumentation to provide key measures gave the following, where N is the length of the "chain":

N     Unifications  Disjunctions  addExprConjunct  evalTime (ms)
1     2             2             2                0.010874
10    11            11            56               0.061542
100   101           101           5051             3.918735
1000  1001          1001          500501           2612.380445
2000  2001          2001          2001001          21411.943618

This shows the number of unifications, disjunctions and calls to addExprConjunct for various N.

The number of unifications and disjunctions is entirely as expected. The number of calls to addExprConjunct seems extremely high. Looking at the very crude figure of average time per call to addExprConjunct (analysis which makes lots of broad assumptions), evalTime / addExprConjunct, gives "reasonable" figures; as N increases, we see that time increasing, which is probably to be expected if there is memory pressure (this function, or functions it calls, allocate).

Whilst the current implementation is correct, this analysis seems to point to an algorithmic anomaly when it comes to evaluating "chains" of references. This manifests as a CPU "leak" which very likely has a significant impact on memory usage.

We believe we have identified a number of places where the current memory pressure can be alleviated and are working on solutions for each. These are shorter term fixes.

Per @mpvl, we also believe we have identified a different algorithm/approach which reduces the complexity of such chains by a factor of N. This will obviously greatly improve the evaluation time (CPU pressure) and we suspect it will greatly improve the memory pressure, possibly by the same order of magnitude. But the fix is more involved.

Once we have made progress on either/both fronts, we will be in a position to re-evaluate using the updated version of the reproducer in my comment.

Specifically we will:

Re-run the above analysis.
Verify that memory pressure is indeed greatly reduced.
Verify that the tools/flow use case is also improved with the fixes.
Understand then how disjunctions impact performance. As previously shared with the CUE Community, structure sharing is a further performance improvement that will improve situations where there are many references to the same identifier.

As before, we will post updates to this issue.

This loop was showing up in benchmarks, so do a fairly trivial strength reduction: - comparing two identical interface types is faster than comparing different interface types - we can exit the loop early when we find a match A benchmark designed to highlight the issue shows a reasonable performance improvement: name old time/op new time/op delta /chain.txtar-8 2.87s ± 3% 2.01s ± 1% -30.00% (p=0.008 n=5+5) Issue #1795. Signed-off-by: Roger Peppe <rogpeppe@gmail.com> Change-Id: Ib2c723d30bf33576f4050c96c8211e74727d8fdb Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/542604 Reviewed-by: Daniel Martí <mvdan@mvdan.cc> Unity-Result: CUEcueckoo <cueckoo@cuelang.org> Reviewed-by: Marcel van Lohuizen <mpvl@gmail.com> TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>

jlongtine · 2022-08-23T22:50:50Z

Firstly, here is a rewritten version of @jlongtine's repro (the original didn't run for me). Joel, perhaps you can confirm this is faithful to your original?

Quick pass looks spot on. The basic cue structure and the tools/flow implementation didn't change.

The multiple is ~3.7x, but of the same order. @jlongtine, please let me know if these numbers are off from what you were seeing?

That looks about right, yes.

I'll read back through these comments and reply with additional thoughts tomorrow.

Thanks for looking into it!

See comments in code. This coincidentally also tightens some structural cycles, which tends to have a positive effect on performance. Issue #1795 Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com> Change-Id: Ida714373e549595c656b3033c7807d6c74d2fdbe Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/542678 Reviewed-by: Roger Peppe <rogpeppe@gmail.com> Unity-Result: CUEcueckoo <cueckoo@cuelang.org> TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>

This is a useful metric for tracking some non-linear behavior. Issue #1795 Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com> Change-Id: Ic444887de591d68281bdb3b1c333bfc391c49615 Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/542748 Reviewed-by: Paul Jolly <paul@myitcv.io> Unity-Result: CUEcueckoo <cueckoo@cuelang.org> TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>

The functionality is hoisted from adt, but the package should for all practical purposes be seen as a new implementation. Note that some of the methods are now by value. This will make it easier to make things thread-safe in the future. We may also opt to hide the struct fields later on. But for now, as this has been working for a long time, we expect this to be sufficient for a while to come. The stats have been aggregated as experimental functionality to the tools/flow package. Issue #1795 Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com> Change-Id: Ifd3458d0fa03f3b2d91c50385e34e24b5a93bc70 Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/542778 Reviewed-by: Paul Jolly <paul@myitcv.io> Unity-Result: CUEcueckoo <cueckoo@cuelang.org> TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>

These tests expose cases where structure sharing is currently not working, and where we would like it to work. Fixing these can be a significant performance boost, especially for chained references. Issue #2854 Issue #2850 Issue #1795 Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com> Change-Id: I723b2fbd4c0e44431f1cce47023e535b1d81043a Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/1202208 Reviewed-by: Matthew Sackman <matthew@cue.works> Unity-Result: CUE porcuepine <cue.porcuepine@gmail.com> TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>

These tests expose cases where structure sharing is currently not working, and where we would like it to work. Fixing these can be a significant performance boost, especially for chained references. Issue cue-lang#2854 Issue cue-lang#2850 Issue cue-lang#1795 Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com> Change-Id: I723b2fbd4c0e44431f1cce47023e535b1d81043a Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/1202208 Reviewed-by: Matthew Sackman <matthew@cue.works> Unity-Result: CUE porcuepine <cue.porcuepine@gmail.com> TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>

jlongtine added NeedsInvestigation Triage Requires triage/attention labels Jul 18, 2022

myitcv removed the Triage Requires triage/attention label Jul 25, 2022

myitcv added the x/user/dagger.io Experimental CUE-user-based labels label Aug 11, 2022

gerhard mentioned this issue Aug 18, 2022

feat: Add ability to use custom terraform version dagger/dagger#2942

Merged

mpvl mentioned this issue Aug 23, 2022

internal/core/adt: dynamic fields not properly resolve with default values #1871

Closed

mpvl assigned rogpeppe and mpvl Aug 24, 2022

This was referenced Sep 22, 2022

Need to mkdir output on fresh clone cpard/scuel#1

Closed

Can I write the Fibonacci sequence in cue? #1950

Closed

grouville mentioned this issue Oct 11, 2022

🐞 insane memory usage dagger/dagger#3307

Closed

myitcv added the zGarden label Jun 13, 2023

mvdan removed the zGarden label Feb 8, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

tools/flow: high memory usage #1795

tools/flow: high memory usage #1795

jlongtine commented Jul 18, 2022 •

edited by myitcv

Loading

jlongtine commented Jul 18, 2022

verdverm commented Jul 18, 2022 •

edited

Loading

marcosnils commented Jul 19, 2022

verdverm commented Jul 19, 2022 •

edited

Loading

marcosnils commented Jul 19, 2022

verdverm commented Jul 19, 2022 •

edited

Loading

marcosnils commented Jul 21, 2022 •

edited

Loading

verdverm commented Jul 21, 2022 •

edited

Loading

verdverm commented Jul 21, 2022

marcosnils commented Jul 21, 2022

myitcv commented Jul 25, 2022

myitcv commented Jul 26, 2022

samalba commented Aug 9, 2022

myitcv commented Aug 10, 2022

myitcv commented Aug 12, 2022

marcosnils commented Aug 12, 2022 via email

myitcv commented Aug 12, 2022

marcosnils commented Aug 12, 2022 •

edited

Loading

mpvl commented Aug 23, 2022

mpvl commented Aug 23, 2022

myitcv commented Aug 23, 2022

jlongtine commented Aug 23, 2022

tools/flow: high memory usage #1795

tools/flow: high memory usage #1795

Comments

jlongtine commented Jul 18, 2022 • edited by myitcv Loading

What version of CUE are you using (cue version)?

Does this issue reproduce with the latest release?

What did you do?

What did you expect to see?

What did you see instead?

Dependencies

jlongtine commented Jul 18, 2022

verdverm commented Jul 18, 2022 • edited Loading

marcosnils commented Jul 19, 2022

verdverm commented Jul 19, 2022 • edited Loading

marcosnils commented Jul 19, 2022

verdverm commented Jul 19, 2022 • edited Loading

marcosnils commented Jul 21, 2022 • edited Loading

verdverm commented Jul 21, 2022 • edited Loading

verdverm commented Jul 21, 2022

marcosnils commented Jul 21, 2022

myitcv commented Jul 25, 2022

myitcv commented Jul 26, 2022

samalba commented Aug 9, 2022

myitcv commented Aug 10, 2022

myitcv commented Aug 12, 2022

marcosnils commented Aug 12, 2022 via email

myitcv commented Aug 12, 2022

marcosnils commented Aug 12, 2022 • edited Loading

mpvl commented Aug 23, 2022

mpvl commented Aug 23, 2022

myitcv commented Aug 23, 2022

jlongtine commented Aug 23, 2022

jlongtine commented Jul 18, 2022 •

edited by myitcv

Loading

What version of CUE are you using (`cue version`)?

verdverm commented Jul 18, 2022 •

edited

Loading

verdverm commented Jul 19, 2022 •

edited

Loading

verdverm commented Jul 19, 2022 •

edited

Loading

marcosnils commented Jul 21, 2022 •

edited

Loading

verdverm commented Jul 21, 2022 •

edited

Loading

marcosnils commented Aug 12, 2022 •

edited

Loading