Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tools/flow: high memory usage #1795

Open
1 task
jlongtine opened this issue Jul 18, 2022 · 22 comments
Open
1 task

tools/flow: high memory usage #1795

jlongtine opened this issue Jul 18, 2022 · 22 comments
Assignees
Labels
NeedsInvestigation x/user/dagger.io Experimental CUE-user-based labels

Comments

@jlongtine
Copy link
Contributor

jlongtine commented Jul 18, 2022

What version of CUE are you using (cue version)?

v0.4.3

Does this issue reproduce with the latest release?

Yes.

What did you do?

go build -o main
./main
go tool pprof heap
./main -disjunctions=false
go tool pprof heap

-- go.mod --
module dagger.io/dagger/cue-memory-use

go 1.18

require cuelang.org/go v0.4.3

require (
	github.com/cockroachdb/apd/v2 v2.0.1 // indirect
	github.com/google/uuid v1.2.0 // indirect
	github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de // indirect
	github.com/pkg/errors v0.8.1 // indirect
	golang.org/x/net v0.0.0-20200226121028-0de0cce0169b // indirect
	golang.org/x/text v0.3.7 // indirect
	gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
)
-- go.sum --
cuelang.org/go v0.4.3 h1:W3oBBjDTm7+IZfCKZAmC8uDG0eYfJL4Pp/xbbCMKaVo=
cuelang.org/go v0.4.3/go.mod h1:7805vR9H+VoBNdWFdI7jyDR3QLUPp4+naHfbcgp55HI=
github.com/cockroachdb/apd v1.1.0 h1:3LFP3629v+1aKXU5Q37mxmRxX/pIu1nijXydLShEq5I=
github.com/cockroachdb/apd/v2 v2.0.1 h1:y1Rh3tEU89D+7Tgbw+lp52T6p/GJLpDmNvr10UWqLTE=
github.com/cockroachdb/apd/v2 v2.0.1/go.mod h1:DDxRlzC2lo3/vSlmSoS7JkqbbrARPuFOGr0B9pvN3Gw=
github.com/google/uuid v1.2.0 h1:qJYtXnJRWmpe7m/3XlyhrsLrEURqHRM2kxzoxXqyUDs=
github.com/google/uuid v1.2.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de h1:D5x39vF5KCwKQaw+OC9ZPiLVHXz3UFw2+psEX+gYcto=
github.com/mpvl/unique v0.0.0-20150818121801-cbe035fff7de/go.mod h1:kJun4WP5gFuHZgRjZUWWuH1DTxCtxbHDOIJsudS8jzY=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b h1:0mm1VjtFUOIlE1SbDlwjYaDxZVDP2S5ou6y0gSgXHu8=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-- main.go --
package main

import (
	"context"
	"fmt"
	"log"
	"os"
	"runtime/pprof"
	"time"

	"cuelang.org/go/cue"
	"cuelang.org/go/cue/ast"
	"cuelang.org/go/cue/cuecontext"
	"cuelang.org/go/cue/format"
	"cuelang.org/go/cue/token"
	"cuelang.org/go/tools/flow"
)

func genTask(name, prev int) *ast.Field {
	var prevTask ast.Expr
	if prev >= 0 {
		prevTask = ast.NewStruct(
			ast.NewIdent("input"),
			ast.NewSel(ast.NewIdent(fmt.Sprintf("task%d", prev)), "output"),
		)
	} else {
		prevTask = ast.NewStruct(ast.NewIdent("input"), ast.NewString("world"))
	}
	return &ast.Field{
		Label: ast.NewIdent(fmt.Sprintf("task%d", name)),
		Value: ast.NewBinExpr(token.AND,
			ast.NewIdent("#MyDef"),
			prevTask,
		),
	}
}

func genFile(disjunctions bool) ast.File {
	var schema ast.Expr
	if disjunctions {
		schema = ast.NewBinExpr(token.OR,
			ast.NewIdent("string"),
			ast.NewIdent("bool"),
			ast.NewIdent("int"),
			ast.NewStruct(
				ast.NewIdent("a"), ast.NewIdent("string"),
				ast.NewIdent("b"), ast.NewIdent("int"),
			),
		)
	} else {
		schema = ast.NewIdent("string")
	}
	return ast.File{
		Filename: "example.cue",
		Decls: []ast.Decl{
			&ast.Field{
				Label: ast.NewIdent("#MyDef"),
				Value: ast.NewStruct(
					&ast.Field{
						Label: ast.NewIdent("input"),
						Value: schema,
					},
					&ast.Field{
						Label: ast.NewIdent("output"),
						Value: schema,
					},
				),
			},
		},
	}
}

func main() {
	r := cuecontext.New()
	file := genFile(true)
	fmt.Println("genTasks: Starting at ", time.Now())
	for i := 0; i < 1000; i++ {
		file.Decls = append(file.Decls, genTask(i, i-1))
	}
	fmt.Println("genTasks: Ended at ", time.Now())

	b, _ := format.Node(&file, format.Simplify())
	fmt.Println(string(b))
	val := r.BuildFile(&file)
	if err := val.Err(); err != nil {
		log.Fatal(err)
	}
	fmt.Println("Starting at ", time.Now())
	controller := flow.New(nil, val, ioTaskFunc)
	if err := controller.Run(context.Background()); err != nil {
		log.Fatal(err)
	}

	fmt.Println("Taking heap dump")
	outputFile, _ := os.Create("heap")
	err := pprof.WriteHeapProfile(outputFile)
	if err != nil {
		panic(err)
	}
	fmt.Println("Ended at ", time.Now())
}

func ioTaskFunc(v cue.Value) (flow.Runner, error) {
	inputPath := cue.ParsePath("input")

	input := v.LookupPath(inputPath)
	if !input.Exists() {
		return nil, nil
	}

	return flow.RunnerFunc(func(t *flow.Task) error {
		inputVal, err := t.Value().LookupPath(inputPath).String()
		if err != nil {
			return fmt.Errorf("input not of type string")
		}

		outputVal := inputVal

		return t.Fill(map[string]string{
			"output": outputVal,
		})
	}), nil
}

What did you expect to see?

Light memory usage. This tools/flow run is literally copying input -> output.

What did you see instead?

4-5GB of memory usage over the course of this run.

The executable will dump pprof files (named heap*).

There is a fairly different memory usage pattern for -disjunctions (2.5x more).

Dependencies

@jlongtine jlongtine added NeedsInvestigation Triage Requires triage/attention labels Jul 18, 2022
@jlongtine
Copy link
Contributor Author

Note: we are experiencing a version of this within a Dagger user's config.

@verdverm
Copy link

verdverm commented Jul 18, 2022

This reminds me of "Abuse of CUE" https://github.com/hofstadter-io/cuetorials.com/tree/main/code/puzzles/advent-of-code (I used lists, this is a struct, but the same idea applies) There is a non-flow reproducer more than likely.

I believe what happens is that each successive entry refers to the prev, so you end up with references (in the ADT) which need to be walked / evaluated. So the last element essentially has a link to every element, and every element effectively links to every element before it. You can imagine this hits the runtime complexity pretty hard.

I'm 99% sure the ideas @mpvl has to fix performance issues should cover this case, in particular the structural or reference sharing (forgetting the name a.t.m.) [edit: I believe the idea is called "structural sharing"]


As a dagger user, I've run into this situation and another (where I cannot control parallelism and end up with too many tasks running at once). One thought I had, which may address both cases, is a "task list" that runs sequentially, without parallelism.

@marcosnils
Copy link

As a dagger user, I've run into this situation and another (where I cannot control parallelism and end up with too many tasks running at once). One thought I had, which may address both cases, is a "task list" that runs sequentially, without parallelism.

Dagger delegates parallelism to buildkit. It's correct that there's no currently friendly way to configure it through Dagger. Having said that, it can be configured in buidkit through the following configuration. https://github.com/moby/buildkit/blob/8700be396100c7e8bdde1aaec5b590d5338948b0/docs/buildkitd.toml.md?plain=1#L59

@verdverm
Copy link

verdverm commented Jul 19, 2022

There is task parallelism in CUE/flow that cannot be controlled when more than one tasks is ready to run. This is the issue I am referring to, which lives above buildkit. Dagger has some customization around CUE/flow and task discovery, which is where it could be controlled, by introducing a task list and managing the dependency ordering outside of the config

@marcosnils
Copy link

There is task parallelism in CUE/flow that cannot be controlled when more than one tasks is ready to run. This is the issue I am referring to, which lives above buildkit

Correct. Since all Dagger tasks ultimately translate to buildkit ops, in Dagger, this parallelism is being "limited" by buildkit's

@verdverm
Copy link

verdverm commented Jul 19, 2022

I think there is a misunderstanding, the issue I am describing is in no way related to buildkit. The issue lives in the Go code that implements cue/flow. To work around this, we do things like the original description, i.e. inject artificial dependencies between tasks so they are not released from dagger & cue/flow to buildkit. This however has unintended consequences, the hidden and high growth of references between sequential elements, which result in long runtimes in the CUE evaluator. (structural sharing should alleviate much of this reference / performance issue, but it is still a t.b.d. It however does not address controlling parallelism and needing this workaround in the first place)

The idea I am suggesting is to introduce a "task list" which iterates over the tasks, marking them ready to run to the cue/flow engine. Again, all of this is separate from anything buildkit related. I have the same issue in hof/flow which also builds on cue/flow and does not use buildkit. I have also experienced this issue in Dagger. For example, wanting to build a Go binary for each OS. If I want to build them one-by-one, I have to introduce artificial dependencies. Setting buildkit parallelism is not a general or acceptable solution when I do want other tasks to run in parallel.

gerhard added a commit to bluebrown/dagger that referenced this issue Jul 20, 2022
Pinning versions is super important. `alpine/helm:3.9.1` in this case
means limiting potential `alpine/helm:latest` breakages.

Including a link that shows the source & other available tags makes
future bumps easier. Another option would be to "dependabot" it.

This matches an emerging best practice for actions:
dagger#2781

It also avoids using cueflow which maks this CUE high memory issue 2x
worse: cue-lang/cue#1795

Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
@marcosnils
Copy link

marcosnils commented Jul 21, 2022

I think there is a misunderstanding, the issue I am describing is in no way related to buildkit. The issue lives in the Go code that implements cue/flow.

I don't think there's a misunderstanding. I was talking about dagger specifics and I understand that you're referring to a different scenario when using cue/flow. In any case, let's move on from that topic since it doesn't help this discussion.

One thing that I'd like to reinforce is that this high memory usage exponentially gets worse when using disjunctions in the cue/flow workflow. So it's not only the amount of tasks and their dependencies which is the only issue, but as observed in the attached heap dumps, reducing the task count while increasing the amount of disjunctions in the CUE tree (which dagger uses/abuses), also shows unusable memory limits.

@verdverm
Copy link

verdverm commented Jul 21, 2022

This issue is not specific to cue/flow, it's in the evaluator which cue/flow also uses, though cue/flow may exacerbate it due to re-evaluation every time a task completes.

There are several issues with similar reproducers: https://github.com/cue-lang/cue/issues?q=is%3Aopen+is%3Aissue+label%3Aroadmap%2Fperformance This is probably a duplicate of one of those

@verdverm
Copy link

A simpler reproducer without cue/flow

exec cue eval repro.cue

-- repro.cue --
import "list"

#Task: {
  // input: _
  input: string | bool | int | { a: string, b: int }
  output: _
}

tasks: [string]: #Task
tasks: {
        for i,j in list.Range(1,1000,1) {
                "task-\(i)": {
                        if i > 0 {
                                input: tasks["task-\(i-1)"].output
                        }
                        input: _
                        output: input
                }
        }
}

@marcosnils
Copy link

A simpler reproducer without cue/flow

awesome, thx for sharing!

@myitcv
Copy link
Member

myitcv commented Jul 25, 2022

Thanks for the report, @jlongtine. Thanks also for the analysis @verdverm and @marcosnils.

Taking a look now.

@myitcv myitcv removed the Triage Requires triage/attention label Jul 25, 2022
gerhard pushed a commit to dagger/dagger that referenced this issue Jul 25, 2022
* feat: helm.#Upgrade
* chore: allow to override the image in the helm install action
Signed-off-by: Nico Braun <rainbowstack@gmail.com>


* chore: Add make target to create kind cluster
This is the simplest thing for now. The next step is to add it to ci.cue.
Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>


* chore: Add make target that installs the correct CUE version
This is a follow-up to #2801
Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>


* Simplify the helm.#Image definition
Pinning versions is super important. `alpine/helm:3.9.1` in this case means limiting potential `alpine/helm:latest` breakages. This matches an emerging best practice for actions: #2781

It also avoids using cueflow which makes this CUE high memory issue 2x worse: cue-lang/cue#1795

Next step: restore Kubernetes tests. These have been temporarily disabled in #936. In other words, CI does not test these packages. It took me a while to track this down and understand what the simplest thing to do would be.

Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
@myitcv
Copy link
Member

myitcv commented Jul 26, 2022

Drive-by response for this comment whilst I'm investigating.

There is task parallelism in CUE/flow that cannot be controlled when more than one tasks is ready to run

cue cmd does not have any means to limit parallelism today, something that has come up multiple times: I've added a bullet to #1325 so we don't forget that.

With tools/flow, there is nothing to actually stop the implementer controlling parallelism, and delegating the control to the user. I forked that discussion into #1818.

I'll update here with any further insights on the performance issue.

@samalba
Copy link

samalba commented Aug 9, 2022

I'll update here with any further insights on the performance issue.

@myitcv @mpvl Any update on this one? Another dagger user is impacted by a very high memory usage.

@myitcv
Copy link
Member

myitcv commented Aug 10, 2022

Any update on this one? Another dagger user is impacted by a very high memory usage.

@samalba - @mpvl and I are just working through some remaining work on cycle and related changes, at which point we will be doing a deep dive on this and all open issues that affect Dagger, picking up on my investigation from a couple of weeks ago. I pinged @jlongtine for confirmation from your side on what that list looks like from your perspective.

@myitcv myitcv added the x/user/dagger.io Experimental CUE-user-based labels label Aug 11, 2022
@myitcv
Copy link
Member

myitcv commented Aug 12, 2022

@jlongtine is the example in this description actual Dagger code, or is this a reproducer? i.e. are you constructing CUE via cue/ast in this way?

@marcosnils
Copy link

marcosnils commented Aug 12, 2022 via email

@myitcv
Copy link
Member

myitcv commented Aug 12, 2022

Thanks @marcosnils. I just want to make sure as we start to dig deep on this that we don't miss a critical piece of the puzzle. Creating reduced versions of the actual problem is often great, but I want to be sure that we don't "miss" the actual scenario along the way. Because that's entirely possible when creating repros.

So to summarise my understanding of the context of the original problem: the problem as you/your users are experiencing it essentially boils down to tools/flow "evaluating" a configuration where there is a long, linear dependency chain of tasks. This causes long evaluation time and memory usage. As @verdverm points out, this issue is very likely rooted in more low-level aspects of the evaluator, but as a top level summary of the problem you are facing today, I believe this is accurate.

That said, as a first step I'm pulling together some analysis which breaks down your example (or at least an example that captures the essence of your example), showing, from the ground up, the explosion in the number of conjunctions/disjunctions that are triggered within the evaluation engine with each step.

@marcosnils
Copy link

marcosnils commented Aug 12, 2022

boils down to tools/flow "evaluating" a configuration where there is a long, linear dependency chain of tasks.

I'd also add to this that the use of conjunctions in those deep nested tasks makes the issue exponentially worse.

As @verdverm points out, this issue is very likely rooted in more low-level aspects of the evaluator, but as a top level summary of the problem you are facing today, I believe this is accurate

yes, that's correct.

That said, as a first step I'm pulling together some analysis which breaks down your example (or at least an example that captures the essence of your example), showing, from the ground up, the explosion in the number of conjunctions/disjunctions that are triggered within the evaluation engine with each step.

awesome! really appreciate that. LMK if there's anything we can do to help out 🙏

@mpvl
Copy link
Member

mpvl commented Aug 23, 2022

There are several issues going on here:

  • a very expensive interface comparison in SpawnRef (results in about 25% of running time).
  • allocation of closeInfos, about 40% of running time.
  • an algorithmic "property" that causes such chained lookups to be O(n^2) ((n-2)*(n-1), to be precise).

Fixing the last would, of course, give by far the biggest win. It is also the hardest to fix. In principle, CUE does the right thing: in general, the expression of the referenced field may evaluate differently in the new context, as it may be unified with new structure. However, in the vast majority of cases it would be fine to just unify the resulting value of the referenced field, rather than recomputing the expression.

An experiment with a test implementation of such an optimization shows that it would speedup this case here by about 1000x (the depth of the references), thus confirming the hypotheses. The idea is that one can only not reuse the result if the referenced subtree contains a FieldReference (among some others) directly to the root of the node. Having references to both ancestor or internal references does not change the result.

In practice it is a little harder: the CloseInfo of the conjuncts needs to be rewritten to handle cycles (easy) but also to reflect the closedNess info itself. The current algorithm is not prepared for that and thus will need to be changed. Investigating this also uncovered some other issues with the closedness algorithm, so a revisit is in order either way. Seems doable, but will require some work.

@mpvl
Copy link
Member

mpvl commented Aug 23, 2022

This test is used for evaluation: https://gist.github.com/mpvl/ddd37202677606ab7edf80066e7f74fe

@myitcv
Copy link
Member

myitcv commented Aug 23, 2022

Adding to @mpvl's comment with some detail. Thanks for your patience here.

Firstly, here is a rewritten version of @jlongtine's repro (the original didn't run for me). Joel, perhaps you can confirm this is faithful to your original?

go mod tidy

# 1 task
go run . -tasks 1
go run . -tasks 1 -disjunctions

# 10 tasks
go run . -tasks 10
go run . -tasks 10 -disjunctions

# 100 tasks
go run . -tasks 100
go run . -tasks 100 -disjunctions

# 1000 tasks
go run . -tasks 1000
go run . -tasks 1000 -disjunctions

-- go.mod --
module dagger.io/dagger/cue-memory-use

go 1.18

require cuelang.org/go v0.4.4-0.20220815180243-e187f9fe5d45

-- main.go --
package main

import (
	"context"
	"flag"
	"fmt"
	"log"
	"os"
	"runtime/pprof"
	"text/tabwriter"
	"time"

	"cuelang.org/go/cue"
	"cuelang.org/go/cue/ast"
	"cuelang.org/go/cue/cuecontext"
	"cuelang.org/go/cue/format"
	"cuelang.org/go/cue/token"
	"cuelang.org/go/tools/flow"
)

var (
	fTaskCount    = flag.Int("tasks", 1000, "number of tasks to chain")
	fDisjunctions = flag.Bool("disjunctions", false, "use disjunctions")
	fDump         = flag.Bool("dump", false, "dump generated CUE to stdout")
	fHeap         = flag.Bool("heap", false, "create heap profile")
)

func main() {
	flag.Parse()
	tw := tabwriter.NewWriter(os.Stdout, 0, 1, 1, ' ', tabwriter.AlignRight)

	r := cuecontext.New()

	now := time.Now()
	step := func(prefix string) {
		t := time.Now()
		d := t.Sub(now)
		now = t
		fmt.Fprintf(tw, prefix+":\t\t%.4fms\t\n", float64(d)/float64(time.Millisecond))
	}

	file := genFile(*fDisjunctions)
	step("genFile")

	for i := 0; i < *fTaskCount; i++ {
		file.Decls = append(file.Decls, genTask(i, i-1))
	}
	step("genTasks")

	b, _ := format.Node(&file, format.Simplify())
	if *fDump {
		fmt.Println(string(b))
	}
	step("formatNode")

	val := r.BuildFile(&file)
	if err := val.Err(); err != nil {
		log.Fatal(err)
	}
	step("buildFile")

	controller := flow.New(nil, val, ioTaskFunc)
	if err := controller.Run(context.Background()); err != nil {
		log.Fatal(err)
	}
	step("flow")

	if *fHeap {
		outputFile, _ := os.Create("heap")
		err := pprof.WriteHeapProfile(outputFile)
		if err != nil {
			panic(err)
		}
		step("heapProfile")
	}
	tw.Flush()
}

func genFile(disjunctions bool) ast.File {
	var schema ast.Expr
	if disjunctions {
		schema = ast.NewBinExpr(token.OR,
			ast.NewIdent("string"),
			ast.NewIdent("bool"),
			ast.NewIdent("int"),
			ast.NewStruct(
				ast.NewIdent("a"), ast.NewIdent("string"),
				ast.NewIdent("b"), ast.NewIdent("int"),
			),
		)
	} else {
		schema = ast.NewIdent("string")
	}
	return ast.File{
		Filename: "example.cue",
		Decls: []ast.Decl{
			&ast.Field{
				Label: ast.NewIdent("#MyDef"),
				Value: ast.NewStruct(
					&ast.Field{
						Label: ast.NewIdent("input"),
						Value: schema,
					},
					&ast.Field{
						Label: ast.NewIdent("output"),
						Value: schema,
					},
				),
			},
		},
	}
}

func genTask(name, prev int) *ast.Field {
	var prevTask ast.Expr
	if prev >= 0 {
		prevTask = ast.NewStruct(
			ast.NewIdent("input"),
			ast.NewSel(ast.NewIdent(fmt.Sprintf("task%d", prev)), "output"),
		)
	} else {
		prevTask = ast.NewStruct(ast.NewIdent("input"), ast.NewString("world"))
	}
	return &ast.Field{
		Label: ast.NewIdent(fmt.Sprintf("task%d", name)),
		Value: ast.NewBinExpr(token.AND,
			ast.NewIdent("#MyDef"),
			prevTask,
		),
	}
}

func ioTaskFunc(v cue.Value) (flow.Runner, error) {
	inputPath := cue.ParsePath("input")

	input := v.LookupPath(inputPath)
	if !input.Exists() {
		return nil, nil
	}

	return flow.RunnerFunc(func(t *flow.Task) error {
		inputVal, err := t.Value().LookupPath(inputPath).String()
		if err != nil {
			return fmt.Errorf("input not of type string")
		}

		outputVal := inputVal

		return t.Fill(map[string]string{
			"output": outputVal,
		})
	}), nil
}

In the original example the -disjunctions flag was not declared, and the program was hard coded to always generate disjunctions. I've fixed that along with adding a command line flags for the number of tasks to "chain", generating the heap profile, dumping the generated input CUE.

Note that I am using the current tip version of CUE: v0.4.4-0.20220815180243-e187f9fe5d45.

For reference with the following performance results, I'm running VMWare Fusion on top of an M1 Max. The VM has 10 CPUs available, with 48GB of RAM.

Firstly looking at the output of the above to give rough evaluation time results:

> go mod tidy
# 1 task (0.491s)
> go run . -tasks 1
[stdout]
    genFile:  0.0012ms
   genTasks:  0.0167ms
 formatNode:  0.0209ms
  buildFile:  0.0297ms
       flow:  0.1306ms

> go run . -tasks 1 -disjunctions
[stdout]
    genFile:  0.0020ms
   genTasks:  0.0335ms
 formatNode:  0.0390ms
  buildFile:  0.0958ms
       flow:  0.1372ms

# 10 tasks (0.475s)
> go run . -tasks 10
[stdout]
    genFile:  0.0014ms
   genTasks:  0.0273ms
 formatNode:  0.0474ms
  buildFile:  0.0806ms
       flow:  0.9170ms

> go run . -tasks 10 -disjunctions
[stdout]
    genFile:  0.0017ms
   genTasks:  0.0257ms
 formatNode:  0.0564ms
  buildFile:  0.4274ms
       flow:  3.0863ms

# 100 tasks (0.928s)
> go run . -tasks 100
[stdout]
    genFile:   0.0009ms
   genTasks:   0.0567ms
 formatNode:   0.2871ms
  buildFile:   0.4000ms
       flow:  87.1700ms

> go run . -tasks 100 -disjunctions
[stdout]
    genFile:    0.0020ms
   genTasks:    0.0577ms
 formatNode:    0.2604ms
  buildFile:    3.4494ms
       flow:  352.5022ms

# 1000 tasks (92.793s)
> go run . -tasks 1000
[stdout]
    genFile:      0.0008ms
   genTasks:      0.4276ms
 formatNode:      3.5348ms
  buildFile:      5.5205ms
       flow:  31146.8159ms

> go run . -tasks 1000 -disjunctions
[stdout]
    genFile:      0.0021ms
   genTasks:      0.6324ms
 formatNode:      3.4441ms
  buildFile:     44.6750ms
       flow:  61071.4085ms

I can also roughly reproduce the heap sizes differences you have seen:

1000 tasks, no disjunctions:   654.40MB
1000 tasks, with disjunctions: 2443.68MB

The multiple is ~3.7x, but of the same order. @jlongtine, please let me know if these numbers are off from what you were seeing?

Whilst the memory usage is very high, the evaluation time is even more strange. We look at that first, because it's possible/likely that redundant computation, or a CPU "leak", is the cause of the memory "leak".

Looking at the timings above, what's interesting to note is that whilst the number of tasks between each run increases linearly, the increase in evaluation time is most definitely super linear.

But before diving into the detail to analyse timings, we first look at whether we can strip down the example.

@jlongtine's example uses tools/flow, representative of the Dagger situation as we understand it. As @verdverm mentions above, the filling of a value on task completion necessarily results in a re-run of the API-equivalent of cue eval.

As a first step, we looked to analyse a single evaluation run with varying sizes of "chain". We simplified the "chain" to references between fields, as follows:

f1:    string
f2:    f1
f3:    f2
f4:    f3
f5:    f4
f6:    f5
f7:    f6
f8:    f7
f9:    f8
f10:   f9
....

This creates the same effect of a chain between fields in the original example. Looking at "chain" lengths of 1, 10, 100 and 1000, we see broadly the same CPU and memory increases at each step (more details below).

We took a CPU profile of the 1000 length "chain" to see where time was being spent:

(pprof) top10 -cum
Showing nodes accounting for 2330ms, 88.93% of 2620ms total
Dropped 3 nodes (cum <= 13.10ms)
Showing top 10 nodes out of 20
      flat  flat%   sum%        cum   cum%
         0     0%     0%     2620ms   100%  cuelang.org/go/internal/core/adt.(*nodeContext).addExprConjunct
     290ms 11.07% 11.07%     2620ms   100%  cuelang.org/go/internal/core/adt.(*nodeContext).addVertexConjuncts
         0     0% 11.07%     2620ms   100%  cuelang.org/go/internal/core/adt.(*nodeContext).evalExpr
     220ms  8.40% 19.47%      930ms 35.50%  cuelang.org/go/internal/core/adt.CloseInfo.SpawnRef
     620ms 23.66% 43.13%      620ms 23.66%  runtime.duffcopy
     270ms 10.31% 53.44%      540ms 20.61%  cuelang.org/go/internal/core/adt.(*nodeContext).markCycle
     210ms  8.02% 61.45%      540ms 20.61%  runtime.convI2I
     310ms 11.83% 73.28%      420ms 16.03%  runtime.ifaceeq
     260ms  9.92% 83.21%      340ms 12.98%  runtime.getitab
     150ms  5.73% 88.93%      150ms  5.73%  type..eq.cuelang.org/go/internal/core/adt.arcKey

Adding some crude instrumentation to provide key measures gave the following, where N is the length of the "chain":

N     Unifications  Disjunctions  addExprConjunct  evalTime (ms)
1     2             2             2                0.010874
10    11            11            56               0.061542
100   101           101           5051             3.918735
1000  1001          1001          500501           2612.380445
2000  2001          2001          2001001          21411.943618

This shows the number of unifications, disjunctions and calls to addExprConjunct for various N.

The number of unifications and disjunctions is entirely as expected. The number of calls to addExprConjunct seems extremely high. Looking at the very crude figure of average time per call to addExprConjunct (analysis which makes lots of broad assumptions), evalTime / addExprConjunct, gives "reasonable" figures; as N increases, we see that time increasing, which is probably to be expected if there is memory pressure (this function, or functions it calls, allocate).

Whilst the current implementation is correct, this analysis seems to point to an algorithmic anomaly when it comes to evaluating "chains" of references. This manifests as a CPU "leak" which very likely has a significant impact on memory usage.

We believe we have identified a number of places where the current memory pressure can be alleviated and are working on solutions for each. These are shorter term fixes.

Per @mpvl, we also believe we have identified a different algorithm/approach which reduces the complexity of such chains by a factor of N. This will obviously greatly improve the evaluation time (CPU pressure) and we suspect it will greatly improve the memory pressure, possibly by the same order of magnitude. But the fix is more involved.

Once we have made progress on either/both fronts, we will be in a position to re-evaluate using the updated version of the reproducer in my comment.

Specifically we will:

  • Re-run the above analysis.
  • Verify that memory pressure is indeed greatly reduced.
  • Verify that the tools/flow use case is also improved with the fixes.
  • Understand then how disjunctions impact performance. As previously shared with the CUE Community, structure sharing is a further performance improvement that will improve situations where there are many references to the same identifier.

As before, we will post updates to this issue.

cueckoo pushed a commit that referenced this issue Aug 23, 2022
This loop was showing up in benchmarks, so do a fairly trivial strength
reduction:
- comparing two identical interface types is faster than comparing
different interface types
- we can exit the loop early when we find a match

A benchmark designed to highlight the issue shows a reasonable performance
improvement:

	name            old time/op  new time/op  delta
	/chain.txtar-8   2.87s ± 3%   2.01s ± 1%  -30.00%  (p=0.008 n=5+5)

Issue #1795.

Signed-off-by: Roger Peppe <rogpeppe@gmail.com>
Change-Id: Ib2c723d30bf33576f4050c96c8211e74727d8fdb
Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/542604
Reviewed-by: Daniel Martí <mvdan@mvdan.cc>
Unity-Result: CUEcueckoo <cueckoo@cuelang.org>
Reviewed-by: Marcel van Lohuizen <mpvl@gmail.com>
TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>
@jlongtine
Copy link
Contributor Author

Firstly, here is a rewritten version of @jlongtine's repro (the original didn't run for me). Joel, perhaps you can confirm this is faithful to your original?

Quick pass looks spot on. The basic cue structure and the tools/flow implementation didn't change.

The multiple is ~3.7x, but of the same order. @jlongtine, please let me know if these numbers are off from what you were seeing?

That looks about right, yes.

I'll read back through these comments and reply with additional thoughts tomorrow.

Thanks for looking into it!

cueckoo pushed a commit that referenced this issue Aug 25, 2022
See comments in code. This coincidentally also tightens
some structural cycles, which tends to have a positive effect
on performance.

Issue #1795

Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com>
Change-Id: Ida714373e549595c656b3033c7807d6c74d2fdbe
Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/542678
Reviewed-by: Roger Peppe <rogpeppe@gmail.com>
Unity-Result: CUEcueckoo <cueckoo@cuelang.org>
TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>
cueckoo pushed a commit that referenced this issue Aug 26, 2022
This is a useful metric for tracking some non-linear
behavior.

Issue #1795

Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com>
Change-Id: Ic444887de591d68281bdb3b1c333bfc391c49615
Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/542748
Reviewed-by: Paul Jolly <paul@myitcv.io>
Unity-Result: CUEcueckoo <cueckoo@cuelang.org>
TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>
cueckoo pushed a commit that referenced this issue Aug 31, 2022
The functionality is hoisted from adt, but the package should for all
practical purposes be seen as a new implementation.

Note that some of the methods are now by value. This will make it
easier to make things thread-safe in the future. We may also
opt to hide the struct fields later on. But for now, as this has been
working for a long time, we expect this to be sufficient for a while
to come.

The stats have been aggregated as experimental functionality to
the tools/flow package.

Issue #1795

Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com>
Change-Id: Ifd3458d0fa03f3b2d91c50385e34e24b5a93bc70
Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/542778
Reviewed-by: Paul Jolly <paul@myitcv.io>
Unity-Result: CUEcueckoo <cueckoo@cuelang.org>
TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>
@myitcv myitcv added the zGarden label Jun 13, 2023
@mvdan mvdan removed the zGarden label Feb 8, 2024
cueckoo pushed a commit that referenced this issue Oct 8, 2024
These tests expose cases where structure sharing
is currently not working, and where we
would like it to work.

Fixing these can be a significant performance
boost, especially for chained references.

Issue #2854
Issue #2850
Issue #1795

Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com>
Change-Id: I723b2fbd4c0e44431f1cce47023e535b1d81043a
Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/1202208
Reviewed-by: Matthew Sackman <matthew@cue.works>
Unity-Result: CUE porcuepine <cue.porcuepine@gmail.com>
TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>
vanhtuan0409 pushed a commit to anduintransaction/cue that referenced this issue Oct 15, 2024
These tests expose cases where structure sharing
is currently not working, and where we
would like it to work.

Fixing these can be a significant performance
boost, especially for chained references.

Issue cue-lang#2854
Issue cue-lang#2850
Issue cue-lang#1795

Signed-off-by: Marcel van Lohuizen <mpvl@gmail.com>
Change-Id: I723b2fbd4c0e44431f1cce47023e535b1d81043a
Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/1202208
Reviewed-by: Matthew Sackman <matthew@cue.works>
Unity-Result: CUE porcuepine <cue.porcuepine@gmail.com>
TryBot-Result: CUEcueckoo <cueckoo@cuelang.org>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
NeedsInvestigation x/user/dagger.io Experimental CUE-user-based labels
Projects
None yet
Development

No branches or pull requests

8 participants