From c6306c497ed75db7e06a9c41fbe29584d8f6ba19 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Wed, 4 Dec 2024 06:35:31 +1030 Subject: [PATCH] x-pack/filebeat/input/cel: add evaluation state dump debugging feature (#41335) (#41859) This allows a program to conditionally dump a complete set of evaluation states from a program in the case that it fails, allowing a trace of execution leading to the failure. (cherry picked from commit 0631d8b1ae10e0d25a0ebbc70a8118f858b08631) Co-authored-by: Dan Kortschak --- CHANGELOG.next.asciidoc | 1 + .../filebeat/docs/inputs/input-cel.asciidoc | 20 +++ x-pack/filebeat/input/cel/config.go | 19 ++- x-pack/filebeat/input/cel/input.go | 84 +++++++++++- x-pack/filebeat/input/cel/input_test.go | 123 ++++++++++++++++-- 5 files changed, 226 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 461e1c26db4..df0b4b300fb 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -330,6 +330,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Add ability to remove request trace logs from entityanalytics input. {pull}40004[40004] - Refactor & cleanup with updates to default values and documentation. {pull}41834[41834] - Update CEL mito extensions to v1.16.0. {pull}41727[41727] +- Add evaluation state dump debugging option to CEL input. {pull}41335[41335] *Auditbeat* diff --git a/x-pack/filebeat/docs/inputs/input-cel.asciidoc b/x-pack/filebeat/docs/inputs/input-cel.asciidoc index a96e8df5f3d..8e062025b24 100644 --- a/x-pack/filebeat/docs/inputs/input-cel.asciidoc +++ b/x-pack/filebeat/docs/inputs/input-cel.asciidoc @@ -795,6 +795,26 @@ This specifies fields in the `state` to be redacted prior to debug logging. Fiel This specifies whether fields should be replaced with a `*` or deleted entirely from messages sent to debug logs. If delete is `true`, fields will be deleted rather than replaced. +[float] +==== `failure_dump.enabled` + +It is possible to log CEL program evaluation failures to a local file-system for debugging configurations. +This option is enabled by setting `failure_dump.enabled` to true and setting the `failure_dump.filename` value. +To delete existing failure dumps, set `failure_dump.enabled` to false without unsetting the filename option. + +Enabling this option compromises security and should only be used for debugging. + +[float] +==== `failure_dump.filename` + +This specifies a directory path to write failure dumps to. If it is not empty and a CEL program evaluation fails, +the complete set of states for the CEL program's evaluation will be written as a JSON file, along with the error +that was reported. This option should only be used when debugging a failure as it imposes a significant performance +impact on the input and may potentially use large quantities of memory to hold the full set of states. If a failure +dump is configured, it is recommended that data input sizes be reduced to avoid excessive memory consumption, and +making dumps that are intractable to analysis. To delete existing failure dumps, set `failure_dump.enabled` to +false without unsetting the filename option. + [float] === Metrics diff --git a/x-pack/filebeat/input/cel/config.go b/x-pack/filebeat/input/cel/config.go index b04b7845719..aee095b199b 100644 --- a/x-pack/filebeat/input/cel/config.go +++ b/x-pack/filebeat/input/cel/config.go @@ -58,6 +58,9 @@ type config struct { // Resource is the configuration for establishing an // HTTP request or for locating a local resource. Resource *ResourceConfig `config:"resource" validate:"required"` + + // FailureDump configures failure dump behaviour. + FailureDump *dumpConfig `config:"failure_dump"` } type redact struct { @@ -69,6 +72,19 @@ type redact struct { Delete bool `config:"delete"` } +// dumpConfig configures the CEL program to retain +// the full evaluation state using the cel.OptTrackState +// option. The state is written to a file in the path if +// the evaluation fails. +type dumpConfig struct { + Enabled *bool `config:"enabled"` + Filename string `config:"filename"` +} + +func (t *dumpConfig) enabled() bool { + return t != nil && (t.Enabled == nil || *t.Enabled) +} + func (c config) Validate() error { if c.Redact == nil { logp.L().Named("input.cel").Warn("missing recommended 'redact' configuration: " + @@ -89,7 +105,8 @@ func (c config) Validate() error { if len(c.Regexps) != 0 { patterns = map[string]*regexp.Regexp{".": nil} } - _, _, err = newProgram(context.Background(), c.Program, root, nil, &http.Client{}, nil, nil, patterns, c.XSDs, logp.L().Named("input.cel"), nil) + wantDump := c.FailureDump.enabled() && c.FailureDump.Filename != "" + _, _, err = newProgram(context.Background(), c.Program, root, nil, &http.Client{}, nil, nil, patterns, c.XSDs, logp.L().Named("input.cel"), nil, wantDump) if err != nil { return fmt.Errorf("failed to check program: %w", err) } diff --git a/x-pack/filebeat/input/cel/input.go b/x-pack/filebeat/input/cel/input.go index ff4f1dccf51..97ab8c9bee0 100644 --- a/x-pack/filebeat/input/cel/input.go +++ b/x-pack/filebeat/input/cel/input.go @@ -10,6 +10,7 @@ package cel import ( "compress/gzip" "context" + "encoding/json" "errors" "fmt" "io" @@ -166,7 +167,8 @@ func (i input) run(env v2.Context, src *source, cursor map[string]interface{}, p Password: cfg.Auth.Basic.Password, } } - prg, ast, err := newProgram(ctx, cfg.Program, root, getEnv(cfg.AllowedEnvironment), client, limiter, auth, patterns, cfg.XSDs, log, trace) + wantDump := cfg.FailureDump.enabled() && cfg.FailureDump.Filename != "" + prg, ast, err := newProgram(ctx, cfg.Program, root, getEnv(cfg.AllowedEnvironment), client, limiter, auth, patterns, cfg.XSDs, log, trace, wantDump) if err != nil { return err } @@ -251,12 +253,25 @@ func (i input) run(env v2.Context, src *source, cursor map[string]interface{}, p log.Debugw("request state", logp.Namespace("cel"), "state", redactor{state: state, cfg: cfg.Redact}) metrics.executions.Add(1) start := i.now().In(time.UTC) - state, err = evalWith(ctx, prg, ast, state, start) + state, err = evalWith(ctx, prg, ast, state, start, wantDump) log.Debugw("response state", logp.Namespace("cel"), "state", redactor{state: state, cfg: cfg.Redact}) if err != nil { + var dump dumpError switch { case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded): return err + case errors.As(err, &dump): + path := strings.ReplaceAll(cfg.FailureDump.Filename, "*", sanitizeFileName(env.IDWithoutName)) + dir := filepath.Dir(path) + base := filepath.Base(path) + ext := filepath.Ext(base) + prefix := strings.TrimSuffix(base, ext) + path = filepath.Join(dir, prefix+"-"+i.now().In(time.UTC).Format("2006-01-02T15-04-05.000")+ext) + log.Debugw("writing failure dump file", "path", path) + err := dump.writeToFile(path) + if err != nil { + log.Errorw("failed to write failure dump", "path", path, "error", err) + } } log.Errorw("failed evaluation", "error", err) env.UpdateStatus(status.Degraded, "failed evaluation: "+err.Error()) @@ -785,6 +800,26 @@ func newClient(ctx context.Context, cfg config, log *logp.Logger, reg *monitorin } } } + if !cfg.FailureDump.enabled() && cfg.FailureDump != nil && cfg.FailureDump.Filename != "" { + // We have a fail-dump name, but we are not enabled, + // so remove all dumps we own. + err = os.Remove(cfg.FailureDump.Filename) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + log.Errorw("failed to remove request trace log", "path", cfg.FailureDump.Filename, "error", err) + } + ext := filepath.Ext(cfg.FailureDump.Filename) + base := strings.TrimSuffix(cfg.FailureDump.Filename, ext) + paths, err := filepath.Glob(base + "-" + lumberjackTimestamp + ext) + if err != nil { + log.Errorw("failed to collect request trace log path names", "error", err) + } + for _, p := range paths { + err = os.Remove(p) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + log.Errorw("failed to remove request trace log", "path", p, "error", err) + } + } + } if reg != nil { c.Transport = httpmon.NewMetricsRoundTripper(c.Transport, reg) @@ -1004,7 +1039,7 @@ func getEnv(allowed []string) map[string]string { return env } -func newProgram(ctx context.Context, src, root string, vars map[string]string, client *http.Client, limiter *rate.Limiter, auth *lib.BasicAuth, patterns map[string]*regexp.Regexp, xsd map[string]string, log *logp.Logger, trace *httplog.LoggingRoundTripper) (cel.Program, *cel.Ast, error) { +func newProgram(ctx context.Context, src, root string, vars map[string]string, client *http.Client, limiter *rate.Limiter, auth *lib.BasicAuth, patterns map[string]*regexp.Regexp, xsd map[string]string, log *logp.Logger, trace *httplog.LoggingRoundTripper, details bool) (cel.Program, *cel.Ast, error) { xml, err := lib.XML(nil, xsd) if err != nil { return nil, nil, fmt.Errorf("failed to build xml type hints: %w", err) @@ -1043,7 +1078,11 @@ func newProgram(ctx context.Context, src, root string, vars map[string]string, c return nil, nil, fmt.Errorf("failed compilation: %w", iss.Err()) } - prg, err := env.Program(ast) + var progOpts []cel.ProgramOption + if details { + progOpts = []cel.ProgramOption{cel.EvalOptions(cel.OptTrackState)} + } + prg, err := env.Program(ast, progOpts...) if err != nil { return nil, nil, fmt.Errorf("failed program instantiation: %w", err) } @@ -1065,8 +1104,8 @@ func debug(log *logp.Logger, trace *httplog.LoggingRoundTripper) func(string, an } } -func evalWith(ctx context.Context, prg cel.Program, ast *cel.Ast, state map[string]interface{}, now time.Time) (map[string]interface{}, error) { - out, _, err := prg.ContextEval(ctx, map[string]interface{}{ +func evalWith(ctx context.Context, prg cel.Program, ast *cel.Ast, state map[string]interface{}, now time.Time, details bool) (map[string]interface{}, error) { + out, det, err := prg.ContextEval(ctx, map[string]interface{}{ // Replace global program "now" with current time. This is necessary // as the lib.Time now global is static at program instantiation time // which will persist over multiple evaluations. The lib.Time behaviour @@ -1081,6 +1120,9 @@ func evalWith(ctx context.Context, prg cel.Program, ast *cel.Ast, state map[stri }) if err != nil { err = lib.DecoratedError{AST: ast, Err: err} + if details { + err = dumpError{error: err, dump: lib.NewDump(ast, det)} + } } if e := ctx.Err(); e != nil { err = e @@ -1109,6 +1151,36 @@ func evalWith(ctx context.Context, prg cel.Program, ast *cel.Ast, state map[stri } } +// dumpError is an evaluation state dump associated with an error. +type dumpError struct { + error + dump *lib.Dump +} + +func (e dumpError) writeToFile(path string) (err error) { + err = os.MkdirAll(filepath.Dir(path), 0o700) + if err != nil { + return err + } + f, err := os.Create(path) + if err != nil { + return err + } + defer func() { + err = errors.Join(err, f.Sync(), f.Close()) + }() + enc := json.NewEncoder(f) + enc.SetEscapeHTML(false) + type dump struct { + Error string `json:"error"` + State []lib.NodeValue `json:"state"` + } + return enc.Encode(dump{ + Error: e.Error(), + State: e.dump.NodeValues(), + }) +} + // clearWantMore sets the state to not request additional work in a periodic evaluation. // It leaves state intact if there is no "want_more" element, and sets the element to false // if there is. This is necessary instead of just doing delete(state, "want_more") as diff --git a/x-pack/filebeat/input/cel/input_test.go b/x-pack/filebeat/input/cel/input_test.go index 0d91710ca09..143402d9834 100644 --- a/x-pack/filebeat/input/cel/input_test.go +++ b/x-pack/filebeat/input/cel/input_test.go @@ -45,6 +45,7 @@ var inputTests = []struct { want []map[string]interface{} wantCursor []map[string]interface{} wantErr error + prepare func() error wantFile string wantNoFile string }{ @@ -1685,6 +1686,88 @@ var inputTests = []struct { }, }}, }, + { + name: "dump_no_error", + config: map[string]interface{}{ + "interval": 1, + "program": `{"events":[{"message":{"value": try(debug("divide by zero", 0/0))}}]}`, + "state": nil, + "resource": map[string]interface{}{ + "url": "", + }, + "failure_dump": map[string]interface{}{ + "enabled": true, + "filename": "failure_dumps/dump.json", + }, + }, + time: func() time.Time { return time.Date(2010, 2, 8, 0, 0, 0, 0, time.UTC) }, + wantNoFile: filepath.Join("failure_dumps", "dump-2010-02-08T00-00-00.000.json"), + want: []map[string]interface{}{{ + "message": map[string]interface{}{ + "value": "division by zero", + }, + }}, + }, + { + name: "dump_error", + config: map[string]interface{}{ + "interval": 1, + "program": `{"events":[{"message":{"value": debug("divide by zero", 0/0)}}]}`, + "state": nil, + "resource": map[string]interface{}{ + "url": "", + }, + "failure_dump": map[string]interface{}{ + "enabled": true, + "filename": "failure_dumps/dump.json", + }, + }, + time: func() time.Time { return time.Date(2010, 2, 9, 0, 0, 0, 0, time.UTC) }, + wantFile: filepath.Join("failure_dumps", "dump-2010-02-09T00-00-00.000.json"), // One day after the no dump case. + want: []map[string]interface{}{ + { + "error": map[string]interface{}{ + "message": `failed eval: ERROR: :1:58: division by zero + | {"events":[{"message":{"value": debug("divide by zero", 0/0)}}]} + | .........................................................^`, + }, + }, + }, + }, + { + name: "dump_error_delete", + config: map[string]interface{}{ + "interval": 1, + "program": `{"events":[{"message":{"value": debug("divide by zero", 0/0)}}]}`, + "state": nil, + "resource": map[string]interface{}{ + "url": "", + }, + "failure_dump": map[string]interface{}{ + "enabled": false, // We have a name but are disabled, so delete. + "filename": "failure_dumps/dump.json", + }, + }, + time: func() time.Time { return time.Date(2010, 2, 9, 0, 0, 0, 0, time.UTC) }, + prepare: func() error { + // Make a file that the configuration should delete. + err := os.MkdirAll("failure_dumps", 0o700) + if err != nil { + return err + } + return os.WriteFile(filepath.Join("failure_dumps", "dump-2010-02-09T00-00-00.000.json"), nil, 0o600) + }, + wantNoFile: filepath.Join("failure_dumps", "dump-2010-02-09T00-00-00.000.json"), // One day after the no dump case. + want: []map[string]interface{}{ + { + "error": map[string]interface{}{ + "message": `failed eval: ERROR: :1:58: division by zero + | {"events":[{"message":{"value": debug("divide by zero", 0/0)}}]} + | .........................................................^`, + }, + }, + }, + }, // not yet done from httpjson (some are redundant since they are compositional products). // @@ -1708,6 +1791,11 @@ func TestInput(t *testing.T) { os.Setenv("CELTESTENVVAR", "TESTVALUE") os.Setenv("DISALLOWEDCELTESTENVVAR", "DISALLOWEDTESTVALUE") + err := os.RemoveAll("failure_dumps") + if err != nil { + t.Fatalf("failed to remove failure_dumps directory: %v", err) + } + logp.TestingSetup() for _, test := range inputTests { t.Run(test.name, func(t *testing.T) { @@ -1718,6 +1806,13 @@ func TestInput(t *testing.T) { t.Skip("skipping remote endpoint test") } + if test.prepare != nil { + err := test.prepare() + if err != nil { + t.Fatalf("unexpected from prepare(): %v", err) + } + } + if test.server != nil { test.server(t, test.handler, test.config) } @@ -1770,6 +1865,20 @@ func TestInput(t *testing.T) { if fmt.Sprint(err) != fmt.Sprint(test.wantErr) { t.Errorf("unexpected error from running input: got:%v want:%v", err, test.wantErr) } + if test.wantFile != "" { + if _, err := os.Stat(filepath.Join(tempDir, test.wantFile)); err != nil { + t.Errorf("expected log file not found: %v", err) + } + } + if test.wantNoFile != "" { + paths, err := filepath.Glob(filepath.Join(tempDir, test.wantNoFile)) + if err != nil { + t.Fatalf("unexpected error calling filepath.Glob(%q): %v", test.wantNoFile, err) + } + if len(paths) != 0 { + t.Errorf("unexpected files found: %v", paths) + } + } if test.wantErr != nil { return } @@ -1802,20 +1911,6 @@ func TestInput(t *testing.T) { t.Errorf("unexpected cursor for event %d: got:- want:+\n%s", i, cmp.Diff(got, test.wantCursor[i])) } } - if test.wantFile != "" { - if _, err := os.Stat(filepath.Join(tempDir, test.wantFile)); err != nil { - t.Errorf("expected log file not found: %v", err) - } - } - if test.wantNoFile != "" { - paths, err := filepath.Glob(filepath.Join(tempDir, test.wantNoFile)) - if err != nil { - t.Fatalf("unexpected error calling filepath.Glob(%q): %v", test.wantNoFile, err) - } - if len(paths) != 0 { - t.Errorf("unexpected files found: %v", paths) - } - } }) } }