From 89fb29df3a97668a8f67821eddb6f92746efaddd Mon Sep 17 00:00:00 2001 From: Lukasz Zajaczkowski Date: Fri, 12 Jul 2024 14:33:53 +0200 Subject: [PATCH] feat: Agent server-side apply (SSA) deduplication cache (#227) * add cache * draft resource watch * use gvr cache * add event handler * add field selector * add initial status reconciler draft * add agent labels * initial server cache update * finish initial server cache implementation * update status watcher * register controller * change flag * fix build * fixes * add sha logic * update resource watcher * fix resource key parsing * add apply check * update sha object * add main apply check * finish main apply check * update chache * refactor * move const outside func * extract health status * set health status * refactor health status * refactor resource cache * refactor after merge * check deleted resource * add health cache * fix lint * add resource key alias * use resource key alias * move getLuaHealthConvert * handle contexts better * fix recursive call * use resource key alias * post merge fix * format * add cache filter * add custom status watcher * close watcher * add filter * skip timeout * fix watcher cleanup on close error * revert skip timeout * fixes * refactor * use custom status watcher for cache * revert last commit * use a custom status watcher implementation for resource cache and applier * disable health cache * fix linter * remove health cache * don't throw the error when inventory config map deleted * clear cache * add logs * optimize status watcher to use unique watches when running Watch multiple times * bump go version * fix linter * bump go version * bump go CI * make tools before unit tests * bump go CI * add debug * -vet=off * disable controllers * bump go in dockerfile * fix dockerfile * use test suite * bump ginko version * disable client in test suite * disable client in test suite * disable controller tests * explicitly initialize cache in main and add more prometheus metrics * fix unit tests * fix linter * change filter order * improve RequiresApply * improve filter * fix watcher * remove CRD filter * do not remove whole SHA entry in cache * revert apply check * fix apply check * fix sha object * remove resource cache expiration check poller * fix cache cleanup * close dynamic client watch * add error log for health status * fix linter * refactor args handling and allow reconciling only specific services * support golang flag * fix exposing agent metrics * refactor our custom watcher implementation * create status cache * move lua * fix agent build * fix unit tests * fix lint * fix status watcher init in the applier * fix status cache * refactor statuses * use ComponentAttributes * add retry watcher wrapper and use it in status watcher * use version cache * fix linter * fix goroutine leak and add profiler arg * fix lint * check if there is only one CustomHealth object * cleanup, add docs and refactor code * cleanup * cleanup metrics when service is deleted * allow disabling resource cache and add metrics to measure detailed reconcile execution times * allow providing extra args to operator * disable resource cache in chart * allow disabling resource cache via operator secrets * cache manifests --------- Co-authored-by: Marcin Maciaszczyk Co-authored-by: Sebastian Florek --- .github/workflows/ci.yaml | 8 +- .github/workflows/publish.yaml | 2 +- Dockerfile | 6 +- Makefile | 17 +- .../templates/deployment.yaml | 1 + charts/deployment-operator/values.yaml | 1 + charts/deployment-operator/values.yaml.liquid | 7 +- cmd/agent/agent.go | 36 +- cmd/agent/args/args.go | 220 +++++ cmd/agent/args/pprof.go | 20 + cmd/agent/main.go | 50 +- cmd/agent/metrics.go | 26 - cmd/agent/options.go | 61 -- go.mod | 28 +- go.sum | 40 +- internal/controller/argorollout_controller.go | 13 +- .../controller/customhealth_controller.go | 9 +- .../customhealth_controller_test.go | 5 +- .../pipelinegate_controller_test.go | 9 +- .../controller/stackrunjob_controller_test.go | 5 +- internal/controller/status_controller.go | 90 ++ internal/controller/suite_test.go | 3 +- internal/helpers/kubernetes.go | 16 + internal/kstatus/watcher/common.go | 95 +++ internal/kstatus/watcher/event_funnel.go | 119 +++ .../kstatus/watcher/object_status_reporter.go | 771 ++++++++++++++++++ internal/kstatus/watcher/task_manager.go | 68 ++ internal/kstatus/watcher/unschedulable.go | 66 ++ internal/kstatus/watcher/watcher.go | 99 +++ internal/kstatus/watcher/watcher_reference.go | 82 ++ internal/kstatus/watcher/watcher_types.go | 31 + .../watcher/retry_lister_watcher.go | 145 ++++ .../watcher/retry_lister_watcher_options.go | 41 + internal/kubernetes/watcher/retry_watcher.go | 293 +++++++ internal/metrics/metrics_context.go | 22 + internal/metrics/metrics_options.go | 23 + internal/metrics/metrics_prometheus.go | 76 +- internal/metrics/metrics_types.go | 58 +- internal/utils/hash.go | 5 + pkg/applier/builder.go | 18 +- pkg/applier/filters/cache_filter.go | 49 ++ pkg/applier/runner.go | 7 +- pkg/cache/cache.go | 68 ++ pkg/cache/common.go | 40 + pkg/cache/resource_cache.go | 294 +++++++ pkg/cache/resource_cache_entry.go | 84 ++ pkg/cache/resource_key.go | 70 ++ pkg/common/common.go | 25 + pkg/{controller/service => common}/health.go | 94 ++- pkg/common/lua.go | 60 ++ pkg/common/status.go | 109 +++ pkg/controller/controller_manager.go | 11 +- pkg/controller/service/reconciler.go | 77 +- pkg/controller/service/reconciler_status.go | 167 +--- pkg/controller/service/status_collector.go | 65 +- pkg/manifests/cache.go | 25 +- pkg/manifests/tarball.go | 58 +- pkg/manifests/template/helm.go | 12 +- 58 files changed, 3550 insertions(+), 450 deletions(-) create mode 100644 cmd/agent/args/args.go create mode 100644 cmd/agent/args/pprof.go delete mode 100644 cmd/agent/metrics.go delete mode 100644 cmd/agent/options.go create mode 100644 internal/controller/status_controller.go create mode 100644 internal/helpers/kubernetes.go create mode 100644 internal/kstatus/watcher/common.go create mode 100644 internal/kstatus/watcher/event_funnel.go create mode 100644 internal/kstatus/watcher/object_status_reporter.go create mode 100644 internal/kstatus/watcher/task_manager.go create mode 100644 internal/kstatus/watcher/unschedulable.go create mode 100644 internal/kstatus/watcher/watcher.go create mode 100644 internal/kstatus/watcher/watcher_reference.go create mode 100644 internal/kstatus/watcher/watcher_types.go create mode 100644 internal/kubernetes/watcher/retry_lister_watcher.go create mode 100644 internal/kubernetes/watcher/retry_lister_watcher_options.go create mode 100644 internal/kubernetes/watcher/retry_watcher.go create mode 100644 internal/metrics/metrics_context.go create mode 100644 internal/metrics/metrics_options.go create mode 100644 pkg/applier/filters/cache_filter.go create mode 100644 pkg/cache/cache.go create mode 100644 pkg/cache/common.go create mode 100644 pkg/cache/resource_cache.go create mode 100644 pkg/cache/resource_cache_entry.go create mode 100644 pkg/cache/resource_key.go create mode 100644 pkg/common/common.go rename pkg/{controller/service => common}/health.go (91%) create mode 100644 pkg/common/lua.go create mode 100644 pkg/common/status.go diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ab7a6538..2bf7aa59 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -38,7 +38,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v4 + - uses: actions/setup-go@v5 with: go-version-file: go.mod check-latest: true @@ -48,17 +48,17 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v4 + - uses: actions/setup-go@v5 with: go-version-file: go.mod check-latest: true - - run: PATH=$PATH:$GOPATH/bin make test + - run: PATH=$PATH:$GOPATH/bin make -d test lint: name: Lint runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v4 + - uses: actions/setup-go@v5 with: go-version-file: go.mod check-latest: true diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index bdbd08ce..5d6916be 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v4 + - uses: actions/setup-go@v5 with: go-version-file: go.mod check-latest: true diff --git a/Dockerfile b/Dockerfile index c935664a..bb70491d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.21.2-alpine3.17 as builder +FROM golang:1.22.4-alpine3.20 as builder ARG TARGETARCH @@ -17,9 +17,9 @@ COPY /api api/ COPY /internal internal/ # Build -RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} GO111MODULE=on go build -a -o deployment-agent cmd/agent/** +RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} GO111MODULE=on go build -a -o deployment-agent cmd/agent/*.go -FROM alpine:3.18 +FROM alpine:3.20 WORKDIR /workspace COPY --from=builder /workspace/deployment-agent . diff --git a/Makefile b/Makefile index 56df1fc2..509b7929 100644 --- a/Makefile +++ b/Makefile @@ -59,14 +59,25 @@ genmock: mockery ## generates mocks before running tests ##@ Run .PHONY: agent-run -agent-run: ## run agent - go run cmd/agent/** +agent-run: agent ## run agent + OPERATOR_NAMESPACE=plrl-deploy-operator \ + go run cmd/agent/*.go \ + --console-url=${PLURAL_CONSOLE_URL}/ext/gql \ + --enable-helm-dependency-update=false \ + --disable-helm-dry-run-server=false \ + --cluster-id=${PLURAL_CLUSTER_ID} \ + --local \ + --refresh-interval=30s \ + --resource-cache-ttl=60s \ + --max-concurrent-reconciles=20 \ + --v=1 \ + --deploy-token=${PLURAL_DEPLOY_TOKEN} ##@ Build .PHONY: agent agent: ## build agent - go build -o bin/deployment-agent cmd/agent/** + go build -o bin/deployment-agent cmd/agent/*.go .PHONY: harness harness: ## build stack run harness diff --git a/charts/deployment-operator/templates/deployment.yaml b/charts/deployment-operator/templates/deployment.yaml index 9f3ca7a6..ae608646 100644 --- a/charts/deployment-operator/templates/deployment.yaml +++ b/charts/deployment-operator/templates/deployment.yaml @@ -46,6 +46,7 @@ spec: - -processing-timeout={{ .Values.args.processingTimeout }} - -enable-helm-dependency-update={{ .Values.args.enableHelmDependencyUpdate }} - -disable-helm-dry-run-server={{ .Values.args.disableHelmTemplateDryRunServer }} + - -disable-resource-cache={{ .Values.args.disableResourceCache }} env: - name: IMAGE_TAG value: {{ $tag | quote }} diff --git a/charts/deployment-operator/values.yaml b/charts/deployment-operator/values.yaml index 1770bfa2..8b665fdc 100644 --- a/charts/deployment-operator/values.yaml +++ b/charts/deployment-operator/values.yaml @@ -11,6 +11,7 @@ args: processingTimeout: 5m enableHelmDependencyUpdate: false disableHelmTemplateDryRunServer: false + disableResourceCache: false image: repository: ghcr.io/pluralsh/deployment-operator diff --git a/charts/deployment-operator/values.yaml.liquid b/charts/deployment-operator/values.yaml.liquid index 853c679e..b9f1fe3a 100644 --- a/charts/deployment-operator/values.yaml.liquid +++ b/charts/deployment-operator/values.yaml.liquid @@ -37,4 +37,9 @@ args: {% if configuration.disableHelmTemplateDryRunServer %} args: disableHelmTemplateDryRunServer: {{ configuration.disableHelmTemplateDryRunServer }} -{% endif %} \ No newline at end of file +{% endif %} + +{% if configuration.disableResourceCache %} +args: + disableResourceCache: {{ configuration.disableResourceCache }} +{% endif %} diff --git a/cmd/agent/agent.go b/cmd/agent/agent.go index b54272fd..75e7456c 100644 --- a/cmd/agent/agent.go +++ b/cmd/agent/agent.go @@ -4,6 +4,7 @@ import ( "os" "time" + "github.com/pluralsh/deployment-operator/cmd/agent/args" "github.com/pluralsh/deployment-operator/internal/utils" "github.com/pluralsh/deployment-operator/pkg/controller/stacks" @@ -21,26 +22,23 @@ import ( const pollInterval = time.Second * 30 -func runAgent(opt *options, config *rest.Config, ctx context.Context, k8sClient ctrclient.Client) (*controller.ControllerManager, *service.ServiceReconciler, *pipelinegates.GateReconciler) { - r, err := time.ParseDuration(opt.refreshInterval) - if err != nil { - setupLog.Error("unable to get refresh interval", "error", err) - os.Exit(1) - } - - t, err := time.ParseDuration(opt.processingTimeout) - if err != nil { - setupLog.Errorw("unable to get processing timeout", "error", err) - os.Exit(1) - } - - mgr, err := controller.NewControllerManager(ctx, opt.maxConcurrentReconciles, t, r, lo.ToPtr(true), opt.consoleUrl, opt.deployToken, opt.clusterId) +func runAgent(config *rest.Config, ctx context.Context, k8sClient ctrclient.Client) (*controller.ControllerManager, *service.ServiceReconciler, *pipelinegates.GateReconciler) { + mgr, err := controller.NewControllerManager( + ctx, + args.MaxConcurrentReconciles(), + args.ProcessingTimeout(), + args.RefreshInterval(), + lo.ToPtr(true), + args.ConsoleUrl(), + args.DeployToken(), + args.ClusterId(), + ) if err != nil { setupLog.Errorw("unable to create manager", "error", err) os.Exit(1) } - sr, err := service.NewServiceReconciler(ctx, mgr.GetClient(), config, r, opt.restoreNamespace) + sr, err := service.NewServiceReconciler(ctx, mgr.GetClient(), config, args.RefreshInterval(), args.ManifestCacheTTL(), args.RestoreNamespace(), args.ConsoleUrl()) if err != nil { setupLog.Errorw("unable to create service reconciler", "error", err) os.Exit(1) @@ -50,7 +48,7 @@ func runAgent(opt *options, config *rest.Config, ctx context.Context, k8sClient Do: sr, Queue: sr.SvcQueue, }) - gr, err := pipelinegates.NewGateReconciler(mgr.GetClient(), k8sClient, config, r, pollInterval, opt.clusterId) + gr, err := pipelinegates.NewGateReconciler(mgr.GetClient(), k8sClient, config, args.RefreshInterval(), pollInterval, args.ClusterId()) if err != nil { setupLog.Errorw("unable to create gate reconciler", "error", err) os.Exit(1) @@ -61,14 +59,14 @@ func runAgent(opt *options, config *rest.Config, ctx context.Context, k8sClient Queue: gr.GateQueue, }) - rr := restore.NewRestoreReconciler(mgr.GetClient(), k8sClient, r, opt.restoreNamespace) + rr := restore.NewRestoreReconciler(mgr.GetClient(), k8sClient, args.RefreshInterval(), args.RestoreNamespace()) mgr.AddController(&controller.Controller{ Name: "Restore Controller", Do: rr, Queue: rr.RestoreQueue, }) - ns := namespaces.NewNamespaceReconciler(mgr.GetClient(), k8sClient, r) + ns := namespaces.NewNamespaceReconciler(mgr.GetClient(), k8sClient, args.RefreshInterval()) mgr.AddController(&controller.Controller{ Name: "Managed Namespace Controller", Do: ns, @@ -81,7 +79,7 @@ func runAgent(opt *options, config *rest.Config, ctx context.Context, k8sClient os.Exit(1) } - s := stacks.NewStackReconciler(mgr.GetClient(), k8sClient, r, pollInterval, namespace, opt.consoleUrl, opt.deployToken) + s := stacks.NewStackReconciler(mgr.GetClient(), k8sClient, args.RefreshInterval(), pollInterval, namespace, args.ConsoleUrl(), args.DeployToken()) mgr.AddController(&controller.Controller{ Name: "Stack Controller", Do: s, diff --git a/cmd/agent/args/args.go b/cmd/agent/args/args.go new file mode 100644 index 00000000..bf54b017 --- /dev/null +++ b/cmd/agent/args/args.go @@ -0,0 +1,220 @@ +package args + +import ( + "flag" + "fmt" + "strconv" + "strings" + "time" + + "github.com/pluralsh/polly/containers" + "github.com/spf13/pflag" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + "github.com/pluralsh/deployment-operator/internal/helpers" + "github.com/pluralsh/deployment-operator/pkg/log" +) + +const ( + EnvDeployToken = "DEPLOY_TOKEN" + + defaultProbeAddress = ":9001" + defaultMetricsAddress = ":8000" + + defaultProcessingTimeout = "1m" + defaultProcessingTimeoutDuration = time.Minute + + defaultRefreshInterval = "2m" + defaultRefreshIntervalDuration = 2 * time.Minute + + defaultResourceCacheTTL = "1h" + defaultResourceCacheTTLDuration = time.Hour + + defaultManifestCacheTTL = "1h" + defaultManifestCacheTTLDuration = time.Hour + + defaultRestoreNamespace = "velero" + + defaultProfilerPath = "/debug/pprof/" + defaultProfilerAddress = ":7777" +) + +var ( + argDisableHelmTemplateDryRunServer = flag.Bool("disable-helm-dry-run-server", false, "Disable helm template in dry-run=server mode.") + argEnableHelmDependencyUpdate = flag.Bool("enable-helm-dependency-update", false, "Enable update Helm chart's dependencies.") + argEnableLeaderElection = flag.Bool("leader-elect", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") + argLocal = flag.Bool("local", false, "Whether you're running the operator locally.") + argProfiler = flag.Bool("profiler", false, "Enable pprof handler. By default it will be exposed on localhost:7777 under '/debug/pprof'") + argDisableResourceCache = flag.Bool("disable-resource-cache", false, "Control whether resource cache should be enabled or not.") + + argMaxConcurrentReconciles = flag.Int("max-concurrent-reconciles", 20, "Maximum number of concurrent reconciles which can be run.") + argResyncSeconds = flag.Int("resync-seconds", 300, "Resync duration in seconds.") + + argClusterId = flag.String("cluster-id", "", "The ID of the cluster being connected to.") + argConsoleUrl = flag.String("console-url", "", "The URL of the console api to fetch services from.") + argDeployToken = flag.String("deploy-token", helpers.GetEnv(EnvDeployToken, ""), "The deploy token to auth to Console API with.") + argProbeAddr = flag.String("health-probe-bind-address", defaultProbeAddress, "The address the probe endpoint binds to.") + argMetricsAddr = flag.String("metrics-bind-address", defaultMetricsAddress, "The address the metric endpoint binds to.") + argProcessingTimeout = flag.String("processing-timeout", defaultProcessingTimeout, "Maximum amount of time to spend trying to process queue item.") + argRefreshInterval = flag.String("refresh-interval", defaultRefreshInterval, "Refresh interval duration.") + argResourceCacheTTL = flag.String("resource-cache-ttl", defaultResourceCacheTTL, "The time to live of each resource cache entry.") + argManifestCacheTTL = flag.String("manifest-cache-ttl", defaultManifestCacheTTL, "The time to live of service manifests in cache entry.") + argRestoreNamespace = flag.String("restore-namespace", defaultRestoreNamespace, "The namespace where Velero restores are located.") + argServices = flag.String("services", "", "A comma separated list of service ids to reconcile. Leave empty to reconcile all.") + + serviceSet containers.Set[string] +) + +func Init() { + defaultFlagSet := flag.CommandLine + + // Init klog + klog.InitFlags(defaultFlagSet) + + // Use default log level defined by the application + _ = defaultFlagSet.Set("v", fmt.Sprintf("%d", log.LogLevelDefault)) + + opts := zap.Options{Development: true} + opts.BindFlags(defaultFlagSet) + + flag.Parse() + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + // Initialize unique service set + if len(*argServices) > 0 { + serviceSet = containers.ToSet(strings.Split(*argServices, ",")) + } + + if *argProfiler { + initProfiler() + } + + klog.V(log.LogLevelMinimal).InfoS("configured log level", "v", LogLevel()) +} + +func DisableHelmTemplateDryRunServer() bool { + return *argDisableHelmTemplateDryRunServer +} + +func EnableHelmDependencyUpdate() bool { + return *argEnableHelmDependencyUpdate +} + +func EnableLeaderElection() bool { + return *argEnableLeaderElection +} + +func Local() bool { + return *argLocal +} + +func MaxConcurrentReconciles() int { + return *argMaxConcurrentReconciles +} + +func ResyncSeconds() int { + return *argResyncSeconds +} + +func ClusterId() string { + ensureOrDie("cluster-id", argClusterId) + + return *argClusterId +} + +func ConsoleUrl() string { + ensureOrDie("console-url", argConsoleUrl) + + return *argConsoleUrl +} + +func DeployToken() string { + ensureOrDie("deploy-token", argDeployToken) + + return *argDeployToken +} + +func ProbeAddr() string { + return *argProbeAddr +} + +func MetricsAddr() string { + return *argMetricsAddr +} + +func ProcessingTimeout() time.Duration { + duration, err := time.ParseDuration(*argProcessingTimeout) + if err != nil { + klog.ErrorS(err, "Could not parse processing-timeout", "value", *argProcessingTimeout, "default", defaultProcessingTimeoutDuration) + return defaultProcessingTimeoutDuration + } + + return duration +} + +func RefreshInterval() time.Duration { + duration, err := time.ParseDuration(*argRefreshInterval) + if err != nil { + klog.ErrorS(err, "Could not parse refresh-interval", "value", *argProcessingTimeout, "default", defaultRefreshIntervalDuration) + return defaultRefreshIntervalDuration + } + + return duration +} + +func ResourceCacheTTL() time.Duration { + duration, err := time.ParseDuration(*argResourceCacheTTL) + if err != nil { + klog.ErrorS(err, "Could not parse resource-cache-ttl", "value", *argResourceCacheTTL, "default", defaultResourceCacheTTLDuration) + return defaultResourceCacheTTLDuration + } + + return duration +} + +func ManifestCacheTTL() time.Duration { + duration, err := time.ParseDuration(*argManifestCacheTTL) + if err != nil { + klog.ErrorS(err, "Could not parse manifest-cache-ttl", "value", *argManifestCacheTTL, "default", defaultManifestCacheTTLDuration) + return defaultManifestCacheTTLDuration + } + + return duration +} + +func RestoreNamespace() string { + return *argRestoreNamespace +} + +func SkipService(id string) bool { + return serviceSet.Len() > 0 && !serviceSet.Has(id) +} + +func LogLevel() klog.Level { + v := pflag.Lookup("v") + if v == nil { + return log.LogLevelDefault + } + + level, err := strconv.ParseInt(v.Value.String(), 10, 32) + if err != nil { + klog.ErrorS(err, "Could not parse log level", "level", v.Value.String(), "default", log.LogLevelDefault) + return log.LogLevelDefault + } + + return klog.Level(level) +} + +func ResourceCacheEnabled() bool { + return !(*argDisableResourceCache) +} + +func ensureOrDie(argName string, arg *string) { + if arg == nil || len(*arg) == 0 { + pflag.PrintDefaults() + panic(fmt.Sprintf("%s arg is rquired", argName)) + } +} diff --git a/cmd/agent/args/pprof.go b/cmd/agent/args/pprof.go new file mode 100644 index 00000000..57b060bc --- /dev/null +++ b/cmd/agent/args/pprof.go @@ -0,0 +1,20 @@ +package args + +import ( + "net/http" + "net/http/pprof" + + "github.com/pluralsh/deployment-operator/pkg/log" +) + +func initProfiler() { + log.Logger.Info("initializing profiler") + + mux := http.NewServeMux() + mux.HandleFunc(defaultProfilerPath, pprof.Index) + go func() { + if err := http.ListenAndServe(defaultProfilerAddress, mux); err != nil { + log.Logger.Fatal(err) + } + }() +} diff --git a/cmd/agent/main.go b/cmd/agent/main.go index 0cc37549..ea1d2072 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -10,6 +10,17 @@ import ( roclientset "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned" templatesv1 "github.com/open-policy-agent/frameworks/constraint/pkg/apis/templates/v1" constraintstatusv1beta1 "github.com/open-policy-agent/gatekeeper/v3/apis/status/v1beta1" + "github.com/prometheus/client_golang/prometheus/promhttp" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + deploymentsv1alpha1 "github.com/pluralsh/deployment-operator/api/v1alpha1" + "github.com/pluralsh/deployment-operator/cmd/agent/args" + "github.com/pluralsh/deployment-operator/internal/controller" + "github.com/pluralsh/deployment-operator/pkg/cache" + _ "github.com/pluralsh/deployment-operator/pkg/cache" // Init cache. + "github.com/pluralsh/deployment-operator/pkg/client" + "github.com/pluralsh/deployment-operator/pkg/log" + velerov1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" @@ -20,11 +31,6 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/healthz" - - deploymentsv1alpha1 "github.com/pluralsh/deployment-operator/api/v1alpha1" - "github.com/pluralsh/deployment-operator/internal/controller" - "github.com/pluralsh/deployment-operator/pkg/client" - "github.com/pluralsh/deployment-operator/pkg/log" ) var ( @@ -49,15 +55,28 @@ const ( ) func main() { - opt := newOptions() + args.Init() config := ctrl.GetConfigOrDie() ctx := ctrl.SetupSignalHandler() + if args.ResourceCacheEnabled() { + cache.Init(ctx, config, args.ResourceCacheTTL()) + } + mgr, err := ctrl.NewManager(config, ctrl.Options{ Scheme: scheme, - LeaderElection: opt.enableLeaderElection, + LeaderElection: args.EnableLeaderElection(), LeaderElectionID: "dep12loy45.plural.sh", - HealthProbeBindAddress: opt.probeAddr, + HealthProbeBindAddress: args.ProbeAddr(), + Metrics: server.Options{ + BindAddress: args.MetricsAddr(), + ExtraHandlers: map[string]http.Handler{ + // Default prometheus metrics path. + // We can't use /metrics as it is already taken by the + // controller manager. + "/metrics/agent": promhttp.Handler(), + }, + }, }) if err != nil { setupLog.Error(err, "unable to create manager") @@ -78,8 +97,9 @@ func main() { setupLog.Error(err, "unable to create kubernetes client") os.Exit(1) } + setupLog.Info("starting agent") - ctrlMgr, serviceReconciler, gateReconciler := runAgent(opt, config, ctx, mgr.GetClient()) + ctrlMgr, serviceReconciler, gateReconciler := runAgent(config, ctx, mgr.GetClient()) backupController := &controller.BackupReconciler{ Client: mgr.GetClient(), @@ -101,7 +121,7 @@ func main() { Client: mgr.GetClient(), Scheme: mgr.GetScheme(), ConsoleClient: ctrlMgr.GetClient(), - ConsoleURL: opt.consoleUrl, + ConsoleURL: args.ConsoleUrl(), HttpClient: &http.Client{Timeout: httpClientTimout}, ArgoClientSet: rolloutsClient, DynamicClient: dynamicClient, @@ -156,12 +176,20 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "StackRun") } + statusController, err := controller.NewStatusReconciler(mgr.GetClient()) + if err != nil { + setupLog.Error(err, "unable to create controller", "controller", "StatusController") + } + if err := statusController.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to start controller", "controller", "StatusController") + } + //+kubebuilder:scaffold:builder if err = (&controller.PipelineGateReconciler{ Client: mgr.GetClient(), GateCache: gateReconciler.GateCache, - ConsoleClient: client.New(opt.consoleUrl, opt.deployToken), + ConsoleClient: client.New(args.ConsoleUrl(), args.DeployToken()), Log: ctrl.Log.WithName("controllers").WithName("PipelineGate"), Scheme: mgr.GetScheme(), }).SetupWithManager(mgr); err != nil { diff --git a/cmd/agent/metrics.go b/cmd/agent/metrics.go deleted file mode 100644 index 323cd994..00000000 --- a/cmd/agent/metrics.go +++ /dev/null @@ -1,26 +0,0 @@ -package main - -import ( - "fmt" - "net/http" - - "github.com/prometheus/client_golang/prometheus/promhttp" - "k8s.io/klog/v2" -) - -const ( - prometheusMetricsPath = "/metrics" - prometheusMetricsPort = 8000 -) - -func init() { - go initPrometheusMetrics() -} - -func initPrometheusMetrics() { - http.Handle(prometheusMetricsPath, promhttp.Handler()) - - if err := http.ListenAndServe(fmt.Sprintf(":%d", prometheusMetricsPort), nil); err != nil { - klog.Fatal(err) - } -} diff --git a/cmd/agent/options.go b/cmd/agent/options.go deleted file mode 100644 index 382b2d3b..00000000 --- a/cmd/agent/options.go +++ /dev/null @@ -1,61 +0,0 @@ -package main - -import ( - "flag" - "os" - - "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/log/zap" - - "github.com/pluralsh/deployment-operator/pkg/manifests/template" - - "github.com/pluralsh/deployment-operator/pkg/controller/service" -) - -type options struct { - enableLeaderElection bool - metricsAddr string - probeAddr string - refreshInterval string - processingTimeout string - resyncSeconds int - maxConcurrentReconciles int - consoleUrl string - deployToken string - clusterId string - restoreNamespace string -} - -func newOptions() *options { - klog.InitFlags(nil) - - o := &options{} - - opts := zap.Options{Development: true} - opts.BindFlags(flag.CommandLine) - - flag.StringVar(&o.metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") - flag.StringVar(&o.probeAddr, "health-probe-bind-address", ":9001", "The address the probe endpoint binds to.") - flag.BoolVar(&o.enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") - flag.IntVar(&o.maxConcurrentReconciles, "max-concurrent-reconciles", 20, "Maximum number of concurrent reconciles which can be run.") - flag.IntVar(&o.resyncSeconds, "resync-seconds", 300, "Resync duration in seconds.") - flag.StringVar(&o.refreshInterval, "refresh-interval", "2m", "Refresh interval duration.") - flag.StringVar(&o.processingTimeout, "processing-timeout", "1m", "Maximum amount of time to spend trying to process queue item.") - flag.StringVar(&o.consoleUrl, "console-url", "", "The URL of the console api to fetch services from.") - flag.StringVar(&o.deployToken, "deploy-token", "", "The deploy token to auth to Console API with.") - flag.StringVar(&o.clusterId, "cluster-id", "", "The ID of the cluster being connected to.") - flag.StringVar(&o.restoreNamespace, "restore-namespace", "velero", "The namespace where Velero restores are located.") - flag.BoolVar(&service.Local, "local", false, "Whether you're running the operator locally.") - flag.BoolVar(&template.EnableHelmDependencyUpdate, "enable-helm-dependency-update", false, "Enable update Helm chart's dependencies.") - flag.BoolVar(&template.DisableHelmTemplateDryRunServer, "disable-helm-dry-run-server", false, "Disable helm template in dry-run=server mode.") - flag.Parse() - - ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) - - if o.deployToken == "" { - o.deployToken = os.Getenv("DEPLOY_TOKEN") - } - - return o -} diff --git a/go.mod b/go.mod index baca9673..983ecd49 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/pluralsh/deployment-operator -go 1.21 +go 1.22.4 require ( github.com/Masterminds/semver/v3 v3.2.1 @@ -12,16 +12,17 @@ require ( github.com/fluxcd/flagger v1.35.0 github.com/gin-gonic/gin v1.7.7 github.com/go-logr/logr v1.4.1 + github.com/gobuffalo/flect v1.0.2 github.com/gofrs/flock v0.8.1 github.com/hashicorp/terraform-json v0.22.1 github.com/mitchellh/mapstructure v1.5.0 - github.com/onsi/ginkgo/v2 v2.14.0 - github.com/onsi/gomega v1.30.0 + github.com/onsi/ginkgo/v2 v2.19.0 + github.com/onsi/gomega v1.33.1 github.com/open-policy-agent/frameworks/constraint v0.0.0-20240110234408-18fa1fc7dc06 github.com/open-policy-agent/gatekeeper/v3 v3.15.1 github.com/orcaman/concurrent-map/v2 v2.0.1 github.com/pkg/errors v0.9.1 - github.com/pluralsh/console-client-go v0.7.0 + github.com/pluralsh/console-client-go v0.11.2 github.com/pluralsh/controller-reconcile-helper v0.0.4 github.com/pluralsh/gophoenix v0.1.3-0.20231201014135-dff1b4309e34 github.com/pluralsh/polly v0.1.10 @@ -34,7 +35,8 @@ require ( github.com/vmware-tanzu/velero v1.13.0 github.com/yuin/gopher-lua v1.1.1 go.uber.org/zap v1.27.0 - golang.org/x/net v0.23.0 + golang.org/x/exp v0.0.0-20231006140011-7918f672742d + golang.org/x/net v0.25.0 gopkg.in/yaml.v3 v3.0.1 helm.sh/helm/v3 v3.14.3 k8s.io/api v0.29.2 @@ -44,8 +46,9 @@ require ( k8s.io/client-go v0.29.2 k8s.io/klog/v2 v2.110.1 k8s.io/kubectl v0.29.2 + k8s.io/utils v0.0.0-20230726121419-3b25d923346b layeh.com/gopher-luar v1.0.11 - sigs.k8s.io/cli-utils v0.35.1-0.20240103002740-7928dbf3d402 + sigs.k8s.io/cli-utils v0.36.1-0.20240525003310-87074c9799d2 sigs.k8s.io/controller-runtime v0.17.2 sigs.k8s.io/controller-runtime/tools/setup-envtest v0.0.0-20240313184151-cb5107b36b64 sigs.k8s.io/kustomize/kustomize/v5 v5.0.4-0.20230601165947-6ce0bf390ce3 @@ -108,8 +111,7 @@ require ( github.com/go-playground/locales v0.13.0 // indirect github.com/go-playground/universal-translator v0.17.0 // indirect github.com/go-playground/validator/v10 v10.4.1 // indirect - github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect - github.com/gobuffalo/flect v1.0.2 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/goccy/go-yaml v1.11.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect @@ -120,7 +122,7 @@ require ( github.com/google/gnostic-models v0.6.8 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect + github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6 // indirect github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect @@ -208,15 +210,14 @@ require ( go.starlark.net v0.0.0-20230525235612-a134d8f9ddca // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/crypto v0.23.0 // indirect - golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect - golang.org/x/mod v0.16.0 // indirect + golang.org/x/mod v0.17.0 // indirect golang.org/x/oauth2 v0.17.0 // indirect - golang.org/x/sync v0.6.0 // indirect + golang.org/x/sync v0.7.0 // indirect golang.org/x/sys v0.20.0 // indirect golang.org/x/term v0.20.0 // indirect golang.org/x/text v0.15.0 // indirect golang.org/x/time v0.5.0 // indirect - golang.org/x/tools v0.19.0 // indirect + golang.org/x/tools v0.21.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/appengine v1.6.8 // indirect @@ -231,7 +232,6 @@ require ( k8s.io/apiserver v0.29.0 // indirect k8s.io/component-base v0.29.2 // indirect k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect - k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect oras.land/oras-go v1.2.4 // indirect sigs.k8s.io/controller-tools v0.14.0 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect diff --git a/go.sum b/go.sum index c67b1568..f584b369 100644 --- a/go.sum +++ b/go.sum @@ -236,8 +236,8 @@ github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn github.com/go-sql-driver/mysql v1.6.0 h1:BCTh4TKNUYmOmMUcQ3IipzF5prigylS7XXjEkfCHuOE= github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gobuffalo/flect v1.0.2 h1:eqjPGSo2WmjgY2XlpGwo2NXgL3RucAKo4k4qQMNA5sA= github.com/gobuffalo/flect v1.0.2/go.mod h1:A5msMlrHtLqh9umBSnvabjsMrCcCpAyzglnDvkbYKHs= github.com/gobuffalo/logger v1.0.6 h1:nnZNpxYo0zx+Aj9RfMPBm+x9zAU2OayFh/xrAWi34HU= @@ -328,8 +328,8 @@ github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hf github.com/google/pprof v0.0.0-20201023163331-3e6fc7fc9c4c/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20201203190320-1bf35d6f28c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20201218002935-b9804c9f04c2/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6 h1:k7nVchz72niMH6YLQNvHSdIE7iqsQxK1P41mySCvssg= +github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= @@ -502,10 +502,10 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY= -github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw= -github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= -github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= +github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA= +github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To= +github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= +github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= github.com/open-policy-agent/frameworks/constraint v0.0.0-20240110234408-18fa1fc7dc06 h1:scXMWxph905CdmX5HkFJXipCtG+wT1ynxw31G9qSrMk= github.com/open-policy-agent/frameworks/constraint v0.0.0-20240110234408-18fa1fc7dc06/go.mod h1:Gl2I/z5dxvTOwa/ANYGGOkUqE4M0CbQpln0Ia/7KVro= github.com/open-policy-agent/gatekeeper/v3 v3.15.1 h1:OZwnjjos2Y5IjxoO4Y0sHIW2AApOAROzST7lwnrRzSU= @@ -532,8 +532,8 @@ github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qRg= -github.com/pluralsh/console-client-go v0.7.0 h1:7BcvftmKhssYd8F06NGsWXKxs7O3K8gQDYrQebvbmHE= -github.com/pluralsh/console-client-go v0.7.0/go.mod h1:eyCiLA44YbXiYyJh8303jk5JdPkt9McgCo5kBjk4lKo= +github.com/pluralsh/console-client-go v0.11.2 h1:mxWaLRKlYr3MsVA4ViYxj/FV/5JjCE1NvBjBlDvAbj8= +github.com/pluralsh/console-client-go v0.11.2/go.mod h1:eyCiLA44YbXiYyJh8303jk5JdPkt9McgCo5kBjk4lKo= github.com/pluralsh/controller-reconcile-helper v0.0.4 h1:1o+7qYSyoeqKFjx+WgQTxDz4Q2VMpzprJIIKShxqG0E= github.com/pluralsh/controller-reconcile-helper v0.0.4/go.mod h1:AfY0gtteD6veBjmB6jiRx/aR4yevEf6K0M13/pGan/s= github.com/pluralsh/gophoenix v0.1.3-0.20231201014135-dff1b4309e34 h1:ab2PN+6if/Aq3/sJM0AVdy1SYuMAnq4g20VaKhTm/Bw= @@ -755,8 +755,8 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic= -golang.org/x/mod v0.16.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -792,8 +792,8 @@ golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= -golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= -golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -816,8 +816,8 @@ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= -golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -942,8 +942,8 @@ golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.0.0-20210108195828-e2f9c7f1fc8e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.19.0 h1:tfGCXNR1OsFG+sVdLAitlpjAvD/I6dHDKnYrpEZUHkw= -golang.org/x/tools v0.19.0/go.mod h1:qoJWxmGSIBmAeriMx19ogtrEPrGtDbPK634QFIcLAhc= +golang.org/x/tools v0.21.0 h1:qc0xYgIbsSDt9EyWz05J5wfa7LOVW0YTLOXrqdLAWIw= +golang.org/x/tools v0.21.0/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -1119,8 +1119,8 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0 h1:TgtAeesdhpm2SGwkQasmbeqDo8th5wOBA5h/AjTKA4I= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.28.0/go.mod h1:VHVDI/KrK4fjnV61bE2g3sA7tiETLn8sooImelsCx3Y= -sigs.k8s.io/cli-utils v0.35.1-0.20240103002740-7928dbf3d402 h1:fPIyWLn5o5yv8qu1FZlmbJZjnapHXZ6fudZtdOsVAmw= -sigs.k8s.io/cli-utils v0.35.1-0.20240103002740-7928dbf3d402/go.mod h1:uCFC3BPXB3xHFQyKkWUlTrncVDCKzbdDfqZqRTCrk24= +sigs.k8s.io/cli-utils v0.36.1-0.20240525003310-87074c9799d2 h1:XDyZuBsUagBzTd/z3V4S8ddbejiyA7tgmuzSfwKTkPI= +sigs.k8s.io/cli-utils v0.36.1-0.20240525003310-87074c9799d2/go.mod h1:uCFC3BPXB3xHFQyKkWUlTrncVDCKzbdDfqZqRTCrk24= sigs.k8s.io/controller-runtime v0.17.2 h1:FwHwD1CTUemg0pW2otk7/U5/i5m2ymzvOXdbeGOUvw0= sigs.k8s.io/controller-runtime v0.17.2/go.mod h1:+MngTvIQQQhfXtwfdGw/UOQ/aIaqsYywfCINOtwMO/s= sigs.k8s.io/controller-runtime/tools/setup-envtest v0.0.0-20240313184151-cb5107b36b64 h1:CxmsLoKUF1PCEoAXrbXzNud5EtSxQAdo9zbYMkY4Jko= diff --git a/internal/controller/argorollout_controller.go b/internal/controller/argorollout_controller.go index f286b051..7f2964b1 100644 --- a/internal/controller/argorollout_controller.go +++ b/internal/controller/argorollout_controller.go @@ -9,25 +9,22 @@ import ( clientset "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned/typed/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/pkg/kubectl-argo-rollouts/cmd/abort" + "sigs.k8s.io/cli-utils/pkg/inventory" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts" rolloutv1alpha1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" roclientset "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned" console "github.com/pluralsh/console-client-go" - "github.com/pluralsh/deployment-operator/internal/utils" - "github.com/pluralsh/deployment-operator/pkg/client" - "github.com/pluralsh/deployment-operator/pkg/controller/service" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" ctrl "sigs.k8s.io/controller-runtime" k8sClient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" -) -const ( - inventoryAnnotationName = "config.k8s.io/owning-inventory" - closed = "closed" + "github.com/pluralsh/deployment-operator/internal/utils" + "github.com/pluralsh/deployment-operator/pkg/client" + "github.com/pluralsh/deployment-operator/pkg/controller/service" ) var requeueRollout = ctrl.Result{RequeueAfter: time.Second * 5} @@ -60,7 +57,7 @@ func (r *ArgoRolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } - serviceID, ok := rollout.Annotations[inventoryAnnotationName] + serviceID, ok := rollout.Annotations[inventory.OwningInventoryKey] if !ok { return ctrl.Result{}, nil } diff --git a/internal/controller/customhealth_controller.go b/internal/controller/customhealth_controller.go index bc46a5b5..4f9f73f4 100644 --- a/internal/controller/customhealth_controller.go +++ b/internal/controller/customhealth_controller.go @@ -18,9 +18,11 @@ package controller import ( "context" + "fmt" "github.com/pluralsh/deployment-operator/api/v1alpha1" "github.com/pluralsh/deployment-operator/internal/utils" + "github.com/pluralsh/deployment-operator/pkg/common" "github.com/pluralsh/deployment-operator/pkg/controller/service" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -48,11 +50,16 @@ type CustomHealthReconciler struct { // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.16.3/pkg/reconcile func (r *CustomHealthReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ reconcile.Result, reterr error) { logger := log.FromContext(ctx) + if req.Name != "default" { + logger.Error(fmt.Errorf("expected 'default' name, got %s", req.Name), "") + return reconcile.Result{}, nil + } script := &v1alpha1.CustomHealth{} if err := r.Get(ctx, req.NamespacedName, script); err != nil { logger.Error(err, "Unable to fetch LuaScript") return ctrl.Result{}, client.IgnoreNotFound(err) } + utils.MarkCondition(script.SetCondition, v1alpha1.ReadyConditionType, v1.ConditionFalse, v1alpha1.ReadyConditionReason, "") // Ensure that status updates will always be persisted when exiting this function. scope, err := NewClusterScope(ctx, r.Client, script) @@ -67,7 +74,7 @@ func (r *CustomHealthReconciler) Reconcile(ctx context.Context, req ctrl.Request } }() - r.ServiceReconciler.SetLuaScript(script.Spec.Script) + common.GetLuaScript().SetValue(script.Spec.Script) utils.MarkCondition(script.SetCondition, v1alpha1.ReadyConditionType, v1.ConditionTrue, v1alpha1.ReadyConditionReason, "") return ctrl.Result{}, nil diff --git a/internal/controller/customhealth_controller_test.go b/internal/controller/customhealth_controller_test.go index ded2bf76..57ca7e8d 100644 --- a/internal/controller/customhealth_controller_test.go +++ b/internal/controller/customhealth_controller_test.go @@ -6,6 +6,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/pluralsh/deployment-operator/api/v1alpha1" + "github.com/pluralsh/deployment-operator/pkg/common" "github.com/pluralsh/deployment-operator/pkg/controller/service" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -16,7 +17,7 @@ import ( var _ = Describe("Customhealt Controller", Ordered, func() { Context("When reconciling a resource", func() { const ( - resourceName = "test" + resourceName = "default" namespace = "default" script = "test script" ) @@ -76,7 +77,7 @@ var _ = Describe("Customhealt Controller", Ordered, func() { }) Expect(err).NotTo(HaveOccurred()) - Expect(sr.LuaScript).Should(Equal(script)) + Expect(common.GetLuaScript().GetValue()).Should(Equal(script)) }) }) diff --git a/internal/controller/pipelinegate_controller_test.go b/internal/controller/pipelinegate_controller_test.go index 2163250a..2eca3c29 100644 --- a/internal/controller/pipelinegate_controller_test.go +++ b/internal/controller/pipelinegate_controller_test.go @@ -8,10 +8,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" console "github.com/pluralsh/console-client-go" - "github.com/pluralsh/deployment-operator/api/v1alpha1" - "github.com/pluralsh/deployment-operator/pkg/client" - "github.com/pluralsh/deployment-operator/pkg/test/common" - "github.com/pluralsh/deployment-operator/pkg/test/mocks" "github.com/samber/lo" "github.com/stretchr/testify/mock" batchv1 "k8s.io/api/batch/v1" @@ -20,6 +16,11 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/pluralsh/deployment-operator/api/v1alpha1" + "github.com/pluralsh/deployment-operator/pkg/client" + "github.com/pluralsh/deployment-operator/pkg/test/common" + "github.com/pluralsh/deployment-operator/pkg/test/mocks" ) var _ = Describe("PipelineGate Controller", Ordered, func() { diff --git a/internal/controller/stackrunjob_controller_test.go b/internal/controller/stackrunjob_controller_test.go index 606d920e..fe3de5bc 100644 --- a/internal/controller/stackrunjob_controller_test.go +++ b/internal/controller/stackrunjob_controller_test.go @@ -8,8 +8,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" console "github.com/pluralsh/console-client-go" - "github.com/pluralsh/deployment-operator/pkg/controller/stacks" - "github.com/pluralsh/deployment-operator/pkg/test/mocks" "github.com/samber/lo" "github.com/stretchr/testify/mock" batchv1 "k8s.io/api/batch/v1" @@ -18,6 +16,9 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/pluralsh/deployment-operator/pkg/controller/stacks" + "github.com/pluralsh/deployment-operator/pkg/test/mocks" ) var _ = Describe("Stack Run Job Controller", Ordered, func() { diff --git a/internal/controller/status_controller.go b/internal/controller/status_controller.go new file mode 100644 index 00000000..82a22204 --- /dev/null +++ b/internal/controller/status_controller.go @@ -0,0 +1,90 @@ +package controller + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + cliutilscommon "sigs.k8s.io/cli-utils/pkg/common" + "sigs.k8s.io/cli-utils/pkg/inventory" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + k8sClient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/pluralsh/deployment-operator/cmd/agent/args" + "github.com/pluralsh/deployment-operator/pkg/cache" + "github.com/pluralsh/deployment-operator/pkg/common" +) + +type StatusReconciler struct { + k8sClient.Client + inventoryCache cache.InventoryResourceKeys +} + +func (r *StatusReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + logger := log.FromContext(ctx) + + configMap := &corev1.ConfigMap{} + if err := r.Get(ctx, req.NamespacedName, configMap); err != nil { + logger.Info("unable to fetch configmap") + return ctrl.Result{}, k8sClient.IgnoreNotFound(err) + } + + if !configMap.DeletionTimestamp.IsZero() { + return r.handleDelete(configMap) + } + + inv, err := common.ToUnstructured(configMap) + if err != nil { + return ctrl.Result{}, err + } + + set, err := inventory.WrapInventoryObj(inv).Load() + if err != nil { + return ctrl.Result{}, err + } + + invID := r.inventoryID(configMap) + + // If services arg is provided, we can skip + // services that are not on the list. + if args.SkipService(invID) { + return ctrl.Result{}, nil + } + + r.inventoryCache[invID] = cache.ResourceKeyFromObjMetadata(set) + cache.GetResourceCache().Register(r.inventoryCache.Values().TypeIdentifierSet()) + + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *StatusReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.ConfigMap{}). + WithEventFilter(predicate.NewPredicateFuncs(func(o client.Object) bool { + _, exists := o.GetLabels()[cliutilscommon.InventoryLabel] + return exists + })). + Complete(r) +} + +func (r *StatusReconciler) inventoryID(c *corev1.ConfigMap) string { + return c.Labels[cliutilscommon.InventoryLabel] +} + +func (r *StatusReconciler) handleDelete(c *corev1.ConfigMap) (ctrl.Result, error) { + inventoryID := r.inventoryID(c) + delete(r.inventoryCache, inventoryID) + + return ctrl.Result{}, nil +} + +func NewStatusReconciler(c client.Client) (*StatusReconciler, error) { + return &StatusReconciler{ + Client: c, + inventoryCache: make(cache.InventoryResourceKeys), + }, nil +} diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index e00be180..749422b6 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -24,12 +24,13 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - deploymentsv1alpha1 "github.com/pluralsh/deployment-operator/api/v1alpha1" "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" + + deploymentsv1alpha1 "github.com/pluralsh/deployment-operator/api/v1alpha1" ) // These tests use Ginkgo (BDD-style Go testing framework). Refer to diff --git a/internal/helpers/kubernetes.go b/internal/helpers/kubernetes.go new file mode 100644 index 00000000..de1938d6 --- /dev/null +++ b/internal/helpers/kubernetes.go @@ -0,0 +1,16 @@ +package helpers + +import ( + "strings" + + "github.com/gobuffalo/flect" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +func GVRFromGVK(gvk schema.GroupVersionKind) schema.GroupVersionResource { + return schema.GroupVersionResource{ + Group: gvk.Group, + Version: gvk.Version, + Resource: flect.Pluralize(strings.ToLower(gvk.Kind)), + } +} diff --git a/internal/kstatus/watcher/common.go b/internal/kstatus/watcher/common.go new file mode 100644 index 00000000..ba83ab69 --- /dev/null +++ b/internal/kstatus/watcher/common.go @@ -0,0 +1,95 @@ +package watcher + +import ( + "strings" + + "github.com/gobuffalo/flect" + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/event" + kwatcher "sigs.k8s.io/cli-utils/pkg/kstatus/watcher" + "sigs.k8s.io/cli-utils/pkg/object" +) + +func handleFatalError(err error) <-chan event.Event { + eventCh := make(chan event.Event) + go func() { + defer close(eventCh) + eventCh <- event.Event{ + Type: event.ErrorEvent, + Error: err, + } + }() + return eventCh +} + +func autoSelectRESTScopeStrategy(ids object.ObjMetadataSet) kwatcher.RESTScopeStrategy { + if len(uniqueNamespaces(ids)) > 1 { + return kwatcher.RESTScopeRoot + } + return kwatcher.RESTScopeNamespace +} + +func rootScopeGKNs(ids object.ObjMetadataSet) []GroupKindNamespace { + gks := uniqueGKs(ids) + targets := make([]GroupKindNamespace, len(gks)) + for i, gk := range gks { + targets[i] = GroupKindNamespace{ + Group: gk.Group, + Kind: gk.Kind, + Namespace: "", + } + } + return targets +} + +func namespaceScopeGKNs(ids object.ObjMetadataSet) []GroupKindNamespace { + return uniqueGKNs(ids) +} + +// uniqueGKNs returns a set of unique GroupKindNamespaces from a set of object identifiers. +func uniqueGKNs(ids object.ObjMetadataSet) []GroupKindNamespace { + gknMap := make(map[GroupKindNamespace]struct{}) + for _, id := range ids { + gkn := GroupKindNamespace{Group: id.GroupKind.Group, Kind: id.GroupKind.Kind, Namespace: id.Namespace} + gknMap[gkn] = struct{}{} + } + gknList := make([]GroupKindNamespace, 0, len(gknMap)) + for gk := range gknMap { + gknList = append(gknList, gk) + } + return gknList +} + +// uniqueGKs returns a set of unique GroupKinds from a set of object identifiers. +func uniqueGKs(ids object.ObjMetadataSet) []schema.GroupKind { + gkMap := make(map[schema.GroupKind]struct{}) + for _, id := range ids { + gkn := schema.GroupKind{Group: id.GroupKind.Group, Kind: id.GroupKind.Kind} + gkMap[gkn] = struct{}{} + } + gkList := make([]schema.GroupKind, 0, len(gkMap)) + for gk := range gkMap { + gkList = append(gkList, gk) + } + return gkList +} + +func uniqueNamespaces(ids object.ObjMetadataSet) []string { + nsMap := make(map[string]struct{}) + for _, id := range ids { + nsMap[id.Namespace] = struct{}{} + } + nsList := make([]string, 0, len(nsMap)) + for ns := range nsMap { + nsList = append(nsList, ns) + } + return nsList +} + +func GvrFromGvk(gvk schema.GroupVersionKind) schema.GroupVersionResource { + return schema.GroupVersionResource{ + Group: gvk.Group, + Version: gvk.Version, + Resource: flect.Pluralize(strings.ToLower(gvk.Kind)), + } +} diff --git a/internal/kstatus/watcher/event_funnel.go b/internal/kstatus/watcher/event_funnel.go new file mode 100644 index 00000000..52e175ab --- /dev/null +++ b/internal/kstatus/watcher/event_funnel.go @@ -0,0 +1,119 @@ +package watcher + +import ( + "context" + "fmt" + + "k8s.io/klog/v2" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/event" +) + +// eventFunnel wraps a list of event channels and multiplexes them down to a +// single event channel. New input channels can be added at runtime, and the +// output channel will remain open until all input channels are closed. +type eventFunnel struct { + // ctx closure triggers shutdown + ctx context.Context + // outCh is the funnel that consumes all events from input channels + outCh chan event.Event + // doneCh is closed after outCh is closed. + // This allows blocking until done without consuming events. + doneCh chan struct{} + // counterCh is used to track the number of open input channels. + counterCh chan int +} + +func newEventFunnel(ctx context.Context) *eventFunnel { + funnel := &eventFunnel{ + ctx: ctx, + outCh: make(chan event.Event), + doneCh: make(chan struct{}), + counterCh: make(chan int), + } + // Wait until the context is done and all input channels are closed. + // Then close out and done channels to signal completion. + go func() { + defer func() { + // Don't close counterCh, otherwise AddInputChannel may panic. + klog.V(5).Info("Closing funnel") + close(funnel.outCh) + close(funnel.doneCh) + }() + ctxDoneCh := ctx.Done() + + // Count input channels that have been added and not closed. + inputs := 0 + for { + select { + case delta := <-funnel.counterCh: + inputs += delta + klog.V(5).Infof("Funnel input channels (%+d): %d", delta, inputs) + case <-ctxDoneCh: + // Stop waiting for context closure. + // Nil channel avoids busy waiting. + ctxDoneCh = nil + } + if ctxDoneCh == nil && inputs <= 0 { + // Context is closed and all input channels are closed. + break + } + } + }() + return funnel +} + +// Add a new input channel to the multiplexer. +func (m *eventFunnel) AddInputChannel(inCh <-chan event.Event) error { + select { + case <-m.ctx.Done(): // skip, if context is closed + return &EventFunnelClosedError{ContextError: m.ctx.Err()} + case m.counterCh <- 1: // increment counter + } + + // Create a multiplexer for each new event channel. + go m.drain(inCh, m.outCh) + return nil +} + +// OutputChannel channel receives all events sent to input channels. +// This channel is closed after all input channels are closed. +func (m *eventFunnel) OutputChannel() <-chan event.Event { + return m.outCh +} + +// Done channel is closed after the Output channel is closed. +// This allows blocking until done without consuming events. +// If no input channels have been added yet, the done channel will be nil. +func (m *eventFunnel) Done() <-chan struct{} { + return m.doneCh +} + +// drain a single input channel to a single output channel. +func (m *eventFunnel) drain(inCh <-chan event.Event, outCh chan<- event.Event) { + defer func() { + m.counterCh <- -1 // decrement counter + }() + for event := range inCh { + outCh <- event + } +} + +type EventFunnelClosedError struct { + ContextError error +} + +func (e *EventFunnelClosedError) Error() string { + return fmt.Sprintf("event funnel closed: %v", e.ContextError) +} + +func (e *EventFunnelClosedError) Is(err error) bool { + fcErr, ok := err.(*EventFunnelClosedError) + if !ok { + return false + } + return e.ContextError == fcErr.ContextError +} + +func (e *EventFunnelClosedError) Unwrap() error { + return e.ContextError +} diff --git a/internal/kstatus/watcher/object_status_reporter.go b/internal/kstatus/watcher/object_status_reporter.go new file mode 100644 index 00000000..ca16e776 --- /dev/null +++ b/internal/kstatus/watcher/object_status_reporter.go @@ -0,0 +1,771 @@ +// Copyright 2022 The Kubernetes Authors. +// SPDX-License-Identifier: Apache-2.0 + +package watcher + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" + "k8s.io/utils/clock" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/engine" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/event" + "sigs.k8s.io/cli-utils/pkg/kstatus/status" + kwatcher "sigs.k8s.io/cli-utils/pkg/kstatus/watcher" + "sigs.k8s.io/cli-utils/pkg/object" + + "github.com/pluralsh/deployment-operator/internal/kubernetes/watcher" + "github.com/pluralsh/deployment-operator/internal/metrics" + "github.com/pluralsh/deployment-operator/pkg/common" +) + +// GroupKindNamespace identifies an informer target. +// When used as an informer target, the namespace is optional. +// When the namespace is empty for namespaced resources, all namespaces are watched. +type GroupKindNamespace struct { + Group string + Kind string + Namespace string +} + +// String returns a serialized form suitable for logging. +func (gkn GroupKindNamespace) String() string { + return fmt.Sprintf("%s/%s/namespaces/%s", + gkn.Group, gkn.Kind, gkn.Namespace) +} + +func (gkn GroupKindNamespace) GroupKind() schema.GroupKind { + return schema.GroupKind{Group: gkn.Group, Kind: gkn.Kind} +} + +// ObjectStatusReporter reports on updates to objects (instances) using a +// network of informers to watch one or more resources (types). +// +// Unlike SharedIndexInformer, ObjectStatusReporter... +// - Reports object status. +// - Can watch multiple resource types simultaneously. +// - Specific objects can be ignored for efficiency by specifying an ObjectFilter. +// - Resolves GroupKinds into Resources at runtime, to pick up newly added +// resources. +// - Starts and Stops individual watches automaically to reduce errors when a +// CRD or Namespace is deleted. +// - Resources can be watched in root-scope mode or namespace-scope mode, +// allowing the caller to optimize for efficiency or least-privilege. +// - Gives unschedulable Pods (and objects that generate them) a 15s grace +// period before reporting them as Failed. +// - Resets the RESTMapper cache automatically when CRDs are modified. +// +// ObjectStatusReporter is NOT repeatable. It will panic if started more than +// once. If you need a repeatable factory, use DefaultStatusWatcher. +// +// Ref: https://github.com/kubernetes-sigs/cli-utils/blob/v0.37.1/pkg/kstatus/watcher/object_status_reporter.go +type ObjectStatusReporter struct { + // Mapper is used to map from GroupKind to GroupVersionKind. + Mapper meta.RESTMapper + + // StatusReader specifies a custom implementation of the + // engine.StatusReader interface that will be used to compute reconcile + // status for resource objects. + StatusReader engine.StatusReader + + // ClusterReader is used to look up generated objects on-demand. + // Generated objects (ex: Deployment > ReplicaSet > Pod) are sometimes + // required for computing parent object status, to compensate for + // controllers that aren't following status conventions. + ClusterReader engine.ClusterReader + + // GroupKinds is the list of GroupKinds to watch. + Targets []GroupKindNamespace + + // ObjectFilter is used to decide which objects to ignore. + ObjectFilter kwatcher.ObjectFilter + + // RESTScope specifies whether to ListAndWatch resources at the namespace + // or cluster (root) level. Using root scope is more efficient, but + // namespace scope may require fewer permissions. + RESTScope meta.RESTScope + + // DynamicClient is used to watch of resource objects. + DynamicClient dynamic.Interface + + // LabelSelector is used to apply server-side filtering on watched resources. + LabelSelector labels.Selector + + // lock guards modification of the subsequent stateful fields + lock sync.Mutex + + // gk2gkn maps GKs to GKNs to make it easy/cheap to look up. + gk2gkn map[schema.GroupKind]map[GroupKindNamespace]struct{} + + // ns2gkn maps Namespaces to GKNs to make it easy/cheap to look up. + ns2gkn map[string]map[GroupKindNamespace]struct{} + + // watcherRefs tracks which informers have been started and stopped + watcherRefs map[GroupKindNamespace]*watcherReference + + // context will be cancelled when the reporter should stop. + context context.Context + + // cancel function that stops the context. + // This should only be called after the terminal error event has been sent. + cancel context.CancelFunc + + // funnel multiplexes multiple input channels into one output channel, + // allowing input channels to be added and removed at runtime. + funnel *eventFunnel + + // taskManager makes it possible to cancel scheduled tasks. + taskManager *taskManager + + started bool + stopped bool +} + +func (in *ObjectStatusReporter) Start(ctx context.Context) <-chan event.Event { + in.lock.Lock() + defer in.lock.Unlock() + + if in.started { + panic("ObjectStatusInformer cannot be restarted") + } + + in.taskManager = &taskManager{} + + // Map GroupKinds to sets of GroupKindNamespaces for fast lookups. + // This is the only time we modify the map. + // So it should be safe to read from multiple threads after this. + in.gk2gkn = make(map[schema.GroupKind]map[GroupKindNamespace]struct{}) + for _, gkn := range in.Targets { + gk := gkn.GroupKind() + m, found := in.gk2gkn[gk] + if !found { + m = make(map[GroupKindNamespace]struct{}) + in.gk2gkn[gk] = m + } + m[gkn] = struct{}{} + } + + // Map namespaces to sets of GroupKindNamespaces for fast lookups. + // This is the only time we modify the map. + // So it should be safe to read from multiple threads after this. + in.ns2gkn = make(map[string]map[GroupKindNamespace]struct{}) + for _, gkn := range in.Targets { + ns := gkn.Namespace + m, found := in.ns2gkn[ns] + if !found { + m = make(map[GroupKindNamespace]struct{}) + in.ns2gkn[ns] = m + } + m[gkn] = struct{}{} + } + + // Initialize the informer map with references to track their start/stop. + // This is the only time we modify the map. + // So it should be safe to read from multiple threads after this. + if in.watcherRefs == nil { + in.watcherRefs = make(map[GroupKindNamespace]*watcherReference) + } + + for _, gkn := range in.Targets { + if _, exists := in.watcherRefs[gkn]; !exists { + in.watcherRefs[gkn] = &watcherReference{} + } + } + + ctx, cancel := context.WithCancel(ctx) + in.context = ctx + in.cancel = cancel + + // Use an event funnel to multiplex events through multiple input channels + // into out output channel. We can't use the normal fan-in pattern, because + // we need to be able to add and remove new input channels at runtime, as + // new informers are created and destroyed. + in.funnel = newEventFunnel(ctx) + + // Send start requests. + for _, gkn := range in.Targets { + in.startInformer(gkn) + } + + in.started = true + + // Block until the event funnel is closed. + // The event funnel will close after all the informer channels are closed. + // The informer channels will close after the informers have stopped. + // The informers will stop after their context is cancelled. + go func() { + <-in.funnel.Done() + + in.lock.Lock() + defer in.lock.Unlock() + in.stopped = true + }() + + // Wait until all informers are synced or stopped, then send a SyncEvent. + syncEventCh := make(chan event.Event) + err := in.funnel.AddInputChannel(syncEventCh) + if err != nil { + // Reporter already stopped. + return handleFatalError(fmt.Errorf("reporter failed to start: %w", err)) + } + go func() { + defer close(syncEventCh) + // TODO: should we use something less aggressive, like wait.BackoffUntil? + if cache.WaitForCacheSync(ctx.Done(), in.HasSynced) { + syncEventCh <- event.Event{ + Type: event.SyncEvent, + } + } + }() + + return in.funnel.OutputChannel() +} + +// Stop triggers the cancellation of the reporter context, and closure of the +// event channel without sending an error event. +func (in *ObjectStatusReporter) Stop() { + klog.V(4).Info("Stopping reporter") + in.cancel() +} + +// HasSynced returns true if all the started informers have been synced. +// +// Use the following to block waiting for synchronization: +// synced := cache.WaitForCacheSync(stopCh, informer.HasSynced) +func (in *ObjectStatusReporter) HasSynced() bool { + in.lock.Lock() + defer in.lock.Unlock() + + if in.stopped || !in.started { + return false + } + + pending := make([]GroupKindNamespace, 0, len(in.watcherRefs)) + for gke, informer := range in.watcherRefs { + if informer.HasStarted() && !informer.HasSynced() { + pending = append(pending, gke) + } + } + if len(pending) > 0 { + klog.V(5).Infof("Informers pending synchronization: %v", pending) + return false + } + return true +} + +// startInformer adds the specified GroupKindNamespace to the start channel to +// be started asynchronously. +func (in *ObjectStatusReporter) startInformer(gkn GroupKindNamespace) { + ctx, ok := in.watcherRefs[gkn].Start(in.context) + if !ok { + klog.V(5).Infof("Watch start skipped (already started): %v", gkn) + // already started + return + } + go in.startInformerWithRetry(ctx, gkn) +} + +// stopInformer stops the informer watching the specified GroupKindNamespace. +func (in *ObjectStatusReporter) stopInformer(gkn GroupKindNamespace) { + in.watcherRefs[gkn].Stop() +} + +func (in *ObjectStatusReporter) startInformerWithRetry(ctx context.Context, gkn GroupKindNamespace) { + realClock := &clock.RealClock{} + // TODO nolint can be removed once https://github.com/kubernetes/kubernetes/issues/118638 is resolved + backoffManager := wait.NewExponentialBackoffManager(800*time.Millisecond, 30*time.Second, 2*time.Minute, 2.0, 1.0, realClock) //nolint:staticcheck + retryCtx, retryCancel := context.WithCancel(ctx) + + wait.BackoffUntil(func() { + err := in.startInformerNow( + ctx, + gkn, + ) + if err != nil { + if meta.IsNoMatchError(err) { + // CRD (or api extension) not installed + // TODO: retry if CRDs are not being watched + klog.V(3).Infof("Watch start error (blocking until CRD is added): %v: %v", gkn, err) + // Cancel the parent context, which will stop the retries too. + in.stopInformer(gkn) + return + } + + // Create a temporary input channel to send the error event. + eventCh := make(chan event.Event) + defer close(eventCh) + err := in.funnel.AddInputChannel(eventCh) + if err != nil { + // Reporter already stopped. + // This is fine. 🔥 + klog.V(5).Infof("Informer failed to start: %v", err) + return + } + // Send error event and stop the reporter! + in.handleFatalError(eventCh, err) + return + } + // Success! - Stop retrying + retryCancel() + }, backoffManager, true, retryCtx.Done()) +} + +func (in *ObjectStatusReporter) newWatcher(ctx context.Context, gkn GroupKindNamespace) (watch.Interface, error) { + gk := schema.GroupKind{Group: gkn.Group, Kind: gkn.Kind} + mapping, err := in.Mapper.RESTMapping(gk) + if err != nil { + return nil, err + } + + gvr := GvrFromGvk(mapping.GroupVersionKind) + + var labelSelectorString string + if in.LabelSelector != nil { + labelSelectorString = in.LabelSelector.String() + } + + return watcher.NewRetryListerWatcher( + watcher.WithListWatchFunc( + func(options metav1.ListOptions) (runtime.Object, error) { + options.LabelSelector = labelSelectorString + return in.DynamicClient.Resource(gvr).List(ctx, options) + }, func(options metav1.ListOptions) (watch.Interface, error) { + options.LabelSelector = labelSelectorString + return in.DynamicClient.Resource(gvr).Watch(ctx, options) + }), + watcher.WithID(gkn.String()), + ) +} + +// startInformerNow starts an informer to watch for changes to a +// GroupKindNamespace. Changes are filtered and passed by event channel into the +// funnel. Each update event includes the computed status of the object. +// An error is returned if the informer could not be created. +func (in *ObjectStatusReporter) startInformerNow( + ctx context.Context, + gkn GroupKindNamespace, +) error { + w, err := in.newWatcher(ctx, gkn) + if err != nil { + return err + } + + in.watcherRefs[gkn].SetInformer(w) + eventCh := make(chan event.Event) + + // Add this event channel to the output multiplexer + err = in.funnel.AddInputChannel(eventCh) + if err != nil { + // Reporter already stopped. + return fmt.Errorf("informer failed to build event handler: %w\n", err) + } + + // Start the informer in the background. + // Informer will be stopped when the context is cancelled. + go func() { + klog.V(3).Infof("Watch starting: %v", gkn) + metrics.Record().ResourceCacheWatchStart(gkn.String()) + in.Run(ctx.Done(), w.ResultChan(), in.eventHandler(ctx, eventCh)) + metrics.Record().ResourceCacheWatchEnd(gkn.String()) + klog.V(3).Infof("Watch stopped: %v", gkn) + // Signal to the caller there will be no more events for this GroupKind. + in.watcherRefs[gkn].Stop() + close(eventCh) + }() + + return nil +} + +func (in *ObjectStatusReporter) Run(stopCh <-chan struct{}, echan <-chan watch.Event, rh cache.ResourceEventHandler) { + for { + select { + case <-stopCh: + return + case e, ok := <-echan: + if !ok { + klog.Error("event channel closed") + return + } + + switch e.Type { + case watch.Added: + un, _ := common.ToUnstructured(e.Object) + rh.OnAdd(un, true) + case watch.Modified: + un, _ := common.ToUnstructured(e.Object) + rh.OnUpdate(nil, un) + case watch.Deleted: + un, _ := common.ToUnstructured(e.Object) + rh.OnDelete(un) + case watch.Error: + default: + klog.V(5).InfoS("unexpected watch event", "event", e) + } + } + } +} + +func (in *ObjectStatusReporter) forEachTargetWithGroupKind(gk schema.GroupKind, fn func(GroupKindNamespace)) { + for gkn := range in.gk2gkn[gk] { + fn(gkn) + } +} + +func (in *ObjectStatusReporter) forEachTargetWithNamespace(ns string, fn func(GroupKindNamespace)) { + for gkn := range in.ns2gkn[ns] { + fn(gkn) + } +} + +// readStatusFromObject is a convenience function to read object status with a +// StatusReader using a ClusterReader to retrieve generated objects. +func (in *ObjectStatusReporter) readStatusFromObject( + ctx context.Context, + obj *unstructured.Unstructured, +) (*event.ResourceStatus, error) { + return in.StatusReader.ReadStatusForObject(ctx, in.ClusterReader, obj) +} + +// readStatusFromCluster is a convenience function to read object status with a +// StatusReader using a ClusterReader to retrieve the object and its generated +// objects. +func (in *ObjectStatusReporter) readStatusFromCluster( + ctx context.Context, + id object.ObjMetadata, +) (*event.ResourceStatus, error) { + return in.StatusReader.ReadStatus(ctx, in.ClusterReader, id) +} + +// deletedStatus builds a ResourceStatus for a deleted object. +// +// StatusReader.ReadStatusForObject doesn't handle nil objects as input. So +// this builds the status manually. +// TODO: find a way to delegate this back to the status package. +func deletedStatus(id object.ObjMetadata) *event.ResourceStatus { + // Status is always NotFound after deltion. + // Passed obj represents the last known state, not the current state. + result := &event.ResourceStatus{ + Identifier: id, + Status: status.NotFoundStatus, + Message: "Resource not found", + } + + return &event.ResourceStatus{ + Identifier: id, + Resource: nil, // deleted object has no + Status: result.Status, + Message: result.Message, + // If deleted with foreground deletion, a finalizer will have blocked + // deletion until all the generated resources are deleted. + // TODO: Handle lookup of generated resources when not using foreground deletion. + GeneratedResources: nil, + } +} + +// eventHandler builds an event handler to compute object status. +// Returns an event channel on which these stats updates will be reported. +func (in *ObjectStatusReporter) eventHandler( + ctx context.Context, + eventCh chan<- event.Event, +) cache.ResourceEventHandler { + var handler cache.ResourceEventHandlerFuncs + + handler.AddFunc = func(iobj interface{}) { + // Bail early if the context is cancelled, to avoid unnecessary work. + if ctx.Err() != nil { + return + } + + obj, ok := iobj.(*unstructured.Unstructured) + if !ok { + panic(fmt.Sprintf("AddFunc received unexpected object type %T", iobj)) + } + id := object.UnstructuredToObjMetadata(obj) + if in.ObjectFilter != nil && in.ObjectFilter.Filter(obj) { + klog.V(7).Infof("Watch Event Skipped: AddFunc: %s", id) + return + } + klog.V(5).Infof("AddFunc: Computing status for object: %s", id) + + // cancel any scheduled status update for this object + in.taskManager.Cancel(id) + + rs, err := in.readStatusFromObject(ctx, obj) + if err != nil { + // Send error event and stop the reporter! + in.handleFatalError(eventCh, fmt.Errorf("failed to compute object status: %s: %w", id, err)) + return + } + + if object.IsNamespace(obj) { + klog.V(5).Infof("AddFunc: Namespace added: %v", id) + in.onNamespaceAdd(obj) + } else if object.IsCRD(obj) { + klog.V(5).Infof("AddFunc: CRD added: %v", id) + in.onCRDAdd(obj) + } + + if isObjectUnschedulable(rs) { + klog.V(5).Infof("AddFunc: object unschedulable: %v", id) + // schedule delayed status update + in.taskManager.Schedule(ctx, id, status.ScheduleWindow, + in.newStatusCheckTaskFunc(ctx, eventCh, id)) + } + + klog.V(7).Infof("AddFunc: sending update event: %v", rs) + eventCh <- event.Event{ + Type: event.ResourceUpdateEvent, + Resource: rs, + } + } + + handler.UpdateFunc = func(_, iobj interface{}) { + // Bail early if the context is cancelled, to avoid unnecessary work. + if ctx.Err() != nil { + return + } + + obj, ok := iobj.(*unstructured.Unstructured) + if !ok { + panic(fmt.Sprintf("UpdateFunc received unexpected object type %T", iobj)) + } + id := object.UnstructuredToObjMetadata(obj) + if in.ObjectFilter != nil && in.ObjectFilter.Filter(obj) { + klog.V(7).Infof("UpdateFunc: Watch Event Skipped: %s", id) + return + } + klog.V(5).Infof("UpdateFunc: Computing status for object: %s", id) + + // cancel any scheduled status update for this object + in.taskManager.Cancel(id) + + rs, err := in.readStatusFromObject(ctx, obj) + if err != nil { + // Send error event and stop the reporter! + in.handleFatalError(eventCh, fmt.Errorf("failed to compute object status: %s: %w", id, err)) + return + } + + if object.IsNamespace(obj) { + klog.V(5).Infof("UpdateFunc: Namespace updated: %v", id) + in.onNamespaceUpdate(obj) + } else if object.IsCRD(obj) { + klog.V(5).Infof("UpdateFunc: CRD updated: %v", id) + in.onCRDUpdate(obj) + } + + if isObjectUnschedulable(rs) { + klog.V(5).Infof("UpdateFunc: object unschedulable: %v", id) + // schedule delayed status update + in.taskManager.Schedule(ctx, id, status.ScheduleWindow, + in.newStatusCheckTaskFunc(ctx, eventCh, id)) + } + + klog.V(7).Infof("UpdateFunc: sending update event: %v", rs) + eventCh <- event.Event{ + Type: event.ResourceUpdateEvent, + Resource: rs, + } + } + + handler.DeleteFunc = func(iobj interface{}) { + // Bail early if the context is cancelled, to avoid unnecessary work. + if ctx.Err() != nil { + return + } + + if tombstone, ok := iobj.(cache.DeletedFinalStateUnknown); ok { + // Last state unknown. Possibly stale. + // TODO: Should we propegate this uncertainty to the caller? + iobj = tombstone.Obj + } + obj, ok := iobj.(*unstructured.Unstructured) + if !ok { + panic(fmt.Sprintf("DeleteFunc received unexpected object type %T", iobj)) + } + id := object.UnstructuredToObjMetadata(obj) + if in.ObjectFilter != nil && in.ObjectFilter.Filter(obj) { + klog.V(7).Infof("DeleteFunc: Watch Event Skipped: %s", id) + return + } + klog.V(5).Infof("DeleteFunc: Computing status for object: %s", id) + + // cancel any scheduled status update for this object + in.taskManager.Cancel(id) + + if object.IsNamespace(obj) { + klog.V(5).Infof("DeleteFunc: Namespace deleted: %v", id) + in.onNamespaceDelete(obj) + } else if object.IsCRD(obj) { + klog.V(5).Infof("DeleteFunc: CRD deleted: %v", id) + in.onCRDDelete(obj) + } + + rs := deletedStatus(id) + klog.V(7).Infof("DeleteFunc: sending update event: %v", rs) + eventCh <- event.Event{ + Type: event.ResourceUpdateEvent, + Resource: rs, + } + } + + return handler +} + +// onCRDAdd handles creating a new informer to watch the new resource type. +func (in *ObjectStatusReporter) onCRDAdd(obj *unstructured.Unstructured) { + gk, found := object.GetCRDGroupKind(obj) + if !found { + id := object.UnstructuredToObjMetadata(obj) + klog.Warningf("Invalid CRD added: missing group and/or kind: %v", id) + // Don't return an error, because this should not inturrupt the task queue. + // TODO: Allow non-fatal errors to be reported using a specific error type. + return + } + klog.V(3).Infof("CRD added for %s", gk) + + klog.V(3).Info("Resetting RESTMapper") + // Reset mapper to invalidate cache. + meta.MaybeResetRESTMapper(in.Mapper) + + in.forEachTargetWithGroupKind(gk, func(gkn GroupKindNamespace) { + in.startInformer(gkn) + }) +} + +// onCRDUpdate handles creating a new informer to watch the updated resource type. +func (in *ObjectStatusReporter) onCRDUpdate(newObj *unstructured.Unstructured) { + gk, found := object.GetCRDGroupKind(newObj) + if !found { + id := object.UnstructuredToObjMetadata(newObj) + klog.Warningf("Invalid CRD updated: missing group and/or kind: %v", id) + // Don't return an error, because this should not inturrupt the task queue. + // TODO: Allow non-fatal errors to be reported using a specific error type. + return + } + klog.V(3).Infof("CRD updated for %s", gk) + + klog.V(3).Info("Resetting RESTMapper") + // Reset mapper to invalidate cache. + meta.MaybeResetRESTMapper(in.Mapper) + + in.forEachTargetWithGroupKind(gk, func(gkn GroupKindNamespace) { + in.startInformer(gkn) + }) +} + +// onCRDDelete handles stopping the informer watching the deleted resource type. +func (in *ObjectStatusReporter) onCRDDelete(oldObj *unstructured.Unstructured) { + gk, found := object.GetCRDGroupKind(oldObj) + if !found { + id := object.UnstructuredToObjMetadata(oldObj) + klog.Warningf("Invalid CRD deleted: missing group and/or kind: %v", id) + // Don't return an error, because this should not inturrupt the task queue. + // TODO: Allow non-fatal errors to be reported using a specific error type. + return + } + klog.V(3).Infof("CRD deleted for %s", gk) + + in.forEachTargetWithGroupKind(gk, func(gkn GroupKindNamespace) { + in.stopInformer(gkn) + }) + + klog.V(3).Info("Resetting RESTMapper") + // Reset mapper to invalidate cache. + meta.MaybeResetRESTMapper(in.Mapper) +} + +// onNamespaceAdd handles creating new informers to watch this namespace. +func (in *ObjectStatusReporter) onNamespaceAdd(obj *unstructured.Unstructured) { + if in.RESTScope == meta.RESTScopeRoot { + // When watching resources across all namespaces, + // we don't need to start or stop any + // namespace-specific informers. + return + } + namespace := obj.GetName() + in.forEachTargetWithNamespace(namespace, func(gkn GroupKindNamespace) { + in.startInformer(gkn) + }) +} + +// onNamespaceUpdate handles creating new informers to watch this namespace. +func (in *ObjectStatusReporter) onNamespaceUpdate(obj *unstructured.Unstructured) { + if in.RESTScope == meta.RESTScopeRoot { + // When watching resources across all namespaces, + // we don't need to start or stop any + // namespace-specific informers. + return + } + namespace := obj.GetName() + in.forEachTargetWithNamespace(namespace, func(gkn GroupKindNamespace) { + in.startInformer(gkn) + }) +} + +// onNamespaceDelete handles stopping informers watching this namespace. +func (in *ObjectStatusReporter) onNamespaceDelete(obj *unstructured.Unstructured) { + if in.RESTScope == meta.RESTScopeRoot { + // When watching resources across all namespaces, + // we don't need to start or stop any + // namespace-specific informers. + return + } + namespace := obj.GetName() + in.forEachTargetWithNamespace(namespace, func(gkn GroupKindNamespace) { + in.stopInformer(gkn) + }) +} + +// newStatusCheckTaskFunc returns a taskFund that reads the status of an object +// from the cluster and sends it over the event channel. +// +// This method should only be used for generated resource objects, as it's much +// slower at scale than watching the resource for updates. +func (in *ObjectStatusReporter) newStatusCheckTaskFunc( + ctx context.Context, + eventCh chan<- event.Event, + id object.ObjMetadata, +) taskFunc { + return func() { + klog.V(5).Infof("Re-reading object status: %v", id) + // check again + rs, err := in.readStatusFromCluster(ctx, id) + if err != nil { + // Send error event and stop the reporter! + // TODO: retry N times before terminating + in.handleFatalError(eventCh, err) + return + } + eventCh <- event.Event{ + Type: event.ResourceUpdateEvent, + Resource: rs, + } + } +} + +func (in *ObjectStatusReporter) handleFatalError(eventCh chan<- event.Event, err error) { + klog.V(5).Infof("Reporter error: %v", err) + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return + } + eventCh <- event.Event{ + Type: event.ErrorEvent, + Error: err, + } + in.Stop() +} diff --git a/internal/kstatus/watcher/task_manager.go b/internal/kstatus/watcher/task_manager.go new file mode 100644 index 00000000..0c667cd8 --- /dev/null +++ b/internal/kstatus/watcher/task_manager.go @@ -0,0 +1,68 @@ +package watcher + +import ( + "context" + "sync" + "time" + + "k8s.io/klog/v2" + "sigs.k8s.io/cli-utils/pkg/object" +) + +type taskFunc func() + +// taskManager manages a set of tasks with object identifiers. +// This makes starting and stopping the tasks thread-safe. +type taskManager struct { + lock sync.Mutex + cancelFuncs map[object.ObjMetadata]context.CancelFunc +} + +func (tm *taskManager) Schedule(parentCtx context.Context, id object.ObjMetadata, delay time.Duration, task taskFunc) { + tm.lock.Lock() + defer tm.lock.Unlock() + + if tm.cancelFuncs == nil { + tm.cancelFuncs = make(map[object.ObjMetadata]context.CancelFunc) + } + + cancel, found := tm.cancelFuncs[id] + if found { + // Cancel the existing scheduled task and replace it. + cancel() + } + + taskCtx, cancel := context.WithTimeout(context.Background(), delay) + tm.cancelFuncs[id] = cancel + + go func() { + klog.V(5).Infof("Task scheduled (%v) for object (%s)", delay, id) + select { + case <-parentCtx.Done(): + // stop waiting + cancel() + case <-taskCtx.Done(): + if taskCtx.Err() == context.DeadlineExceeded { + klog.V(5).Infof("Task executing (after %v) for object (%v)", delay, id) + task() + } + // else stop waiting + } + }() +} + +func (tm *taskManager) Cancel(id object.ObjMetadata) { + tm.lock.Lock() + defer tm.lock.Unlock() + + cancelFunc, found := tm.cancelFuncs[id] + if !found { + // already cancelled or not added + return + } + delete(tm.cancelFuncs, id) + cancelFunc() + if len(tm.cancelFuncs) == 0 { + tm.cancelFuncs = nil + } +} diff --git a/internal/kstatus/watcher/unschedulable.go b/internal/kstatus/watcher/unschedulable.go new file mode 100644 index 00000000..134d2e79 --- /dev/null +++ b/internal/kstatus/watcher/unschedulable.go @@ -0,0 +1,66 @@ +package watcher + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/event" + "sigs.k8s.io/cli-utils/pkg/kstatus/status" + "sigs.k8s.io/cli-utils/pkg/object" +) + +// isObjectUnschedulable returns true if the object or any of its generated resources +// is an unschedulable pod. +// +// This status is computed recursively, so it can handle objects that generate +// objects that generate pods, as long as the input ResourceStatus has those +// GeneratedResources computed. +func isObjectUnschedulable(rs *event.ResourceStatus) bool { + if rs.Error != nil { + return false + } + if rs.Status != status.InProgressStatus { + return false + } + if isPodUnschedulable(rs.Resource) { + return true + } + // recurse through generated resources + for _, subRS := range rs.GeneratedResources { + if isObjectUnschedulable(subRS) { + return true + } + } + return false +} + +// isPodUnschedulable returns true if the object is a pod and is unschedulable +// according to a False PodScheduled condition. +func isPodUnschedulable(obj *unstructured.Unstructured) bool { + if obj == nil { + return false + } + gk := obj.GroupVersionKind().GroupKind() + if gk != (schema.GroupKind{Kind: "Pod"}) { + return false + } + icnds, found, err := object.NestedField(obj.Object, "status", "conditions") + if err != nil || !found { + return false + } + cnds, ok := icnds.([]interface{}) + if !ok { + return false + } + for _, icnd := range cnds { + cnd, ok := icnd.(map[string]interface{}) + if !ok { + return false + } + if cnd["type"] == "PodScheduled" && + cnd["status"] == "False" && + cnd["reason"] == "Unschedulable" { + return true + } + } + return false +} diff --git a/internal/kstatus/watcher/watcher.go b/internal/kstatus/watcher/watcher.go new file mode 100644 index 00000000..8164a24e --- /dev/null +++ b/internal/kstatus/watcher/watcher.go @@ -0,0 +1,99 @@ +package watcher + +import ( + "context" + "fmt" + + "k8s.io/klog/v2" + kwatcher "sigs.k8s.io/cli-utils/pkg/kstatus/watcher" + + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/dynamic" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/event" + "sigs.k8s.io/cli-utils/pkg/object" +) + +type DynamicStatusWatcher struct { + *kwatcher.DefaultStatusWatcher + + // Options can be provided when creating a new StatusWatcher to customize the + // behavior. + Options Options + + // informerRefs tracks which informers have been started and stopped by the ObjectStatusReporter + informerRefs map[GroupKindNamespace]*watcherReference +} + +func (in *DynamicStatusWatcher) Watch(ctx context.Context, ids object.ObjMetadataSet, opts kwatcher.Options) <-chan event.Event { + var strategy kwatcher.RESTScopeStrategy + + if opts.RESTScopeStrategy != kwatcher.RESTScopeAutomatic { + strategy = opts.RESTScopeStrategy + } + + if in.Options.RESTScopeStrategy != nil { + strategy = *in.Options.RESTScopeStrategy + } + + if strategy == kwatcher.RESTScopeAutomatic { + strategy = autoSelectRESTScopeStrategy(ids) + } + + var scope meta.RESTScope + var targets []GroupKindNamespace + switch strategy { + case kwatcher.RESTScopeRoot: + scope = meta.RESTScopeRoot + targets = rootScopeGKNs(ids) + klog.V(3).Infof("DynamicStatusWatcher starting in root-scoped mode (targets: %d)", len(targets)) + case kwatcher.RESTScopeNamespace: + scope = meta.RESTScopeNamespace + targets = namespaceScopeGKNs(ids) + klog.V(3).Infof("DynamicStatusWatcher starting in namespace-scoped mode (targets: %d)", len(targets)) + default: + return handleFatalError(fmt.Errorf("invalid RESTScopeStrategy: %v", strategy)) + } + + var objectFilter kwatcher.ObjectFilter = &kwatcher.AllowListObjectFilter{AllowList: ids} + if in.Options.UseCustomObjectFilter { + objectFilter = in.Options.ObjectFilter + } + + var labelSelector labels.Selector + if in.Options.Filters != nil { + labelSelector = in.Options.Filters.Labels + } + + informer := &ObjectStatusReporter{ + Mapper: in.Mapper, + StatusReader: in.StatusReader, + ClusterReader: in.ClusterReader, + Targets: targets, + RESTScope: scope, + ObjectFilter: objectFilter, + // Custom options + LabelSelector: labelSelector, + DynamicClient: in.DynamicClient, + watcherRefs: in.informerRefs, + } + + return informer.Start(ctx) +} + +func NewDynamicStatusWatcher(dynamicClient dynamic.Interface, mapper meta.RESTMapper, options Options) kwatcher.StatusWatcher { + var informerRefs map[GroupKindNamespace]*watcherReference + if options.UseInformerRefCache { + informerRefs = make(map[GroupKindNamespace]*watcherReference) + } + + defaultStatusWatcher := kwatcher.NewDefaultStatusWatcher(dynamicClient, mapper) + defaultStatusWatcher.Filters = options.Filters + + return &DynamicStatusWatcher{ + DefaultStatusWatcher: defaultStatusWatcher, + // Custom options + Options: options, + informerRefs: informerRefs, + } +} diff --git a/internal/kstatus/watcher/watcher_reference.go b/internal/kstatus/watcher/watcher_reference.go new file mode 100644 index 00000000..9a7d5943 --- /dev/null +++ b/internal/kstatus/watcher/watcher_reference.go @@ -0,0 +1,82 @@ +package watcher + +import ( + "context" + "sync" + + "k8s.io/apimachinery/pkg/watch" +) + +// watcherReference tracks [watch.Interface] lifecycle. +type watcherReference struct { + // lock guards the subsequent stateful fields + lock sync.Mutex + + context context.Context + cancel context.CancelFunc + started bool + + watcher watch.Interface +} + +// Start returns a wrapped context that can be cancelled. +// Returns nil & false if already started. +func (ir *watcherReference) Start(ctx context.Context) (context.Context, bool) { + ir.lock.Lock() + defer ir.lock.Unlock() + + if ir.started { + return nil, false + } + + ctx, cancel := context.WithCancel(ctx) + ir.context = ctx + ir.cancel = cancel + ir.started = true + + return ctx, true +} + +func (ir *watcherReference) SetInformer(watcher watch.Interface) { + ir.lock.Lock() + defer ir.lock.Unlock() + + ir.watcher = watcher +} + +func (ir *watcherReference) HasSynced() bool { + ir.lock.Lock() + defer ir.lock.Unlock() + + if !ir.started { + return false + } + + if ir.watcher == nil { + return false + } + + return true +} + +func (ir *watcherReference) HasStarted() bool { + ir.lock.Lock() + defer ir.lock.Unlock() + + return ir.started +} + +// Stop cancels the context, if it's been started. +func (ir *watcherReference) Stop() { + ir.lock.Lock() + defer ir.lock.Unlock() + + if !ir.started { + return + } + + ir.watcher.Stop() + ir.cancel() + ir.started = false + ir.context = nil +} diff --git a/internal/kstatus/watcher/watcher_types.go b/internal/kstatus/watcher/watcher_types.go new file mode 100644 index 00000000..94fc1bf8 --- /dev/null +++ b/internal/kstatus/watcher/watcher_types.go @@ -0,0 +1,31 @@ +// Copyright 2022 The Kubernetes Authors. +// SPDX-License-Identifier: Apache-2.0 + +package watcher + +import ( + kwatcher "sigs.k8s.io/cli-utils/pkg/kstatus/watcher" +) + +// Options can be provided when creating a new StatusWatcher to customize the +// behavior. +type Options struct { + // RESTScopeStrategy specifies which strategy to use when listing and + // watching resources. By default, the strategy is selected automatically. + RESTScopeStrategy *kwatcher.RESTScopeStrategy + + // ObjectFilter is used to filter resources after getting them from the API. + ObjectFilter kwatcher.ObjectFilter + + // UseCustomObjectFilter controls whether custom ObjectFilter provided in options + // should be used instead of the default one. + UseCustomObjectFilter bool + + // Filters allows filtering the objects being watched. + Filters *kwatcher.Filters + + // UseInformerRefCache allows caching informer ref per status watcher instance. + // This allows to ensure that multiple [StatusWatcher.Watch] calls will only spawn + // unique watches. + UseInformerRefCache bool +} diff --git a/internal/kubernetes/watcher/retry_lister_watcher.go b/internal/kubernetes/watcher/retry_lister_watcher.go new file mode 100644 index 00000000..9611a61a --- /dev/null +++ b/internal/kubernetes/watcher/retry_lister_watcher.go @@ -0,0 +1,145 @@ +package watcher + +import ( + "fmt" + + "github.com/pluralsh/polly/algorithms" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + apiwatch "k8s.io/apimachinery/pkg/watch" + "k8s.io/klog/v2" + + "k8s.io/client-go/tools/cache" +) + +// RetryListerWatcher is a wrapper around [watch.RetryWatcher] +// that ... +type RetryListerWatcher struct { + *RetryWatcher + + id string + initialResourceVersion string + listerWatcher cache.ListerWatcher + + listOptions metav1.ListOptions + resultChan chan apiwatch.Event +} + +func (in *RetryListerWatcher) ResultChan() <-chan apiwatch.Event { + return in.resultChan +} + +func (in *RetryListerWatcher) funnel(from <-chan apiwatch.Event) { + for { + select { + case <-in.Done(): + return + case e, ok := <-from: + if !ok { + return + } + + in.resultChan <- e + } + } +} + +func (in *RetryListerWatcher) funnelItems(items ...apiwatch.Event) { + for _, item := range items { + select { + case <-in.Done(): + klog.V(4).InfoS("funnelItems stopped due to resultChan being closed") + return + case in.resultChan <- item: + klog.V(4).InfoS("successfully sent item to resultChan") + } + } +} + +func (in *RetryListerWatcher) toEvents(objects ...runtime.Object) []apiwatch.Event { + return algorithms.Map(objects, func(object runtime.Object) apiwatch.Event { + return apiwatch.Event{ + Type: apiwatch.Added, + Object: object, + } + }) +} + +func (in *RetryListerWatcher) isEmptyResourceVersion() bool { + return len(in.initialResourceVersion) == 0 || in.initialResourceVersion == "0" +} + +func (in *RetryListerWatcher) init() (*RetryListerWatcher, error) { + if err := in.ensureRequiredArgs(); err != nil { + return nil, err + } + + // TODO: check if watch supports feeding initial items instead of using list + if in.isEmptyResourceVersion() { + klog.V(3).InfoS("starting list and watch as initialResourceVersion is empty") + err := in.listAndWatch() + return in, err + } + + klog.V(3).InfoS("starting watch", "initialResourceVersion", in.initialResourceVersion) + go in.watch(in.initialResourceVersion) + return in, nil +} + +func (in *RetryListerWatcher) listAndWatch() error { + list, err := in.listerWatcher.List(in.listOptions) + if err != nil { + return fmt.Errorf("error listing resources: %w", err) + } + + listMetaInterface, err := meta.ListAccessor(list) + if err != nil { + return fmt.Errorf("unable to understand list result %#v: %w", list, err) + } + + resourceVersion := listMetaInterface.GetResourceVersion() + items, err := meta.ExtractList(list) + if err != nil { + return fmt.Errorf("unable to understand list result %#v (%w)", list, err) + } + + go in.watch(resourceVersion, in.toEvents(items...)...) + + return nil +} + +// Starts the [watch.RetryWatcher] and funnels all events to our wrapper. +func (in *RetryListerWatcher) watch(resourceVersion string, initialItems ...apiwatch.Event) { + defer close(in.resultChan) + + w, err := NewRetryWatcher(resourceVersion, in.listerWatcher) + if err != nil { + klog.ErrorS(err, "unable to create retry watcher", "resourceVersion", resourceVersion) + return + } + + in.RetryWatcher = w + in.funnelItems(initialItems...) + in.funnel(w.ResultChan()) +} + +func (in *RetryListerWatcher) ensureRequiredArgs() error { + if in.listerWatcher == nil { + return fmt.Errorf("listerWatcher must not be nil") + } + + return nil +} + +func NewRetryListerWatcher(options ...RetryListerWatcherOption) (*RetryListerWatcher, error) { + rw := &RetryListerWatcher{ + resultChan: make(chan apiwatch.Event), + } + + for _, option := range options { + option(rw) + } + + return rw.init() +} diff --git a/internal/kubernetes/watcher/retry_lister_watcher_options.go b/internal/kubernetes/watcher/retry_lister_watcher_options.go new file mode 100644 index 00000000..b702e960 --- /dev/null +++ b/internal/kubernetes/watcher/retry_lister_watcher_options.go @@ -0,0 +1,41 @@ +package watcher + +import ( + "k8s.io/client-go/tools/cache" +) + +type RetryListerWatcherOption func(*RetryListerWatcher) + +func WithListerWatcher(listerWatcher cache.ListerWatcher) RetryListerWatcherOption { + return func(rlw *RetryListerWatcher) { + rlw.listerWatcher = listerWatcher + } +} + +func WithListOptions() RetryListerWatcherOption { + return func(rlw *RetryListerWatcher) { + + } +} + +func WithResourceVersion(resourceVersion string) RetryListerWatcherOption { + return func(rlw *RetryListerWatcher) { + rlw.initialResourceVersion = resourceVersion + } +} + +func WithID(id string) RetryListerWatcherOption { + return func(rlw *RetryListerWatcher) { + rlw.id = id + } +} + +func WithListWatchFunc(listFunc cache.ListFunc, watchFunc cache.WatchFunc) RetryListerWatcherOption { + return func(rlw *RetryListerWatcher) { + rlw.listerWatcher = &cache.ListWatch{ + ListFunc: listFunc, + WatchFunc: watchFunc, + DisableChunking: false, + } + } +} diff --git a/internal/kubernetes/watcher/retry_watcher.go b/internal/kubernetes/watcher/retry_watcher.go new file mode 100644 index 00000000..4cf00143 --- /dev/null +++ b/internal/kubernetes/watcher/retry_watcher.go @@ -0,0 +1,293 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package watcher + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/dump" + "k8s.io/apimachinery/pkg/util/net" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" +) + +// resourceVersionGetter is an interface used to get resource version from events. +// We can't reuse an interface from meta otherwise it would be a cyclic dependency and we need just this one method +type resourceVersionGetter interface { + GetResourceVersion() string +} + +// RetryWatcher will make sure that in case the underlying watcher is closed (e.g. due to API timeout or etcd timeout) +// it will get restarted from the last point without the consumer even knowing about it. +// RetryWatcher does that by inspecting events and keeping track of resourceVersion. +// Especially useful when using watch.UntilWithoutRetry where premature termination is causing issues and flakes. +// Please note that this is not resilient to etcd cache not having the resource version anymore - you would need to +// use Informers for that. +type RetryWatcher struct { + lastResourceVersion string + watcherClient cache.Watcher + resultChan chan watch.Event + stopChan chan struct{} + doneChan chan struct{} + minRestartDelay time.Duration +} + +// NewRetryWatcher creates a new RetryWatcher. +// It will make sure that watches gets restarted in case of recoverable errors. +// The initialResourceVersion will be given to watch method when first called. +func NewRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher) (*RetryWatcher, error) { + return newRetryWatcher(initialResourceVersion, watcherClient, 1*time.Second) +} + +func newRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher, minRestartDelay time.Duration) (*RetryWatcher, error) { + switch initialResourceVersion { + case "", "0": + // TODO: revisit this if we ever get WATCH v2 where it means start "now" + // without doing the synthetic list of objects at the beginning (see #74022) + return nil, fmt.Errorf("initial RV %q is not supported due to issues with underlying WATCH", initialResourceVersion) + default: + break + } + + rw := &RetryWatcher{ + lastResourceVersion: initialResourceVersion, + watcherClient: watcherClient, + stopChan: make(chan struct{}), + doneChan: make(chan struct{}), + resultChan: make(chan watch.Event), + minRestartDelay: minRestartDelay, + } + + go rw.receive() + return rw, nil +} + +func (rw *RetryWatcher) send(event watch.Event) bool { + // Writing to an unbuffered channel is blocking operation + // and we need to check if stop wasn't requested while doing so. + select { + case rw.resultChan <- event: + return true + case <-rw.stopChan: + return false + } +} + +// doReceive returns true when it is done, false otherwise. +// If it is not done the second return value holds the time to wait before calling it again. +func (rw *RetryWatcher) doReceive() (bool, time.Duration) { + watcher, err := rw.watcherClient.Watch(metav1.ListOptions{ + ResourceVersion: rw.lastResourceVersion, + AllowWatchBookmarks: true, + }) + // We are very unlikely to hit EOF here since we are just establishing the call, + // but it may happen that the apiserver is just shutting down (e.g. being restarted) + // This is consistent with how it is handled for informers + switch { + case err == nil: + break + case errors.Is(err, io.EOF): + // watch closed normally + return false, 0 + case errors.Is(err, io.ErrUnexpectedEOF): + klog.V(1).InfoS("Watch closed with unexpected EOF", "err", err) + return false, 0 + default: + msg := "Watch failed" + if net.IsProbableEOF(err) || net.IsTimeout(err) || errors.Is(err, context.Canceled) { + klog.V(5).InfoS(msg, "err", err) + // Retry + return false, 0 + } + + klog.ErrorS(err, msg) + // Retry + return false, 0 + } + + if watcher == nil { + klog.ErrorS(nil, "Watch returned nil watcher") + // Retry + return false, 0 + } + + ch := watcher.ResultChan() + defer watcher.Stop() + + for { + select { + case <-rw.stopChan: + klog.V(4).InfoS("Stopping RetryWatcher.") + return true, 0 + case event, ok := <-ch: + if !ok { + klog.V(4).InfoS("Failed to get event! Re-creating the watcher.", "resourceVersion", rw.lastResourceVersion) + return false, 0 + } + + // We need to inspect the event and get ResourceVersion out of it + switch event.Type { + case watch.Added, watch.Modified, watch.Deleted, watch.Bookmark: + metaObject, ok := event.Object.(resourceVersionGetter) + if !ok { + _ = rw.send(watch.Event{ + Type: watch.Error, + Object: &apierrors.NewInternalError(errors.New("retryWatcher: doesn't support resourceVersion")).ErrStatus, + }) + // We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + return true, 0 + } + + resourceVersion := metaObject.GetResourceVersion() + if resourceVersion == "" { + _ = rw.send(watch.Event{ + Type: watch.Error, + Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher: object %#v doesn't support resourceVersion", event.Object)).ErrStatus, + }) + // We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + return true, 0 + } + + // All is fine; send the non-bookmark events and update resource version. + if event.Type != watch.Bookmark { + ok = rw.send(event) + if !ok { + return true, 0 + } + } + rw.lastResourceVersion = resourceVersion + + continue + + case watch.Error: + // This round trip allows us to handle unstructured status + errObject := apierrors.FromObject(event.Object) + var statusErr *apierrors.StatusError + ok := errors.As(errObject, &statusErr) + if !ok { + klog.Error(fmt.Sprintf("Received an error which is not *metav1.Status but %s", dump.Pretty(event.Object))) + // Retry unknown errors + return false, 0 + } + + status := statusErr.ErrStatus + + statusDelay := time.Duration(0) + if status.Details != nil { + statusDelay = time.Duration(status.Details.RetryAfterSeconds) * time.Second + } + + switch status.Code { + case http.StatusGone: + // Never retry RV too old errors + _ = rw.send(event) + return true, 0 + + case http.StatusGatewayTimeout, http.StatusInternalServerError: + // Retry + return false, statusDelay + + default: + // We retry by default. RetryWatcher is meant to proceed unless it is certain + // that it can't. If we are not certain, we proceed with retry and leave it + // up to the user to timeout if needed. + + // Log here so we have a record of hitting the unexpected error + // and we can whitelist some error codes if we missed any that are expected. + klog.V(5).Info(fmt.Sprintf("Retrying after unexpected error: %s", dump.Pretty(event.Object))) + + // Retry + return false, statusDelay + } + + default: + klog.Errorf("Failed to recognize Event type %q", event.Type) + _ = rw.send(watch.Event{ + Type: watch.Error, + Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher failed to recognize Event type %q", event.Type)).ErrStatus, + }) + // We are unable to restart the watch and have to stop the loop or this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! + return true, 0 + } + } + } +} + +// receive reads the result from a watcher, restarting it if necessary. +func (rw *RetryWatcher) receive() { + defer close(rw.doneChan) + defer close(rw.resultChan) + + klog.V(4).Info("Starting RetryWatcher.") + defer klog.V(4).Info("Stopping RetryWatcher.") + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { + select { + case <-rw.stopChan: + cancel() + return + case <-ctx.Done(): + return + } + }() + + // We use non sliding until so we don't introduce delays on happy path when WATCH call + // timeouts or gets closed and we need to reestablish it while also avoiding hot loops. + wait.NonSlidingUntilWithContext(ctx, func(ctx context.Context) { + done, retryAfter := rw.doReceive() + if done { + cancel() + return + } + + timer := time.NewTimer(retryAfter) + select { + case <-ctx.Done(): + timer.Stop() + return + case <-timer.C: + } + + klog.V(4).Infof("Restarting RetryWatcher at RV=%q", rw.lastResourceVersion) + }, rw.minRestartDelay) +} + +// ResultChan implements Interface. +func (rw *RetryWatcher) ResultChan() <-chan watch.Event { + return rw.resultChan +} + +// Stop implements Interface. +func (rw *RetryWatcher) Stop() { + close(rw.stopChan) +} + +// Done allows the caller to be notified when Retry watcher stops. +func (rw *RetryWatcher) Done() <-chan struct{} { + return rw.doneChan +} diff --git a/internal/metrics/metrics_context.go b/internal/metrics/metrics_context.go new file mode 100644 index 00000000..e937aef8 --- /dev/null +++ b/internal/metrics/metrics_context.go @@ -0,0 +1,22 @@ +package metrics + +import ( + "context" + "fmt" + + "github.com/samber/lo" +) + +func FromContext[T any](ctx context.Context, key ContextKey) (T, error) { + v := ctx.Value(key) + if v == nil { + return lo.Empty[T](), fmt.Errorf("could not get value for key: %v", key) + } + + val, ok := v.(T) + if !ok { + return lo.Empty[T](), fmt.Errorf("could not cast value for key: %v", key) + } + + return val, nil +} diff --git a/internal/metrics/metrics_options.go b/internal/metrics/metrics_options.go new file mode 100644 index 00000000..2c1225a3 --- /dev/null +++ b/internal/metrics/metrics_options.go @@ -0,0 +1,23 @@ +package metrics + +import ( + "time" +) + +func WithServiceReconciliationError(err error) ServiceReconciliationOption { + return func(o *serviceReconciliationOptions) { + o.err = err + } +} + +func WithServiceReconciliationStartedAt(startedAt time.Time) ServiceReconciliationOption { + return func(o *serviceReconciliationOptions) { + o.startedAt = &startedAt + } +} + +func WithServiceReconciliationStage(stage ServiceReconciliationStage) ServiceReconciliationOption { + return func(o *serviceReconciliationOptions) { + o.stage = &stage + } +} diff --git a/internal/metrics/metrics_prometheus.go b/internal/metrics/metrics_prometheus.go index dfe3c4d1..1b48069a 100644 --- a/internal/metrics/metrics_prometheus.go +++ b/internal/metrics/metrics_prometheus.go @@ -1,8 +1,11 @@ package metrics import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/samber/lo" ) var ( @@ -12,9 +15,32 @@ var ( type prometheusRecorder struct { discoveryAPICacheRefreshCounter prometheus.Counter discoveryAPICacheRefreshErrorCounter prometheus.Counter - serviceReconciliationCounter *prometheus.CounterVec - serviceReconciliationErrorCounter *prometheus.CounterVec - stackRunJobsCreatedCounter prometheus.Counter + + serviceReconciliationCounter *prometheus.CounterVec + serviceReconciliationDuration *prometheus.HistogramVec + serviceReconciliationErrorCounter *prometheus.CounterVec + + stackRunJobsCreatedCounter prometheus.Counter + + resourceCacheWatchCounter *prometheus.GaugeVec + resourceCacheHitCounter *prometheus.CounterVec + resourceCacheMissCounter *prometheus.CounterVec +} + +func (in *prometheusRecorder) ResourceCacheWatchStart(resourceType string) { + in.resourceCacheWatchCounter.WithLabelValues(resourceType).Inc() +} + +func (in *prometheusRecorder) ResourceCacheWatchEnd(resourceType string) { + in.resourceCacheWatchCounter.WithLabelValues(resourceType).Dec() +} + +func (in *prometheusRecorder) ResourceCacheHit(serviceID string) { + in.resourceCacheHitCounter.WithLabelValues(serviceID).Inc() +} + +func (in *prometheusRecorder) ResourceCacheMiss(serviceID string) { + in.resourceCacheMissCounter.WithLabelValues(serviceID).Inc() } func (in *prometheusRecorder) DiscoveryAPICacheRefresh(err error) { @@ -26,15 +52,32 @@ func (in *prometheusRecorder) DiscoveryAPICacheRefresh(err error) { in.discoveryAPICacheRefreshCounter.Inc() } -func (in *prometheusRecorder) ServiceReconciliation(serviceID, serviceName string, err error) { - if err != nil { +func (in *prometheusRecorder) ServiceReconciliation(serviceID, serviceName string, options ...ServiceReconciliationOption) { + o := &serviceReconciliationOptions{} + for _, opt := range options { + opt(o) + } + + if o.err != nil { in.serviceReconciliationErrorCounter.WithLabelValues(serviceID, serviceName).Inc() return } + if o.startedAt != nil { + in.serviceReconciliationDuration.WithLabelValues(serviceID, serviceName, lo.FromPtr(o.stage).String()).Observe(time.Since(*o.startedAt).Seconds()) + } + in.serviceReconciliationCounter.WithLabelValues(serviceID, serviceName).Inc() } +func (in *prometheusRecorder) ServiceDeletion(serviceID string) { + labels := prometheus.Labels{MetricLabelServiceID: serviceID} + in.serviceReconciliationErrorCounter.DeletePartialMatch(labels) + in.serviceReconciliationCounter.DeletePartialMatch(labels) + in.resourceCacheMissCounter.DeletePartialMatch(labels) + in.resourceCacheHitCounter.DeletePartialMatch(labels) +} + func (in *prometheusRecorder) StackRunJobCreation() { in.stackRunJobsCreatedCounter.Inc() } @@ -44,7 +87,6 @@ func (in *prometheusRecorder) init() Recorder { Name: DiscoveryAPICacheRefreshMetricName, Help: DiscoveryAPICacheRefreshMetricDescription, }) - in.discoveryAPICacheRefreshErrorCounter = promauto.NewCounter(prometheus.CounterOpts{ Name: DiscoveryAPICacheRefreshErrorMetricName, Help: DiscoveryAPICacheRefreshErrorMetricDescription, @@ -53,18 +95,34 @@ func (in *prometheusRecorder) init() Recorder { in.serviceReconciliationCounter = promauto.NewCounterVec(prometheus.CounterOpts{ Name: ServiceReconciliationMetricName, Help: ServiceReconciliationMetricDescription, - }, []string{ServiceReconciliationMetricLabelServiceID, ServiceReconciliationMetricLabelServiceName}) - + }, []string{MetricLabelServiceID, MetricLabelServiceName}) in.serviceReconciliationErrorCounter = promauto.NewCounterVec(prometheus.CounterOpts{ Name: ServiceReconciliationErrorMetricName, Help: ServiceReconciliationErrorMetricDescription, - }, []string{ServiceReconciliationMetricLabelServiceID, ServiceReconciliationMetricLabelServiceName}) + }, []string{MetricLabelServiceID, MetricLabelServiceName}) + in.serviceReconciliationDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: ServiceReconciliationDurationMetricName, + Help: ServiceReconciliationDurationMetricDescription, + }, []string{MetricLabelServiceID, MetricLabelServiceName, MetricLabelServiceReconciliationStage}) in.stackRunJobsCreatedCounter = promauto.NewCounter(prometheus.CounterOpts{ Name: StackRunJobsCreatedMetricName, Help: StackRunJobsCreatedMetricDescription, }) + in.resourceCacheWatchCounter = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: ResourceCacheOpenWatchesName, + Help: ResourceCacheOpenWatchesDescription, + }, []string{MetricLabelServiceType}) + in.resourceCacheHitCounter = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: ResourceCacheHitMetricName, + Help: ResourceCacheHitMetricDescription, + }, []string{MetricLabelServiceID}) + in.resourceCacheMissCounter = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: ResourceCacheMissMetricName, + Help: ResourceCacheMissMetricDescription, + }, []string{MetricLabelServiceID}) + return in } diff --git a/internal/metrics/metrics_types.go b/internal/metrics/metrics_types.go index 1cf352f6..9b81da66 100644 --- a/internal/metrics/metrics_types.go +++ b/internal/metrics/metrics_types.go @@ -1,5 +1,9 @@ package metrics +import ( + "time" +) + const ( DiscoveryAPICacheRefreshMetricName = "agent_discoveryapi_cache_refresh_total" DiscoveryAPICacheRefreshMetricDescription = "The total number of Discovery API cache refresh attempts" @@ -13,15 +17,63 @@ const ( ServiceReconciliationErrorMetricName = "agent_service_reconciliation_errors_total" ServiceReconciliationErrorMetricDescription = "The total number of service reconciliation errors" - ServiceReconciliationMetricLabelServiceID = "service_id" - ServiceReconciliationMetricLabelServiceName = "service_name" + ServiceReconciliationDurationMetricName = "agent_service_reconcile_duration_seconds" + ServiceReconciliationDurationMetricDescription = "The time it takes to reconcile a service" StackRunJobsCreatedMetricName = "agent_stack_runs_created_total" StackRunJobsCreatedMetricDescription = "The total number of created stack runs" + + ResourceCacheOpenWatchesName = "agent_resource_cache_open_watches_total" + ResourceCacheOpenWatchesDescription = "The total number of open watches in the resource cache" + + ResourceCacheHitMetricName = "agent_resource_cache_hit_total" + ResourceCacheHitMetricDescription = "The total number of resource cache hits" + + ResourceCacheMissMetricName = "agent_resource_cache_miss_total" + ResourceCacheMissMetricDescription = "The total number of resource cache misses" + + MetricLabelServiceID = "service_id" + MetricLabelServiceName = "service_name" + MetricLabelServiceType = "service_type" + MetricLabelServiceReconciliationStage = "service_reconciliation_stage" +) + +type ServiceReconciliationStage string + +func (in ServiceReconciliationStage) String() string { + return string(in) +} + +const ( + ServiceReconciliationStart ServiceReconciliationStage = "start" + ServiceReconciliationPrepareManifestsFinish ServiceReconciliationStage = "prepare_manifests_finish" + ServiceReconciliationApplyStart ServiceReconciliationStage = "apply_start" + ServiceReconciliationApplyFinish ServiceReconciliationStage = "apply_finish" + ServiceReconciliationUpdateStatusFinish ServiceReconciliationStage = "update_status_finish" + ServiceReconciliationFinish ServiceReconciliationStage = "finish" +) + +type ServiceReconciliationOption func(*serviceReconciliationOptions) + +type serviceReconciliationOptions struct { + err error + startedAt *time.Time + stage *ServiceReconciliationStage +} + +type ContextKey string + +const ( + ContextKeyTimeStart ContextKey = "time_start" ) type Recorder interface { DiscoveryAPICacheRefresh(err error) - ServiceReconciliation(serviceID, serviceName string, err error) + ServiceReconciliation(serviceID, serviceName string, options ...ServiceReconciliationOption) + ServiceDeletion(serviceID string) StackRunJobCreation() + ResourceCacheWatchStart(resourceType string) + ResourceCacheWatchEnd(resourceType string) + ResourceCacheHit(serviceID string) + ResourceCacheMiss(serviceID string) } diff --git a/internal/utils/hash.go b/internal/utils/hash.go index 481b5de2..eddaa8b7 100644 --- a/internal/utils/hash.go +++ b/internal/utils/hash.go @@ -14,3 +14,8 @@ func HashObject(any interface{}) (string, error) { sha := sha256.Sum256(out) return base32.StdEncoding.EncodeToString(sha[:]), nil } + +func HashString(s string) string { + sha := sha256.Sum256([]byte(s)) + return base32.StdEncoding.EncodeToString(sha[:]) +} diff --git a/pkg/applier/builder.go b/pkg/applier/builder.go index e572214d..bd7b280f 100644 --- a/pkg/applier/builder.go +++ b/pkg/applier/builder.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" + "github.com/samber/lo" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/cli-runtime/pkg/resource" "k8s.io/client-go/discovery" @@ -16,7 +17,10 @@ import ( "sigs.k8s.io/cli-utils/pkg/apply/info" "sigs.k8s.io/cli-utils/pkg/apply/prune" "sigs.k8s.io/cli-utils/pkg/inventory" - "sigs.k8s.io/cli-utils/pkg/kstatus/watcher" + kwatcher "sigs.k8s.io/cli-utils/pkg/kstatus/watcher" + + "github.com/pluralsh/deployment-operator/internal/kstatus/watcher" + "github.com/pluralsh/deployment-operator/pkg/common" ) type commonBuilder struct { @@ -28,7 +32,7 @@ type commonBuilder struct { mapper meta.RESTMapper restConfig *rest.Config unstructuredClientForMapping func(*meta.RESTMapping) (resource.RESTClient, error) - statusWatcher watcher.StatusWatcher + statusWatcher kwatcher.StatusWatcher } func (cb *commonBuilder) finalize() (*commonBuilder, error) { @@ -80,7 +84,13 @@ func (cb *commonBuilder) finalize() (*commonBuilder, error) { cx.unstructuredClientForMapping = cx.factory.UnstructuredClientForMapping } if cx.statusWatcher == nil { - cx.statusWatcher = watcher.NewDefaultStatusWatcher(cx.client, cx.mapper) + cx.statusWatcher = watcher.NewDynamicStatusWatcher(cx.client, cx.mapper, watcher.Options{ + RESTScopeStrategy: lo.ToPtr(kwatcher.RESTScopeRoot), + Filters: &kwatcher.Filters{ + Labels: common.ManagedByAgentLabelSelector(), + Fields: nil, + }, + }) } return &cx, nil } @@ -151,7 +161,7 @@ func (b *ApplierBuilder) WithUnstructuredClientForMapping(unstructuredClientForM return b } -func (b *ApplierBuilder) WithStatusWatcher(statusWatcher watcher.StatusWatcher) *ApplierBuilder { +func (b *ApplierBuilder) WithStatusWatcher(statusWatcher kwatcher.StatusWatcher) *ApplierBuilder { b.statusWatcher = statusWatcher return b } diff --git a/pkg/applier/filters/cache_filter.go b/pkg/applier/filters/cache_filter.go new file mode 100644 index 00000000..946c3234 --- /dev/null +++ b/pkg/applier/filters/cache_filter.go @@ -0,0 +1,49 @@ +package filters + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "sigs.k8s.io/cli-utils/pkg/inventory" + + "github.com/pluralsh/deployment-operator/internal/metrics" + "github.com/pluralsh/deployment-operator/pkg/cache" +) + +type CacheFilter struct { +} + +// Name returns a filter identifier for logging. +func (c CacheFilter) Name() string { + return "CacheFilter" +} + +func (c CacheFilter) Filter(obj *unstructured.Unstructured) error { + serviceID := c.serviceID(obj) + newManifestSHA, err := cache.HashResource(*obj) + if err != nil { + // TODO log error + return nil + } + + key := cache.ResourceKeyFromUnstructured(obj) + sha, exists := cache.GetResourceCache().GetCacheEntry(key.ObjectIdentifier()) + if exists && !sha.RequiresApply(newManifestSHA) { + metrics.Record().ResourceCacheHit(serviceID) + return fmt.Errorf("skipping cached object %s", key.ObjectIdentifier()) + } + + metrics.Record().ResourceCacheMiss(serviceID) + sha.SetManifestSHA(newManifestSHA) + cache.GetResourceCache().SetCacheEntry(key.ObjectIdentifier(), sha) + + return nil +} + +func (c CacheFilter) serviceID(obj *unstructured.Unstructured) string { + if annotations := obj.GetAnnotations(); annotations != nil { + return annotations[inventory.OwningInventoryKey] + } + + return "" +} diff --git a/pkg/applier/runner.go b/pkg/applier/runner.go index fbd50f82..5a259c02 100644 --- a/pkg/applier/runner.go +++ b/pkg/applier/runner.go @@ -66,18 +66,13 @@ func (a *Applier) Run(ctx context.Context, invInfo inventory.Info, objects objec // Fetch the queue (channel) of tasks that should be executed. // Build list of apply validation filters. applyFilters := []filter.ValidationFilter{ + filters.CacheFilter{}, filter.InventoryPolicyApplyFilter{ Client: a.client, Mapper: a.mapper, Inv: invInfo, InvPolicy: options.InventoryPolicy, }, - filters.CrdFilter{ - Client: a.client, - Mapper: a.mapper, - Inv: invInfo, - InvPolicy: options.InventoryPolicy, - }, filters.DependencyFilter{ TaskContext: taskContext, ActuationStrategy: actuation.ActuationStrategyApply, diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go new file mode 100644 index 00000000..ab1f3e56 --- /dev/null +++ b/pkg/cache/cache.go @@ -0,0 +1,68 @@ +package cache + +import ( + "context" + "time" + + cmap "github.com/orcaman/concurrent-map/v2" + "github.com/samber/lo" +) + +type Expirable interface { + Expire() +} + +type cacheLine[T Expirable] struct { + resource T + created time.Time +} + +func (l *cacheLine[_]) alive(ttl time.Duration) bool { + return l.created.After(time.Now().Add(-ttl)) +} + +type Cache[T Expirable] struct { + cache cmap.ConcurrentMap[string, cacheLine[T]] + ttl time.Duration + ctx context.Context +} + +func NewCache[T Expirable](ctx context.Context, ttl time.Duration) *Cache[T] { + return &Cache[T]{ + cache: cmap.New[cacheLine[T]](), + ttl: ttl, + ctx: ctx, + } +} + +func (c *Cache[T]) Get(key string) (T, bool) { + data, exists := c.cache.Get(key) + if !exists { + return lo.Empty[T](), false + } + + if !data.alive(c.ttl) { + c.Expire(key) + } + + return data.resource, true +} + +func (c *Cache[T]) Set(key string, value T) { + c.cache.Set(key, cacheLine[T]{resource: value, created: time.Now()}) +} + +func (c *Cache[T]) Wipe() { + c.cache.Clear() +} + +func (c *Cache[T]) Expire(key string) { + expirable, exists := c.cache.Get(key) + if !exists { + return + } + + expirable.resource.Expire() + expirable.created = time.Now() + c.cache.Set(key, expirable) +} diff --git a/pkg/cache/common.go b/pkg/cache/common.go new file mode 100644 index 00000000..978e3ef6 --- /dev/null +++ b/pkg/cache/common.go @@ -0,0 +1,40 @@ +package cache + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + + "github.com/pluralsh/deployment-operator/internal/utils" +) + +// shaObject is a helper structure that represents a resource used to calculate SHA. +type shaObject struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + DeletionTimestamp string `json:"deletionTimestamp"` + Other map[string]any `json:"other"` +} + +// HashResource calculates SHA for an unstructured object. +// It uses object metadata (name, namespace, labels, annotations, deletion timestamp) +// and all other top-level fields except status. +func HashResource(resource unstructured.Unstructured) (string, error) { + resourceCopy := resource.DeepCopy() + object := shaObject{ + Name: resourceCopy.GetName(), + Namespace: resourceCopy.GetNamespace(), + Labels: resourceCopy.GetLabels(), + Annotations: resourceCopy.GetAnnotations(), + } + + if resourceCopy.GetDeletionTimestamp() != nil { + object.DeletionTimestamp = resourceCopy.GetDeletionTimestamp().String() + } + + unstructured.RemoveNestedField(resourceCopy.Object, "metadata") + unstructured.RemoveNestedField(resourceCopy.Object, "status") + object.Other = resourceCopy.Object + + return utils.HashObject(object) +} diff --git a/pkg/cache/resource_cache.go b/pkg/cache/resource_cache.go new file mode 100644 index 00000000..0d3985b4 --- /dev/null +++ b/pkg/cache/resource_cache.go @@ -0,0 +1,294 @@ +package cache + +import ( + "context" + "fmt" + "os" + "time" + + console "github.com/pluralsh/console-client-go" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/clusterreader" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/statusreaders" + + "github.com/pluralsh/deployment-operator/pkg/manifests" + + "github.com/pluralsh/polly/containers" + "github.com/samber/lo" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/rest" + applyevent "sigs.k8s.io/cli-utils/pkg/apply/event" + "sigs.k8s.io/cli-utils/pkg/kstatus/polling/event" + "sigs.k8s.io/cli-utils/pkg/kstatus/status" + kwatcher "sigs.k8s.io/cli-utils/pkg/kstatus/watcher" + "sigs.k8s.io/cli-utils/pkg/object" + + "github.com/pluralsh/deployment-operator/internal/kstatus/watcher" + "github.com/pluralsh/deployment-operator/internal/utils" + "github.com/pluralsh/deployment-operator/pkg/common" + "github.com/pluralsh/deployment-operator/pkg/log" +) + +// ResourceCache is responsible for creating a global resource cache of the +// inventory items registered via [ResourceCache.Register] method. In particular, it +// does: +// - starts unique watches per resource type, watching resource in all namespaces. +// In order to optimize the number of resources being watched, it uses server-side +// filtering by label and only watches for resources with specific label. Only +// registered resource types will be watched. +// - creates a cache based on watched resources that maps [ResourceKey] to [ResourceCacheEntry]. +// It stores information about latest SHAs calculated during a different reconcile stages as well +// as simplified resource status. [ServerSHA] is always calculated based on watch events. All other +// types of SHA ([ManifestSHA], [ApplySHA]) are updated during service reconciliation using [SaveResourceSHA]. +// +// TODO: Allow stopping opened watches if any unique resource type gets removed from inventory. +type ResourceCache struct { + // ctx can be used to stop all tasks running in background. + ctx context.Context + + // dynamicClient is required to list/watch resources. + dynamicClient dynamic.Interface + + // mapper helps with extraction of i.e. version based on the group and kind only. + mapper meta.RESTMapper + + // cache is the main resource cache + cache *Cache[*ResourceCacheEntry] + + // resourceKeySet stores all registered [ResourceKey] that should be watched. + // It still contains detailed resource information such as Group/Kind/Name/Namespace, + // allowing us to uniquely identify resources when creating watches. + resourceKeySet containers.Set[ResourceKey] + + // watcher is a cli-utils [kwatcher.StatusWatcher] interface. + // We are using a custom implementation that allows us to better + // control the lifecycle of opened watches and is using RetryListWatcher + // instead of informers to minimize the memory footprint. + watcher kwatcher.StatusWatcher +} + +var ( + resourceCache *ResourceCache + initialized = false +) + +// Init must be executed early in [main] in order to ensure that the +// [ResourceCache] will be initialized properly during the application +// startup. +func Init(ctx context.Context, config *rest.Config, ttl time.Duration) { + dynamicClient, err := dynamic.NewForConfig(config) + if err != nil { + log.Logger.Error(err, "unable to create dynamic client") + os.Exit(1) + } + + f := utils.NewFactory(config) + mapper, err := f.ToRESTMapper() + if err != nil { + log.Logger.Error(err, "unable to create rest mapper") + os.Exit(1) + } + + w := watcher.NewDynamicStatusWatcher(dynamicClient, mapper, watcher.Options{ + UseCustomObjectFilter: true, + ObjectFilter: nil, + UseInformerRefCache: true, + RESTScopeStrategy: lo.ToPtr(kwatcher.RESTScopeRoot), + Filters: &kwatcher.Filters{ + Labels: common.ManagedByAgentLabelSelector(), + Fields: nil, + }, + }) + + resourceCache = &ResourceCache{ + ctx: ctx, + dynamicClient: dynamicClient, + mapper: mapper, + cache: NewCache[*ResourceCacheEntry](ctx, ttl), + resourceKeySet: containers.NewSet[ResourceKey](), + watcher: w, + } + + initialized = true +} + +// GetResourceCache returns an instance of [ResourceCache]. It can +// be accessed outside this package only via this getter. +func GetResourceCache() *ResourceCache { + return resourceCache +} + +// GetCacheEntry returns a [ResourceCacheEntry] and an information if it exists. +func (in *ResourceCache) GetCacheEntry(key string) (ResourceCacheEntry, bool) { + if !initialized { + klog.V(4).Info("resource cache not initialized") + return ResourceCacheEntry{}, false + } + + if sha, exists := in.cache.Get(key); exists && sha != nil { + return *sha, true + } + + return ResourceCacheEntry{}, false +} + +// SetCacheEntry updates cache key with the provided value of [ResourceCacheEntry]. +func (in *ResourceCache) SetCacheEntry(key string, value ResourceCacheEntry) { + if !initialized { + klog.V(4).Info("resource cache not initialized") + return + } + + in.cache.Set(key, &value) +} + +// Register updates watched resources. It uses a set to ensure that only unique resources +// are stored. It only supports registering new resources that are not currently being watched. +// If empty set is provided, it won't do anything. +func (in *ResourceCache) Register(inventoryResourceKeys containers.Set[ResourceKey]) { + if !initialized { + klog.V(4).Info("resource cache not initialized") + return + } + + toAdd := inventoryResourceKeys.Difference(in.resourceKeySet) + + if len(toAdd) > 0 { + in.resourceKeySet = containers.ToSet(append(in.resourceKeySet.List(), inventoryResourceKeys.List()...)) + in.watch() + } +} + +// SaveResourceSHA allows updating specific SHA type based on the provided resource. It will +// calculate the SHA and then update cache. +func SaveResourceSHA(resource *unstructured.Unstructured, shaType SHAType) { + if !initialized { + klog.V(4).Info("resource cache not initialized") + return + } + + key := object.UnstructuredToObjMetadata(resource).String() + sha, _ := resourceCache.GetCacheEntry(key) + if err := sha.SetSHA(*resource, shaType); err == nil { + resourceCache.SetCacheEntry(key, sha) + } +} + +// GetCacheStatus returns cached status based on the provided key. If no status is found in cache, +// it will make an API call, fetch the latest resource and extract the status. +func (in *ResourceCache) GetCacheStatus(key object.ObjMetadata) (*console.ComponentAttributes, error) { + if !initialized { + return nil, fmt.Errorf("resource cache not initialized") + } + + entry, exists := in.cache.Get(key.String()) + if exists && entry.status != nil { + return entry.status, nil + } + + mapping, err := in.mapper.RESTMapping(key.GroupKind) + if err != nil { + return nil, err + } + + gvr := watcher.GvrFromGvk(mapping.GroupVersionKind) + obj, err := in.dynamicClient.Resource(gvr).Namespace(key.Namespace).Get(context.Background(), key.Name, metav1.GetOptions{}) + if err != nil { + return nil, err + } + + s, err := in.toStatusEvent(obj) + if err != nil { + return nil, err + } + in.saveResourceStatus(obj) + return common.StatusEventToComponentAttributes(*s, make(map[manifests.GroupName]string)), nil +} + +func (in *ResourceCache) saveResourceStatus(resource *unstructured.Unstructured) { + e, err := in.toStatusEvent(resource) + if err != nil { + log.Logger.Error(err, "unable to convert resource to status event") + return + } + + key := object.UnstructuredToObjMetadata(resource).String() + cacheEntry, _ := resourceCache.GetCacheEntry(key) + cacheEntry.SetStatus(*e) + resourceCache.SetCacheEntry(key, cacheEntry) + +} + +func (in *ResourceCache) watch() { + objMetadataSet := ResourceKeys(in.resourceKeySet.List()).ObjectMetadataSet() + ch := in.watcher.Watch(in.ctx, objMetadataSet, kwatcher.Options{}) + + go func() { + for { + select { + case <-in.ctx.Done(): + if in.ctx.Err() != nil { + log.Logger.Errorf("status watcher context error %v", in.ctx.Err()) + } + return + case e, ok := <-ch: + if !ok { + log.Logger.Error("status watcher event channel closed") + in.watch() + return + } + in.reconcile(e) + } + } + }() +} + +func (in *ResourceCache) reconcile(e event.Event) { + if e.Type != event.ResourceUpdateEvent { + return + } + + if !in.shouldCacheResource(e.Resource) { + in.deleteCacheEntry(e.Resource) + return + } + + SaveResourceSHA(e.Resource.Resource, ServerSHA) + in.saveResourceStatus(e.Resource.Resource) +} + +func (in *ResourceCache) shouldCacheResource(r *event.ResourceStatus) bool { + if r == nil { + return false + } + + return r.Resource != nil && (r.Status == status.CurrentStatus || r.Status == status.InProgressStatus) +} + +func (in *ResourceCache) deleteCacheEntry(r *event.ResourceStatus) { + if r == nil { + return + } + + in.cache.Expire(r.Identifier.String()) +} + +func (in *ResourceCache) toStatusEvent(resource *unstructured.Unstructured) (*applyevent.StatusEvent, error) { + sr := statusreaders.NewDefaultStatusReader(in.mapper) + cr := &clusterreader.DynamicClusterReader{ + DynamicClient: in.dynamicClient, + Mapper: in.mapper, + } + s, err := sr.ReadStatusForObject(context.Background(), cr, resource) + if err != nil { + return nil, err + } + return &applyevent.StatusEvent{ + Identifier: ResourceKeyFromUnstructured(resource).ObjMetadata(), + PollResourceInfo: s, + Resource: resource, + }, nil +} diff --git a/pkg/cache/resource_cache_entry.go b/pkg/cache/resource_cache_entry.go new file mode 100644 index 00000000..77f97455 --- /dev/null +++ b/pkg/cache/resource_cache_entry.go @@ -0,0 +1,84 @@ +package cache + +import ( + console "github.com/pluralsh/console-client-go" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "sigs.k8s.io/cli-utils/pkg/apply/event" + + "github.com/pluralsh/deployment-operator/pkg/common" + "github.com/pluralsh/deployment-operator/pkg/manifests" +) + +type SHAType string + +const ( + ManifestSHA SHAType = "MANIFEST" + ApplySHA SHAType = "APPLY" + ServerSHA SHAType = "SERVER" +) + +// ResourceCacheEntry contains latest SHAs for a single resource from multiple stages +// as well as the last seen status of the resource. +type ResourceCacheEntry struct { + // manifestSHA is SHA of the resource manifest from the repository. + manifestSHA *string + + // applySHA is SHA of the resource post-server-side apply. + // Taking only metadata w/ name, namespace, annotations and labels and non-status non-metadata fields. + applySHA *string + + // serverSHA is SHA from a watch of the resource, using the same pruning function as applySHA. + // It is persisted only if there's a current-inventory annotation. + serverSHA *string + + // status is a simplified Console structure containing last seen status of cache resource. + status *console.ComponentAttributes +} + +// Expire implements [Expirable] interface. +func (in *ResourceCacheEntry) Expire() { + in.manifestSHA = nil + in.applySHA = nil +} + +// SetSHA updates shaType with SHA calculated based on the provided resource. +func (in *ResourceCacheEntry) SetSHA(resource unstructured.Unstructured, shaType SHAType) error { + sha, err := HashResource(resource) + if err != nil { + return err + } + + switch shaType { + case ManifestSHA: + in.manifestSHA = &sha + case ApplySHA: + in.applySHA = &sha + case ServerSHA: + in.serverSHA = &sha + } + + return nil +} + +// SetManifestSHA updates manifest SHA. +func (in *ResourceCacheEntry) SetManifestSHA(manifestSHA string) { + in.manifestSHA = &manifestSHA +} + +// RequiresApply checks if there is any drift +// between applySHA calculated during applying resource and serverSHA from a watch of a resource +// or between last two manifestSHA read from the repository. +// If any drift is detected, then server-side apply should be done. +func (in *ResourceCacheEntry) RequiresApply(manifestSHA string) bool { + return in.serverSHA == nil || + in.applySHA == nil || + in.manifestSHA == nil || + (*in.serverSHA != *in.applySHA) || + (manifestSHA != *in.manifestSHA) +} + +// SetStatus saves the last seen resource [event.StatusEvent] and converts it to a simpler +// [console.ComponentAttributes] structure. +func (in *ResourceCacheEntry) SetStatus(se event.StatusEvent) { + in.status = common.StatusEventToComponentAttributes(se, make(map[manifests.GroupName]string)) +} diff --git a/pkg/cache/resource_key.go b/pkg/cache/resource_key.go new file mode 100644 index 00000000..e5d4f9c7 --- /dev/null +++ b/pkg/cache/resource_key.go @@ -0,0 +1,70 @@ +package cache + +import ( + "slices" + + "github.com/pluralsh/polly/algorithms" + "github.com/pluralsh/polly/containers" + "github.com/samber/lo" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "sigs.k8s.io/cli-utils/pkg/object" +) + +const resourceKeyPlaceholder = "*" + +type ResourceKey object.ObjMetadata + +func (in ResourceKey) ObjMetadata() object.ObjMetadata { + return object.ObjMetadata(in) +} + +// TypeIdentifier returns type-only representation of ResourceKey. +// Name and namespace are replaced with placeholders as they cannot be empty. +func (in ResourceKey) TypeIdentifier() ResourceKey { + in.Name = resourceKeyPlaceholder + in.Namespace = resourceKeyPlaceholder + + return in +} + +// ObjectIdentifier returns a string representation of [object.ObjMetadata]. +func (in ResourceKey) ObjectIdentifier() string { + return in.ObjMetadata().String() +} + +type ResourceKeys []ResourceKey + +func (in ResourceKeys) TypeIdentifierSet() containers.Set[ResourceKey] { + return containers.ToSet(algorithms.Map(in, func(obj ResourceKey) ResourceKey { + return obj.TypeIdentifier() + })) +} + +func (in ResourceKeys) ObjectMetadataSet() object.ObjMetadataSet { + return algorithms.Map(in, func(r ResourceKey) object.ObjMetadata { + return r.ObjMetadata() + }) +} + +// InventoryResourceKeys maps cli-utils inventory ID to ResourceKeys. +type InventoryResourceKeys map[string]ResourceKeys + +func (in InventoryResourceKeys) Values() ResourceKeys { + return slices.Concat(lo.Values(in)...) +} + +func ResourceKeyFromObjMetadata(set object.ObjMetadataSet) ResourceKeys { + return algorithms.Map(set, func(obj object.ObjMetadata) ResourceKey { return ResourceKey(obj) }) +} + +func ResourceKeyFromUnstructured(obj *unstructured.Unstructured) ResourceKey { + if obj == nil { + return ResourceKey(object.NilObjMetadata) + } + return ResourceKey(object.UnstructuredToObjMetadata(obj)) +} + +func ResourceKeyFromString(key string) (ResourceKey, error) { + objMetadata, err := object.ParseObjMetadata(key) + return ResourceKey(objMetadata), err +} diff --git a/pkg/common/common.go b/pkg/common/common.go new file mode 100644 index 00000000..89456c96 --- /dev/null +++ b/pkg/common/common.go @@ -0,0 +1,25 @@ +package common + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" +) + +const ( + ManagedByLabel = "plural.sh/managed-by" + AgentLabelValue = "agent" +) + +func ManagedByAgentLabelSelector() labels.Selector { + return labels.SelectorFromSet(map[string]string{ManagedByLabel: AgentLabelValue}) +} + +func ToUnstructured(obj runtime.Object) (*unstructured.Unstructured, error) { + objMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return nil, err + } + + return &unstructured.Unstructured{Object: objMap}, nil +} diff --git a/pkg/controller/service/health.go b/pkg/common/health.go similarity index 91% rename from pkg/controller/service/health.go rename to pkg/common/health.go index 247072ea..fb67a874 100644 --- a/pkg/controller/service/health.go +++ b/pkg/common/health.go @@ -1,13 +1,13 @@ -package service +package common import ( "encoding/json" "fmt" "strings" + "github.com/argoproj/argo-rollouts/pkg/apis/rollouts" rolloutv1alpha1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" flaggerv1beta1 "github.com/fluxcd/flagger/pkg/apis/flagger/v1beta1" - "github.com/pluralsh/deployment-operator/pkg/lua" appsv1 "k8s.io/api/apps/v1" autoscalingv1 "k8s.io/api/autoscaling/v1" autoscalingv2 "k8s.io/api/autoscaling/v2" @@ -19,9 +19,31 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/kubectl/pkg/util/podutils" ) +const ( + // Indicates that health assessment failed and actual health status is unknown + HealthStatusUnknown HealthStatusCode = "Unknown" + // Progressing health status means that resource is not healthy but still have a chance to reach healthy state + HealthStatusProgressing HealthStatusCode = "Progressing" + // Resource is 100% healthy + HealthStatusHealthy HealthStatusCode = "Healthy" + // Assigned to resources that are suspended or paused. The typical example is a + // [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob. + HealthStatusSuspended HealthStatusCode = "Suspended" + HealthStatusPaused HealthStatusCode = "Paused" + // Degrade status is used if resource status indicates failure or resource could not reach healthy state + // within some timeout. + HealthStatusDegraded HealthStatusCode = "Degraded" + // Indicates that resource is missing in the cluster. + HealthStatusMissing HealthStatusCode = "Missing" +) + +// Represents resource health status +type HealthStatusCode string + const ( SecretKind = "Secret" ServiceKind = "Service" @@ -772,7 +794,58 @@ type Status struct { const readyCondition = "Ready" -func getOtherHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { +func GetHealthCheckFuncByGroupVersionKind(gvk schema.GroupVersionKind) func(obj *unstructured.Unstructured) (*HealthStatus, error) { + switch gvk.Group { + case "apps": + switch gvk.Kind { + case DeploymentKind: + return getDeploymentHealth + case StatefulSetKind: + return getStatefulSetHealth + case ReplicaSetKind: + return getReplicaSetHealth + case DaemonSetKind: + return getDaemonSetHealth + } + case "extensions": + if gvk.Kind == IngressKind { + return getIngressHealth + } + case "networking.k8s.io": + if gvk.Kind == IngressKind { + return getIngressHealth + } + case "": + switch gvk.Kind { + case ServiceKind: + return getServiceHealth + case PersistentVolumeClaimKind: + return getPVCHealth + case PodKind: + return getPodHealth + } + case "batch": + if gvk.Kind == JobKind { + return getJobHealth + } + case "flagger.app": + if gvk.Kind == CanaryKind { + return getCanaryHealth + } + case rollouts.Group: + if gvk.Kind == rollouts.RolloutKind { + return getArgoRolloutHealth + } + case "autoscaling": + if gvk.Kind == HorizontalPodAutoscalerKind { + return getHPAHealth + } + } + + return nil +} + +func GetOtherHealthStatus(obj *unstructured.Unstructured) (*HealthStatus, error) { sts := Status{} status, ok := obj.Object["status"] if ok { @@ -795,18 +868,3 @@ func getOtherHealth(obj *unstructured.Unstructured) (*HealthStatus, error) { return nil, nil } - -func (s *ServiceReconciler) getLuaHealthConvert(obj *unstructured.Unstructured) (*HealthStatus, error) { - out, err := lua.ExecuteLua(obj.Object, s.LuaScript) - if err != nil { - return nil, err - } - healthStatus := &HealthStatus{} - if err := runtime.DefaultUnstructuredConverter.FromUnstructured(out, healthStatus); err != nil { - return nil, err - } - if healthStatus.Status == "" && healthStatus.Message == "" { - return nil, nil - } - return healthStatus, nil -} diff --git a/pkg/common/lua.go b/pkg/common/lua.go new file mode 100644 index 00000000..cba3809b --- /dev/null +++ b/pkg/common/lua.go @@ -0,0 +1,60 @@ +package common + +import ( + "sync" + + "github.com/pluralsh/deployment-operator/pkg/lua" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" +) + +func init() { + luaScript = &LuaScript{} +} + +var luaScript *LuaScript + +// LuaScript is a thread-safe structure for string manipulation +type LuaScript struct { + mu sync.RWMutex + value string +} + +func GetLuaScript() *LuaScript { + return luaScript +} + +// SetValue sets the value of the string in a thread-safe manner +func (s *LuaScript) SetValue(val string) { + s.mu.Lock() + defer s.mu.Unlock() + s.value = val +} + +// GetValue retrieves the value of the string in a thread-safe manner +func (s *LuaScript) GetValue() string { + s.mu.RLock() + defer s.mu.RUnlock() + return s.value +} + +func (s *LuaScript) IsLuaScriptValue() bool { + s.mu.RLock() + defer s.mu.RUnlock() + return s.value != "" +} + +func GetLuaHealthConvert(obj *unstructured.Unstructured, luaScript string) (*HealthStatus, error) { + out, err := lua.ExecuteLua(obj.Object, luaScript) + if err != nil { + return nil, err + } + healthStatus := &HealthStatus{} + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(out, healthStatus); err != nil { + return nil, err + } + if healthStatus.Status == "" && healthStatus.Message == "" { + return nil, nil + } + return healthStatus, nil +} diff --git a/pkg/common/status.go b/pkg/common/status.go new file mode 100644 index 00000000..6aff5ddf --- /dev/null +++ b/pkg/common/status.go @@ -0,0 +1,109 @@ +package common + +import ( + console "github.com/pluralsh/console-client-go" + dlog "github.com/pluralsh/deployment-operator/pkg/log" + "github.com/pluralsh/deployment-operator/pkg/manifests" + "github.com/samber/lo" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/cli-utils/pkg/apply/event" + "sigs.k8s.io/cli-utils/pkg/kstatus/status" +) + +func StatusEventToComponentAttributes(e event.StatusEvent, vcache map[manifests.GroupName]string) *console.ComponentAttributes { + if e.Resource == nil { + return nil + } + gvk := e.Resource.GroupVersionKind() + gname := manifests.GroupName{ + Group: gvk.Group, + Kind: gvk.Kind, + Name: e.Resource.GetName(), + } + + version := gvk.Version + if v, ok := vcache[gname]; ok { + version = v + } + + synced := e.PollResourceInfo.Status == status.CurrentStatus + + if e.PollResourceInfo.Status == status.UnknownStatus { + if ToStatus(e.Resource) != nil { + synced = *ToStatus(e.Resource) == console.ComponentStateRunning + } + } + return &console.ComponentAttributes{ + Group: gvk.Group, + Kind: gvk.Kind, + Namespace: e.Resource.GetNamespace(), + Name: e.Resource.GetName(), + Version: version, + Synced: synced, + State: ToStatus(e.Resource), + } +} + +func ToStatus(obj *unstructured.Unstructured) *console.ComponentState { + h, err := GetResourceHealth(obj) + if err != nil { + dlog.Logger.Error(err, "Failed to get resource health status", "name", obj.GetName(), "namespace", obj.GetNamespace()) + } + if h == nil { + return nil + } + + if h.Status == HealthStatusDegraded { + return lo.ToPtr(console.ComponentStateFailed) + } + + if h.Status == HealthStatusHealthy { + return lo.ToPtr(console.ComponentStateRunning) + } + + if h.Status == HealthStatusPaused { + return lo.ToPtr(console.ComponentStatePaused) + } + + return lo.ToPtr(console.ComponentStatePending) +} + +// GetResourceHealth returns the health of a k8s resource +func GetResourceHealth(obj *unstructured.Unstructured) (health *HealthStatus, err error) { + if obj.GetDeletionTimestamp() != nil { + return &HealthStatus{ + Status: HealthStatusProgressing, + Message: "Pending deletion", + }, nil + } + + if healthCheck := GetHealthCheckFunc(obj.GroupVersionKind()); healthCheck != nil { + if health, err = healthCheck(obj); err != nil { + health = &HealthStatus{ + Status: HealthStatusUnknown, + Message: err.Error(), + } + } + } + return health, err + +} + +// GetHealthCheckFunc returns built-in health check function or nil if health check is not supported +func GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unstructured) (*HealthStatus, error) { + + if healthFunc := GetHealthCheckFuncByGroupVersionKind(gvk); healthFunc != nil { + return healthFunc + } + + if GetLuaScript().IsLuaScriptValue() { + return getLuaHealthConvert + } + + return GetOtherHealthStatus +} + +func getLuaHealthConvert(obj *unstructured.Unstructured) (*HealthStatus, error) { + return GetLuaHealthConvert(obj, GetLuaScript().GetValue()) +} diff --git a/pkg/controller/controller_manager.go b/pkg/controller/controller_manager.go index 9a819d6b..17687364 100644 --- a/pkg/controller/controller_manager.go +++ b/pkg/controller/controller_manager.go @@ -5,10 +5,11 @@ import ( "errors" "time" - "github.com/pluralsh/deployment-operator/pkg/client" - "github.com/pluralsh/deployment-operator/pkg/websocket" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" + + "github.com/pluralsh/deployment-operator/pkg/client" + "github.com/pluralsh/deployment-operator/pkg/websocket" ) type ControllerManager struct { @@ -88,8 +89,7 @@ func (cm *ControllerManager) Start() error { if controllerPollInterval := controller.Do.GetPollInterval(); controllerPollInterval > 0 { pollInterval = controllerPollInterval } - //nolint:all - _ = wait.PollImmediateInfinite(pollInterval, func() (done bool, err error) { + _ = wait.PollUntilContextCancel(context.Background(), pollInterval, true, func(_ context.Context) (done bool, err error) { return controller.Do.Poll(cm.ctx) }) }() @@ -100,8 +100,7 @@ func (cm *ControllerManager) Start() error { } go func() { - //nolint:all - _ = wait.PollImmediateInfinite(cm.Refresh, func() (done bool, err error) { + _ = wait.PollUntilContextCancel(context.Background(), cm.Refresh, true, func(_ context.Context) (done bool, err error) { return false, cm.Socket.Join() }) }() diff --git a/pkg/controller/service/reconciler.go b/pkg/controller/service/reconciler.go index d02363d1..7e245807 100644 --- a/pkg/controller/service/reconciler.go +++ b/pkg/controller/service/reconciler.go @@ -23,6 +23,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/pluralsh/deployment-operator/cmd/agent/args" + agentcommon "github.com/pluralsh/deployment-operator/pkg/common" + clienterrors "github.com/pluralsh/deployment-operator/internal/errors" "github.com/pluralsh/deployment-operator/internal/helpers" "github.com/pluralsh/deployment-operator/internal/metrics" @@ -38,14 +41,6 @@ import ( "github.com/pluralsh/deployment-operator/pkg/websocket" ) -func init() { - Local = false -} - -var ( - Local = false -) - const ( OperatorService = "deploy-operator" RestoreConfigMapName = "restore-config-map" @@ -64,14 +59,14 @@ type ServiceReconciler struct { SvcCache *client.Cache[console.GetServiceDeploymentForAgent_ServiceDeployment] ManifestCache *manifests.ManifestCache UtilFactory util.Factory - LuaScript string RestoreNamespace string discoveryClient *discovery.DiscoveryClient pinger *ping.Pinger + ctx context.Context } -func NewServiceReconciler(ctx context.Context, consoleClient client.Client, config *rest.Config, refresh time.Duration, restoreNamespace string) (*ServiceReconciler, error) { +func NewServiceReconciler(ctx context.Context, consoleClient client.Client, config *rest.Config, refresh, manifestTTL time.Duration, restoreNamespace, consoleURL string) (*ServiceReconciler, error) { logger := log.FromContext(ctx) utils.DisableClientLimits(config) @@ -88,7 +83,7 @@ func NewServiceReconciler(ctx context.Context, consoleClient client.Client, conf svcQueue := workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) - manifestCache := manifests.NewCache(refresh, deployToken) + manifestCache := manifests.NewCache(manifestTTL, deployToken, consoleURL) f := utils.NewFactory(config) @@ -131,14 +126,13 @@ func NewServiceReconciler(ctx context.Context, consoleClient client.Client, conf discoveryClient: discoveryClient, pinger: ping.New(consoleClient, discoveryClient, f), RestoreNamespace: restoreNamespace, + ctx: ctx, }, nil } func CapabilitiesAPIVersions(discoveryClient *discovery.DiscoveryClient) error { lists, err := discoveryClient.ServerPreferredResources() - if err != nil { - return err - } + for _, list := range lists { if len(list.APIResources) == 0 { continue @@ -154,7 +148,7 @@ func CapabilitiesAPIVersions(discoveryClient *discovery.DiscoveryClient) error { template.APIVersions.Set(fmt.Sprintf("%s/%s", gv.String(), resource.Kind), true) } } - return nil + return err } func (s *ServiceReconciler) GetPollInterval() time.Duration { @@ -196,6 +190,12 @@ func newDestroyer(invFactory inventory.ClientFactory, f util.Factory) (*apply.De func postProcess(mans []*unstructured.Unstructured) []*unstructured.Unstructured { return lo.Map(mans, func(man *unstructured.Unstructured, ind int) *unstructured.Unstructured { + labels := man.GetLabels() + if labels == nil { + labels = map[string]string{} + } + labels[agentcommon.ManagedByLabel] = agentcommon.AgentLabelValue + man.SetLabels(labels) if man.GetKind() != "CustomResourceDefinition" { return man } @@ -261,6 +261,12 @@ func (s *ServiceReconciler) Poll(ctx context.Context) (done bool, err error) { return false, nil } for _, svc := range services { + // If services arg is provided, we can skip + // services that are not on the list. + if args.SkipService(svc.Node.ID) { + continue + } + logger.Info("sending update for", "service", svc.Node.ID) s.SvcQueue.Add(svc.Node.ID) } @@ -275,6 +281,9 @@ func (s *ServiceReconciler) Poll(ctx context.Context) (done bool, err error) { } func (s *ServiceReconciler) Reconcile(ctx context.Context, id string) (result reconcile.Result, err error) { + start := time.Now() + ctx = context.WithValue(ctx, metrics.ContextKeyTimeStart, start) + logger := log.FromContext(ctx) logger.Info("attempting to sync service", "id", id) @@ -288,6 +297,13 @@ func (s *ServiceReconciler) Reconcile(ctx context.Context, id string) (result re return } + metrics.Record().ServiceReconciliation( + id, + svc.Name, + metrics.WithServiceReconciliationStartedAt(start), + metrics.WithServiceReconciliationStage(metrics.ServiceReconciliationStart), + ) + defer func() { if err != nil { logger.Error(err, "process item") @@ -296,11 +312,17 @@ func (s *ServiceReconciler) Reconcile(ctx context.Context, id string) (result re } } - metrics.Record().ServiceReconciliation(id, svc.Name, err) + metrics.Record().ServiceReconciliation( + id, + svc.Name, + metrics.WithServiceReconciliationError(err), + metrics.WithServiceReconciliationStartedAt(start), + metrics.WithServiceReconciliationStage(metrics.ServiceReconciliationFinish), + ) }() - logger.V(2).Info("local", "flag", Local) - if Local && svc.Name == OperatorService { + logger.V(2).Info("local", "flag", args.Local()) + if args.Local() && svc.Name == OperatorService { return } @@ -317,6 +339,7 @@ func (s *ServiceReconciler) Reconcile(ctx context.Context, id string) (result re ValidationPolicy: 1, }) + metrics.Record().ServiceDeletion(id) err = s.UpdatePruneStatus(ctx, svc, ch, map[manis.GroupName]string{}) return } @@ -328,7 +351,6 @@ func (s *ServiceReconciler) Reconcile(ctx context.Context, id string) (result re return } manifests = postProcess(manifests) - logger.Info("Syncing manifests", "count", len(manifests)) invObj, manifests, err := s.SplitObjects(id, manifests) if err != nil { @@ -336,9 +358,14 @@ func (s *ServiceReconciler) Reconcile(ctx context.Context, id string) (result re } inv := inventory.WrapInventoryInfoObj(invObj) - vcache := manis.VersionCache(manifests) + metrics.Record().ServiceReconciliation( + id, + svc.Name, + metrics.WithServiceReconciliationStartedAt(start), + metrics.WithServiceReconciliationStage(metrics.ServiceReconciliationPrepareManifestsFinish), + ) - logger.Info("Apply service", "name", svc.Name, "namespace", svc.Namespace) + vcache := manis.VersionCache(manifests) if err = s.CheckNamespace(svc.Namespace, svc.SyncConfig); err != nil { logger.Error(err, "failed to check namespace") @@ -434,14 +461,6 @@ func (s *ServiceReconciler) defaultInventoryObjTemplate(id string) *unstructured } } -func (s *ServiceReconciler) GetLuaScript() string { - return s.LuaScript -} - -func (s *ServiceReconciler) SetLuaScript(script string) { - s.LuaScript = script -} - func (s *ServiceReconciler) isClusterRestore(ctx context.Context) (bool, error) { cmr, err := s.Clientset.CoreV1().ConfigMaps(s.RestoreNamespace).Get(ctx, RestoreConfigMapName, metav1.GetOptions{}) if err != nil { diff --git a/pkg/controller/service/reconciler_status.go b/pkg/controller/service/reconciler_status.go index c04bc6bb..9311559d 100644 --- a/pkg/controller/service/reconciler_status.go +++ b/pkg/controller/service/reconciler_status.go @@ -4,137 +4,20 @@ import ( "context" "fmt" "strings" + "time" - "github.com/argoproj/argo-rollouts/pkg/apis/rollouts" console "github.com/pluralsh/console-client-go" - "github.com/pluralsh/deployment-operator/pkg/manifests" - "github.com/samber/lo" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/klog/v2" "sigs.k8s.io/cli-utils/pkg/apply/event" "sigs.k8s.io/cli-utils/pkg/print/stats" "sigs.k8s.io/controller-runtime/pkg/log" -) -const ( - // Indicates that health assessment failed and actual health status is unknown - HealthStatusUnknown HealthStatusCode = "Unknown" - // Progressing health status means that resource is not healthy but still have a chance to reach healthy state - HealthStatusProgressing HealthStatusCode = "Progressing" - // Resource is 100% healthy - HealthStatusHealthy HealthStatusCode = "Healthy" - // Assigned to resources that are suspended or paused. The typical example is a - // [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob. - HealthStatusSuspended HealthStatusCode = "Suspended" - HealthStatusPaused HealthStatusCode = "Paused" - // Degrade status is used if resource status indicates failure or resource could not reach healthy state - // within some timeout. - HealthStatusDegraded HealthStatusCode = "Degraded" - // Indicates that resource is missing in the cluster. - HealthStatusMissing HealthStatusCode = "Missing" + "github.com/pluralsh/deployment-operator/internal/metrics" + "github.com/pluralsh/deployment-operator/pkg/cache" + "github.com/pluralsh/deployment-operator/pkg/manifests" ) -// Represents resource health status -type HealthStatusCode string - -// GetResourceHealth returns the health of a k8s resource -func (s *ServiceReconciler) getResourceHealth(obj *unstructured.Unstructured) (health *HealthStatus, err error) { - if obj.GetDeletionTimestamp() != nil { - return &HealthStatus{ - Status: HealthStatusProgressing, - Message: "Pending deletion", - }, nil - } - - if healthCheck := s.GetHealthCheckFunc(obj.GroupVersionKind()); healthCheck != nil { - if health, err = healthCheck(obj); err != nil { - health = &HealthStatus{ - Status: HealthStatusUnknown, - Message: err.Error(), - } - } - } - return health, err - -} - -// GetHealthCheckFunc returns built-in health check function or nil if health check is not supported -func (s *ServiceReconciler) GetHealthCheckFunc(gvk schema.GroupVersionKind) func(obj *unstructured.Unstructured) (*HealthStatus, error) { - switch gvk.Group { - case "apps": - switch gvk.Kind { - case DeploymentKind: - return getDeploymentHealth - case StatefulSetKind: - return getStatefulSetHealth - case ReplicaSetKind: - return getReplicaSetHealth - case DaemonSetKind: - return getDaemonSetHealth - } - case "extensions": - if gvk.Kind == IngressKind { - return getIngressHealth - } - case "networking.k8s.io": - if gvk.Kind == IngressKind { - return getIngressHealth - } - case "": - switch gvk.Kind { - case ServiceKind: - return getServiceHealth - case PersistentVolumeClaimKind: - return getPVCHealth - case PodKind: - return getPodHealth - } - case "batch": - if gvk.Kind == JobKind { - return getJobHealth - } - case "flagger.app": - if gvk.Kind == CanaryKind { - return getCanaryHealth - } - case rollouts.Group: - if gvk.Kind == rollouts.RolloutKind { - return getArgoRolloutHealth - } - case "autoscaling": - if gvk.Kind == HorizontalPodAutoscalerKind { - return getHPAHealth - } - } - - if s.GetLuaScript() != "" { - return s.getLuaHealthConvert - } - - return getOtherHealth -} - -func (s *ServiceReconciler) toStatus(obj *unstructured.Unstructured) *console.ComponentState { - h, _ := s.getResourceHealth(obj) - if h == nil { - return nil - } - - if h.Status == HealthStatusDegraded { - return lo.ToPtr(console.ComponentStateFailed) - } - - if h.Status == HealthStatusHealthy { - return lo.ToPtr(console.ComponentStateRunning) - } - - if h.Status == HealthStatusPaused { - return lo.ToPtr(console.ComponentStatePaused) - } - - return lo.ToPtr(console.ComponentStatePending) -} - func (s *ServiceReconciler) UpdatePruneStatus(ctx context.Context, svc *console.GetServiceDeploymentForAgent_ServiceDeployment, ch <-chan event.Event, vcache map[manifests.GroupName]string) error { logger := log.FromContext(ctx) @@ -174,13 +57,28 @@ func (s *ServiceReconciler) UpdatePruneStatus(ctx context.Context, svc *console. return nil } -func (s *ServiceReconciler) UpdateApplyStatus(ctx context.Context, svc *console.GetServiceDeploymentForAgent_ServiceDeployment, ch <-chan event.Event, printStatus bool, vcache map[manifests.GroupName]string) error { +func (s *ServiceReconciler) UpdateApplyStatus( + ctx context.Context, + svc *console.GetServiceDeploymentForAgent_ServiceDeployment, + ch <-chan event.Event, + printStatus bool, + vcache map[manifests.GroupName]string, +) error { logger := log.FromContext(ctx) + start, err := metrics.FromContext[time.Time](ctx, metrics.ContextKeyTimeStart) + if err != nil { + klog.Fatalf("programmatic error! context does not have value for the key %s", metrics.ContextKeyTimeStart) + } + + metrics.Record().ServiceReconciliation( + svc.ID, + svc.Name, + metrics.WithServiceReconciliationStartedAt(start), + metrics.WithServiceReconciliationStage(metrics.ServiceReconciliationApplyStart), + ) var statsCollector stats.Stats - var err error statusCollector := newServiceComponentsStatusCollector(s, svc) - for e := range ch { statsCollector.Handle(e) switch e.Type { @@ -194,6 +92,10 @@ func (s *ServiceReconciler) UpdateApplyStatus(ctx context.Context, svc *console. statusCollector.updateApplyStatus(e.ApplyEvent.Identifier, e.ApplyEvent) gk := e.ApplyEvent.Identifier.GroupKind name := e.ApplyEvent.Identifier.Name + if e.ApplyEvent.Status == event.ApplySuccessful { + cache.SaveResourceSHA(e.ApplyEvent.Resource, cache.ApplySHA) + } + if e.ApplyEvent.Error != nil { msg := fmt.Sprintf("%s apply %s: %s\n", resourceIDToString(gk, name), strings.ToLower(e.ApplyEvent.Status.String()), e.ApplyEvent.Error.Error()) @@ -224,15 +126,28 @@ func (s *ServiceReconciler) UpdateApplyStatus(ctx context.Context, svc *console. } } + metrics.Record().ServiceReconciliation( + svc.ID, + svc.Name, + metrics.WithServiceReconciliationStartedAt(start), + metrics.WithServiceReconciliationStage(metrics.ServiceReconciliationApplyFinish), + ) + if err := FormatSummary(ctx, svc.Namespace, svc.Name, statsCollector); err != nil { return err } - components := statusCollector.componentsAttributes(vcache) if err := s.UpdateStatus(svc.ID, components, errorAttributes("sync", err)); err != nil { logger.Error(err, "Failed to update service status, ignoring for now") } + metrics.Record().ServiceReconciliation( + svc.ID, + svc.Name, + metrics.WithServiceReconciliationStartedAt(start), + metrics.WithServiceReconciliationStage(metrics.ServiceReconciliationUpdateStatusFinish), + ) + return nil } diff --git a/pkg/controller/service/status_collector.go b/pkg/controller/service/status_collector.go index 1d217e4b..400b1f2d 100644 --- a/pkg/controller/service/status_collector.go +++ b/pkg/controller/service/status_collector.go @@ -4,12 +4,16 @@ import ( "context" console "github.com/pluralsh/console-client-go" + "github.com/pluralsh/polly/containers" "github.com/samber/lo" + "golang.org/x/exp/maps" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/cli-utils/pkg/apply/event" - "sigs.k8s.io/cli-utils/pkg/kstatus/status" "sigs.k8s.io/cli-utils/pkg/object" + "github.com/pluralsh/deployment-operator/pkg/cache" + "github.com/pluralsh/deployment-operator/pkg/common" + "github.com/pluralsh/deployment-operator/pkg/log" "github.com/pluralsh/deployment-operator/pkg/manifests" ) @@ -84,7 +88,7 @@ func (sc *serviceComponentsStatusCollector) fromApplyResult(e event.ApplyEvent, Name: e.Resource.GetName(), Version: version, Synced: live == desired, - State: sc.reconciler.toStatus(e.Resource), + State: common.ToStatus(e.Resource), Content: &console.ComponentContentAttributes{ Desired: &desired, Live: &live, @@ -92,40 +96,6 @@ func (sc *serviceComponentsStatusCollector) fromApplyResult(e event.ApplyEvent, } } -func (sc *serviceComponentsStatusCollector) fromSyncResult(e event.StatusEvent, vcache map[manifests.GroupName]string) *console.ComponentAttributes { - if e.Resource == nil { - return nil - } - gvk := e.Resource.GroupVersionKind() - gname := manifests.GroupName{ - Group: gvk.Group, - Kind: gvk.Kind, - Name: e.Resource.GetName(), - } - - version := gvk.Version - if v, ok := vcache[gname]; ok { - version = v - } - - synced := e.PollResourceInfo.Status == status.CurrentStatus - - if e.PollResourceInfo.Status == status.UnknownStatus { - if sc.reconciler.toStatus(e.Resource) != nil { - synced = *sc.reconciler.toStatus(e.Resource) == console.ComponentStateRunning - } - } - return &console.ComponentAttributes{ - Group: gvk.Group, - Kind: gvk.Kind, - Namespace: e.Resource.GetNamespace(), - Name: e.Resource.GetName(), - Version: version, - Synced: synced, - State: sc.reconciler.toStatus(e.Resource), - } -} - func (sc *serviceComponentsStatusCollector) componentsAttributes(vcache map[manifests.GroupName]string) []*console.ComponentAttributes { components := make([]*console.ComponentAttributes, 0, len(sc.latestStatus)) @@ -139,10 +109,31 @@ func (sc *serviceComponentsStatusCollector) componentsAttributes(vcache map[mani } for _, v := range sc.latestStatus { - if attrs := sc.fromSyncResult(v, vcache); attrs != nil { + if attrs := common.StatusEventToComponentAttributes(v, vcache); attrs != nil { components = append(components, attrs) } } + applyKeys := maps.Keys(sc.applyStatus) + statusKeys := maps.Keys(sc.latestStatus) + diff := containers.ToSet(applyKeys).Difference(containers.ToSet(statusKeys)) + for key := range diff { + e, err := cache.GetResourceCache().GetCacheStatus(key) + if err != nil { + log.Logger.Error(err, "failed to get cache status") + continue + } + gname := manifests.GroupName{ + Group: e.Group, + Kind: e.Kind, + Name: e.Name, + } + + if v, ok := vcache[gname]; ok { + e.Version = v + } + components = append(components, e) + } + return components } diff --git a/pkg/manifests/cache.go b/pkg/manifests/cache.go index 3b3f5ae3..25dd3947 100644 --- a/pkg/manifests/cache.go +++ b/pkg/manifests/cache.go @@ -20,30 +20,33 @@ var ( type cacheLine struct { dir string + sha string created time.Time } type ManifestCache struct { - cache cmap.ConcurrentMap[string, *cacheLine] - token string - expiry time.Duration + cache cmap.ConcurrentMap[string, *cacheLine] + token string + consoleURL string + expiry time.Duration } -func NewCache(expiry time.Duration, token string) *ManifestCache { +func NewCache(expiry time.Duration, token, consoleURL string) *ManifestCache { return &ManifestCache{ - cache: cmap.New[*cacheLine](), - token: token, - expiry: expiry, + cache: cmap.New[*cacheLine](), + token: token, + expiry: expiry, + consoleURL: consoleURL, } } func (c *ManifestCache) Fetch(utilFactory util.Factory, svc *console.GetServiceDeploymentForAgent_ServiceDeployment) ([]*unstructured.Unstructured, error) { + sha, err := fetchSha(c.consoleURL, c.token, svc.ID) if line, ok := c.cache.Get(svc.ID); ok { - if line.live(c.expiry) { + if err == nil && line.live(c.expiry) && line.sha == sha { return template.Render(line.dir, svc, utilFactory) - } else { - line.wipe() } + line.wipe() } if svc.Tarball == nil { @@ -57,7 +60,7 @@ func (c *ManifestCache) Fetch(utilFactory util.Factory, svc *console.GetServiceD } log.V(1).Info("using cache dir", "dir", dir) - c.cache.Set(svc.ID, &cacheLine{dir: dir, created: time.Now()}) + c.cache.Set(svc.ID, &cacheLine{dir: dir, sha: sha, created: time.Now()}) return template.Render(dir, svc, utilFactory) } diff --git a/pkg/manifests/tarball.go b/pkg/manifests/tarball.go index 6405a147..89085026 100644 --- a/pkg/manifests/tarball.go +++ b/pkg/manifests/tarball.go @@ -2,8 +2,11 @@ package manifests import ( "fmt" + "io" "net/http" + "net/url" "os" + "strings" "time" "github.com/pluralsh/deployment-operator/pkg/errors" @@ -14,41 +17,70 @@ var ( client = &http.Client{Timeout: 15 * time.Second} ) -func fetch(url, token string) (string, error) { - dir, err := os.MkdirTemp("", "manifests") +func get(url, token string) (string, error) { + req, err := http.NewRequest(http.MethodGet, url, nil) if err != nil { - return dir, err - } - - req, err := http.NewRequest("GET", url, nil) - if err != nil { - return dir, err + return "", err } req.Header.Add("Authorization", "Token "+token) resp, err := client.Do(req) if err != nil { - return dir, err + return "", err } defer resp.Body.Close() if resp.StatusCode != 200 { if resp.StatusCode == 403 { - return dir, errors.ErrUnauthenticated + return "", errors.ErrUnauthenticated } if resp.StatusCode == 402 { - return dir, errors.ErrTransientManifest + return "", errors.ErrTransientManifest } - return dir, fmt.Errorf("could not fetch manifest, error code %d", resp.StatusCode) + return "", fmt.Errorf("could not fetch manifest, error code %d", resp.StatusCode) + } + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + return string(body), nil +} + +func fetchSha(consoleURL, token, serviceID string) (string, error) { + url, err := sanitizeURL(consoleURL) + if err != nil { + return "", err + } + url = fmt.Sprintf("%s/ext/v1/digests?id=%s", url, serviceID) + return get(url, token) +} + +func fetch(url, token string) (string, error) { + dir, err := os.MkdirTemp("", "manifests") + if err != nil { + return "", err + } + + resp, err := get(url, token) + if err != nil { + return "", err } log.V(1).Info("finished request to", "url", url) - if err := fs.Untar(dir, resp.Body); err != nil { + if err := fs.Untar(dir, strings.NewReader(resp)); err != nil { return dir, err } return dir, nil } + +func sanitizeURL(consoleURL string) (string, error) { + u, err := url.Parse(consoleURL) + if err != nil { + return "", err + } + return fmt.Sprintf("%s://%s", u.Scheme, u.Host), nil +} diff --git a/pkg/manifests/template/helm.go b/pkg/manifests/template/helm.go index ee12f466..055ef03f 100644 --- a/pkg/manifests/template/helm.go +++ b/pkg/manifests/template/helm.go @@ -32,6 +32,8 @@ import ( "k8s.io/client-go/util/homedir" "k8s.io/kubectl/pkg/cmd/util" "sigs.k8s.io/yaml" + + "github.com/pluralsh/deployment-operator/cmd/agent/args" ) const ( @@ -45,8 +47,6 @@ const ( ) func init() { - EnableHelmDependencyUpdate = false - DisableHelmTemplateDryRunServer = false // setup helm cache directory. dir, err := os.MkdirTemp("", "repositories") if err != nil { @@ -59,8 +59,6 @@ func init() { } var settings = cli.New() -var EnableHelmDependencyUpdate bool -var DisableHelmTemplateDryRunServer bool var APIVersions cmap.ConcurrentMap[string, bool] func debug(format string, v ...interface{}) { @@ -90,8 +88,8 @@ func (h *helm) Render(svc *console.GetServiceDeploymentForAgent_ServiceDeploymen return nil, err } - log.Println("render helm templates:", "enable dependency update=", EnableHelmDependencyUpdate, "dependencies=", len(c.Dependencies)) - if len(c.Dependencies) > 0 && EnableHelmDependencyUpdate { + log.Println("render helm templates:", "enable dependency update=", args.EnableHelmDependencyUpdate(), "dependencies=", len(c.Dependencies)) + if len(c.Dependencies) > 0 && args.EnableHelmDependencyUpdate() { if err := h.dependencyUpdate(config, c.Dependencies); err != nil { return nil, err } @@ -216,7 +214,7 @@ func (h *helm) templateHelm(conf *action.Configuration, release, namespace strin client := action.NewInstall(conf) client.DryRun = true - if !DisableHelmTemplateDryRunServer { + if !args.DisableHelmTemplateDryRunServer() { client.DryRunOption = "server" } client.ReleaseName = release