Skip to content

Commit

Permalink
feat: Agent server-side apply (SSA) deduplication cache (#227)
Browse files Browse the repository at this point in the history
* add cache

* draft resource watch

* use gvr cache

* add event handler

* add field selector

* add initial status reconciler draft

* add agent labels

* initial server cache update

* finish initial server cache implementation

* update status watcher

* register controller

* change flag

* fix build

* fixes

* add sha logic

* update resource watcher

* fix resource key parsing

* add apply check

* update sha object

* add main apply check

* finish main apply check

* update chache

* refactor

* move const outside func

* extract health status

* set health status

* refactor health status

* refactor resource cache

* refactor after merge

* check deleted resource

* add health cache

* fix lint

* add resource key alias

* use resource key alias

* move getLuaHealthConvert

* handle contexts better

* fix recursive call

* use resource key alias

* post merge fix

* format

* add cache filter

* add custom status watcher

* close watcher

* add filter

* skip timeout

* fix watcher cleanup on close error

* revert skip timeout

* fixes

* refactor

* use custom status watcher for cache

* revert last commit

* use a custom status watcher implementation for resource cache and applier

* disable health cache

* fix linter

* remove health cache

* don't throw the error when inventory config map deleted

* clear cache

* add logs

* optimize status watcher to use unique watches when running Watch multiple times

* bump go version

* fix linter

* bump go version

* bump go CI

* make tools before unit tests

* bump go CI

* add debug

* -vet=off

* disable controllers

* bump go in dockerfile

* fix dockerfile

* use test suite

* bump ginko version

* disable client in test suite

* disable client in test suite

* disable controller tests

* explicitly initialize cache in main and add more prometheus metrics

* fix unit tests

* fix linter

* change filter order

* improve RequiresApply

* improve filter

* fix watcher

* remove CRD filter

* do not remove whole SHA entry in cache

* revert apply check

* fix apply check

* fix sha object

* remove resource cache expiration check poller

* fix cache cleanup

* close dynamic client watch

* add error log for health status

* fix linter

* refactor args handling and allow reconciling only specific services

* support golang flag

* fix exposing agent metrics

* refactor our custom watcher implementation

* create status cache

* move lua

* fix agent build

* fix unit tests

* fix lint

* fix status watcher init in the applier

* fix status cache

* refactor statuses

* use ComponentAttributes

* add retry watcher wrapper and use it in status watcher

* use version cache

* fix linter

* fix goroutine leak and add profiler arg

* fix lint

* check if there is only one CustomHealth object

* cleanup, add docs and refactor code

* cleanup

* cleanup metrics when service is deleted

* allow disabling resource cache and add metrics to measure detailed reconcile execution times

* allow providing extra args to operator

* disable resource cache in chart

* allow disabling resource cache via operator secrets

* cache manifests

---------

Co-authored-by: Marcin Maciaszczyk <marcin9yk@icloud.com>
Co-authored-by: Sebastian Florek <sebastian@plural.sh>
  • Loading branch information
3 people authored Jul 12, 2024
1 parent 3ff708f commit 89fb29d
Show file tree
Hide file tree
Showing 58 changed files with 3,550 additions and 450 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
check-latest: true
Expand All @@ -48,17 +48,17 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
check-latest: true
- run: PATH=$PATH:$GOPATH/bin make test
- run: PATH=$PATH:$GOPATH/bin make -d test
lint:
name: Lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
check-latest: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
check-latest: true
Expand Down
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.21.2-alpine3.17 as builder
FROM golang:1.22.4-alpine3.20 as builder

ARG TARGETARCH

Expand All @@ -17,9 +17,9 @@ COPY /api api/
COPY /internal internal/

# Build
RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} GO111MODULE=on go build -a -o deployment-agent cmd/agent/**
RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} GO111MODULE=on go build -a -o deployment-agent cmd/agent/*.go

FROM alpine:3.18
FROM alpine:3.20
WORKDIR /workspace

COPY --from=builder /workspace/deployment-agent .
Expand Down
17 changes: 14 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,25 @@ genmock: mockery ## generates mocks before running tests
##@ Run

.PHONY: agent-run
agent-run: ## run agent
go run cmd/agent/**
agent-run: agent ## run agent
OPERATOR_NAMESPACE=plrl-deploy-operator \
go run cmd/agent/*.go \
--console-url=${PLURAL_CONSOLE_URL}/ext/gql \
--enable-helm-dependency-update=false \
--disable-helm-dry-run-server=false \
--cluster-id=${PLURAL_CLUSTER_ID} \
--local \
--refresh-interval=30s \
--resource-cache-ttl=60s \
--max-concurrent-reconciles=20 \
--v=1 \
--deploy-token=${PLURAL_DEPLOY_TOKEN}

##@ Build

.PHONY: agent
agent: ## build agent
go build -o bin/deployment-agent cmd/agent/**
go build -o bin/deployment-agent cmd/agent/*.go

.PHONY: harness
harness: ## build stack run harness
Expand Down
1 change: 1 addition & 0 deletions charts/deployment-operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ spec:
- -processing-timeout={{ .Values.args.processingTimeout }}
- -enable-helm-dependency-update={{ .Values.args.enableHelmDependencyUpdate }}
- -disable-helm-dry-run-server={{ .Values.args.disableHelmTemplateDryRunServer }}
- -disable-resource-cache={{ .Values.args.disableResourceCache }}
env:
- name: IMAGE_TAG
value: {{ $tag | quote }}
Expand Down
1 change: 1 addition & 0 deletions charts/deployment-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ args:
processingTimeout: 5m
enableHelmDependencyUpdate: false
disableHelmTemplateDryRunServer: false
disableResourceCache: false

image:
repository: ghcr.io/pluralsh/deployment-operator
Expand Down
7 changes: 6 additions & 1 deletion charts/deployment-operator/values.yaml.liquid
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,9 @@ args:
{% if configuration.disableHelmTemplateDryRunServer %}
args:
disableHelmTemplateDryRunServer: {{ configuration.disableHelmTemplateDryRunServer }}
{% endif %}
{% endif %}

{% if configuration.disableResourceCache %}
args:
disableResourceCache: {{ configuration.disableResourceCache }}
{% endif %}
36 changes: 17 additions & 19 deletions cmd/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"os"
"time"

"github.com/pluralsh/deployment-operator/cmd/agent/args"
"github.com/pluralsh/deployment-operator/internal/utils"
"github.com/pluralsh/deployment-operator/pkg/controller/stacks"

Expand All @@ -21,26 +22,23 @@ import (

const pollInterval = time.Second * 30

func runAgent(opt *options, config *rest.Config, ctx context.Context, k8sClient ctrclient.Client) (*controller.ControllerManager, *service.ServiceReconciler, *pipelinegates.GateReconciler) {
r, err := time.ParseDuration(opt.refreshInterval)
if err != nil {
setupLog.Error("unable to get refresh interval", "error", err)
os.Exit(1)
}

t, err := time.ParseDuration(opt.processingTimeout)
if err != nil {
setupLog.Errorw("unable to get processing timeout", "error", err)
os.Exit(1)
}

mgr, err := controller.NewControllerManager(ctx, opt.maxConcurrentReconciles, t, r, lo.ToPtr(true), opt.consoleUrl, opt.deployToken, opt.clusterId)
func runAgent(config *rest.Config, ctx context.Context, k8sClient ctrclient.Client) (*controller.ControllerManager, *service.ServiceReconciler, *pipelinegates.GateReconciler) {
mgr, err := controller.NewControllerManager(
ctx,
args.MaxConcurrentReconciles(),
args.ProcessingTimeout(),
args.RefreshInterval(),
lo.ToPtr(true),
args.ConsoleUrl(),
args.DeployToken(),
args.ClusterId(),
)
if err != nil {
setupLog.Errorw("unable to create manager", "error", err)
os.Exit(1)
}

sr, err := service.NewServiceReconciler(ctx, mgr.GetClient(), config, r, opt.restoreNamespace)
sr, err := service.NewServiceReconciler(ctx, mgr.GetClient(), config, args.RefreshInterval(), args.ManifestCacheTTL(), args.RestoreNamespace(), args.ConsoleUrl())
if err != nil {
setupLog.Errorw("unable to create service reconciler", "error", err)
os.Exit(1)
Expand All @@ -50,7 +48,7 @@ func runAgent(opt *options, config *rest.Config, ctx context.Context, k8sClient
Do: sr,
Queue: sr.SvcQueue,
})
gr, err := pipelinegates.NewGateReconciler(mgr.GetClient(), k8sClient, config, r, pollInterval, opt.clusterId)
gr, err := pipelinegates.NewGateReconciler(mgr.GetClient(), k8sClient, config, args.RefreshInterval(), pollInterval, args.ClusterId())
if err != nil {
setupLog.Errorw("unable to create gate reconciler", "error", err)
os.Exit(1)
Expand All @@ -61,14 +59,14 @@ func runAgent(opt *options, config *rest.Config, ctx context.Context, k8sClient
Queue: gr.GateQueue,
})

rr := restore.NewRestoreReconciler(mgr.GetClient(), k8sClient, r, opt.restoreNamespace)
rr := restore.NewRestoreReconciler(mgr.GetClient(), k8sClient, args.RefreshInterval(), args.RestoreNamespace())
mgr.AddController(&controller.Controller{
Name: "Restore Controller",
Do: rr,
Queue: rr.RestoreQueue,
})

ns := namespaces.NewNamespaceReconciler(mgr.GetClient(), k8sClient, r)
ns := namespaces.NewNamespaceReconciler(mgr.GetClient(), k8sClient, args.RefreshInterval())
mgr.AddController(&controller.Controller{
Name: "Managed Namespace Controller",
Do: ns,
Expand All @@ -81,7 +79,7 @@ func runAgent(opt *options, config *rest.Config, ctx context.Context, k8sClient
os.Exit(1)
}

s := stacks.NewStackReconciler(mgr.GetClient(), k8sClient, r, pollInterval, namespace, opt.consoleUrl, opt.deployToken)
s := stacks.NewStackReconciler(mgr.GetClient(), k8sClient, args.RefreshInterval(), pollInterval, namespace, args.ConsoleUrl(), args.DeployToken())
mgr.AddController(&controller.Controller{
Name: "Stack Controller",
Do: s,
Expand Down
Loading

0 comments on commit 89fb29d

Please sign in to comment.