Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Target allocator #261

Merged
merged 20 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 62 additions & 3 deletions .github/workflows/build-and-upload-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ env:
# Use terraform assume role for uploading to ecr
AWS_ASSUME_ROLE: ${{ secrets.TERRAFORM_AWS_ASSUME_ROLE }}
ECR_OPERATOR_STAGING_REPO: ${{ vars.ECR_OPERATOR_STAGING_REPO }}
ECR_OPERATOR_RELEASE_IMAGE: ${{ secrets.ECR_OPERATOR_RELEASE_IMAGE }}
ECR_TARGET_ALLOCATOR_STAGING_REPO: ${{ vars.ECR_TARGET_ALLOCATOR_STAGING_REPO}}
ECR_OPERATOR_RELEASE_IMAGE: ${{ vars.ECR_TARGET_ALLOCATOR_TEST_OPERATOR_REPO}}
ECR_TARGET_ALLOCATOR_RELEASE_REPO: ${{ vars.ECR_TARGET_ALLOCATOR_RELEASE_REPO}}

on:
workflow_dispatch:
Expand Down Expand Up @@ -81,9 +83,60 @@ jobs:
tags: ${{ env.ECR_OPERATOR_STAGING_REPO }}:${{ inputs.tag }}
platforms: linux/amd64, linux/arm64

MakeTABinary:
name: 'MakeTargetAllocatorImage'
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Set up Go 1.x
uses: actions/setup-go@v4
with:
go-version: '>1.22'
cache: true

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ env.AWS_ASSUME_ROLE }}
aws-region: us-west-2

- name: Login to ECR
if: steps.cached_binaries.outputs.cache-hit == false
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1

- name: Set up Docker Buildx
if: steps.cached_binaries.outputs.cache-hit == false
uses: docker/setup-buildx-action@v1

- name: Set up QEMU
if: steps.cached_binaries.outputs.cache-hit == false
uses: docker/setup-qemu-action@v1

- name: Build Binaries
run: |
go mod download
export GOARCH=arm64 && make targetallocator
export GOARCH=amd64 && make targetallocator
- name: Build Cloudwatch Agent Target Allocator Image and push to ECR
uses: docker/build-push-action@v4
if: steps.cached_binaries.outputs.cache-hit == false
with:
file: ./cmd/amazon-cloudwatch-agent-target-allocator/Dockerfile
context: ./cmd/amazon-cloudwatch-agent-target-allocator
push: true
tags: ${{ env.ECR_TARGET_ALLOCATOR_STAGING_REPO }}:${{ inputs.tag }}
platforms: linux/amd64, linux/arm64

e2e-test:
name: "Application Signals E2E Test"
needs: MakeBinary
needs: [MakeBinary,MakeTABinary]
okankoAMZ marked this conversation as resolved.
Show resolved Hide resolved
uses: ./.github/workflows/application-signals-e2e-test.yml
secrets: inherit
permissions:
Expand Down Expand Up @@ -119,4 +172,10 @@ jobs:
run: |
docker buildx imagetools create \
-t ${{ env.ECR_OPERATOR_RELEASE_IMAGE }} \
${{ env.ECR_OPERATOR_STAGING_REPO }}:${{ inputs.tag }}
${{ env.ECR_OPERATOR_STAGING_REPO }}:${{ inputs.tag }}

- name: Push image to TA release ECR
run: |
docker buildx imagetools create \
-t ${{ env.ECR_TARGET_ALLOCATOR_RELEASE_REPO}} \
${{ env.ECR_TARGET_ALLOCATOR_STAGING_REPO }}:${{ inputs.tag }}
7 changes: 4 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Build the manager binary
FROM golang:1.20 as builder
FROM golang:1.21 as builder

# set goproxy=direct
ENV GOPROXY direct
Expand Down Expand Up @@ -30,9 +30,10 @@ ARG AUTO_INSTRUMENTATION_DOTNET_VERSION
ARG AUTO_INSTRUMENTATION_NODEJS_VERSION
ARG DCMG_EXPORTER_VERSION
ARG NEURON_MONITOR_VERSION
ARG TARGET_ALLOCATOR_VERSION

# Build
RUN CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -ldflags="-X ${VERSION_PKG}.version=${VERSION} -X ${VERSION_PKG}.buildDate=${VERSION_DATE} -X ${VERSION_PKG}.agent=${AGENT_VERSION} -X ${VERSION_PKG}.autoInstrumentationJava=${AUTO_INSTRUMENTATION_JAVA_VERSION} -X ${VERSION_PKG}.autoInstrumentationPython=${AUTO_INSTRUMENTATION_PYTHON_VERSION} -X ${VERSION_PKG}.autoInstrumentationDotNet=${AUTO_INSTRUMENTATION_DOTNET_VERSION} -X ${VERSION_PKG}.autoInstrumentationNodeJS=${AUTO_INSTRUMENTATION_NODEJS_VERSION} -X ${VERSION_PKG}.dcgmExporter=${DCMG_EXPORTER_VERSION} -X ${VERSION_PKG}.neuronMonitor=${NEURON_MONITOR_VERSION}" -a -o manager main.go
RUN CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -ldflags="-X ${VERSION_PKG}.version=${VERSION} -X ${VERSION_PKG}.buildDate=${VERSION_DATE} -X ${VERSION_PKG}.agent=${AGENT_VERSION} -X ${VERSION_PKG}.autoInstrumentationJava=${AUTO_INSTRUMENTATION_JAVA_VERSION} -X ${VERSION_PKG}.autoInstrumentationPython=${AUTO_INSTRUMENTATION_PYTHON_VERSION} -X ${VERSION_PKG}.autoInstrumentationDotNet=${AUTO_INSTRUMENTATION_DOTNET_VERSION} -X ${VERSION_PKG}.autoInstrumentationNodeJS=${AUTO_INSTRUMENTATION_NODEJS_VERSION} -X ${VERSION_PKG}.dcgmExporter=${DCMG_EXPORTER_VERSION} -X ${VERSION_PKG}.neuronMonitor=${NEURON_MONITOR_VERSION} -X ${VERSION_PKG}.targetAllocator=${TARGET_ALLOCATOR_VERSION}" -a -o manager main.go

# Use distroless as minimal base image to package the manager binary
# Refer to https://github.com/GoogleContainerTools/distroless for more details
Expand All @@ -41,4 +42,4 @@ WORKDIR /
COPY --from=builder /workspace/manager .
USER 65532:65532

ENTRYPOINT ["/manager"]
ENTRYPOINT ["/manager"]
23 changes: 22 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@ AUTO_INSTRUMENTATION_DOTNET_VERSION ?= "$(shell grep -v '\#' versions.txt | grep
AUTO_INSTRUMENTATION_NODEJS_VERSION ?= "$(shell grep -v '\#' versions.txt | grep aws-otel-nodejs-instrumentation | awk -F= '{print $$2}')"
DCGM_EXPORTER_VERSION ?= "$(shell grep -v '\#' versions.txt | grep dcgm-exporter | awk -F= '{print $$2}')"
NEURON_MONITOR_VERSION ?= "$(shell grep -v '\#' versions.txt | grep neuron-monitor | awk -F= '{print $$2}')"
TARGET_ALLOCATOR_VERSION ?= "$(shell grep -v '\#' versions.txt | grep target-allocator | awk -F= '{print $$2}')"

# Image URL to use all building/pushing image targets
IMG_PREFIX ?= aws
IMG_REPO ?= cloudwatch-agent-operator
IMG ?= ${IMG_PREFIX}/${IMG_REPO}:${VERSION}
ARCH ?= $(shell go env GOARCH)

TARGET_ALLOCATOR_IMG_REPO ?= target-allocator
TARGET_ALLOCATOR_IMG ?= ${IMG_PREFIX}/${TARGET_ALLOCATOR_IMG_REPO}:${TARGET_ALLOCATOR_VERSION}

# Options for 'bundle-build'
ifneq ($(origin CHANNELS), undefined)
BUNDLE_CHANNELS := --channels=$(CHANNELS)
Expand Down Expand Up @@ -96,6 +100,10 @@ test: generate fmt vet envtest
.PHONY: manager
manager: generate fmt vet
go build -o bin/manager main.go
# Build target allocator binary
.PHONY: targetallocator
targetallocator:
cd cmd/amazon-cloudwatch-agent-target-allocator && CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(ARCH) go build -installsuffix cgo -o bin/targetallocator_${ARCH} -ldflags "${LDFLAGS}" .

# Run against the configured Kubernetes cluster in ~/.kube/config
.PHONY: run
Expand Down Expand Up @@ -155,13 +163,26 @@ generate: controller-gen api-docs
# buildx is used to ensure same results for arm based systems (m1/2 chips)
.PHONY: container
container:
docker buildx build --load --platform linux/${ARCH} -t ${IMG} --build-arg VERSION_PKG=${VERSION_PKG} --build-arg VERSION=${VERSION} --build-arg VERSION_DATE=${VERSION_DATE} --build-arg AGENT_VERSION=${AGENT_VERSION} --build-arg AUTO_INSTRUMENTATION_JAVA_VERSION=${AUTO_INSTRUMENTATION_JAVA_VERSION} --build-arg AUTO_INSTRUMENTATION_PYTHON_VERSION=${AUTO_INSTRUMENTATION_PYTHON_VERSION} --build-arg AUTO_INSTRUMENTATION_DOTNET_VERSION=${AUTO_INSTRUMENTATION_DOTNET_VERSION} --build-arg AUTO_INSTRUMENTATION_NODEJS_VERSION=${AUTO_INSTRUMENTATION_NODEJS_VERSION} --build-arg DCGM_EXPORTER_VERSION=${DCGM_EXPORTER_VERSION} --build-arg NEURON_MONITOR_VERSION=${NEURON_MONITOR_VERSION} .
docker buildx build --load --platform linux/${ARCH} -t ${IMG} --build-arg VERSION_PKG=${VERSION_PKG} --build-arg VERSION=${VERSION} --build-arg VERSION_DATE=${VERSION_DATE} --build-arg AGENT_VERSION=${AGENT_VERSION} --build-arg AUTO_INSTRUMENTATION_JAVA_VERSION=${AUTO_INSTRUMENTATION_JAVA_VERSION} --build-arg AUTO_INSTRUMENTATION_PYTHON_VERSION=${AUTO_INSTRUMENTATION_PYTHON_VERSION} --build-arg AUTO_INSTRUMENTATION_DOTNET_VERSION=${AUTO_INSTRUMENTATION_DOTNET_VERSION} --build-arg AUTO_INSTRUMENTATION_NODEJS_VERSION=${AUTO_INSTRUMENTATION_NODEJS_VERSION} --build-arg DCGM_EXPORTER_VERSION=${DCGM_EXPORTER_VERSION} --build-arg NEURON_MONITOR_VERSION=${NEURON_MONITOR_VERSION} --build-arg TARGET_ALLOCATOR_VERSION=${TARGET_ALLOCATOR_VERSION} .

# Push the container image, used only for local dev purposes
.PHONY: container-push
container-push:
docker push ${IMG}

.PHONY: container-target-allocator-push
container-target-allocator-push:
docker push ${TARGET_ALLOCATOR_IMG}

.PHONY: container-target-allocator
container-target-allocator: GOOS = linux
container-target-allocator: targetallocator
docker buildx build --load --platform linux/${ARCH} -t ${TARGET_ALLOCATOR_IMG} cmd/amazon-cloudwatch-agent-target-allocator

.PHONY: ta-build-and-push
ta-build-and-push: container-target-allocator
ta-build-and-push: container-target-allocator-push

.PHONY: kustomize
kustomize: ## Download kustomize locally if necessary.
$(call go-get-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v5,$(KUSTOMIZE_VERSION))
Expand Down
15 changes: 15 additions & 0 deletions apis/v1alpha1/allocation_strategy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

package v1alpha1

type (
// AmazonCloudWatchAgentTargetAllocatorAllocationStrategy represent which strategy to distribute target to each collector
// +kubebuilder:validation:Enum=consistent-hashing
AmazonCloudWatchAgentTargetAllocatorAllocationStrategy string
)

const (
// AmazonCloudWatchAgentTargetAllocatorAllocationStrategyConsistentHashing targets will be consistently added to collectors, which allows a high-availability setup.
AmazonCloudWatchAgentTargetAllocatorAllocationStrategyConsistentHashing AmazonCloudWatchAgentTargetAllocatorAllocationStrategy = "consistent-hashing"
)
87 changes: 87 additions & 0 deletions apis/v1alpha1/amazoncloudwatchagent_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ type AmazonCloudWatchAgentSpec struct {
// Collector and Target Allocator pods.
// +optional
PodAnnotations map[string]string `json:"podAnnotations,omitempty"`
// TargetAllocator indicates a value which determines whether to spawn a target allocation resource or not.
// +optional
TargetAllocator AmazonCloudWatchAgentTargetAllocator `json:"targetAllocator,omitempty"`
// Mode represents how the collector should be deployed (deployment, daemonset, statefulset or sidecar)
// +optional
Mode Mode `json:"mode,omitempty"`
Expand All @@ -164,6 +167,9 @@ type AmazonCloudWatchAgentSpec struct {
// ImagePullPolicy indicates the pull policy to be used for retrieving the container image (Always, Never, IfNotPresent)
// +optional
ImagePullPolicy v1.PullPolicy `json:"imagePullPolicy,omitempty"`
// Prometheus is the raw YAML to be used as the collector's prometheus configuration.
// +optional
Prometheus PrometheusConfig `json:"prometheus,omitempty"`
// Config is the raw JSON to be used as the collector's configuration. Refer to the OpenTelemetry Collector documentation for details.
// +required
Config string `json:"config,omitempty"`
Expand Down Expand Up @@ -276,6 +282,87 @@ type AmazonCloudWatchAgentSpec struct {
UpdateStrategy appsv1.DaemonSetUpdateStrategy `json:"updateStrategy,omitempty"`
}

// AmazonCloudWatchAgentTargetAllocator defines the configurations for the Prometheus target allocator.
type AmazonCloudWatchAgentTargetAllocator struct {
// Replicas is the number of pod instances for the underlying TargetAllocator. This should only be set to a value
// other than 1 if a strategy that allows for high availability is chosen. Currently, the only allocation strategy
// that can be run in a high availability mode is consistent-hashing.
// +optional
Replicas *int32 `json:"replicas,omitempty"`
// NodeSelector to schedule OpenTelemetry TargetAllocator pods.
// +optional
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// Resources to set on the OpenTelemetryTargetAllocator containers.
// +optional
Resources v1.ResourceRequirements `json:"resources,omitempty"`
// AllocationStrategy determines which strategy the target allocator should use for allocation.
// The current option is consistent-hashing.
// +optional
AllocationStrategy AmazonCloudWatchAgentTargetAllocatorAllocationStrategy `json:"allocationStrategy,omitempty"`
// FilterStrategy determines how to filter targets before allocating them among the collectors.
// The only current option is relabel-config (drops targets based on prom relabel_config).
// Filtering is disabled by default.
// +optional
FilterStrategy string `json:"filterStrategy,omitempty"`
// ServiceAccount indicates the name of an existing service account to use with this instance. When set,
// the operator will not automatically create a ServiceAccount for the TargetAllocator.
// +optional
ServiceAccount string `json:"serviceAccount,omitempty"`
// Image indicates the container image to use for the OpenTelemetry TargetAllocator.
// +optional
Image string `json:"image,omitempty"`
// Enabled indicates whether to use a target allocation mechanism for Prometheus targets or not.
// +optional
Enabled bool `json:"enabled,omitempty"`
// If specified, indicates the pod's scheduling constraints
// +optional
Affinity *v1.Affinity `json:"affinity,omitempty"`
// PrometheusCR defines the configuration for the retrieval of PrometheusOperator CRDs ( servicemonitor.monitoring.coreos.com/v1 and podmonitor.monitoring.coreos.com/v1 ) retrieval.
// All CR instances which the ServiceAccount has access to will be retrieved. This includes other namespaces.
// +optional
PrometheusCR AmazonCloudWatchAgentTargetAllocatorPrometheusCR `json:"prometheusCR,omitempty"`
// SecurityContext configures the container security context for
// the target-allocator.
// +optional
SecurityContext *v1.PodSecurityContext `json:"securityContext,omitempty"`
// TopologySpreadConstraints embedded kubernetes pod configuration option,
// controls how pods are spread across your cluster among failure-domains
// such as regions, zones, nodes, and other user-defined topology domains
// https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/
// +optional
TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
// Toleration embedded kubernetes pod configuration option,
// controls how pods can be scheduled with matching taints
// +optional
Tolerations []v1.Toleration `json:"tolerations,omitempty"`
// ENV vars to set on the OpenTelemetry TargetAllocator's Pods. These can then in certain cases be
// consumed in the config file for the TargetAllocator.
// +optional
Env []v1.EnvVar `json:"env,omitempty"`
}

type AmazonCloudWatchAgentTargetAllocatorPrometheusCR struct {
// Enabled indicates whether to use a PrometheusOperator custom resources as targets or not.
// +optional
Enabled bool `json:"enabled,omitempty"`
// Interval between consecutive scrapes. Equivalent to the same setting on the Prometheus CRD.
//
// Default: "30s"
// +kubebuilder:default:="30s"
// +kubebuilder:validation:Format:=duration
ScrapeInterval *metav1.Duration `json:"scrapeInterval,omitempty"`
// PodMonitors to be selected for target discovery.
// This is a map of {key,value} pairs. Each {key,value} in the map is going to exactly match a label in a
// PodMonitor's meta labels. The requirements are ANDed.
// +optional
PodMonitorSelector map[string]string `json:"podMonitorSelector,omitempty"`
// ServiceMonitors to be selected for target discovery.
// This is a map of {key,value} pairs. Each {key,value} in the map is going to exactly match a label in a
// ServiceMonitor's meta labels. The requirements are ANDed.
// +optional
ServiceMonitorSelector map[string]string `json:"serviceMonitorSelector,omitempty"`
}

// ScaleSubresourceStatus defines the observed state of the AmazonCloudWatchAgent's
// scale subresource.
type ScaleSubresourceStatus struct {
Expand Down
32 changes: 32 additions & 0 deletions apis/v1alpha1/collector_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ import (
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"

"github.com/aws/amazon-cloudwatch-agent-operator/internal/config"
"github.com/aws/amazon-cloudwatch-agent-operator/internal/manifests/collector/adapters"
ta "github.com/aws/amazon-cloudwatch-agent-operator/internal/manifests/targetallocator/adapters"
"github.com/aws/amazon-cloudwatch-agent-operator/pkg/featuregate"
)

var (
Expand Down Expand Up @@ -87,6 +90,9 @@ func (c CollectorWebhook) defaulter(r *AmazonCloudWatchAgent) error {
if r.Spec.Replicas == nil {
r.Spec.Replicas = &one
}
if r.Spec.TargetAllocator.Enabled && r.Spec.TargetAllocator.Replicas == nil {
r.Spec.TargetAllocator.Replicas = &one
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering if this needs to be documented or made configurable later ? This means TA is not scalable at the moment

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think this can be an enhancement.

}

if r.Spec.MaxReplicas != nil || (r.Spec.Autoscaler != nil && r.Spec.Autoscaler.MaxReplicas != nil) {
if r.Spec.Autoscaler == nil {
Expand Down Expand Up @@ -163,6 +169,32 @@ func (c CollectorWebhook) validate(r *AmazonCloudWatchAgent) (admission.Warnings
return warnings, fmt.Errorf("the OpenTelemetry Collector mode is set to %s, which does not support the attribute 'AdditionalContainers'", r.Spec.Mode)
}

// validate target allocation
if r.Spec.TargetAllocator.Enabled && r.Spec.Mode != ModeStatefulSet {
warnings = append(warnings, fmt.Sprintf("The Amazon CloudWatch Agent mode is set to %s, we do not recommend enabling Target Allocator when not running as a StatefulSet", r.Spec.Mode))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: could be updated with The Amazon CloudWatch Agent mode is set to %s, which does not support enabling Target Allocator

The mode could be Deployment and the error message would say StatefulSet which can be confusing

}

// validate Prometheus config for target allocation
if r.Spec.TargetAllocator.Enabled {
promConfigYaml, err := r.Spec.Prometheus.Yaml()
if err != nil {
return warnings, fmt.Errorf("%s could not convert json to yaml", err)
}

promCfg, err := adapters.ConfigFromString(promConfigYaml)
if err != nil {
return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
}
err = ta.ValidatePromConfig(promCfg, r.Spec.TargetAllocator.Enabled, featuregate.EnableTargetAllocatorRewrite.IsEnabled())
okankoAMZ marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
}
err = ta.ValidateTargetAllocatorConfig(r.Spec.TargetAllocator.PrometheusCR.Enabled, promCfg)
if err != nil {
return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
}
}

// validator port config
for _, p := range r.Spec.Ports {
nameErrs := validation.IsValidPortName(p.Name)
Expand Down
Loading
Loading