aws · okankoAMZ · Nov 26, 2024 · Sep 11, 2024 · Sep 19, 2024 · Sep 25, 2024
diff --git a/.github/workflows/build-and-upload-release.yml b/.github/workflows/build-and-upload-release.yml
@@ -6,7 +6,9 @@ env:
   # Use terraform assume role for uploading to ecr
   AWS_ASSUME_ROLE: ${{ secrets.TERRAFORM_AWS_ASSUME_ROLE }}
   ECR_OPERATOR_STAGING_REPO: ${{ vars.ECR_OPERATOR_STAGING_REPO }}
-  ECR_OPERATOR_RELEASE_IMAGE: ${{ secrets.ECR_OPERATOR_RELEASE_IMAGE }}
+  ECR_TARGET_ALLOCATOR_STAGING_REPO: ${{ vars.ECR_TARGET_ALLOCATOR_STAGING_REPO}}
+  ECR_OPERATOR_RELEASE_IMAGE: ${{ vars.ECR_TARGET_ALLOCATOR_TEST_OPERATOR_REPO}}
+  ECR_TARGET_ALLOCATOR_RELEASE_REPO: ${{ vars.ECR_TARGET_ALLOCATOR_RELEASE_REPO}}
 
 on:
   workflow_dispatch:
@@ -81,9 +83,60 @@ jobs:
           tags: ${{ env.ECR_OPERATOR_STAGING_REPO }}:${{ inputs.tag }}
           platforms: linux/amd64, linux/arm64
 
+  MakeTABinary:
+    name: 'MakeTargetAllocatorImage'
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Set up Go 1.x
+        uses: actions/setup-go@v4
+        with:
+          go-version: '>1.22'
+          cache: true
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v2
+        with:
+          role-to-assume: ${{ env.AWS_ASSUME_ROLE }}
+          aws-region: us-west-2
+
+      - name: Login to ECR
+        if: steps.cached_binaries.outputs.cache-hit == false
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v1
+
+      - name: Set up Docker Buildx
+        if: steps.cached_binaries.outputs.cache-hit == false
+        uses: docker/setup-buildx-action@v1
+
+      - name: Set up QEMU
+        if: steps.cached_binaries.outputs.cache-hit == false
+        uses: docker/setup-qemu-action@v1
+
+      - name: Build Binaries
+        run: |
+          go mod download
+          export GOARCH=arm64 && make targetallocator 
+          export GOARCH=amd64 && make targetallocator 
+      - name: Build Cloudwatch Agent Target Allocator Image and push to ECR
+        uses: docker/build-push-action@v4
+        if: steps.cached_binaries.outputs.cache-hit == false
+        with:
+          file: ./cmd/amazon-cloudwatch-agent-target-allocator/Dockerfile
+          context: ./cmd/amazon-cloudwatch-agent-target-allocator
+          push: true
+          tags: ${{ env.ECR_TARGET_ALLOCATOR_STAGING_REPO }}:${{ inputs.tag }}
+          platforms: linux/amd64, linux/arm64
+
   e2e-test:
     name: "Application Signals E2E Test"
-    needs: MakeBinary
+    needs: [MakeBinary,MakeTABinary]
     uses: ./.github/workflows/application-signals-e2e-test.yml
     secrets: inherit
     permissions:
@@ -119,4 +172,10 @@ jobs:
         run: |
           docker buildx imagetools create \
           -t ${{ env.ECR_OPERATOR_RELEASE_IMAGE }} \
-          ${{ env.ECR_OPERATOR_STAGING_REPO }}:${{ inputs.tag }}
+          ${{ env.ECR_OPERATOR_STAGING_REPO }}:${{ inputs.tag }}
+
+      - name: Push image to TA release ECR
+        run: |
+          docker buildx imagetools create \
+          -t ${{ env.ECR_TARGET_ALLOCATOR_RELEASE_REPO}} \
+          ${{ env.ECR_TARGET_ALLOCATOR_STAGING_REPO }}:${{ inputs.tag }}
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 # Build the manager binary
-FROM golang:1.20 as builder
+FROM golang:1.21 as builder
 
 # set goproxy=direct
 ENV GOPROXY direct
@@ -30,9 +30,10 @@ ARG AUTO_INSTRUMENTATION_DOTNET_VERSION
 ARG AUTO_INSTRUMENTATION_NODEJS_VERSION
 ARG DCMG_EXPORTER_VERSION
 ARG NEURON_MONITOR_VERSION
+ARG TARGET_ALLOCATOR_VERSION
 
 # Build
-RUN CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -ldflags="-X ${VERSION_PKG}.version=${VERSION} -X ${VERSION_PKG}.buildDate=${VERSION_DATE} -X ${VERSION_PKG}.agent=${AGENT_VERSION} -X ${VERSION_PKG}.autoInstrumentationJava=${AUTO_INSTRUMENTATION_JAVA_VERSION} -X ${VERSION_PKG}.autoInstrumentationPython=${AUTO_INSTRUMENTATION_PYTHON_VERSION} -X ${VERSION_PKG}.autoInstrumentationDotNet=${AUTO_INSTRUMENTATION_DOTNET_VERSION} -X ${VERSION_PKG}.autoInstrumentationNodeJS=${AUTO_INSTRUMENTATION_NODEJS_VERSION} -X ${VERSION_PKG}.dcgmExporter=${DCMG_EXPORTER_VERSION} -X ${VERSION_PKG}.neuronMonitor=${NEURON_MONITOR_VERSION}" -a -o manager main.go
+RUN CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -ldflags="-X ${VERSION_PKG}.version=${VERSION} -X ${VERSION_PKG}.buildDate=${VERSION_DATE} -X ${VERSION_PKG}.agent=${AGENT_VERSION} -X ${VERSION_PKG}.autoInstrumentationJava=${AUTO_INSTRUMENTATION_JAVA_VERSION} -X ${VERSION_PKG}.autoInstrumentationPython=${AUTO_INSTRUMENTATION_PYTHON_VERSION} -X ${VERSION_PKG}.autoInstrumentationDotNet=${AUTO_INSTRUMENTATION_DOTNET_VERSION} -X ${VERSION_PKG}.autoInstrumentationNodeJS=${AUTO_INSTRUMENTATION_NODEJS_VERSION} -X ${VERSION_PKG}.dcgmExporter=${DCMG_EXPORTER_VERSION} -X ${VERSION_PKG}.neuronMonitor=${NEURON_MONITOR_VERSION} -X ${VERSION_PKG}.targetAllocator=${TARGET_ALLOCATOR_VERSION}" -a -o manager main.go
 
 # Use distroless as minimal base image to package the manager binary
 # Refer to https://github.com/GoogleContainerTools/distroless for more details
@@ -41,4 +42,4 @@ WORKDIR /
 COPY --from=builder /workspace/manager .
 USER 65532:65532
 
-ENTRYPOINT ["/manager"]
+ENTRYPOINT ["/manager"]
diff --git a/Makefile b/Makefile
@@ -9,13 +9,17 @@ AUTO_INSTRUMENTATION_DOTNET_VERSION ?= "$(shell grep -v '\#' versions.txt | grep
 AUTO_INSTRUMENTATION_NODEJS_VERSION ?= "$(shell grep -v '\#' versions.txt | grep aws-otel-nodejs-instrumentation | awk -F= '{print $$2}')"
 DCGM_EXPORTER_VERSION ?= "$(shell grep -v '\#' versions.txt | grep dcgm-exporter | awk -F= '{print $$2}')"
 NEURON_MONITOR_VERSION ?= "$(shell grep -v '\#' versions.txt | grep neuron-monitor | awk -F= '{print $$2}')"
+TARGET_ALLOCATOR_VERSION ?= "$(shell grep -v '\#' versions.txt | grep target-allocator |  awk -F= '{print $$2}')"
 
 # Image URL to use all building/pushing image targets
 IMG_PREFIX ?= aws
 IMG_REPO ?= cloudwatch-agent-operator
 IMG ?= ${IMG_PREFIX}/${IMG_REPO}:${VERSION}
 ARCH ?= $(shell go env GOARCH)
 
+TARGET_ALLOCATOR_IMG_REPO ?= target-allocator
+TARGET_ALLOCATOR_IMG ?= ${IMG_PREFIX}/${TARGET_ALLOCATOR_IMG_REPO}:${TARGET_ALLOCATOR_VERSION}
+
 # Options for 'bundle-build'
 ifneq ($(origin CHANNELS), undefined)
 BUNDLE_CHANNELS := --channels=$(CHANNELS)
@@ -96,6 +100,10 @@ test: generate fmt vet envtest
 .PHONY: manager
 manager: generate fmt vet
 	go build -o bin/manager main.go
+# Build target allocator binary
+.PHONY: targetallocator
+targetallocator:
+	cd cmd/amazon-cloudwatch-agent-target-allocator && CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(ARCH) go build  -installsuffix cgo -o bin/targetallocator_${ARCH} -ldflags "${LDFLAGS}"  .
 
 # Run against the configured Kubernetes cluster in ~/.kube/config
 .PHONY: run
@@ -155,13 +163,26 @@ generate: controller-gen api-docs
 # buildx is used to ensure same results for arm based systems (m1/2 chips)
 .PHONY: container
 container:
-	docker buildx build --load --platform linux/${ARCH} -t ${IMG} --build-arg VERSION_PKG=${VERSION_PKG} --build-arg VERSION=${VERSION} --build-arg VERSION_DATE=${VERSION_DATE} --build-arg AGENT_VERSION=${AGENT_VERSION} --build-arg AUTO_INSTRUMENTATION_JAVA_VERSION=${AUTO_INSTRUMENTATION_JAVA_VERSION} --build-arg AUTO_INSTRUMENTATION_PYTHON_VERSION=${AUTO_INSTRUMENTATION_PYTHON_VERSION} --build-arg AUTO_INSTRUMENTATION_DOTNET_VERSION=${AUTO_INSTRUMENTATION_DOTNET_VERSION} --build-arg AUTO_INSTRUMENTATION_NODEJS_VERSION=${AUTO_INSTRUMENTATION_NODEJS_VERSION} --build-arg DCGM_EXPORTER_VERSION=${DCGM_EXPORTER_VERSION} --build-arg NEURON_MONITOR_VERSION=${NEURON_MONITOR_VERSION} .
+	docker buildx build --load --platform linux/${ARCH} -t ${IMG} --build-arg VERSION_PKG=${VERSION_PKG} --build-arg VERSION=${VERSION} --build-arg VERSION_DATE=${VERSION_DATE} --build-arg AGENT_VERSION=${AGENT_VERSION} --build-arg AUTO_INSTRUMENTATION_JAVA_VERSION=${AUTO_INSTRUMENTATION_JAVA_VERSION} --build-arg AUTO_INSTRUMENTATION_PYTHON_VERSION=${AUTO_INSTRUMENTATION_PYTHON_VERSION} --build-arg AUTO_INSTRUMENTATION_DOTNET_VERSION=${AUTO_INSTRUMENTATION_DOTNET_VERSION} --build-arg AUTO_INSTRUMENTATION_NODEJS_VERSION=${AUTO_INSTRUMENTATION_NODEJS_VERSION} --build-arg DCGM_EXPORTER_VERSION=${DCGM_EXPORTER_VERSION} --build-arg NEURON_MONITOR_VERSION=${NEURON_MONITOR_VERSION} --build-arg TARGET_ALLOCATOR_VERSION=${TARGET_ALLOCATOR_VERSION} .
 
 # Push the container image, used only for local dev purposes
 .PHONY: container-push
 container-push:
 	docker push ${IMG}
 
+.PHONY: container-target-allocator-push
+container-target-allocator-push:
+	docker push ${TARGET_ALLOCATOR_IMG}
+
+.PHONY: container-target-allocator
+container-target-allocator: GOOS = linux
+container-target-allocator: targetallocator
+	docker buildx build --load --platform linux/${ARCH} -t ${TARGET_ALLOCATOR_IMG}  cmd/amazon-cloudwatch-agent-target-allocator
+
+.PHONY: ta-build-and-push
+ta-build-and-push: container-target-allocator
+ta-build-and-push: container-target-allocator-push
+
 .PHONY: kustomize
 kustomize: ## Download kustomize locally if necessary.
 	$(call go-get-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v5,$(KUSTOMIZE_VERSION))

diff --git a/apis/v1alpha1/allocation_strategy.go b/apis/v1alpha1/allocation_strategy.go
@@ -0,0 +1,15 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package v1alpha1
+
+type (
+	// AmazonCloudWatchAgentTargetAllocatorAllocationStrategy represent which strategy to distribute target to each collector
+	// +kubebuilder:validation:Enum=consistent-hashing
+	AmazonCloudWatchAgentTargetAllocatorAllocationStrategy string
+)
+
+const (
+	// AmazonCloudWatchAgentTargetAllocatorAllocationStrategyConsistentHashing targets will be consistently added to collectors, which allows a high-availability setup.
+	AmazonCloudWatchAgentTargetAllocatorAllocationStrategyConsistentHashing AmazonCloudWatchAgentTargetAllocatorAllocationStrategy = "consistent-hashing"
+)
diff --git a/apis/v1alpha1/amazoncloudwatchagent_types.go b/apis/v1alpha1/amazoncloudwatchagent_types.go
@@ -143,6 +143,9 @@ type AmazonCloudWatchAgentSpec struct {
 	// Collector and Target Allocator pods.
 	// +optional
 	PodAnnotations map[string]string `json:"podAnnotations,omitempty"`
+	// TargetAllocator indicates a value which determines whether to spawn a target allocation resource or not.
+	// +optional
+	TargetAllocator AmazonCloudWatchAgentTargetAllocator `json:"targetAllocator,omitempty"`
 	// Mode represents how the collector should be deployed (deployment, daemonset, statefulset or sidecar)
 	// +optional
 	Mode Mode `json:"mode,omitempty"`
@@ -164,6 +167,9 @@ type AmazonCloudWatchAgentSpec struct {
 	// ImagePullPolicy indicates the pull policy to be used for retrieving the container image (Always, Never, IfNotPresent)
 	// +optional
 	ImagePullPolicy v1.PullPolicy `json:"imagePullPolicy,omitempty"`
+	// Prometheus is the raw YAML to be used as the collector's prometheus configuration.
+	// +optional
+	Prometheus PrometheusConfig `json:"prometheus,omitempty"`
 	// Config is the raw JSON to be used as the collector's configuration. Refer to the OpenTelemetry Collector documentation for details.
 	// +required
 	Config string `json:"config,omitempty"`
@@ -276,6 +282,87 @@ type AmazonCloudWatchAgentSpec struct {
 	UpdateStrategy appsv1.DaemonSetUpdateStrategy `json:"updateStrategy,omitempty"`
 }
 
+// AmazonCloudWatchAgentTargetAllocator defines the configurations for the Prometheus target allocator.
+type AmazonCloudWatchAgentTargetAllocator struct {
+	// Replicas is the number of pod instances for the underlying TargetAllocator. This should only be set to a value
+	// other than 1 if a strategy that allows for high availability is chosen. Currently, the only allocation strategy
+	// that can be run in a high availability mode is consistent-hashing.
+	// +optional
+	Replicas *int32 `json:"replicas,omitempty"`
+	// NodeSelector to schedule OpenTelemetry TargetAllocator pods.
+	// +optional
+	NodeSelector map[string]string `json:"nodeSelector,omitempty"`
+	// Resources to set on the OpenTelemetryTargetAllocator containers.
+	// +optional
+	Resources v1.ResourceRequirements `json:"resources,omitempty"`
+	// AllocationStrategy determines which strategy the target allocator should use for allocation.
+	// The current option is consistent-hashing.
+	// +optional
+	AllocationStrategy AmazonCloudWatchAgentTargetAllocatorAllocationStrategy `json:"allocationStrategy,omitempty"`
+	// FilterStrategy determines how to filter targets before allocating them among the collectors.
+	// The only current option is relabel-config (drops targets based on prom relabel_config).
+	// Filtering is disabled by default.
+	// +optional
+	FilterStrategy string `json:"filterStrategy,omitempty"`
+	// ServiceAccount indicates the name of an existing service account to use with this instance. When set,
+	// the operator will not automatically create a ServiceAccount for the TargetAllocator.
+	// +optional
+	ServiceAccount string `json:"serviceAccount,omitempty"`
+	// Image indicates the container image to use for the OpenTelemetry TargetAllocator.
+	// +optional
+	Image string `json:"image,omitempty"`
+	// Enabled indicates whether to use a target allocation mechanism for Prometheus targets or not.
+	// +optional
+	Enabled bool `json:"enabled,omitempty"`
+	// If specified, indicates the pod's scheduling constraints
+	// +optional
+	Affinity *v1.Affinity `json:"affinity,omitempty"`
+	// PrometheusCR defines the configuration for the retrieval of PrometheusOperator CRDs ( servicemonitor.monitoring.coreos.com/v1 and podmonitor.monitoring.coreos.com/v1 )  retrieval.
+	// All CR instances which the ServiceAccount has access to will be retrieved. This includes other namespaces.
+	// +optional
+	PrometheusCR AmazonCloudWatchAgentTargetAllocatorPrometheusCR `json:"prometheusCR,omitempty"`
+	// SecurityContext configures the container security context for
+	// the target-allocator.
+	// +optional
+	SecurityContext *v1.PodSecurityContext `json:"securityContext,omitempty"`
+	// TopologySpreadConstraints embedded kubernetes pod configuration option,
+	// controls how pods are spread across your cluster among failure-domains
+	// such as regions, zones, nodes, and other user-defined topology domains
+	// https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/
+	// +optional
+	TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
+	// Toleration embedded kubernetes pod configuration option,
+	// controls how pods can be scheduled with matching taints
+	// +optional
+	Tolerations []v1.Toleration `json:"tolerations,omitempty"`
+	// ENV vars to set on the OpenTelemetry TargetAllocator's Pods. These can then in certain cases be
+	// consumed in the config file for the TargetAllocator.
+	// +optional
+	Env []v1.EnvVar `json:"env,omitempty"`
+}
+
+type AmazonCloudWatchAgentTargetAllocatorPrometheusCR struct {
+	// Enabled indicates whether to use a PrometheusOperator custom resources as targets or not.
+	// +optional
+	Enabled bool `json:"enabled,omitempty"`
+	// Interval between consecutive scrapes. Equivalent to the same setting on the Prometheus CRD.
+	//
+	// Default: "30s"
+	// +kubebuilder:default:="30s"
+	// +kubebuilder:validation:Format:=duration
+	ScrapeInterval *metav1.Duration `json:"scrapeInterval,omitempty"`
+	// PodMonitors to be selected for target discovery.
+	// This is a map of {key,value} pairs. Each {key,value} in the map is going to exactly match a label in a
+	// PodMonitor's meta labels. The requirements are ANDed.
+	// +optional
+	PodMonitorSelector map[string]string `json:"podMonitorSelector,omitempty"`
+	// ServiceMonitors to be selected for target discovery.
+	// This is a map of {key,value} pairs. Each {key,value} in the map is going to exactly match a label in a
+	// ServiceMonitor's meta labels. The requirements are ANDed.
+	// +optional
+	ServiceMonitorSelector map[string]string `json:"serviceMonitorSelector,omitempty"`
+}
+
 // ScaleSubresourceStatus defines the observed state of the AmazonCloudWatchAgent's
 // scale subresource.
 type ScaleSubresourceStatus struct {

diff --git a/apis/v1alpha1/collector_webhook.go b/apis/v1alpha1/collector_webhook.go
@@ -16,6 +16,9 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 
 	"github.com/aws/amazon-cloudwatch-agent-operator/internal/config"
+	"github.com/aws/amazon-cloudwatch-agent-operator/internal/manifests/collector/adapters"
+	ta "github.com/aws/amazon-cloudwatch-agent-operator/internal/manifests/targetallocator/adapters"
+	"github.com/aws/amazon-cloudwatch-agent-operator/pkg/featuregate"
 )
 
 var (
@@ -87,6 +90,9 @@ func (c CollectorWebhook) defaulter(r *AmazonCloudWatchAgent) error {
 	if r.Spec.Replicas == nil {
 		r.Spec.Replicas = &one
 	}
+	if r.Spec.TargetAllocator.Enabled && r.Spec.TargetAllocator.Replicas == nil {
+		r.Spec.TargetAllocator.Replicas = &one
+	}
 
 	if r.Spec.MaxReplicas != nil || (r.Spec.Autoscaler != nil && r.Spec.Autoscaler.MaxReplicas != nil) {
 		if r.Spec.Autoscaler == nil {
@@ -163,6 +169,32 @@ func (c CollectorWebhook) validate(r *AmazonCloudWatchAgent) (admission.Warnings
 		return warnings, fmt.Errorf("the OpenTelemetry Collector mode is set to %s, which does not support the attribute 'AdditionalContainers'", r.Spec.Mode)
 	}
 
+	// validate target allocation
+	if r.Spec.TargetAllocator.Enabled && r.Spec.Mode != ModeStatefulSet {
+		warnings = append(warnings, fmt.Sprintf("The Amazon CloudWatch Agent mode is set to %s, we do not recommend enabling Target Allocator when not running as a StatefulSet", r.Spec.Mode))
+	}
+
+	// validate Prometheus config for target allocation
+	if r.Spec.TargetAllocator.Enabled {
+		promConfigYaml, err := r.Spec.Prometheus.Yaml()
+		if err != nil {
+			return warnings, fmt.Errorf("%s could not convert json to yaml", err)
+		}
+
+		promCfg, err := adapters.ConfigFromString(promConfigYaml)
+		if err != nil {
+			return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
+		}
+		err = ta.ValidatePromConfig(promCfg, r.Spec.TargetAllocator.Enabled, featuregate.EnableTargetAllocatorRewrite.IsEnabled())
+		if err != nil {
+			return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
+		}
+		err = ta.ValidateTargetAllocatorConfig(r.Spec.TargetAllocator.PrometheusCR.Enabled, promCfg)
+		if err != nil {
+			return warnings, fmt.Errorf("the OpenTelemetry Spec Prometheus configuration is incorrect, %w", err)
+		}
+	}
+
 	// validator port config
 	for _, p := range r.Spec.Ports {
 		nameErrs := validation.IsValidPortName(p.Name)