aws · majanjua-amzn · Dec 19, 2023 · Dec 15, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/.github/workflows/appsignals-e2e-test.yml b/.github/workflows/appsignals-e2e-test.yml
@@ -1,16 +1,18 @@
 # This is a reusable workflow for running the E2E test for App Signals.
 # It is meant to be called from another workflow.
+# This E2E test is responsible for validating setting up a sample application on an EKS cluster and enabling
+# App Signals using the staging image of the CloudWatch Agent Operator. It validates the generated telemetry
+# including logs, metrics, and traces, then cleans up the cluster. The testing resources can be found in the
+# ADOT java instrumentation repo: https://github.com/aws-observability/aws-otel-java-instrumentation/tree/main/testing
 # Read more about reusable workflows: https://docs.github.com/en/actions/using-workflows/reusing-workflows#overview
 name: App Signals Enablement E2E Testing
 on:
   workflow_call:
     inputs:
+      # Ensure two tests do not run on the same cluster at the same time through GitHub Action concurrency
       test-cluster-name:
         required: true
         type: string
-      caller-workflow-name:
-        required: true
-        type: string
 
 permissions:
   id-token: write
@@ -19,31 +21,41 @@ permissions:
 env:
   AWS_DEFAULT_REGION: us-east-1
   TEST_ACCOUNT: ${{ secrets.APP_SIGNALS_E2E_TEST_ACCOUNT }}
-  ENABLEMENT_SCRIPT_S3_BUCKET: ${{ secrets.APP_SIGNALS_E2E_ONBOARDING_ZIP_S3_URI }}
   SAMPLE_APP_NAMESPACE: sample-app-namespace
   SAMPLE_APP_FRONTEND_SERVICE_IMAGE: ${{ secrets.APP_SIGNALS_E2E_SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}
   SAMPLE_APP_REMOTE_SERVICE_IMAGE: ${{ secrets.APP_SIGNALS_E2E_SAMPLE_APP_REMOTE_SERVICE_IMAGE }}
   METRIC_NAMESPACE: AppSignals
   LOG_GROUP: /aws/appsignals/eks
   ECR_OPERATOR_STAGING_IMAGE: ${{ secrets.ECR_OPERATOR_STAGING_IMAGE }}
-  ECR_OPERATOR_RELEASE_IMAGE: ${{ secrets.ECR_OPERATOR_RELEASE_IMAGE }}
 
 jobs:
   appsignals-e2e-test:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      # This step avoids code duplication for terraform templates and the validator
+      # To simplify, we get the entire repo
+      - name: Get testing resources from ADOT
+        uses: actions/checkout@v4
+        with:
+          repository: aws-observability/aws-otel-java-instrumentation
+          ref: main
+
+      - name: Download enablement script
+        uses: actions/checkout@v4
         with:
-          # Checkout e2e-test branch and get only the required resources for testing
-          ref: e2e-test
+          repository: aws-observability/application-signals-demo
+          ref: main
+          path: enablement-script
           sparse-checkout: |
-            test
+            scripts/eks/appsignals/enable-app-signals.sh
+            scripts/eks/appsignals/clean-app-signals.sh
+          sparse-checkout-cone-mode: false
 
       - name: Generate testing id
-        run: echo TESTING_ID="${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV
+        run: echo TESTING_ID="${{ env.AWS_DEFAULT_REGION }}-${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV
 
       - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v2
+        uses: aws-actions/configure-aws-credentials@v4
         with:
           role-to-assume: ${{ secrets.APP_SIGNALS_E2E_TEST_ROLE_ARN }}
           aws-region: ${{ env.AWS_DEFAULT_REGION }}
@@ -78,17 +90,18 @@ jobs:
           --approve
 
       - name: Set up terraform
-        uses: hashicorp/setup-terraform@v2
+        uses: hashicorp/setup-terraform@v3
         with:
           terraform_wrapper: false
 
       - name: Deploy sample app via terraform
-        working-directory: test/terraform/eks
+        working-directory: testing/terraform/eks
         run: |
           terraform init
           terraform validate
           terraform apply -auto-approve \
             -var="test_id=${{ env.TESTING_ID }}" \
+            -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
             -var="kube_directory_path=${{ github.workspace }}/.kube" \
             -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
             -var="eks_cluster_context_name=$(kubectl config current-context)" \
@@ -98,20 +111,25 @@ jobs:
             -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}"
 
       # Enable App Signals on the test cluster
-      - name: Pull and unzip enablement script from S3
-        run: aws s3 cp ${{ env.ENABLEMENT_SCRIPT_S3_BUCKET }} . && unzip -j onboarding.zip
-
-      - name: Set the CW Agent Operator image to the staging image in the manifest
-        if: inputs.caller-workflow-name == 'build-and-upload-staging'
-        run: "sed -i 's#${{ env.ECR_OPERATOR_RELEASE_IMAGE }}#${{ env.ECR_OPERATOR_STAGING_IMAGE }}#g' app-signals.yaml"
-
       - name: Enable App Signals
+        working-directory: enablement-script/scripts/eks/appsignals
         run: |
           ./enable-app-signals.sh \
           ${{ inputs.test-cluster-name }} \
           ${{ env.AWS_DEFAULT_REGION }} \
           ${{ env.SAMPLE_APP_NAMESPACE }}
-
+
+      - name: Save CloudWatch Agent Operator image to environment before patching
+        run: |
+          echo "OLD_CW_AGENT_OPERATOR_IMAGE"=$(kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=amazon-cloudwatch-observability -o json | \
+          jq '.items[0].status.containerStatuses[0].image') >> $GITHUB_ENV
+
+      - name: Patch the CloudWatch Agent Operator image and restart CloudWatch pods
+        run: |
+          kubectl patch deploy -n amazon-cloudwatch amazon-cloudwatch-observability-controller-manager --type='json' -p '[{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "${{ env.ECR_OPERATOR_STAGING_IMAGE }}"}]'
+          kubectl delete pods --all -n amazon-cloudwatch
+          kubectl wait --for=condition=Ready pod --all -n amazon-cloudwatch
+
       # Application pods need to be restarted for the
       # app signals instrumentation to take effect
       - name: Restart the app pods
@@ -126,22 +144,38 @@ jobs:
           echo "REMOTE_SERVICE_DEPLOYMENT_NAME=$(kubectl get deployments -n ${{ env.SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].metadata.name}')" >> $GITHUB_ENV
           echo "REMOTE_SERVICE_POD_IP=$(kubectl get pods -n ${{ env.SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].status.podIP}')" >> $GITHUB_ENV
 
-      - name: Verify pod ADOT image
+      - name: Log pod ADOT image ID
         run: |
           kubectl get pods -n ${{ env.SAMPLE_APP_NAMESPACE }} --output json | \
           jq '.items[0].status.initContainerStatuses[0].imageID'
 
-      - name: Verify pod CWAgent image
+      - name: Log pod CWAgent image ID
         run: |
-          kubectl get pods -n amazon-cloudwatch --output json | \
+          kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=cloudwatch-agent -o json | \
           jq '.items[0].status.containerStatuses[0].imageID'
 
+      - name: Log pod CWAgent Operator image ID and save image to the environment
+        run: |
+          kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=amazon-cloudwatch-observability -o json | \
+          jq '.items[0].status.containerStatuses[0].imageID'
+
+          echo "NEW_CW_AGENT_OPERATOR_IMAGE"=$(kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=amazon-cloudwatch-observability -o json | \
+          jq '.items[0].status.containerStatuses[0].image') >> $GITHUB_ENV
+
+      - name: Check if CW Agent Operator image has changed
+        run: |
+          if [ ${{ env.OLD_CW_AGENT_OPERATOR_IMAGE }} = ${{ env.NEW_CW_AGENT_OPERATOR_IMAGE }} ]; then
+            echo "Operator image did not change"
+            exit 1
+          fi
+
       - name: Get the sample app endpoint
         run: |
           echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV
-        working-directory: test/terraform/eks
+        working-directory: testing/terraform/eks
 
       - name: Wait for app endpoint to come online
+        id: endpoint-check
         run: |
           attempt_counter=0
           max_attempts=30
@@ -156,55 +190,63 @@ jobs:
             sleep 10
           done
 
+      # This steps increases the speed of the validation by creating the telemetry data in advance
+      - name: Call all test APIs
+        continue-on-error: true
+        run: |
+          curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/outgoing-http-call/
+          curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/aws-sdk-call/
+          curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}/
+          curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/client-call/
+
       # Validation for app signals telemetry data
       - name: Call endpoint and validate generated EMF logs
         id: log-validation
-        working-directory: test/validator
-        run: ./gradlew run --args='-c log-validation.yml
+        if: steps.endpoint-check.outcome == 'success' && !cancelled()
+        run: ./gradlew testing:validator:run --args='-c eks/log-validation.yml
           --testing-id ${{ env.TESTING_ID }}
           --endpoint http://${{ env.APP_ENDPOINT }}
           --region ${{ env.AWS_DEFAULT_REGION }}
           --account-id ${{ env.TEST_ACCOUNT }}
           --metric-namespace ${{ env.METRIC_NAMESPACE }}
           --log-group ${{ env.LOG_GROUP }}
           --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }}
-          --cluster ${{ inputs.test-cluster-name }}
+          --platform-info ${{ inputs.test-cluster-name }}
           --service-name sample-application-${{ env.TESTING_ID }}
           --remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
           --request-body ip=${{ env.REMOTE_SERVICE_POD_IP }}
           --rollup'
 
       - name: Call endpoints and validate generated metrics
         id: metric-validation
-        if: success() || steps.log-validation.outcome == 'failure'
-        working-directory: test/validator
-        run: ./gradlew run --args='-c metric-validation.yml
+        if: (success() || steps.log-validation.outcome == 'failure') && !cancelled()
+        run: ./gradlew testing:validator:run --args='-c eks/metric-validation.yml
           --testing-id ${{ env.TESTING_ID }}
           --endpoint http://${{ env.APP_ENDPOINT }}
           --region ${{ env.AWS_DEFAULT_REGION }}
           --account-id ${{ env.TEST_ACCOUNT }}
           --metric-namespace ${{ env.METRIC_NAMESPACE }}
           --log-group ${{ env.LOG_GROUP }}
           --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }}
-          --cluster ${{ inputs.test-cluster-name }}
+          --platform-info ${{ inputs.test-cluster-name }}
           --service-name sample-application-${{ env.TESTING_ID }}
           --remote-service-name sample-remote-application-${{ env.TESTING_ID }}
           --remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
           --request-body ip=${{ env.REMOTE_SERVICE_POD_IP }}
           --rollup'
 
       - name: Call endpoints and validate generated traces
-        if: success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure'
-        working-directory: test/validator
-        run: ./gradlew run --args='-c trace-validation.yml
+        id: trace-validation
+        if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
+        run: ./gradlew testing:validator:run --args='-c eks/trace-validation.yml
           --testing-id ${{ env.TESTING_ID }}
           --endpoint http://${{ env.APP_ENDPOINT }}
           --region ${{ env.AWS_DEFAULT_REGION }}
           --account-id ${{ env.TEST_ACCOUNT }}
           --metric-namespace ${{ env.METRIC_NAMESPACE }}
           --log-group ${{ env.LOG_GROUP }}
           --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }}
-          --cluster ${{ inputs.test-cluster-name }}
+          --platform-info ${{ inputs.test-cluster-name }}
           --service-name sample-application-${{ env.TESTING_ID }}
           --remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }}
           --request-body ip=${{ env.REMOTE_SERVICE_POD_IP }}
@@ -214,12 +256,15 @@ jobs:
 
       - name: Remove log group deletion command
         if: always()
+        working-directory: enablement-script/scripts/eks/appsignals
         run: |
           delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP }}' --region \$REGION"
           sed -i "s#$delete_log_group##g" clean-app-signals.sh
 
       - name: Clean Up App Signals
         if: always()
+        continue-on-error: true
+        working-directory: enablement-script/scripts/eks/appsignals
         run: |
           ./clean-app-signals.sh \
           ${{ inputs.test-cluster-name }} \
@@ -236,10 +281,11 @@ jobs:
       - name: Terraform destroy
         if: always()
         continue-on-error: true
+        working-directory: testing/terraform/eks
         run: |
-          cd test/terraform/eks
           terraform destroy -auto-approve \
             -var="test_id=${{ env.TESTING_ID }}" \
+            -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
             -var="kube_directory_path=${{ github.workspace }}/.kube" \
             -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
             -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
@@ -254,4 +300,4 @@ jobs:
           --name service-account-${{ env.TESTING_ID }} \
           --namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
           --cluster ${{ inputs.test-cluster-name }} \
-          --region ${{ env.AWS_DEFAULT_REGION }} \
+          --region ${{ env.AWS_DEFAULT_REGION }}
diff --git a/.github/workflows/build-and-upload-staging.yml b/.github/workflows/build-and-upload-staging.yml
@@ -65,9 +65,9 @@ jobs:
     needs: MakeBinary
     uses: ./.github/workflows/appsignals-e2e-test.yml
     secrets: inherit
+    # Two E2E tests should not run at the same time in the same EKS cluster
     concurrency:
-      group: 'appsignals-cw-agent-operator-test'
+      group: 'e2e-cw-agent-operator-test'
       cancel-in-progress: false
     with:
-      test-cluster-name: 'e2e-cw-agent-operator-test'
-      caller-workflow-name: 'build-and-upload-staging'
+      test-cluster-name: 'e2e-cw-agent-operator-test'