From 138423df6e82712882546e1adeb4e7918c36175a Mon Sep 17 00:00:00 2001 From: Mahad Janjua Date: Fri, 15 Dec 2023 10:36:03 -0800 Subject: [PATCH 1/4] E2E Test: Fix and update release testing --- .github/workflows/appsignals-e2e-test.yml | 91 ++++++++++++------- .../workflows/build-and-upload-staging.yml | 5 +- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/.github/workflows/appsignals-e2e-test.yml b/.github/workflows/appsignals-e2e-test.yml index 3d0b30ad..b3eef1f1 100644 --- a/.github/workflows/appsignals-e2e-test.yml +++ b/.github/workflows/appsignals-e2e-test.yml @@ -8,9 +8,6 @@ on: test-cluster-name: required: true type: string - caller-workflow-name: - required: true - type: string permissions: id-token: write @@ -19,31 +16,41 @@ permissions: env: AWS_DEFAULT_REGION: us-east-1 TEST_ACCOUNT: ${{ secrets.APP_SIGNALS_E2E_TEST_ACCOUNT }} - ENABLEMENT_SCRIPT_S3_BUCKET: ${{ secrets.APP_SIGNALS_E2E_ONBOARDING_ZIP_S3_URI }} SAMPLE_APP_NAMESPACE: sample-app-namespace SAMPLE_APP_FRONTEND_SERVICE_IMAGE: ${{ secrets.APP_SIGNALS_E2E_SAMPLE_APP_FRONTEND_SERVICE_IMAGE }} SAMPLE_APP_REMOTE_SERVICE_IMAGE: ${{ secrets.APP_SIGNALS_E2E_SAMPLE_APP_REMOTE_SERVICE_IMAGE }} METRIC_NAMESPACE: AppSignals LOG_GROUP: /aws/appsignals/eks ECR_OPERATOR_STAGING_IMAGE: ${{ secrets.ECR_OPERATOR_STAGING_IMAGE }} - ECR_OPERATOR_RELEASE_IMAGE: ${{ secrets.ECR_OPERATOR_RELEASE_IMAGE }} jobs: appsignals-e2e-test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + # This step avoids code duplication for terraform templates and the validator + # To simplify, we get the entire repo + - name: Get testing resources from ADOT + uses: actions/checkout@v4 + with: + repository: aws-observability/aws-otel-java-instrumentation + ref: main + + - name: Download enablement script + uses: actions/checkout@v4 with: - # Checkout e2e-test branch and get only the required resources for testing - ref: e2e-test + repository: aws-observability/application-signals-demo + ref: main + path: enablement-script sparse-checkout: | - test + scripts/eks/appsignals/enable-app-signals.sh + scripts/eks/appsignals/clean-app-signals.sh + sparse-checkout-cone-mode: false - name: Generate testing id - run: echo TESTING_ID="${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV + run: echo TESTING_ID="${{ env.AWS_DEFAULT_REGION }}-${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v2 + uses: aws-actions/configure-aws-credentials@v4 with: role-to-assume: ${{ secrets.APP_SIGNALS_E2E_TEST_ROLE_ARN }} aws-region: ${{ env.AWS_DEFAULT_REGION }} @@ -78,17 +85,18 @@ jobs: --approve - name: Set up terraform - uses: hashicorp/setup-terraform@v2 + uses: hashicorp/setup-terraform@v3 with: terraform_wrapper: false - name: Deploy sample app via terraform - working-directory: test/terraform/eks + working-directory: testing/terraform/eks run: | terraform init terraform validate terraform apply -auto-approve \ -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ -var="kube_directory_path=${{ github.workspace }}/.kube" \ -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ -var="eks_cluster_context_name=$(kubectl config current-context)" \ @@ -98,20 +106,20 @@ jobs: -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" # Enable App Signals on the test cluster - - name: Pull and unzip enablement script from S3 - run: aws s3 cp ${{ env.ENABLEMENT_SCRIPT_S3_BUCKET }} . && unzip -j onboarding.zip - - - name: Set the CW Agent Operator image to the staging image in the manifest - if: inputs.caller-workflow-name == 'build-and-upload-staging' - run: "sed -i 's#${{ env.ECR_OPERATOR_RELEASE_IMAGE }}#${{ env.ECR_OPERATOR_STAGING_IMAGE }}#g' app-signals.yaml" - - name: Enable App Signals + working-directory: enablement-script/scripts/eks/appsignals run: | ./enable-app-signals.sh \ ${{ inputs.test-cluster-name }} \ ${{ env.AWS_DEFAULT_REGION }} \ ${{ env.SAMPLE_APP_NAMESPACE }} - + + - name: Patch the CloudWatch Agent Operator image and restart CloudWatch pods + run: | + kubectl patch deploy -n amazon-cloudwatch amazon-cloudwatch-observability-controller-manager --type='json' -p '[{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "${{ env.ECR_OPERATOR_STAGING_IMAGE }}"}]' + kubectl delete pods --all -n amazon-cloudwatch + kubectl wait --for=condition=Ready pod --all -n amazon-cloudwatch + # Application pods need to be restarted for the # app signals instrumentation to take effect - name: Restart the app pods @@ -139,9 +147,10 @@ jobs: - name: Get the sample app endpoint run: | echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV - working-directory: test/terraform/eks + working-directory: testing/terraform/eks - name: Wait for app endpoint to come online + id: endpoint-check run: | attempt_counter=0 max_attempts=30 @@ -156,11 +165,20 @@ jobs: sleep 10 done + # This steps increases the speed of the validation by creating the telemetry data in advance + - name: Call all test APIs + continue-on-error: true + run: | + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/outgoing-http-call/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/aws-sdk-call/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/client-call/ + # Validation for app signals telemetry data - name: Call endpoint and validate generated EMF logs id: log-validation - working-directory: test/validator - run: ./gradlew run --args='-c log-validation.yml + if: steps.endpoint-check.outcome == 'success' && !cancelled() + run: ./gradlew testing:validator:run --args='-c eks/log-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} --region ${{ env.AWS_DEFAULT_REGION }} @@ -168,7 +186,7 @@ jobs: --metric-namespace ${{ env.METRIC_NAMESPACE }} --log-group ${{ env.LOG_GROUP }} --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }} - --cluster ${{ inputs.test-cluster-name }} + --platform-info ${{ inputs.test-cluster-name }} --service-name sample-application-${{ env.TESTING_ID }} --remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }} --request-body ip=${{ env.REMOTE_SERVICE_POD_IP }} @@ -176,9 +194,8 @@ jobs: - name: Call endpoints and validate generated metrics id: metric-validation - if: success() || steps.log-validation.outcome == 'failure' - working-directory: test/validator - run: ./gradlew run --args='-c metric-validation.yml + if: (success() || steps.log-validation.outcome == 'failure') && !cancelled() + run: ./gradlew testing:validator:run --args='-c eks/metric-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} --region ${{ env.AWS_DEFAULT_REGION }} @@ -186,7 +203,7 @@ jobs: --metric-namespace ${{ env.METRIC_NAMESPACE }} --log-group ${{ env.LOG_GROUP }} --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }} - --cluster ${{ inputs.test-cluster-name }} + --platform-info ${{ inputs.test-cluster-name }} --service-name sample-application-${{ env.TESTING_ID }} --remote-service-name sample-remote-application-${{ env.TESTING_ID }} --remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }} @@ -194,9 +211,9 @@ jobs: --rollup' - name: Call endpoints and validate generated traces - if: success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure' - working-directory: test/validator - run: ./gradlew run --args='-c trace-validation.yml + id: trace-validation + if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled() + run: ./gradlew testing:validator:run --args='-c eks/trace-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} --region ${{ env.AWS_DEFAULT_REGION }} @@ -204,7 +221,7 @@ jobs: --metric-namespace ${{ env.METRIC_NAMESPACE }} --log-group ${{ env.LOG_GROUP }} --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }} - --cluster ${{ inputs.test-cluster-name }} + --platform-info ${{ inputs.test-cluster-name }} --service-name sample-application-${{ env.TESTING_ID }} --remote-service-deployment-name ${{ env.REMOTE_SERVICE_DEPLOYMENT_NAME }} --request-body ip=${{ env.REMOTE_SERVICE_POD_IP }} @@ -214,12 +231,15 @@ jobs: - name: Remove log group deletion command if: always() + working-directory: enablement-script/scripts/eks/appsignals run: | delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP }}' --region \$REGION" sed -i "s#$delete_log_group##g" clean-app-signals.sh - name: Clean Up App Signals if: always() + continue-on-error: true + working-directory: enablement-script/scripts/eks/appsignals run: | ./clean-app-signals.sh \ ${{ inputs.test-cluster-name }} \ @@ -236,10 +256,11 @@ jobs: - name: Terraform destroy if: always() continue-on-error: true + working-directory: testing/terraform/eks run: | - cd test/terraform/eks terraform destroy -auto-approve \ -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ -var="kube_directory_path=${{ github.workspace }}/.kube" \ -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ @@ -254,4 +275,4 @@ jobs: --name service-account-${{ env.TESTING_ID }} \ --namespace ${{ env.SAMPLE_APP_NAMESPACE }} \ --cluster ${{ inputs.test-cluster-name }} \ - --region ${{ env.AWS_DEFAULT_REGION }} \ + --region ${{ env.AWS_DEFAULT_REGION }} \ No newline at end of file diff --git a/.github/workflows/build-and-upload-staging.yml b/.github/workflows/build-and-upload-staging.yml index 4977a096..b306ea49 100644 --- a/.github/workflows/build-and-upload-staging.yml +++ b/.github/workflows/build-and-upload-staging.yml @@ -66,8 +66,7 @@ jobs: uses: ./.github/workflows/appsignals-e2e-test.yml secrets: inherit concurrency: - group: 'appsignals-cw-agent-operator-test' + group: 'e2e-cw-agent-operator-test' cancel-in-progress: false with: - test-cluster-name: 'e2e-cw-agent-operator-test' - caller-workflow-name: 'build-and-upload-staging' \ No newline at end of file + test-cluster-name: 'e2e-cw-agent-operator-test' \ No newline at end of file From e76c2e0c36523097f6739e3a92468c75f0c255d7 Mon Sep 17 00:00:00 2001 From: Mahad Janjua Date: Mon, 18 Dec 2023 10:50:30 -0800 Subject: [PATCH 2/4] E2E Testing: Verify CW Agent Operator image + add comments --- .github/workflows/appsignals-e2e-test.yml | 8 +++++++- .github/workflows/build-and-upload-staging.yml | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/appsignals-e2e-test.yml b/.github/workflows/appsignals-e2e-test.yml index b3eef1f1..7202a69c 100644 --- a/.github/workflows/appsignals-e2e-test.yml +++ b/.github/workflows/appsignals-e2e-test.yml @@ -5,6 +5,7 @@ name: App Signals Enablement E2E Testing on: workflow_call: inputs: + # Ensure two tests do not run on the same cluster at the same time through GitHub Action concurrency test-cluster-name: required: true type: string @@ -141,7 +142,12 @@ jobs: - name: Verify pod CWAgent image run: | - kubectl get pods -n amazon-cloudwatch --output json | \ + kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=cloudwatch-agent -o json | \ + jq '.items[0].status.containerStatuses[0].imageID' + + - name: Verify pod CWAgent Operator image + run: | + kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=amazon-cloudwatch-observability -o json | \ jq '.items[0].status.containerStatuses[0].imageID' - name: Get the sample app endpoint diff --git a/.github/workflows/build-and-upload-staging.yml b/.github/workflows/build-and-upload-staging.yml index b306ea49..8f909cc5 100644 --- a/.github/workflows/build-and-upload-staging.yml +++ b/.github/workflows/build-and-upload-staging.yml @@ -65,6 +65,7 @@ jobs: needs: MakeBinary uses: ./.github/workflows/appsignals-e2e-test.yml secrets: inherit + # Two E2E tests should not run at the same time in the same EKS cluster concurrency: group: 'e2e-cw-agent-operator-test' cancel-in-progress: false From f3c934b4b4599f08b62c04fae10019948aa35365 Mon Sep 17 00:00:00 2001 From: Mahad Janjua Date: Mon, 18 Dec 2023 11:52:24 -0800 Subject: [PATCH 3/4] E2E Test: Add failure if Operator image hasn't changed --- .github/workflows/appsignals-e2e-test.yml | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/.github/workflows/appsignals-e2e-test.yml b/.github/workflows/appsignals-e2e-test.yml index 7202a69c..2fef9c74 100644 --- a/.github/workflows/appsignals-e2e-test.yml +++ b/.github/workflows/appsignals-e2e-test.yml @@ -115,6 +115,11 @@ jobs: ${{ env.AWS_DEFAULT_REGION }} \ ${{ env.SAMPLE_APP_NAMESPACE }} + - name: Save CloudWatch Agent Operator image to environment before patching + run: | + echo "OLD_CW_AGENT_OPERATOR_IMAGE"=$(kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=amazon-cloudwatch-observability -o json | \ + jq '.items[0].status.containerStatuses[0].image') >> $GITHUB_ENV + - name: Patch the CloudWatch Agent Operator image and restart CloudWatch pods run: | kubectl patch deploy -n amazon-cloudwatch amazon-cloudwatch-observability-controller-manager --type='json' -p '[{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "${{ env.ECR_OPERATOR_STAGING_IMAGE }}"}]' @@ -135,20 +140,30 @@ jobs: echo "REMOTE_SERVICE_DEPLOYMENT_NAME=$(kubectl get deployments -n ${{ env.SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].metadata.name}')" >> $GITHUB_ENV echo "REMOTE_SERVICE_POD_IP=$(kubectl get pods -n ${{ env.SAMPLE_APP_NAMESPACE }} --selector=app=remote-app -o jsonpath='{.items[0].status.podIP}')" >> $GITHUB_ENV - - name: Verify pod ADOT image + - name: Log pod ADOT image ID run: | kubectl get pods -n ${{ env.SAMPLE_APP_NAMESPACE }} --output json | \ jq '.items[0].status.initContainerStatuses[0].imageID' - - name: Verify pod CWAgent image + - name: Log pod CWAgent image ID run: | kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=cloudwatch-agent -o json | \ jq '.items[0].status.containerStatuses[0].imageID' - - name: Verify pod CWAgent Operator image + - name: Log pod CWAgent Operator image ID and save image to the environment run: | kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=amazon-cloudwatch-observability -o json | \ jq '.items[0].status.containerStatuses[0].imageID' + + echo "NEW_CW_AGENT_OPERATOR_IMAGE"=$(kubectl get pods -n amazon-cloudwatch -l app.kubernetes.io/name=amazon-cloudwatch-observability -o json | \ + jq '.items[0].status.containerStatuses[0].image') >> $GITHUB_ENV + + - name: Check if CW Agent Operator image has changed + run: | + if [ ${{ env.OLD_CW_AGENT_OPERATOR_IMAGE }} = ${{ env.NEW_CW_AGENT_OPERATOR_IMAGE }} ]; then + echo "Operator image did not change" + exit 1 + fi - name: Get the sample app endpoint run: | From 4986b116fe2dbd734952c318ca844b17b27fb7b9 Mon Sep 17 00:00:00 2001 From: Mahad Janjua Date: Tue, 19 Dec 2023 11:59:08 -0800 Subject: [PATCH 4/4] E2E Testing: Add description of what the test does --- .github/workflows/appsignals-e2e-test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/appsignals-e2e-test.yml b/.github/workflows/appsignals-e2e-test.yml index 2fef9c74..934bf265 100644 --- a/.github/workflows/appsignals-e2e-test.yml +++ b/.github/workflows/appsignals-e2e-test.yml @@ -1,5 +1,9 @@ # This is a reusable workflow for running the E2E test for App Signals. # It is meant to be called from another workflow. +# This E2E test is responsible for validating setting up a sample application on an EKS cluster and enabling +# App Signals using the staging image of the CloudWatch Agent Operator. It validates the generated telemetry +# including logs, metrics, and traces, then cleans up the cluster. The testing resources can be found in the +# ADOT java instrumentation repo: https://github.com/aws-observability/aws-otel-java-instrumentation/tree/main/testing # Read more about reusable workflows: https://docs.github.com/en/actions/using-workflows/reusing-workflows#overview name: App Signals Enablement E2E Testing on: