@@ -551,6 +551,29 @@ jobs:
551551 # Redirect all output to a log file while still showing it
552552 exec > >(tee -a deploy-operator.log) 2>&1
553553
554+ # Error handler function - collects diagnostics on any command failure
555+ handle_error() {
556+ local exit_code=$?
557+ echo "=== Command failed with exit code $exit_code ==="
558+
559+ # Capture additional diagnostics if namespace is set
560+ if [ -n "$NAMESPACE" ]; then
561+ echo "=== Pod Status ==="
562+ kubectl get pods -n ${NAMESPACE} -o wide 2>&1 || true
563+ echo "=== Events ==="
564+ kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp' 2>&1 | tail -20 || true
565+ echo "=== Deployment Status ==="
566+ kubectl get deployments -n ${NAMESPACE} 2>&1 || true
567+ echo "=== Helm Status ==="
568+ helm status dynamo-platform -n ${NAMESPACE} 2>&1 || true
569+ fi
570+
571+ exit $exit_code
572+ }
573+
574+ # Set trap to call handle_error on any command failure
575+ trap 'handle_error' ERR
576+
554577 # Set namespace
555578 # Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
556579 BRANCH_SANITIZED="${BRANCH//\//-}"
@@ -596,71 +619,28 @@ jobs:
596619
597620 # Check if Helm is available
598621 echo "Checking Helm availability..."
599- if ! command -v helm &> /dev/null; then
600- ERROR_MSG="Helm is not installed or not available in PATH. Helm installation may have failed during runner setup."
601- echo "$ERROR_MSG"
602- echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
603- exit 1
604- fi
605-
622+ command -v helm
606623 echo "Helm version: $(helm version --short)"
607624
608625 # Install helm dependencies
609626 echo "Installing Helm dependencies..."
610- if ! helm repo add bitnami https://charts.bitnami.com/bitnami 2>&1; then
611- ERROR_MSG="Failed to add Helm bitnami repository"
612- echo "$ERROR_MSG"
613- echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
614- exit 1
615- fi
627+ helm repo add bitnami https://charts.bitnami.com/bitnami
616628
617629 cd deploy/cloud/helm/platform/
618-
619- if ! helm dep build . 2>&1; then
620- ERROR_MSG="Failed to build Helm dependencies"
621- echo "$ERROR_MSG"
622- echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
623- exit 1
624- fi
630+ helm dep build .
625631
626632 # Install platform with namespace restriction for single profile testing
627633 echo "Installing dynamo-platform Helm chart..."
628- if ! helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
634+ helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
629635 --set dynamo-operator.namespaceRestriction.enabled=true \
630636 --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
631637 --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
632638 --set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
633- --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret 2>&1; then
634- ERROR_MSG="Failed to install dynamo-platform Helm chart. This may be due to: pre-install hook timeout, image pull failures, or resource constraints."
635- echo "$ERROR_MSG"
636-
637- # Capture additional diagnostics
638- echo "=== Pod Status ==="
639- kubectl get pods -n ${NAMESPACE} -o wide || true
640- echo "=== Events ==="
641- kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp' | tail -20 || true
642- echo "=== Helm Status ==="
643- helm status dynamo-platform -n ${NAMESPACE} || true
644-
645- echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
646- exit 1
647- fi
639+ --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
648640
649641 # Wait for all deployments to be ready
650642 echo "Waiting for deployments to be ready..."
651- if ! timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch 2>&1; then
652- ERROR_MSG="Deployment rollout timed out after 300 seconds"
653- echo "$ERROR_MSG"
654-
655- # Capture diagnostics
656- echo "=== Pod Status ==="
657- kubectl get pods -n ${NAMESPACE} -o wide || true
658- echo "=== Deployment Status ==="
659- kubectl get deployments -n ${NAMESPACE} || true
660-
661- echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
662- exit 1
663- fi
643+ timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
664644 continue-on-error : true
665645
666646 - name : Setup Python for Log Analysis
@@ -697,26 +677,16 @@ jobs:
697677 echo "LOGAI_EXTRACTED_ERROR=Log file not found" >> $GITHUB_ENV
698678 fi
699679
700- # Also preserve any manually captured error
701- if [ -n "$ERROR_MESSAGE" ]; then
702- echo "MANUAL_ERROR_MESSAGE=$ERROR_MESSAGE" >> $GITHUB_ENV
703- fi
704-
705680 - name : Check for Job Failure and Create Annotation
706681 if : always() && steps.deploy-operator-step.outcome == 'failure'
707682 env :
708683 GITHUB_TOKEN : ${{ secrets.GITHUB_TOKEN }}
709684 run : |
710685 set -x
711686
712- # Combine LogAI extracted errors with manual captures
687+ # Use LogAI extracted errors (actual error lines from logs)
713688 ERROR_MESSAGE="${LOGAI_EXTRACTED_ERROR:-Unknown error occurred during operator deployment}"
714689
715- # Add manually captured error if available
716- if [ -n "$MANUAL_ERROR_MESSAGE" ]; then
717- ERROR_MESSAGE="${ERROR_MESSAGE}\n\nManually Captured Error:\n${MANUAL_ERROR_MESSAGE}"
718- fi
719-
720690 # Get additional context from Kubernetes if namespace is set
721691 if [ -n "$NAMESPACE" ]; then
722692 export KUBECONFIG=$(pwd)/.kubeconfig
@@ -761,7 +731,7 @@ jobs:
761731 "output": {
762732 "title": "Operator Deployment Failed (LogAI Analysis)",
763733 "summary": ("Failed to deploy dynamo-platform operator to namespace " + $namespace),
764- "text": ("**Job**: deploy-operator\n**Namespace**: " + $namespace + "\n**Analysis Method**: LogAI + Manual Capture \n\n**Error Details**:\n```\n" + $error_msg + "\n```\n\n[View Job Run](https://github.com/" + $repo + "/actions/runs/" + $run_id + ")"),
734+ "text": ("**Job**: deploy-operator\n**Namespace**: " + $namespace + "\n**Analysis Method**: LogAI\n\n**Error Details**:\n```\n" + $error_msg + "\n```\n\n[View Job Run](https://github.com/" + $repo + "/actions/runs/" + $run_id + ")"),
765735 "annotations": [{
766736 "path": ".github/workflows/container-validation-backends.yml",
767737 "start_line": 357,
@@ -833,6 +803,25 @@ jobs:
833803 # Redirect all output to a log file while still showing it
834804 exec > >(tee -a test-output.log) 2>&1
835805
806+ # Error handler function - collects diagnostics on any command failure
807+ handle_error() {
808+ local exit_code=$?
809+ echo "=== Command failed with exit code $exit_code ==="
810+
811+ # Capture additional diagnostics if variables are set
812+ if [ -n "$GRAPH_NAME" ] && [ -n "$NAMESPACE" ]; then
813+ echo "=== Pod Status ==="
814+ kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $NAMESPACE -o wide 2>&1 || true
815+ echo "=== Pod Logs (last 50 lines) ==="
816+ kubectl logs -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $NAMESPACE --tail=50 2>&1 || true
817+ fi
818+
819+ exit $exit_code
820+ }
821+
822+ # Set trap to call handle_error on any command failure
823+ trap 'handle_error' ERR
824+
836825 cd examples/backends/$FRAMEWORK
837826 export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
838827 export KUBE_NS=$NAMESPACE
@@ -883,12 +872,9 @@ jobs:
883872 sleep 5
884873 ATTEMPT=$((ATTEMPT + 1))
885874 done
886- if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
887- ERROR_MSG="Model $MODEL_NAME not found in /v1/models after $MAX_ATTEMPTS attempts. Last response: $MODELS_RESPONSE"
888- echo "$ERROR_MSG"
889- echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
890- exit 1
891- fi
875+ # Check if we exceeded max attempts
876+ [ $ATTEMPT -le $MAX_ATTEMPTS ]
877+
892878 RESPONSE=$(curl -s -N --no-buffer --retry 10 --retry-delay 5 --retry-connrefused -X POST "${LLM_URL}/v1/chat/completions" \
893879 -H 'accept: text/event-stream' \
894880 -H 'Content-Type: application/json' \
@@ -903,34 +889,16 @@ jobs:
903889 "stream":false,
904890 "max_tokens": 30,
905891 "temperature": 0.0
906- }' 2>&1 )
892+ }')
907893 echo "Response: $RESPONSE"
908- TEST_RESULT=0
909- ERROR_MSG=""
910- if ! echo "$RESPONSE" | jq -e . >/dev/null 2>&1; then
911- ERROR_MSG="Test failed: Response is not valid JSON. Got: $RESPONSE"
912- echo "$ERROR_MSG"
913- TEST_RESULT=1
914- elif ! echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null 2>&1; then
915- ERROR_MSG="Test failed: Message role is not 'assistant'. Got: $(echo "$RESPONSE" | jq '.choices[0].message.role')"
916- echo "$ERROR_MSG"
917- TEST_RESULT=1
918- elif ! echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null 2>&1; then
919- ERROR_MSG="Test failed: Model name is incorrect. Expected: ${MODEL_NAME}, Got: $(echo "$RESPONSE" | jq '.model')"
920- echo "$ERROR_MSG"
921- TEST_RESULT=1
922- elif ! echo "$RESPONSE" | jq -e '.choices[0].message.content | length > 100' >/dev/null 2>&1; then
923- ERROR_MSG="Test failed: Response content length is not greater than 100 characters. Got length: $(echo "$RESPONSE" | jq '.choices[0].message.content | length')"
924- echo "$ERROR_MSG"
925- TEST_RESULT=1
926- else
927- echo "Test passed: Response matches expected format and content"
928- fi
929894
930- if [ $TEST_RESULT -ne 0 ]; then
931- echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
932- fi
933- exit $TEST_RESULT
895+ # Validate response (each command will trigger trap on failure)
896+ echo "$RESPONSE" | jq -e . >/dev/null
897+ echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null
898+ echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null
899+ echo "$RESPONSE" | jq -e '.choices[0].message.content | length > 100' >/dev/null
900+
901+ echo "Test passed: Response matches expected format and content"
934902 continue-on-error : true
935903
936904 - name : Setup Python for Log Analysis
@@ -967,26 +935,16 @@ jobs:
967935 echo "LOGAI_EXTRACTED_ERROR=Test log file not found" >> $GITHUB_ENV
968936 fi
969937
970- # Also preserve any manually captured error
971- if [ -n "$ERROR_MESSAGE" ]; then
972- echo "MANUAL_ERROR_MESSAGE=$ERROR_MESSAGE" >> $GITHUB_ENV
973- fi
974-
975938 - name : Check for Job Failure and Create Annotation
976939 if : always() && steps.run-tests.outcome == 'failure'
977940 env :
978941 GITHUB_TOKEN : ${{ secrets.GITHUB_TOKEN }}
979942 run : |
980943 set -x
981944
982- # Combine LogAI extracted errors with manual captures
945+ # Use LogAI extracted errors (actual error lines from logs)
983946 ERROR_MESSAGE="${LOGAI_EXTRACTED_ERROR:-Unknown error occurred during deployment test}"
984947
985- # Add manually captured error if available
986- if [ -n "$MANUAL_ERROR_MESSAGE" ]; then
987- ERROR_MESSAGE="${ERROR_MESSAGE}\n\nManually Captured Error:\n${MANUAL_ERROR_MESSAGE}"
988- fi
989-
990948 # Get additional context from pod logs if available
991949 if [ -n "$GRAPH_NAME" ] && [ -n "$NAMESPACE" ]; then
992950 export KUBECONFIG=$(pwd)/.kubeconfig
@@ -1026,7 +984,7 @@ jobs:
1026984 "output": {
1027985 "title": ("Deployment Test Failed: " + $framework + " (" + $profile + ") [LogAI Analysis]"),
1028986 "summary": ("Deployment test failed for " + $framework + " with profile " + $profile),
1029- "text": ("**Job**: " + $job + "\n**Framework**: " + $framework + "\n**Profile**: " + $profile + "\n**Namespace**: " + $namespace + "\n**Analysis Method**: LogAI + Manual Capture \n\n**Error Details**:\n```\n" + $error_msg + "\n```\n\n[View Job Run](https://github.com/" + $repo + "/actions/runs/" + $run_id + ")"),
987+ "text": ("**Job**: " + $job + "\n**Framework**: " + $framework + "\n**Profile**: " + $profile + "\n**Namespace**: " + $namespace + "\n**Analysis Method**: LogAI\n\n**Error Details**:\n```\n" + $error_msg + "\n```\n\n[View Job Run](https://github.com/" + $repo + "/actions/runs/" + $run_id + ")"),
1030988 "annotations": [{
1031989 "path": ".github/workflows/container-validation-backends.yml",
1032990 "start_line": 593,
0 commit comments