Skip to content

Commit 2baf326

Browse files
committed
use trap and remove excess error injections
1 parent c147af1 commit 2baf326

File tree

2 files changed

+64
-137
lines changed

2 files changed

+64
-137
lines changed

.github/scripts/extract_log_errors.py

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
class LogErrorExtractor:
3030
"""Extract errors from log files using LogAI or fallback methods."""
3131

32-
# Common error patterns for fallback
32+
# Common error patterns for fallback - just extract the error lines
3333
ERROR_PATTERNS = [
3434
# Kubernetes specific errors
3535
r"error:\s+(no matching resources found)(?:\s+.*)?",
@@ -50,22 +50,6 @@ class LogErrorExtractor:
5050
r"timeout\s+(.+?)(?:\n|$)",
5151
]
5252

53-
# Error explanations for common patterns
54-
ERROR_EXPLANATIONS = {
55-
"no matching resources found": "Kubernetes resources (pods) have not been created yet or deployment failed to create them. Check deployment status and events.",
56-
"timed out waiting for the condition": "Kubernetes pods did not become ready within the timeout period. Check pod status, events, and logs for issues.",
57-
"waiting for the condition": "Kubernetes pod did not become ready within the timeout period. Check pod status, events, and logs for issues.",
58-
"timeout": "Operation exceeded the time limit. Check resource availability and system logs.",
59-
}
60-
61-
# Patterns to identify context around errors
62-
CONTEXT_PATTERNS = [
63-
r"(exit code \d+)",
64-
r"(status code \d+)",
65-
r"(HTTP \d{3})",
66-
r"(line \d+)",
67-
]
68-
6953
def __init__(self, log_file: Path):
7054
self.log_file = log_file
7155
self.log_content = ""
@@ -174,22 +158,13 @@ def extract_with_fallback(self) -> List[Dict[str, Any]]:
174158
context_end = min(len(lines), line_num + 3)
175159
context = "\n".join(lines[context_start:context_end])
176160

177-
# Check if we have an explanation for this error
178-
explanation = None
179-
for key, exp in self.ERROR_EXPLANATIONS.items():
180-
if key.lower() in error_msg.lower():
181-
explanation = exp
182-
break
183-
184161
if error_msg and len(error_msg) > 10: # Filter out very short matches
185162
error_dict = {
186163
"line_number": line_num,
187164
"message": error_msg[:500], # Limit message length
188165
"context": context[:1000], # Limit context length
189166
"source": "fallback",
190167
}
191-
if explanation:
192-
error_dict["explanation"] = explanation
193168
errors.append(error_dict)
194169

195170
# Deduplicate and sort by line number
@@ -230,8 +205,6 @@ def get_summary(self) -> str:
230205
summary_parts.append(
231206
f"{i}. [Line {error['line_number']}] {error['message']}"
232207
)
233-
if "explanation" in error:
234-
summary_parts.append(f" 💡 {error['explanation']}")
235208
if "context" in error:
236209
summary_parts.append(f" Context: {error['context'][:200]}...")
237210

@@ -248,10 +221,6 @@ def get_primary_error(self) -> str:
248221
primary = errors[0]
249222
message = primary["message"]
250223

251-
# Add explanation if available
252-
if "explanation" in primary:
253-
message += f"\n\n💡 Explanation:\n{primary['explanation']}"
254-
255224
# Add context if available
256225
if "context" in primary:
257226
message += f"\n\nContext:\n{primary['context']}"

.github/workflows/container-validation-backends.yml

Lines changed: 63 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,29 @@ jobs:
551551
# Redirect all output to a log file while still showing it
552552
exec > >(tee -a deploy-operator.log) 2>&1
553553
554+
# Error handler function - collects diagnostics on any command failure
555+
handle_error() {
556+
local exit_code=$?
557+
echo "=== Command failed with exit code $exit_code ==="
558+
559+
# Capture additional diagnostics if namespace is set
560+
if [ -n "$NAMESPACE" ]; then
561+
echo "=== Pod Status ==="
562+
kubectl get pods -n ${NAMESPACE} -o wide 2>&1 || true
563+
echo "=== Events ==="
564+
kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp' 2>&1 | tail -20 || true
565+
echo "=== Deployment Status ==="
566+
kubectl get deployments -n ${NAMESPACE} 2>&1 || true
567+
echo "=== Helm Status ==="
568+
helm status dynamo-platform -n ${NAMESPACE} 2>&1 || true
569+
fi
570+
571+
exit $exit_code
572+
}
573+
574+
# Set trap to call handle_error on any command failure
575+
trap 'handle_error' ERR
576+
554577
# Set namespace
555578
# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
556579
BRANCH_SANITIZED="${BRANCH//\//-}"
@@ -596,71 +619,28 @@ jobs:
596619
597620
# Check if Helm is available
598621
echo "Checking Helm availability..."
599-
if ! command -v helm &> /dev/null; then
600-
ERROR_MSG="Helm is not installed or not available in PATH. Helm installation may have failed during runner setup."
601-
echo "$ERROR_MSG"
602-
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
603-
exit 1
604-
fi
605-
622+
command -v helm
606623
echo "Helm version: $(helm version --short)"
607624
608625
# Install helm dependencies
609626
echo "Installing Helm dependencies..."
610-
if ! helm repo add bitnami https://charts.bitnami.com/bitnami 2>&1; then
611-
ERROR_MSG="Failed to add Helm bitnami repository"
612-
echo "$ERROR_MSG"
613-
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
614-
exit 1
615-
fi
627+
helm repo add bitnami https://charts.bitnami.com/bitnami
616628
617629
cd deploy/cloud/helm/platform/
618-
619-
if ! helm dep build . 2>&1; then
620-
ERROR_MSG="Failed to build Helm dependencies"
621-
echo "$ERROR_MSG"
622-
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
623-
exit 1
624-
fi
630+
helm dep build .
625631
626632
# Install platform with namespace restriction for single profile testing
627633
echo "Installing dynamo-platform Helm chart..."
628-
if ! helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
634+
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
629635
--set dynamo-operator.namespaceRestriction.enabled=true \
630636
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
631637
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
632638
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
633-
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret 2>&1; then
634-
ERROR_MSG="Failed to install dynamo-platform Helm chart. This may be due to: pre-install hook timeout, image pull failures, or resource constraints."
635-
echo "$ERROR_MSG"
636-
637-
# Capture additional diagnostics
638-
echo "=== Pod Status ==="
639-
kubectl get pods -n ${NAMESPACE} -o wide || true
640-
echo "=== Events ==="
641-
kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp' | tail -20 || true
642-
echo "=== Helm Status ==="
643-
helm status dynamo-platform -n ${NAMESPACE} || true
644-
645-
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
646-
exit 1
647-
fi
639+
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
648640
649641
# Wait for all deployments to be ready
650642
echo "Waiting for deployments to be ready..."
651-
if ! timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch 2>&1; then
652-
ERROR_MSG="Deployment rollout timed out after 300 seconds"
653-
echo "$ERROR_MSG"
654-
655-
# Capture diagnostics
656-
echo "=== Pod Status ==="
657-
kubectl get pods -n ${NAMESPACE} -o wide || true
658-
echo "=== Deployment Status ==="
659-
kubectl get deployments -n ${NAMESPACE} || true
660-
661-
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
662-
exit 1
663-
fi
643+
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
664644
continue-on-error: true
665645

666646
- name: Setup Python for Log Analysis
@@ -697,26 +677,16 @@ jobs:
697677
echo "LOGAI_EXTRACTED_ERROR=Log file not found" >> $GITHUB_ENV
698678
fi
699679
700-
# Also preserve any manually captured error
701-
if [ -n "$ERROR_MESSAGE" ]; then
702-
echo "MANUAL_ERROR_MESSAGE=$ERROR_MESSAGE" >> $GITHUB_ENV
703-
fi
704-
705680
- name: Check for Job Failure and Create Annotation
706681
if: always() && steps.deploy-operator-step.outcome == 'failure'
707682
env:
708683
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
709684
run: |
710685
set -x
711686
712-
# Combine LogAI extracted errors with manual captures
687+
# Use LogAI extracted errors (actual error lines from logs)
713688
ERROR_MESSAGE="${LOGAI_EXTRACTED_ERROR:-Unknown error occurred during operator deployment}"
714689
715-
# Add manually captured error if available
716-
if [ -n "$MANUAL_ERROR_MESSAGE" ]; then
717-
ERROR_MESSAGE="${ERROR_MESSAGE}\n\nManually Captured Error:\n${MANUAL_ERROR_MESSAGE}"
718-
fi
719-
720690
# Get additional context from Kubernetes if namespace is set
721691
if [ -n "$NAMESPACE" ]; then
722692
export KUBECONFIG=$(pwd)/.kubeconfig
@@ -761,7 +731,7 @@ jobs:
761731
"output": {
762732
"title": "Operator Deployment Failed (LogAI Analysis)",
763733
"summary": ("Failed to deploy dynamo-platform operator to namespace " + $namespace),
764-
"text": ("**Job**: deploy-operator\n**Namespace**: " + $namespace + "\n**Analysis Method**: LogAI + Manual Capture\n\n**Error Details**:\n```\n" + $error_msg + "\n```\n\n[View Job Run](https://github.com/" + $repo + "/actions/runs/" + $run_id + ")"),
734+
"text": ("**Job**: deploy-operator\n**Namespace**: " + $namespace + "\n**Analysis Method**: LogAI\n\n**Error Details**:\n```\n" + $error_msg + "\n```\n\n[View Job Run](https://github.com/" + $repo + "/actions/runs/" + $run_id + ")"),
765735
"annotations": [{
766736
"path": ".github/workflows/container-validation-backends.yml",
767737
"start_line": 357,
@@ -833,6 +803,25 @@ jobs:
833803
# Redirect all output to a log file while still showing it
834804
exec > >(tee -a test-output.log) 2>&1
835805
806+
# Error handler function - collects diagnostics on any command failure
807+
handle_error() {
808+
local exit_code=$?
809+
echo "=== Command failed with exit code $exit_code ==="
810+
811+
# Capture additional diagnostics if variables are set
812+
if [ -n "$GRAPH_NAME" ] && [ -n "$NAMESPACE" ]; then
813+
echo "=== Pod Status ==="
814+
kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $NAMESPACE -o wide 2>&1 || true
815+
echo "=== Pod Logs (last 50 lines) ==="
816+
kubectl logs -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $NAMESPACE --tail=50 2>&1 || true
817+
fi
818+
819+
exit $exit_code
820+
}
821+
822+
# Set trap to call handle_error on any command failure
823+
trap 'handle_error' ERR
824+
836825
cd examples/backends/$FRAMEWORK
837826
export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
838827
export KUBE_NS=$NAMESPACE
@@ -883,12 +872,9 @@ jobs:
883872
sleep 5
884873
ATTEMPT=$((ATTEMPT + 1))
885874
done
886-
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
887-
ERROR_MSG="Model $MODEL_NAME not found in /v1/models after $MAX_ATTEMPTS attempts. Last response: $MODELS_RESPONSE"
888-
echo "$ERROR_MSG"
889-
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
890-
exit 1
891-
fi
875+
# Check if we exceeded max attempts
876+
[ $ATTEMPT -le $MAX_ATTEMPTS ]
877+
892878
RESPONSE=$(curl -s -N --no-buffer --retry 10 --retry-delay 5 --retry-connrefused -X POST "${LLM_URL}/v1/chat/completions" \
893879
-H 'accept: text/event-stream' \
894880
-H 'Content-Type: application/json' \
@@ -903,34 +889,16 @@ jobs:
903889
"stream":false,
904890
"max_tokens": 30,
905891
"temperature": 0.0
906-
}' 2>&1)
892+
}')
907893
echo "Response: $RESPONSE"
908-
TEST_RESULT=0
909-
ERROR_MSG=""
910-
if ! echo "$RESPONSE" | jq -e . >/dev/null 2>&1; then
911-
ERROR_MSG="Test failed: Response is not valid JSON. Got: $RESPONSE"
912-
echo "$ERROR_MSG"
913-
TEST_RESULT=1
914-
elif ! echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null 2>&1; then
915-
ERROR_MSG="Test failed: Message role is not 'assistant'. Got: $(echo "$RESPONSE" | jq '.choices[0].message.role')"
916-
echo "$ERROR_MSG"
917-
TEST_RESULT=1
918-
elif ! echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null 2>&1; then
919-
ERROR_MSG="Test failed: Model name is incorrect. Expected: ${MODEL_NAME}, Got: $(echo "$RESPONSE" | jq '.model')"
920-
echo "$ERROR_MSG"
921-
TEST_RESULT=1
922-
elif ! echo "$RESPONSE" | jq -e '.choices[0].message.content | length > 100' >/dev/null 2>&1; then
923-
ERROR_MSG="Test failed: Response content length is not greater than 100 characters. Got length: $(echo "$RESPONSE" | jq '.choices[0].message.content | length')"
924-
echo "$ERROR_MSG"
925-
TEST_RESULT=1
926-
else
927-
echo "Test passed: Response matches expected format and content"
928-
fi
929894
930-
if [ $TEST_RESULT -ne 0 ]; then
931-
echo "ERROR_MESSAGE=$ERROR_MSG" >> $GITHUB_ENV
932-
fi
933-
exit $TEST_RESULT
895+
# Validate response (each command will trigger trap on failure)
896+
echo "$RESPONSE" | jq -e . >/dev/null
897+
echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null
898+
echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null
899+
echo "$RESPONSE" | jq -e '.choices[0].message.content | length > 100' >/dev/null
900+
901+
echo "Test passed: Response matches expected format and content"
934902
continue-on-error: true
935903

936904
- name: Setup Python for Log Analysis
@@ -967,26 +935,16 @@ jobs:
967935
echo "LOGAI_EXTRACTED_ERROR=Test log file not found" >> $GITHUB_ENV
968936
fi
969937
970-
# Also preserve any manually captured error
971-
if [ -n "$ERROR_MESSAGE" ]; then
972-
echo "MANUAL_ERROR_MESSAGE=$ERROR_MESSAGE" >> $GITHUB_ENV
973-
fi
974-
975938
- name: Check for Job Failure and Create Annotation
976939
if: always() && steps.run-tests.outcome == 'failure'
977940
env:
978941
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
979942
run: |
980943
set -x
981944
982-
# Combine LogAI extracted errors with manual captures
945+
# Use LogAI extracted errors (actual error lines from logs)
983946
ERROR_MESSAGE="${LOGAI_EXTRACTED_ERROR:-Unknown error occurred during deployment test}"
984947
985-
# Add manually captured error if available
986-
if [ -n "$MANUAL_ERROR_MESSAGE" ]; then
987-
ERROR_MESSAGE="${ERROR_MESSAGE}\n\nManually Captured Error:\n${MANUAL_ERROR_MESSAGE}"
988-
fi
989-
990948
# Get additional context from pod logs if available
991949
if [ -n "$GRAPH_NAME" ] && [ -n "$NAMESPACE" ]; then
992950
export KUBECONFIG=$(pwd)/.kubeconfig
@@ -1026,7 +984,7 @@ jobs:
1026984
"output": {
1027985
"title": ("Deployment Test Failed: " + $framework + " (" + $profile + ") [LogAI Analysis]"),
1028986
"summary": ("Deployment test failed for " + $framework + " with profile " + $profile),
1029-
"text": ("**Job**: " + $job + "\n**Framework**: " + $framework + "\n**Profile**: " + $profile + "\n**Namespace**: " + $namespace + "\n**Analysis Method**: LogAI + Manual Capture\n\n**Error Details**:\n```\n" + $error_msg + "\n```\n\n[View Job Run](https://github.com/" + $repo + "/actions/runs/" + $run_id + ")"),
987+
"text": ("**Job**: " + $job + "\n**Framework**: " + $framework + "\n**Profile**: " + $profile + "\n**Namespace**: " + $namespace + "\n**Analysis Method**: LogAI\n\n**Error Details**:\n```\n" + $error_msg + "\n```\n\n[View Job Run](https://github.com/" + $repo + "/actions/runs/" + $run_id + ")"),
1030988
"annotations": [{
1031989
"path": ".github/workflows/container-validation-backends.yml",
1032990
"start_line": 593,

0 commit comments

Comments
 (0)