From ea292692467d66152b85bf1ecaf0694d72a485b9 Mon Sep 17 00:00:00 2001 From: gibsondan Date: Fri, 3 Feb 2023 09:37:11 -0600 Subject: [PATCH] Add 401 to the list of API codes that our k8s client retries on Summary: 401 would typically not be a retryable error, but a user reported hitting it when they scaled up their cluster, and https://github.com/aws/containers-roadmap/issues/1810 seems to suggest retrying as a workarounds. The downside of retrying on a 401 seems fairly low as well. Open to push-back on this though. Test Plan: Existing BK test coverage of the k8s client --- python_modules/libraries/dagster-k8s/dagster_k8s/client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python_modules/libraries/dagster-k8s/dagster_k8s/client.py b/python_modules/libraries/dagster-k8s/dagster_k8s/client.py index 68c696b898887..7833486f3f220 100644 --- a/python_modules/libraries/dagster-k8s/dagster_k8s/client.py +++ b/python_modules/libraries/dagster-k8s/dagster_k8s/client.py @@ -78,6 +78,8 @@ class DagsterK8sPipelineStatusException(Exception): 503, # Service unavailable 504, # Gateway timeout 500, # Internal server error + # typically not transient, but some k8s clusters raise it transiently: https://github.com/aws/containers-roadmap/issues/1810 + 401, # Authorization Failure ]