diff --git a/providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py b/providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py index 0384ba2879d13..efb018b5cbf80 100644 --- a/providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py +++ b/providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/executors/kubernetes_executor.py @@ -389,6 +389,7 @@ def sync(self) -> None: if ( (str(e.status) == "403" and "exceeded quota" in message) or (str(e.status) == "409" and "object has been modified" in message) + or str(e.status) == "500" ) and (self.task_publish_max_retries == -1 or retries < self.task_publish_max_retries): self.log.warning( "[Try %s of %s] Kube ApiException for Task: (%s). Reason: %r. Message: %s", diff --git a/providers/cncf/kubernetes/tests/unit/cncf/kubernetes/executors/test_kubernetes_executor.py b/providers/cncf/kubernetes/tests/unit/cncf/kubernetes/executors/test_kubernetes_executor.py index c37d3f83bbe43..24b8f4aff1be1 100644 --- a/providers/cncf/kubernetes/tests/unit/cncf/kubernetes/executors/test_kubernetes_executor.py +++ b/providers/cncf/kubernetes/tests/unit/cncf/kubernetes/executors/test_kubernetes_executor.py @@ -407,6 +407,26 @@ def setup_method(self) -> None: State.FAILED, id="429 Too Many Requests (empty body)", ), + pytest.param( + HTTPResponse( + body='{"message": "Internal error occurred: failed calling webhook \\"mutation.azure-workload-identity.io\\": failed to call webhook: Post \\"https://azure-wi-webhook-webhook-service.kube-system.svc:443/mutate-v1-pod?timeout=10s\\""}', + status=500, + ), + 1, + True, + State.SUCCESS, + id="500 Internal Server Error (webhook failure)", + ), + pytest.param( + HTTPResponse( + body='{"message": "Internal error occurred: failed calling webhook"}', + status=500, + ), + 1, + True, + State.FAILED, + id="500 Internal Server Error (webhook failure) (retry failed)", + ), ], ) @mock.patch("airflow.providers.cncf.kubernetes.executors.kubernetes_executor_utils.KubernetesJobWatcher") @@ -439,6 +459,9 @@ def test_run_next_exception_requeue( - your requested namespace doesn't exists - 422 Unprocessable Entity will returns in scenarios like - your request parameters are valid but unsupported e.g. limits lower than requests. + - 500 Internal Server Error will returns in scenarios like + - failed calling webhook - typically transient API server or webhook service issues + - should be retried if task_publish_max_retries > 0 """ template_file = data_file("pods/generator_base_with_secrets.yaml").as_posix()