From 6dc6024d8a26e8ffe895e9a691b062053c4da5db Mon Sep 17 00:00:00 2001 From: Abraham Chavez Date: Thu, 4 Apr 2024 19:42:11 -0700 Subject: [PATCH] fixup! Fix: Health lambas still fail occasionally (#6097) --- lambdas/indexer/app.py | 1 + lambdas/service/app.py | 1 + src/azul/chalice.py | 34 +++++++++++++++++++++++ terraform/api_gateway.tf.json.template.py | 28 +++++++++++-------- 4 files changed, 53 insertions(+), 11 deletions(-) diff --git a/lambdas/indexer/app.py b/lambdas/indexer/app.py index d1a78e73a6..3b1da78913 100644 --- a/lambdas/indexer/app.py +++ b/lambdas/indexer/app.py @@ -204,6 +204,7 @@ def health_by_key(keys: Optional[str] = None): # FIXME: Remove redundant prefix from name # https://github.com/DataBiosphere/azul/issues/5337 +@app.retry(1) @app.schedule( 'rate(1 minute)', name='indexercachehealth' diff --git a/lambdas/service/app.py b/lambdas/service/app.py index 5ef7036222..33b06335f4 100644 --- a/lambdas/service/app.py +++ b/lambdas/service/app.py @@ -542,6 +542,7 @@ def custom_health(keys: Optional[str] = None): # FIXME: Remove redundant prefix from name # https://github.com/DataBiosphere/azul/issues/5337 +@app.retry(1) @app.schedule( 'rate(1 minute)', name='servicecachehealth' diff --git a/src/azul/chalice.py b/src/azul/chalice.py index 66ef1249df..a3af443782 100644 --- a/src/azul/chalice.py +++ b/src/azul/chalice.py @@ -108,6 +108,13 @@ class MetricThreshold: value: int +@attr.s(auto_attribs=True, frozen=True, kw_only=True) +class Retry: + lambda_name: str + handler_name: Optional[str] = attr.ib(default=None) + value: int + + C = TypeVar('C', bound='AppController') @@ -526,6 +533,33 @@ def metric_thresholds(self) -> list[MetricThreshold]: value=threshold)) return thresholds + def retry(self, retries: int): + """ + Use this decorator to specify a custom number of retries that should be + different from the default (which is two) for any of the Azul async + lambdas. See: + https://docs.aws.amazon.com/lambda/latest/dg/invocation-retries.html + """ + + def wrapper(f): + assert isinstance(f, chalice.app.EventSourceHandler), f + f.retries = retries + return f + + return wrapper + + def retries(self) -> list[Retry]: + lambda_name, _ = config.unqualified_resource_name(self.app_name) + retries = [] + for handler_name, handler in self.handler_map.items(): + if isinstance(handler, chalice.app.EventSourceHandler): + value = getattr(handler, 'retries', None) + if value is not None: + retries.append(Retry(lambda_name=lambda_name, + handler_name=handler_name, + value=value)) + return retries + @attr.s(auto_attribs=True, frozen=True, kw_only=True) class AppController: diff --git a/terraform/api_gateway.tf.json.template.py b/terraform/api_gateway.tf.json.template.py index e7a033786f..b5f805c753 100644 --- a/terraform/api_gateway.tf.json.template.py +++ b/terraform/api_gateway.tf.json.template.py @@ -5,11 +5,18 @@ import json from azul import ( + cached_property, config, ) +from azul.chalice import ( + AzulChaliceApp, +) from azul.deployment import ( aws, ) +from azul.modules import ( + load_app_module, +) from azul.objects import ( InternMeta, ) @@ -41,6 +48,10 @@ def for_name(cls, name): ], policy=json.dumps(getattr(policy_module, 'policy'))) + @cached_property + def chalice(self) -> AzulChaliceApp: + return load_app_module(self.name).app + apps = [ Application.for_name('indexer'), @@ -370,18 +381,13 @@ def for_domain(cls, domain): } }, 'aws_lambda_function_event_invoke_config': { - function_name: { - 'function_name': '${aws_lambda_function.%s.function_name}' % function_name, - 'maximum_retry_attempts': retry_attempts + f'{retry.lambda_name}_{retry.handler_name}': { + 'function_name': '${aws_lambda_function.%s.function_name}' + % f'{retry.lambda_name}_{retry.handler_name}', + 'maximum_retry_attempts': retry.value } - for function_name, retry_attempts in - [ - (f'indexer_{lm}', 0) - for lm in ['forward_alb_logs', 'forward_s3_logs'] - if config.enable_log_forwarding - ] + [ - (f'{lm}_{lm}cachehealth', 1) for lm in ['indexer', 'service'] - ] + for app in apps + for retry in app.chalice.retries() } }, *(