Skip to content

Commit

Permalink
Fix: Health lambas still fail occasionally (#6097)
Browse files Browse the repository at this point in the history
  • Loading branch information
achave11-ucsc committed Apr 15, 2024
1 parent ab1777d commit 5d113db
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 11 deletions.
1 change: 1 addition & 0 deletions lambdas/indexer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ def health_by_key(keys: Optional[str] = None):

# FIXME: Remove redundant prefix from name
# https://github.com/DataBiosphere/azul/issues/5337
@app.retry(1)
@app.schedule(
'rate(1 minute)',
name='indexercachehealth'
Expand Down
1 change: 1 addition & 0 deletions lambdas/service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@ def custom_health(keys: Optional[str] = None):

# FIXME: Remove redundant prefix from name
# https://github.com/DataBiosphere/azul/issues/5337
@app.retry(1)
@app.schedule(
'rate(1 minute)',
name='servicecachehealth'
Expand Down
34 changes: 34 additions & 0 deletions src/azul/chalice.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ class MetricThreshold:
value: int


@attr.s(auto_attribs=True, frozen=True, kw_only=True)
class Retry:
lambda_name: str
handler_name: Optional[str] = attr.ib(default=None)
value: int


C = TypeVar('C', bound='AppController')


Expand Down Expand Up @@ -526,6 +533,33 @@ def metric_thresholds(self) -> list[MetricThreshold]:
value=threshold))
return thresholds

def retry(self, retries: int):
"""
Use this decorator to specify a custom number of retries that should be
different from the default (which is two) for any of the Azul async
lambdas. See:
https://docs.aws.amazon.com/lambda/latest/dg/invocation-retries.html
"""

def wrapper(f):
assert isinstance(f, chalice.app.EventSourceHandler), f
f.retries = retries
return f

return wrapper

def retries(self) -> list[Retry]:
lambda_name, _ = config.unqualified_resource_name(self.app_name)
retries = []
for handler_name, handler in self.handler_map.items():
if isinstance(handler, chalice.app.EventSourceHandler):
value = getattr(handler, 'retries', None)
if value is not None:
retries.append(Retry(lambda_name=lambda_name,
handler_name=handler_name,
value=value))
return retries


@attr.s(auto_attribs=True, frozen=True, kw_only=True)
class AppController:
Expand Down
28 changes: 17 additions & 11 deletions terraform/api_gateway.tf.json.template.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,18 @@
import json

from azul import (
cached_property,
config,
)
from azul.chalice import (
AzulChaliceApp,
)
from azul.deployment import (
aws,
)
from azul.modules import (
load_app_module,
)
from azul.objects import (
InternMeta,
)
Expand Down Expand Up @@ -41,6 +48,10 @@ def for_name(cls, name):
],
policy=json.dumps(getattr(policy_module, 'policy')))

@cached_property
def chalice(self) -> AzulChaliceApp:
return load_app_module(self.name).app


apps = [
Application.for_name('indexer'),
Expand Down Expand Up @@ -370,18 +381,13 @@ def for_domain(cls, domain):
}
},
'aws_lambda_function_event_invoke_config': {
function_name: {
'function_name': '${aws_lambda_function.%s.function_name}' % function_name,
'maximum_retry_attempts': 0
f'{retry.lambda_name}_{retry.handler_name}': {
'function_name': '${aws_lambda_function.%s.function_name}'
% f'{retry.lambda_name}_{retry.handler_name}',
'maximum_retry_attempts': retry.value
}
for function_name in
[
f'indexer_{lm}'
for lm in ['forward_alb_logs', 'forward_s3_logs']
if config.enable_log_forwarding
] + [
f'{lm}_{lm}cachehealth' for lm in ['indexer', 'service']
]
for app in apps
for retry in app.chalice.retries()
}
},
*(
Expand Down

0 comments on commit 5d113db

Please sign in to comment.