Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Branding: improve IO resilience #2869

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions bot/exts/backend/branding/_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,14 +342,14 @@ async def synchronise(self) -> tuple[bool, bool]:
"""
log.debug("Synchronise: fetching current event.")

current_event, available_events = await self.repository.get_current_event()
try:
current_event, available_events = await self.repository.get_current_event()
except Exception:
log.exception("Synchronisation aborted: failed to fetch events.")
return False, False

await self.populate_cache_events(available_events)

if current_event is None:
log.error("Failed to fetch event. Cannot synchronise!")
return False, False

return await self.enter_event(current_event)

async def populate_cache_events(self, events: list[Event]) -> None:
Expand Down Expand Up @@ -433,10 +433,6 @@ async def daemon_main(self) -> None:

await self.populate_cache_events(available_events)

if new_event is None:
log.warning("Daemon main: failed to get current event from branding repository, will do nothing.")
return

if new_event.path != await self.cache_information.get("event_path"):
log.debug("Daemon main: new event detected!")
await self.enter_event(new_event)
Expand Down Expand Up @@ -596,10 +592,24 @@ async def branding_calendar_refresh_cmd(self, ctx: commands.Context) -> None:
log.info("Performing command-requested event cache refresh.")

async with ctx.typing():
available_events = await self.repository.get_events()
await self.populate_cache_events(available_events)
try:
available_events = await self.repository.get_events()
except Exception:
log.exception("Refresh aborted: failed to fetch events.")
resp = make_embed(
"Refresh aborted",
"Failed to fetch events. See log for details.",
success=False,
)
else:
await self.populate_cache_events(available_events)
resp = make_embed(
"Refresh successful",
"The event calendar has been refreshed.",
success=True,
)

await ctx.invoke(self.branding_calendar_group)
await ctx.send(embed=resp)

# endregion
# region: Command interface (branding daemon)
Expand Down
67 changes: 42 additions & 25 deletions bot/exts/backend/branding/_repository.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import typing as t
from datetime import UTC, date, datetime

import backoff
import frontmatter
from aiohttp import ClientResponse

from bot.bot import Bot
from bot.constants import Keys
Expand Down Expand Up @@ -71,6 +73,22 @@ def __str__(self) -> str:
return f"<Event at '{self.path}'>"


class GitHubServerError(Exception):
"""
GitHub request has failed due to a server error.

Such requests shall be retried.
"""


retry_on_server_error = backoff.on_exception(
backoff.expo,
GitHubServerError,
max_tries=5,
jitter=None,
)


class BrandingRepository:
"""
Branding repository abstraction.
Expand All @@ -93,6 +111,18 @@ class BrandingRepository:
def __init__(self, bot: Bot) -> None:
self.bot = bot

def _raise_for_status(self, response: ClientResponse) -> None:
"""
Raise GitHubServerError if the response status is >= 500.

The server error indicates that the request shall be retried.
"""
log.trace(f"Response status: {response.status}")
if response.status >= 500:
raise GitHubServerError
Comment on lines +121 to +122
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the request fails 5 times with a 5xx error, currently the actual server error wont be included in the traceback, just the traceback from this line. If we do something like this would it help put more context in the traceback?

try:
    response.raise_for_status()
except ResponseErrorOrWhateverItsCalled as err:
    if response.status >= 500:
        raise GitHubServerError from err
    raise

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a fantastic idea! Thanks, I'll work it in.

response.raise_for_status()

@retry_on_server_error
async def fetch_directory(self, path: str, types: t.Container[str] = ("file", "dir")) -> dict[str, RemoteObject]:
"""
Fetch directory found at `path` in the branding repository.
Expand All @@ -105,14 +135,12 @@ async def fetch_directory(self, path: str, types: t.Container[str] = ("file", "d
log.debug(f"Fetching directory from branding repository: '{full_url}'.")

async with self.bot.http_session.get(full_url, params=PARAMS, headers=HEADERS) as response:
if response.status != 200:
raise RuntimeError(f"Failed to fetch directory due to status: {response.status}")

log.debug("Fetch successful, reading JSON response.")
self._raise_for_status(response)
json_directory = await response.json()

return {file["name"]: RemoteObject(file) for file in json_directory if file["type"] in types}

@retry_on_server_error
async def fetch_file(self, download_url: str) -> bytes:
"""
Fetch file as bytes from `download_url`.
Expand All @@ -122,10 +150,7 @@ async def fetch_file(self, download_url: str) -> bytes:
log.debug(f"Fetching file from branding repository: '{download_url}'.")

async with self.bot.http_session.get(download_url, params=PARAMS, headers=HEADERS) as response:
if response.status != 200:
raise RuntimeError(f"Failed to fetch file due to status: {response.status}")

log.debug("Fetch successful, reading payload.")
self._raise_for_status(response)
return await response.read()

def parse_meta_file(self, raw_file: bytes) -> MetaFile:
Expand Down Expand Up @@ -186,37 +211,30 @@ async def get_events(self) -> list[Event]:
"""
Discover available events in the branding repository.

Misconfigured events are skipped. May return an empty list in the catastrophic case.
Propagate errors if an event fails to fetch or deserialize. Such errors cannot be handled
and the operation should be retried in the future.
"""
log.debug("Discovering events in branding repository.")

try:
event_directories = await self.fetch_directory("events", types=("dir",)) # Skip files.
except Exception:
log.exception("Failed to fetch 'events' directory.")
return []
event_directories = await self.fetch_directory("events", types=("dir",)) # Skip files.

instances: list[Event] = []

for event_directory in event_directories.values():
log.trace(f"Attempting to construct event from directory: '{event_directory.path}'.")
try:
instance = await self.construct_event(event_directory)
except Exception as exc:
log.warning(f"Could not construct event '{event_directory.path}'.", exc_info=exc)
else:
instances.append(instance)
log.trace(f"Reading event directory: '{event_directory.path}'.")
instance = await self.construct_event(event_directory)
instances.append(instance)

return instances

async def get_current_event(self) -> tuple[Event | None, list[Event]]:
async def get_current_event(self) -> tuple[Event, list[Event]]:
"""
Get the currently active event, or the fallback event.

The second return value is a list of all available events. The caller may discard it, if not needed.
Returning all events alongside the current one prevents having to query the API twice in some cases.

The current event may be None in the case that no event is active, and no fallback event is found.
If no event is active and the fallback event cannot be found, raise an error.
"""
utc_now = datetime.now(tz=UTC)
log.debug(f"Finding active event for: {utc_now}.")
Expand All @@ -239,5 +257,4 @@ async def get_current_event(self) -> tuple[Event | None, list[Event]]:
if event.meta.is_fallback:
return event, available_events

log.warning("No event is currently active and no fallback event was found!")
return None, available_events
raise RuntimeError("No event is currently active and no fallback event was found!")
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ pydis_core = { version = "10.5.1", extras = ["async-rediscache"] }

aiohttp = "3.9.1"
arrow = "1.3.0"
backoff = "2.2.1"
beautifulsoup4 = "4.12.2"
colorama = { version = "0.4.6", markers = "sys_platform == 'win32'" }
coloredlogs = "15.0.1"
Expand Down