-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
6 scheduling observer extraction #38
Changes from 4 commits
fdc0a6d
768757b
8fec9a7
393a0d8
515642f
8fdcb60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,6 +87,11 @@ data: | |
launcher = scrapyd_k8s.launcher.K8s | ||
|
||
namespace = default | ||
|
||
max_proc = 2 | ||
reconnection_attempts = 5 | ||
backoff_time = 5 | ||
backoff_coefficient = 2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we have sensible defaults, and not have to think about this yet? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a section in the README would make sense here. Though might it be time to create a new file explaining the configuration options in more detail? That could give some more freedom to explain them in more detail. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are defaults in the code, but they need to be "tested" to see if it's enough for our prod cluster, I suspect that it might be possible that we need to increase the number of attempts, but I tried to make it resilient in a sense that every time the connection was successfully re-established, the number of attempts sets to provided default number, so we really catch the cases when we have several connection breaks in a row. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I do agree that a section about the config file is needed. How do you see this new file, just CONFIG.md or something else as part of the repo? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Super, thanks! Pending:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the feedback, done! |
||
|
||
# This is an example spider that should work out of the box. | ||
# Adapt the spider config to your use-case. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,19 @@ namespace = default | |
# Optional pull secret, in case you have private spiders. | ||
#pull_secret = ghcr-registry | ||
|
||
# Maximum number of jobs running in parallel | ||
max_proc = 10 | ||
|
||
# Number of attempts to reconnect with k8s API to watch events, default is 5 | ||
reconnection_attempts = 5 | ||
|
||
# Minimum time in seconds to wait before reconnecting to k8s API to watch events, default is 5 | ||
backoff_time = 5 | ||
|
||
# Coefficient that is multiplied by backoff_time to provide exponential backoff to prevent k8s API from being overwhelmed | ||
# default is 2, every reconnection attempt will take backoff_time*backoff_coefficient | ||
backoff_coefficient = 2 | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (see previous comment) |
||
# For each project, define a project section. | ||
# This contains a repository that points to the remote container repository. | ||
# An optional env_secret is the name of a secret with additional environment | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1 @@ | ||
import logging | ||
from scrapyd_k8s.joblogs.log_handler_k8s import KubernetesJobLogHandler | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
def joblogs_init(config): | ||
""" | ||
Initializes job logs handling by starting the Kubernetes job log handler. | ||
|
||
Parameters | ||
---------- | ||
config : Config | ||
Configuration object containing settings for job logs and storage. | ||
|
||
Returns | ||
------- | ||
None | ||
""" | ||
joblogs_config = config.joblogs() | ||
if joblogs_config and joblogs_config.get('storage_provider') is not None: | ||
log_handler = KubernetesJobLogHandler(config) | ||
log_handler.start() | ||
logger.info("Job logs handler started.") | ||
else: | ||
logger.warning("No storage provider configured; job logs will not be uploaded.") | ||
from scrapyd_k8s.joblogs.log_handler_k8s import KubernetesJobLogHandler |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
import threading | ||
import logging | ||
import time | ||
from kubernetes import client, watch | ||
from typing import Callable, List | ||
import urllib3 | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
class ResourceWatcher: | ||
""" | ||
Watches Kubernetes pod events and notifies subscribers about relevant events. | ||
|
||
Attributes | ||
---------- | ||
namespace : str | ||
Kubernetes namespace to watch pods in. | ||
subscribers : List[Callable] | ||
List of subscriber callback functions to notify on events. | ||
""" | ||
|
||
def __init__(self, namespace, config): | ||
""" | ||
Initializes the ResourceWatcher. | ||
|
||
Parameters | ||
---------- | ||
namespace : str | ||
Kubernetes namespace to watch pods in. | ||
""" | ||
self.namespace = namespace | ||
self.reconnection_attempts = int(config.scrapyd().get('reconnection_attempts', 5)) | ||
self.backoff_time = int(config.scrapyd().get('backoff_time', 5)) | ||
self.backoff_coefficient = int(config.scrapyd().get('backoff_coefficient', 2)) | ||
self.subscribers: List[Callable] = [] | ||
self._stop_event = threading.Event() | ||
self.watcher_thread = threading.Thread(target=self.watch_pods, daemon=True) | ||
self.watcher_thread.start() | ||
logger.info(f"ResourceWatcher thread started for namespace '{self.namespace}'.") | ||
|
||
def subscribe(self, callback: Callable): | ||
""" | ||
Adds a subscriber callback to be notified on events. | ||
|
||
Parameters | ||
---------- | ||
callback : Callable | ||
A function to call when an event is received. | ||
""" | ||
if callback not in self.subscribers: | ||
self.subscribers.append(callback) | ||
logger.debug(f"Subscriber {callback.__name__} added.") | ||
|
||
def unsubscribe(self, callback: Callable): | ||
""" | ||
Removes a subscriber callback. | ||
|
||
Parameters | ||
---------- | ||
callback : Callable | ||
The subscriber function to remove. | ||
""" | ||
if callback in self.subscribers: | ||
self.subscribers.remove(callback) | ||
logger.debug(f"Subscriber {callback.__name__} removed.") | ||
|
||
def notify_subscribers(self, event: dict): | ||
""" | ||
Notifies all subscribers about an event. | ||
|
||
Parameters | ||
---------- | ||
event : dict | ||
The Kubernetes event data. | ||
""" | ||
for subscriber in self.subscribers: | ||
try: | ||
subscriber(event) | ||
except Exception as e: | ||
logger.exception(f"Error notifying subscriber {subscriber.__name__}: {e}") | ||
|
||
def watch_pods(self): | ||
""" | ||
Watches Kubernetes pod events and notifies subscribers. | ||
Runs in a separate thread. | ||
""" | ||
v1 = client.CoreV1Api() | ||
w = watch.Watch() | ||
resource_version = None | ||
|
||
logger.info(f"Started watching pods in namespace '{self.namespace}'.") | ||
backoff_time = self.backoff_time | ||
reconnection_attempts = self.reconnection_attempts | ||
while not self._stop_event.is_set() and reconnection_attempts > 0: | ||
try: | ||
kwargs = { | ||
'namespace': self.namespace, | ||
'timeout_seconds': 0, | ||
} | ||
if resource_version: | ||
kwargs['resource_version'] = resource_version | ||
first_event = True | ||
for event in w.stream(v1.list_namespaced_pod, **kwargs): | ||
if first_event: | ||
# Reset reconnection attempts and backoff time upon successful reconnection | ||
reconnection_attempts = self.reconnection_attempts | ||
backoff_time = self.backoff_time | ||
first_event = False # Ensure this only happens once per connection | ||
pod_name = event['object'].metadata.name | ||
resource_version = event['object'].metadata.resource_version | ||
event_type = event['type'] | ||
logger.debug(f"Received event: {event_type} for pod: {pod_name}") | ||
self.notify_subscribers(event) | ||
except (urllib3.exceptions.ProtocolError, | ||
urllib3.exceptions.ReadTimeoutError, | ||
urllib3.exceptions.ConnectionError) as e: | ||
reconnection_attempts -= 1 | ||
logger.exception(f"Encountered network error: {e}") | ||
logger.info(f"Retrying to watch pods after {backoff_time} seconds...") | ||
time.sleep(backoff_time) | ||
backoff_time *= self.backoff_coefficient | ||
except client.ApiException as e: | ||
# Resource version is too old and cannot be accessed anymore | ||
if e.status == 410: | ||
logger.error("Received 410 Gone error, resetting resource_version and restarting watch.") | ||
resource_version = None | ||
continue | ||
else: | ||
reconnection_attempts -= 1 | ||
logger.exception(f"Encountered ApiException: {e}") | ||
logger.info(f"Retrying to watch pods after {backoff_time} seconds...") | ||
time.sleep(backoff_time) | ||
backoff_time *= self.backoff_coefficient | ||
except StopIteration: | ||
logger.info("Watch stream ended, restarting watch.") | ||
continue | ||
except Exception as e: | ||
reconnection_attempts -= 1 | ||
logger.exception(f"Watcher encountered exception: {e}") | ||
logger.info(f"Retrying to watch pods after {backoff_time} seconds...") | ||
time.sleep(backoff_time) | ||
backoff_time *= self.backoff_coefficient | ||
|
||
|
||
def stop(self): | ||
""" | ||
Stops the watcher thread gracefully. | ||
""" | ||
self._stop_event.set() | ||
self.watcher_thread.join() | ||
logger.info(f"ResourceWatcher thread stopped for namespace '{self.namespace}'.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is
max_proc
here 2, and inscrapyd_k8s.sample-k8s.conf
10?