Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[17.0] [IMP] queue_job: HA job runner using session level advisory lock #673

Open
wants to merge 4 commits into
base: 17.0
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 35 additions & 6 deletions queue_job/jobrunner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,17 @@

SELECT_TIMEOUT = 60
ERROR_RECOVERY_DELAY = 5
PG_ADVISORY_LOCK_ID = 2293787760715711918

_logger = logging.getLogger(__name__)

select = selectors.DefaultSelector


class MasterElectionLost(Exception):
pass


# Unfortunately, it is not possible to extend the Odoo
# server command line arguments, so we resort to environment variables
# to configure the runner (channels mostly).
Expand Down Expand Up @@ -264,10 +269,15 @@
self.db_name = db_name
connection_info = _connection_info_for(db_name)
self.conn = psycopg2.connect(**connection_info)
self.conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
self.has_queue_job = self._has_queue_job()
if self.has_queue_job:
self._initialize()
try:
self.conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
self.has_queue_job = self._has_queue_job()

Check warning on line 274 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L272-L274

Added lines #L272 - L274 were not covered by tests
if self.has_queue_job:
self._acquire_master_lock()
self._initialize()
except BaseException:
self.close()
raise

Check warning on line 280 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L276-L280

Added lines #L276 - L280 were not covered by tests

def close(self):
# pylint: disable=except-pass
Expand All @@ -280,6 +290,14 @@
pass
self.conn = None

def _acquire_master_lock(self):
"""Acquire the master runner lock or raise MasterElectionLost"""
with closing(self.conn.cursor()) as cr:
cr.execute("SELECT pg_try_advisory_lock(%s)", (PG_ADVISORY_LOCK_ID,))

Check warning on line 296 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L295-L296

Added lines #L295 - L296 were not covered by tests
if not cr.fetchone()[0]:
msg = f"could not acquire master runner lock on {self.db_name}"
raise MasterElectionLost(msg)

Check warning on line 299 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L298-L299

Added lines #L298 - L299 were not covered by tests

def _has_queue_job(self):
with closing(self.conn.cursor()) as cr:
cr.execute(
Expand Down Expand Up @@ -415,14 +433,17 @@
self.db_by_name = {}

def initialize_databases(self):
for db_name in self.get_db_names():
for db_name in sorted(self.get_db_names()):
# sorting is important to avoid deadlocks in acquiring the master lock
db = Database(db_name)
if db.has_queue_job:
self.db_by_name[db_name] = db
with db.select_jobs("state in %s", (NOT_DONE,)) as cr:
for job_data in cr:
self.channel_manager.notify(db_name, *job_data)
_logger.info("queue job runner ready for db %s", db_name)
else:
db.close()

Check warning on line 446 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L446

Added line #L446 was not covered by tests

def run_jobs(self):
now = _odoo_now()
Expand Down Expand Up @@ -509,7 +530,7 @@
while not self._stop:
# outer loop does exception recovery
try:
_logger.info("initializing database connections")
_logger.debug("initializing database connections")

Check warning on line 533 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L533

Added line #L533 was not covered by tests
# TODO: how to detect new databases or databases
# on which queue_job is installed after server start?
self.initialize_databases()
Expand All @@ -524,6 +545,14 @@
except InterruptedError:
# Interrupted system call, i.e. KeyboardInterrupt during select
self.stop()
except MasterElectionLost as e:
_logger.debug(

Check warning on line 549 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L548-L549

Added lines #L548 - L549 were not covered by tests
"master election lost: %s, sleeping %ds and retrying",
e,
ERROR_RECOVERY_DELAY,
)
self.close_databases()
time.sleep(ERROR_RECOVERY_DELAY)

Check warning on line 555 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L554-L555

Added lines #L554 - L555 were not covered by tests
except Exception:
_logger.exception(
"exception: sleeping %ds and retrying", ERROR_RECOVERY_DELAY
Expand Down
Loading