Skip to content

Commit 632a302

Browse files
authored
Detect try tree closures and retry after waiting an exponential time (#81)
* Detect closed try tree and exponentially wait before retrying * Use requests instead of aiohttp * Update existing tests * Add a test case
1 parent 7262ef6 commit 632a302

File tree

5 files changed

+181
-2
lines changed

5 files changed

+181
-2
lines changed

.isort.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[settings]
22
known_first_party = conftest,libmozevent
3-
known_third_party = aioamqp,aiohttp,aioredis,hglib,libmozdata,pytest,responses,rs_parsepatch,setuptools,structlog,taskcluster
3+
known_third_party = aioamqp,aiohttp,aioredis,hglib,libmozdata,pytest,requests,responses,rs_parsepatch,setuptools,structlog,taskcluster
44
force_single_line = True
55
line_length = 159

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ repos:
5252
rev: v0.950
5353
hooks:
5454
- id: mypy
55+
additional_dependencies: ['types-requests']
5556
- repo: meta
5657
hooks:
5758
- id: check-useless-excludes

libmozevent/mercurial.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,18 @@
33
# License, v. 2.0. If a copy of the MPL was not distributed with this
44
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
55

6+
import asyncio
67
import atexit
78
import enum
89
import io
910
import json
1011
import os
1112
import tempfile
1213
import time
14+
from datetime import datetime
1315

1416
import hglib
17+
import requests
1518
import rs_parsepatch
1619
import structlog
1720
from libmozdata.phabricator import PhabricatorPatch
@@ -24,8 +27,14 @@
2427

2528
TREEHERDER_URL = "https://treeherder.mozilla.org/#/jobs?repo={}&revision={}"
2629
DEFAULT_AUTHOR = "libmozevent <release-mgmt-analysis@mozilla.com>"
30+
# On build failure, check try status until available every 5 minutes and up to 24h
31+
TRY_STATUS_URL = "https://treestatus.mozilla-releng.net/trees/try"
32+
TRY_STATUS_DELAY = 5 * 60
33+
TRY_STATUS_MAX_WAIT = 24 * 60 * 60
2734
# Number of allowed retries on an unexpected push fail
2835
MAX_PUSH_RETRIES = 4
36+
# Wait successive exponential delays: 6sec, 36sec, 3.6min, 21.6min
37+
PUSH_RETRY_EXPONENTIAL_DELAY = 6
2938

3039

3140
class TryMode(enum.Enum):
@@ -344,6 +353,34 @@ def get_files_touched_in_diff(rawdiff):
344353
for patched_file in get_files_touched_in_diff(rev.patch)
345354
)
346355

356+
async def wait_try_available(self):
357+
"""
358+
Wait until try status is "open"
359+
On each failure, wait TRY_STATUS_DELAY before retrying up to TRY_STATUS_MAX_WAIT
360+
"""
361+
362+
def get_status():
363+
try:
364+
resp = requests.get(TRY_STATUS_URL)
365+
resp.raise_for_status()
366+
data = resp.json()
367+
except Exception as e:
368+
logger.warning(f"An error occurred retrieving try status: {e}")
369+
else:
370+
return data.get("result", {}).get("status")
371+
372+
start = datetime.utcnow()
373+
while status := get_status() != "open":
374+
if (datetime.utcnow() - start).seconds >= TRY_STATUS_MAX_WAIT:
375+
logger.error(
376+
f"Try tree status still closed after {TRY_STATUS_MAX_WAIT} seconds, skipping"
377+
)
378+
break
379+
logger.warning(
380+
f"Try tree is not actually open (status: {status}), waiting {TRY_STATUS_DELAY} seconds before retrying"
381+
)
382+
await asyncio.sleep(TRY_STATUS_DELAY)
383+
347384
async def handle_build(self, repository, build):
348385
"""
349386
Try to load and apply a diff on local clone
@@ -405,8 +442,16 @@ async def handle_build(self, repository, build):
405442
error_log = error_log.decode("utf-8")
406443

407444
if "push failed on remote" in error_log.lower():
408-
# In case of an unexpected push fail, put the build task back in the queue
409445
build.retries += 1
446+
# Ensure try is opened
447+
await self.wait_try_available()
448+
# Wait an exponential time before retrying the build
449+
delay = PUSH_RETRY_EXPONENTIAL_DELAY**build.retries
450+
logger.info(
451+
f"An error occurred pushing the build to try, retrying after {delay}s"
452+
)
453+
await asyncio.sleep(delay)
454+
# Put the build task back in the queue
410455
await self.bus.send(self.queue_name, build)
411456
return
412457

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ aioredis>=2.0.0
44
libmozdata
55
python-hglib
66
pytoml
7+
requests
78
rs_parsepatch
89
structlog
910
taskcluster>=24.0.0

tests/test_mercurial.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import hglib
88
import pytest
9+
import responses
910

1011
from conftest import MockBuild
1112
from libmozevent.bus import MessageBus
@@ -647,6 +648,7 @@ async def test_crash_utf8_author(PhabricatorMock, mock_mc):
647648
assert details["revision"] == mock_mc.repo.tip().node.decode("utf-8")
648649

649650

651+
@responses.activate
650652
@pytest.mark.asyncio
651653
async def test_unexpected_push_failure(PhabricatorMock, mock_mc):
652654
"""
@@ -669,6 +671,14 @@ async def test_unexpected_push_failure(PhabricatorMock, mock_mc):
669671
from libmozevent import mercurial
670672

671673
mercurial.MAX_PUSH_RETRIES = 1
674+
mercurial.TRY_STATUS_URL = "http://test.status/try"
675+
mercurial.PUSH_RETRY_EXPONENTIAL_DELAY = 0
676+
mercurial.TRY_STATUS_DELAY = 0
677+
mercurial.TRY_STATUS_MAX_WAIT = 0
678+
679+
responses.get(
680+
"http://test.status/try", status=200, json={"result": {"status": "open"}}
681+
)
672682

673683
repository_mock = MagicMock(spec=Repository)
674684
repository_mock.push_to_try.side_effect = [
@@ -712,8 +722,12 @@ async def test_unexpected_push_failure(PhabricatorMock, mock_mc):
712722
tip.node.decode("utf-8")
713723
)
714724
assert details["revision"] == tip.node.decode("utf-8")
725+
assert [(call.request.method, call.request.url) for call in responses.calls] == [
726+
("GET", "http://test.status/try")
727+
]
715728

716729

730+
@responses.activate
717731
@pytest.mark.asyncio
718732
async def test_push_failure_max_retries(PhabricatorMock, mock_mc):
719733
"""
@@ -736,6 +750,23 @@ async def test_push_failure_max_retries(PhabricatorMock, mock_mc):
736750
from libmozevent import mercurial
737751

738752
mercurial.MAX_PUSH_RETRIES = 2
753+
mercurial.TRY_STATUS_URL = "http://test.status/try"
754+
mercurial.PUSH_RETRY_EXPONENTIAL_DELAY = 2
755+
mercurial.TRY_STATUS_DELAY = 0
756+
mercurial.TRY_STATUS_MAX_WAIT = 0
757+
758+
sleep_history = []
759+
760+
class AsyncioMock(object):
761+
async def sleep(self, value):
762+
nonlocal sleep_history
763+
sleep_history.append(value)
764+
765+
mercurial.asyncio = AsyncioMock()
766+
767+
responses.get(
768+
"http://test.status/try", status=200, json={"result": {"status": "open"}}
769+
)
739770

740771
repository_mock = MagicMock(spec=Repository)
741772
repository_mock.push_to_try.side_effect = hglib.error.CommandError(
@@ -767,3 +798,104 @@ async def test_push_failure_max_retries(PhabricatorMock, mock_mc):
767798
details["message"]
768799
== "Max number of retries has been reached pushing the build to try repository"
769800
)
801+
802+
assert [(call.request.method, call.request.url) for call in responses.calls] == [
803+
("GET", "http://test.status/try"),
804+
("GET", "http://test.status/try"),
805+
("GET", "http://test.status/try"),
806+
]
807+
assert sleep_history == [2, 4, 8]
808+
809+
810+
@responses.activate
811+
@pytest.mark.asyncio
812+
async def test_push_closed_try(PhabricatorMock, mock_mc):
813+
"""
814+
Detect when try tree is in a closed state and wait before it is opened to retry
815+
"""
816+
817+
diff = {
818+
"revisionPHID": "PHID-DREV-badutf8",
819+
"baseRevision": "missing",
820+
"phid": "PHID-DIFF-badutf8",
821+
"id": 555,
822+
}
823+
build = MockBuild(4444, "PHID-REPO-mc", 5555, "PHID-build-badutf8", diff)
824+
with PhabricatorMock as phab:
825+
phab.load_patches_stack(build)
826+
827+
bus = MessageBus()
828+
bus.add_queue("phabricator")
829+
830+
from libmozevent import mercurial
831+
832+
mercurial.MAX_PUSH_RETRIES = 2
833+
mercurial.TRY_STATUS_URL = "http://test.status/try"
834+
mercurial.PUSH_RETRY_EXPONENTIAL_DELAY = 2
835+
mercurial.TRY_STATUS_DELAY = 42
836+
mercurial.TRY_STATUS_MAX_WAIT = 1
837+
838+
sleep_history = []
839+
840+
class AsyncioMock(object):
841+
async def sleep(self, value):
842+
nonlocal sleep_history
843+
sleep_history.append(value)
844+
845+
mercurial.asyncio = AsyncioMock()
846+
847+
responses.get(
848+
"http://test.status/try", status=200, json={"result": {"status": "closed"}}
849+
)
850+
responses.get("http://test.status/try", status=500)
851+
responses.get(
852+
"http://test.status/try", status=200, json={"result": {"status": "open"}}
853+
)
854+
855+
repository_mock = MagicMock(spec=Repository)
856+
repository_mock.push_to_try.side_effect = [
857+
hglib.error.CommandError(
858+
args=("push", "try_url"),
859+
ret=1,
860+
err="abort: push failed on remote",
861+
out="",
862+
),
863+
mock_mc.repo.tip(),
864+
]
865+
repository_mock.try_name = "try"
866+
repository_mock.retries = 0
867+
868+
worker = MercurialWorker(
869+
"mercurial", "phabricator", repositories={"PHID-REPO-mc": repository_mock}
870+
)
871+
worker.register(bus)
872+
873+
assert bus.queues["mercurial"].qsize() == 0
874+
875+
resp = await worker.handle_build(repository_mock, build)
876+
assert resp is None
877+
assert repository_mock.push_to_try.call_count == 1
878+
879+
assert bus.queues["mercurial"].qsize() == 1
880+
assert bus.queues["phabricator"].qsize() == 0
881+
882+
build = await bus.receive("mercurial")
883+
resp = await worker.handle_build(repository_mock, build)
884+
assert resp is not None
885+
mode, out_build, details = resp
886+
887+
assert mode == "success"
888+
assert out_build == build
889+
tip = mock_mc.repo.tip()
890+
assert details[
891+
"treeherder_url"
892+
] == "https://treeherder.mozilla.org/#/jobs?repo=try&revision={}".format(
893+
tip.node.decode("utf-8")
894+
)
895+
assert details["revision"] == tip.node.decode("utf-8")
896+
assert [(call.request.method, call.request.url) for call in responses.calls] == [
897+
("GET", "http://test.status/try"),
898+
("GET", "http://test.status/try"),
899+
("GET", "http://test.status/try"),
900+
]
901+
assert sleep_history == [42, 42, 2]

0 commit comments

Comments
 (0)