From f4031569eb684fa455eb31daf862764d20cee440 Mon Sep 17 00:00:00 2001 From: Hook25 Date: Mon, 18 Nov 2024 17:26:39 +0100 Subject: [PATCH 1/8] Initial bugfix --- checkbox-ng/plainbox/impl/session/assistant.py | 9 ++++++++- checkbox-ng/plainbox/impl/session/resume.py | 2 ++ checkbox-ng/plainbox/impl/session/suspend.py | 6 +++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/checkbox-ng/plainbox/impl/session/assistant.py b/checkbox-ng/plainbox/impl/session/assistant.py index fd5a449fc6..d2cfdc45f4 100644 --- a/checkbox-ng/plainbox/impl/session/assistant.py +++ b/checkbox-ng/plainbox/impl/session/assistant.py @@ -828,12 +828,19 @@ def bootstrap(self): ] = "to run bootstrapping job" rb = self.run_job(job.id, "silent", False) self.use_job_result(job.id, rb.get_result()) + # we may have a list of rejected jobs if this session is a resumed + # session + already_rejected = [ + JobIdQualifier(job_id, None, inclusive=False) + for job_id in self._context.state.metadata.rejected_jobs + ] # Perform initial selection -- we want to run everything that is # described by the test plan that was selected earlier. desired_job_list = select_units( self._context.state.job_list, [plan.get_qualifier() for plan in self._manager.test_plans] - + self._exclude_qualifiers, + + self._exclude_qualifiers + + already_rejected, ) self._context.state.update_desired_job_list(desired_job_list) # Set subsequent usage expectations i.e. all of the runtime parts are diff --git a/checkbox-ng/plainbox/impl/session/resume.py b/checkbox-ng/plainbox/impl/session/resume.py index f11e8fb471..e5ddcadf61 100644 --- a/checkbox-ng/plainbox/impl/session/resume.py +++ b/checkbox-ng/plainbox/impl/session/resume.py @@ -779,6 +779,8 @@ def _restore_SessionState_jobs_and_results(self, session, session_repr): self._process_job(session, jobs_repr, results_repr, job_id) except KeyError: leftover_jobs.append(job_id) + + leftover_jobs += session_repr["metadata"]["rejected_jobs"] # Process leftovers. For each iteration the leftover_jobs list should # shrink or we're not making any progress. If that happens we've got # undefined jobs (in general the session is corrupted) diff --git a/checkbox-ng/plainbox/impl/session/suspend.py b/checkbox-ng/plainbox/impl/session/suspend.py index c3e66365f4..05ae7c01a6 100644 --- a/checkbox-ng/plainbox/impl/session/suspend.py +++ b/checkbox-ng/plainbox/impl/session/suspend.py @@ -516,6 +516,7 @@ def _repr_SessionState(self, obj, session_dir): state object. """ id_run_list = frozenset([job.id for job in obj.run_list]) + return { "jobs": { state.job.id: state.job.checksum @@ -631,7 +632,10 @@ def _repr_SessionState(self, obj, session_dir): The representation of meta-data associated with the session state object. """ - id_run_list = frozenset([job.id for job in obj.run_list]) + id_run_list = frozenset( + [job.id for job in obj.run_list] + obj.metadata.rejected_jobs + ) + return { "jobs": { state.job.id: state.job.checksum From 3bad82075dccd80409de6bb771e871d2bec4a603 Mon Sep 17 00:00:00 2001 From: Hook25 Date: Mon, 18 Nov 2024 18:02:21 +0100 Subject: [PATCH 2/8] New agent_respawn scenario --- .../scenarios/restart/agent_respawn.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/metabox/metabox/scenarios/restart/agent_respawn.py b/metabox/metabox/scenarios/restart/agent_respawn.py index 3ab7ea0162..cbffefc266 100644 --- a/metabox/metabox/scenarios/restart/agent_respawn.py +++ b/metabox/metabox/scenarios/restart/agent_respawn.py @@ -24,6 +24,7 @@ Expect, Start, Signal, + ExpectNot, ) from metabox.core.scenario import Scenario from metabox.core.utils import tag @@ -161,3 +162,31 @@ class ResumeAfterFinishPreserveOutputRemote(Scenario): Expect("job passed"), Expect("job failed"), ] + + +@tag("resume", "manual", "regression") +class ResumePreservesRejectedJobsStateMap(Scenario): + launcher = "# no launcher" + steps = [ + Start(), + Expect("Select test plan"), + SelectTestPlan("2021.com.canonical.certification::pass-and-crash"), + Send(keys.KEY_ENTER), + Expect("Press (T) to start"), + Send(keys.KEY_ENTER), + Expect("basic-shell-crashing"), + Send(keys.KEY_DOWN + keys.KEY_SPACE), + Expect("[ ]"), + Send("T"), + Expect("Do you want to submit 'upload to certification' report?"), + Signal(keys.SIGINT), + Start(), + Expect("Reports will be saved to"), + # Part of the regression, fixing the job state map would make the + # re-bootstrapping of the session include the excluded job + ExpectNot("basic-shell-crashing", timeout=0.1), + # Here the session will try to re-generate the submission.json but it + # will fail if the info about the session is not complete in the job + # state map (as it was prior to this regression) + Expect("Do you want to submit 'upload to certification' report?"), + ] From 4227fb74bd737ebb4fd90c56b4dd79ea6f5c5589 Mon Sep 17 00:00:00 2001 From: Hook25 Date: Mon, 18 Nov 2024 18:03:18 +0100 Subject: [PATCH 3/8] Fix ExpectNot on timeout It only worked when the program would stop, now it always does --- metabox/metabox/core/lxd_execute.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/metabox/metabox/core/lxd_execute.py b/metabox/metabox/core/lxd_execute.py index 0c333b137c..294e6c8b2b 100644 --- a/metabox/metabox/core/lxd_execute.py +++ b/metabox/metabox/core/lxd_execute.py @@ -22,6 +22,8 @@ import threading import time +from contextlib import suppress + from loguru import logger import metabox.core.keys as keys from metabox.core.utils import ExecuteResult @@ -113,7 +115,9 @@ def expect(self, pattern, timeout=0): return found def expect_not(self, data, timeout=0): - return not self.expect(data, timeout) + with suppress(TimeoutError): + return not self.expect(data, timeout) + return True def select_test_plan(self, data, timeout=0): if not self._lookup_by_id: From 678723b462c9763878270a52f25250f1c8a7523c Mon Sep 17 00:00:00 2001 From: Hook25 Date: Tue, 19 Nov 2024 10:17:31 +0100 Subject: [PATCH 4/8] Robust to missing fields in metadata --- checkbox-ng/plainbox/impl/session/resume.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/checkbox-ng/plainbox/impl/session/resume.py b/checkbox-ng/plainbox/impl/session/resume.py index e5ddcadf61..bd202e0cee 100644 --- a/checkbox-ng/plainbox/impl/session/resume.py +++ b/checkbox-ng/plainbox/impl/session/resume.py @@ -780,7 +780,9 @@ def _restore_SessionState_jobs_and_results(self, session, session_repr): except KeyError: leftover_jobs.append(job_id) - leftover_jobs += session_repr["metadata"]["rejected_jobs"] + leftover_jobs += session_repr.get("metadata", {}).get( + "rejected_jobs", [] + ) # Process leftovers. For each iteration the leftover_jobs list should # shrink or we're not making any progress. If that happens we've got # undefined jobs (in general the session is corrupted) From ff2672e424660a6396b868875c8e4e5db2bedcf2 Mon Sep 17 00:00:00 2001 From: Hook25 Date: Tue, 19 Nov 2024 10:18:12 +0100 Subject: [PATCH 5/8] Test that rejected_jobs are restored in the job_state_map --- .../plainbox/impl/session/test_resume.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/checkbox-ng/plainbox/impl/session/test_resume.py b/checkbox-ng/plainbox/impl/session/test_resume.py index ca28923708..3878653060 100644 --- a/checkbox-ng/plainbox/impl/session/test_resume.py +++ b/checkbox-ng/plainbox/impl/session/test_resume.py @@ -2031,6 +2031,41 @@ def test_simple_session(self): # as specific tests for restoring results are written elsewhere self.assertEqual(session.job_state_map[job.id].result.outcome, "pass") + def test_simple_session_with_rejected(self): + """ + verify that _restore_SessionState_jobs_and_results() works when + faced with a representation of a simple session (no generated jobs + or anything "exotic"). + """ + job = make_job(id="job") + job1 = make_job(id="job1") + session_repr = { + "jobs": {job.id: job.checksum, job1.id: job1.checksum}, + "results": { + job.id: [ + { + "outcome": "pass", + "comments": None, + "execution_duration": None, + "return_code": None, + "io_log": [], + } + ] + }, + "metadata": {"rejected_jobs": [job1.id]}, + } + helper = self.parameters.resume_cls([], None, None) + session = SessionState([job, job1]) + helper._restore_SessionState_jobs_and_results(session, session_repr) + # Session still has one job in it + self.assertEqual(session.job_list, [job, job1]) + # Resources don't have anything (no resource jobs) + self.assertEqual(session.resource_map, {}) + # The result was restored correctly. This is just a smoke test + # as specific tests for restoring results are written elsewhere + self.assertEqual(session.job_state_map[job.id].result.outcome, "pass") + self.assertIn(job1.id, session.job_state_map) + def test_unknown_jobs_get_reported(self): """ verify that _restore_SessionState_jobs_and_results() reports From 6152d9f7836d576ff5d5e1d997c9b6346c7c143b Mon Sep 17 00:00:00 2001 From: Hook25 Date: Tue, 19 Nov 2024 16:10:13 +0100 Subject: [PATCH 6/8] Update tests as per review --- checkbox-ng/plainbox/impl/session/test_resume.py | 8 +++++--- metabox/metabox/scenarios/restart/agent_respawn.py | 12 ++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/checkbox-ng/plainbox/impl/session/test_resume.py b/checkbox-ng/plainbox/impl/session/test_resume.py index 3878653060..f5289d7f76 100644 --- a/checkbox-ng/plainbox/impl/session/test_resume.py +++ b/checkbox-ng/plainbox/impl/session/test_resume.py @@ -2033,11 +2033,13 @@ def test_simple_session(self): def test_simple_session_with_rejected(self): """ - verify that _restore_SessionState_jobs_and_results() works when - faced with a representation of a simple session (no generated jobs - or anything "exotic"). + verify that _restore_SessionState_jobs_and_results() also resumes in + the job_state_map jobs that were manually excluded from the user + (rejected jobs) """ job = make_job(id="job") + # this job was de-selected by the user manually in the test selection + # screen job1 = make_job(id="job1") session_repr = { "jobs": {job.id: job.checksum, job1.id: job1.checksum}, diff --git a/metabox/metabox/scenarios/restart/agent_respawn.py b/metabox/metabox/scenarios/restart/agent_respawn.py index cbffefc266..fcd455221c 100644 --- a/metabox/metabox/scenarios/restart/agent_respawn.py +++ b/metabox/metabox/scenarios/restart/agent_respawn.py @@ -166,18 +166,26 @@ class ResumeAfterFinishPreserveOutputRemote(Scenario): @tag("resume", "manual", "regression") class ResumePreservesRejectedJobsStateMap(Scenario): + """ + Check that the job_state_map survives both manual closure and restarts + """ + launcher = "# no launcher" steps = [ Start(), Expect("Select test plan"), - SelectTestPlan("2021.com.canonical.certification::pass-and-crash"), + SelectTestPlan( + "2021.com.canonical.certification::checkbox-crash-then-reboot" + ), Send(keys.KEY_ENTER), Expect("Press (T) to start"), Send(keys.KEY_ENTER), - Expect("basic-shell-crashing"), + Expect("Crash Checkbox"), Send(keys.KEY_DOWN + keys.KEY_SPACE), Expect("[ ]"), Send("T"), + Expect("Waiting for the system to shut down or reboot"), + Start(), Expect("Do you want to submit 'upload to certification' report?"), Signal(keys.SIGINT), Start(), From 0af5839d75ca27854840bfdb50da6dc578d51dd5 Mon Sep 17 00:00:00 2001 From: Hook25 Date: Wed, 20 Nov 2024 09:47:31 +0100 Subject: [PATCH 7/8] Split test into remote + local --- .../scenarios/restart/agent_respawn.py | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/metabox/metabox/scenarios/restart/agent_respawn.py b/metabox/metabox/scenarios/restart/agent_respawn.py index fcd455221c..df2a9eb7bd 100644 --- a/metabox/metabox/scenarios/restart/agent_respawn.py +++ b/metabox/metabox/scenarios/restart/agent_respawn.py @@ -165,11 +165,12 @@ class ResumeAfterFinishPreserveOutputRemote(Scenario): @tag("resume", "manual", "regression") -class ResumePreservesRejectedJobsStateMap(Scenario): +class LocalResumePreservesRejectedJobsStateMap(Scenario): """ Check that the job_state_map survives both manual closure and restarts """ + modes = ["local"] launcher = "# no launcher" steps = [ Start(), @@ -198,3 +199,43 @@ class ResumePreservesRejectedJobsStateMap(Scenario): # state map (as it was prior to this regression) Expect("Do you want to submit 'upload to certification' report?"), ] + + +@tag("resume", "manual", "regression") +class RemoteResumePreservesRejectedJobsStateMap(Scenario): + """ + Check that the job_state_map survives both manual closure and restarts + + This differs from Local because in remote the controller waits for the + agent to come back, we loose the output of the rebooting job and we don't + need to re-start the controller on reboot + """ + + modes = ["remote"] + launcher = "# no launcher" + steps = [ + Start(), + Expect("Select test plan"), + SelectTestPlan( + "2021.com.canonical.certification::checkbox-crash-then-reboot" + ), + Send(keys.KEY_ENTER), + Expect("Press (T) to start"), + Send(keys.KEY_ENTER), + Expect("Crash Checkbox"), + Send(keys.KEY_DOWN + keys.KEY_SPACE), + Expect("[ ]"), + Send("T"), + Expect("Connection lost!"), + Expect("Do you want to submit 'upload to certification' report?"), + Signal(keys.SIGINT), + Start(), + # Part of the regression, fixing the job state map would make the + # re-bootstrapping of the session include the excluded job + ExpectNot("Crash Checkbox", timeout=0.1), + Expect("tar_file"), + # Here the session will try to re-generate the submission.json but it + # will fail if the info about the session is not complete in the job + # state map (as it was prior to this regression) + Expect("Do you want to submit 'upload to certification' report?"), + ] From f9c2870425575307bf1cac9ae599bf8214a31112 Mon Sep 17 00:00:00 2001 From: Pierre Equoy Date: Wed, 20 Nov 2024 09:57:04 +0100 Subject: [PATCH 8/8] Update metabox/metabox/scenarios/restart/agent_respawn.py --- metabox/metabox/scenarios/restart/agent_respawn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metabox/metabox/scenarios/restart/agent_respawn.py b/metabox/metabox/scenarios/restart/agent_respawn.py index df2a9eb7bd..41f86ac6a7 100644 --- a/metabox/metabox/scenarios/restart/agent_respawn.py +++ b/metabox/metabox/scenarios/restart/agent_respawn.py @@ -207,7 +207,7 @@ class RemoteResumePreservesRejectedJobsStateMap(Scenario): Check that the job_state_map survives both manual closure and restarts This differs from Local because in remote the controller waits for the - agent to come back, we loose the output of the rebooting job and we don't + agent to come back, we lose the output of the rebooting job and we don't need to re-start the controller on reboot """