From ec16fa2c4fb72a466335055f3d9308cca18ca515 Mon Sep 17 00:00:00 2001 From: Brian Warner Date: Sat, 29 Apr 2023 00:38:49 -0700 Subject: [PATCH] fix(swingset): better snapshot scheduling, do BOYD before each This changes the snapshot scheduling logic to be more consistent. We still use `snapshotInitial` to trigger a snapshot shortly after worker initialization, and `snapshotInterval` to trigger periodic ones after that. However the previous code compared `snapshotInitial` to the absolute deliveryNum, which meant it only applied to the first incarnation, and would not attempt to take a snapshot shortly after upgrade, leaving the kernel vulnerable to replaying the long `startVat` delivery for a larger window than we intended. And `snapshotInterval` was compared against the difference between the latest transcript and the latest snapshot, which changed with the addition of the load-worker pseudo-entry. The new code uses `snapshotInitial` whenever there is not an existing snapshot (so the first span of *all* incarnations), and compares it against the length of the current span (so it includes all the pseudo-events). `snapshotInterval` is also compared against the length of the current span. The result is simpler and more predictable set of rules: * in the first span of each incarnation, trigger a snapshot once we have at least `snapshotInterval` entries * in all other spans, trigger once we have at least `snapshotInterval` In addition, when triggering a snapshot, we perform a BringOutYourDead delivery before asking the worker to save a snapshot. This gives us one last chance to shake out any garbage (making the snapshot as small as possible), and reduces the variation we might see forced GC that happens during snapshot write (any FinalizationRegistry callbacks should get run during the BOYD, not the save-snapshot). closes #7553 closes #7504 --- .../SwingSet/src/kernel/state/vatKeeper.js | 17 +- packages/SwingSet/src/kernel/vat-warehouse.js | 31 ++- .../transcript/test-transcript-entries.js | 246 ++++++++++-------- .../vat-warehouse/test-reload-snapshot.js | 42 +-- .../test-stable-bundles.js | 7 +- 5 files changed, 204 insertions(+), 139 deletions(-) diff --git a/packages/SwingSet/src/kernel/state/vatKeeper.js b/packages/SwingSet/src/kernel/state/vatKeeper.js index 7164b86f08e..7b51a3cd8f4 100644 --- a/packages/SwingSet/src/kernel/state/vatKeeper.js +++ b/packages/SwingSet/src/kernel/state/vatKeeper.js @@ -507,11 +507,15 @@ export function makeVatKeeper( return snapStore?.getSnapshotInfo(vatID); } - function transcriptSnapshotStats() { - const totalEntries = getTranscriptEndPosition(); - const snapshotInfo = getSnapshotInfo(); - const snapshottedEntries = snapshotInfo ? snapshotInfo.snapPos : 0; - return { totalEntries, snapshottedEntries }; + /** + * Returns count of deliveries made since initialization or + * load-snapshot + * + * @returns {number} + */ + function transcriptSpanEntries() { + const { startPos, endPos } = transcriptStore.getCurrentSpanBounds(vatID); + return endPos - startPos; } /** @@ -662,7 +666,8 @@ export function makeVatKeeper( deleteCListEntriesForKernelSlots, transcriptSize, getTranscript, - transcriptSnapshotStats, + getTranscriptEndPosition, + transcriptSpanEntries, addToTranscript, vatStats, dumpState, diff --git a/packages/SwingSet/src/kernel/vat-warehouse.js b/packages/SwingSet/src/kernel/vat-warehouse.js index 9a24dc14079..b78cc7a5eb0 100644 --- a/packages/SwingSet/src/kernel/vat-warehouse.js +++ b/packages/SwingSet/src/kernel/vat-warehouse.js @@ -567,14 +567,15 @@ export function makeVatWarehouse({ const vatKeeper = kernelKeeper.provideVatKeeper(vatID); let reason; - const { totalEntries, snapshottedEntries } = - vatKeeper.transcriptSnapshotStats(); - if (snapshotInitial === totalEntries) { + + const hasSnapshot = !!vatKeeper.getSnapshotInfo(); + const deliveriesInSpan = vatKeeper.transcriptSpanEntries(); + + if (!hasSnapshot && deliveriesInSpan >= snapshotInitial) { + // begin snapshot after 'snapshotInitial' deliveries in an incarnation reason = { snapshotInitial }; - } else if ( - totalEntries > snapshotInitial && - totalEntries - snapshottedEntries >= snapshotInterval - ) { + } else if (deliveriesInSpan >= snapshotInterval) { + // begin snapshot after 'snapshotInterval' deliveries in a span reason = { snapshotInterval }; } // console.log('maybeSaveSnapshot: reason:', reason); @@ -582,6 +583,22 @@ export function makeVatWarehouse({ return false; // not time to make a snapshot } + // always do a bringOutYourDead just before a snapshot, to shake + // loose as much garbage as we can, and to minimize the GC + // sensitivity effects of the forced GC that snapshots perform + // TODO: + // * computrons consumed are not reported to the runPolicy + // * computrons consumed are not deducted from the meter + // (not sure we'd want that anyways) + // * vat-fatal errors or self-termination are not processed + // + /** @type { KernelDeliveryObject } */ + const kd = harden(['bringOutYourDead']); + // eslint-disable-next-line no-use-before-define + const vd = kernelDeliveryToVatDelivery(vatID, kd); + const vs = kernelSlog.provideVatSlogger(vatID).vatSlog; + await deliverToVat(vatID, kd, vd, vs); + // in addition to saving the actual snapshot, // vatKeeper.saveSnapshot() pushes a save-snapshot transcript // entry, then starts a new transcript span, then pushes a diff --git a/packages/SwingSet/test/transcript/test-transcript-entries.js b/packages/SwingSet/test/transcript/test-transcript-entries.js index efdefcb329b..832ff4aff2a 100644 --- a/packages/SwingSet/test/transcript/test-transcript-entries.js +++ b/packages/SwingSet/test/transcript/test-transcript-entries.js @@ -95,11 +95,11 @@ test('transcript of new vats', async t => { test('transcript spans', async t => { const config = { - snapshotInitial: 2, + snapshotInitial: 3, // the new pseudo-deliveries ('initialize-worker', - // 'save-snapshot', and 'load-snapshot' all count against the - // snapshotInterval. So setting it to 7 will get us 5 actual - // deliveries between the two snapshot events. + // 'save-snapshot') count against the snapshotInterval. So setting + // it to 7 will get us 6 actual deliveries before the + // boyd/save-snapshot, and a 9-delivery cycle snapshotInterval: 7, defaultReapInterval: 'never', defaultManagerType: 'xsnap', @@ -131,7 +131,6 @@ test('transcript spans', async t => { }; c = await restart(); // actually just starting for the first time - let expectedFull = ['initialize-worker']; c.pinVatRoot('bootstrap'); const vatID = c.vatNameToID('bootstrap'); @@ -153,55 +152,70 @@ test('transcript spans', async t => { const save = ['save-snapshot']; const load = ['load-snapshot']; const notify = ['notify']; - const snap = [].concat(save, load); + const snap = [].concat(boyd, save, load); const boot = ['message']; // bootstrap(), not nothing() const msg = ['message']; // nothing() const msgN = n => Array(n).fill('message'); - const bounds = () => { + const curBounds = () => { const b = transcriptStore.getCurrentSpanBounds(vatID); return [b.startPos, b.endPos]; }; const incarnation = () => transcriptStore.getCurrentSpanBounds(vatID)?.incarnation; - // snapshotInitial=2 and snapshotInterval=7, so we'll see a - // save-snapshot on deliveryNum 2, and when (deliveryNum - - // snapshot.snapPos) = k*7. Until an upgrade causes a span to end - // early, that means save-snapshot on deliveries 2, 9, 16, 23 - - // the full sequence of deliveries will be: - // span 0-2 = [0,3) (ended by snapshot) - // 0 initialize-worker - // 1 startVat - // 2 save-snapshot - // span 3-9 = [3, 10) (ended by snapshot) - // 3 load-snapshot - // 4 boot (message: bootstrap()) - // 5 notify (response to createVatAdminService) - // 6 message (nothing-1) - // 7 message (nothing-2) - // 8 message (nothing-3) - // 9 save-snapshot - // span 10-15 = [10, 16) (ended by upgrade) - // 10 load-snapshot - // 11 message (nothing-4) - // 12 message (nothing-5) - // 13 message (nothing-6) - // 14 bringOutYourDead (for upgrade) - // 15 shutdown-worker (for upgrade) - // (span ended, snapshot removed, span started) - // span 16-19 = [16,20) - // 16 initialize-worker (added by ensureVatOnline) - // 17 startVat - // 18 message (nothing-7) - // (now maybeSaveSnapshot notices) - // 19 save-snapshot - // span 20- = [20,..) (still active at end of test) - // 20 load-snapshot - + // snapshotInitial=3 and snapshotInterval=7, so we'll start on + // deliveryNum d3 (BOYD on d3, save-snapshot on d4), and when there + // are 7 or more deliveries in the current transcript span + + // | what | pseudo | delivery | span | span | + // | | | num | size | bounds | + // |------------+----------+----------+------+-----------------| + // | | init | 0 | 1 | 0-4 / [0,5) | + // | startVat | | 1 | 2 | | + // | bootstrap | | 2 | 3! | | + // | | BOYD | 3 | | | + // | | save | 4 | | | + // | | new-span | | | | + // | | load | 5 | 1 | 5-13 / [5,14) | + // | notify | | 6 | 2 | | + // | nothing-1 | | 7 | 3 | | + // | nothing-2 | | 8 | 4 | | + // | nothing-3 | | 9 | 5 | | + // | nothing-4 | | 10 | 6 | | + // | nothing-5 | | 11 | 7! | | + // | | BOYD | 12 | | | + // | | save | 13 | | | + // | | new-span | | | | + // | | load | 14 | 1 | 14-17 / [14,18) | + // | nothing-6 | | 15 | 2 | | + // | UPGRADE! | | | | | + // | | BOYD | 16 | | | + // | | shutdown | 17 | | | + // | | new-span | | | | + // | | init | 18 | 1 | 18-22 / [18,23) | + // | startVat | | 19 | 2 | | + // | nothing-7 | | 20 | 3! | | + // | | BOYD | 21 | | | + // | | save | 22 | | | + // | | new-span | | | | + // | | load | 23 | 1 | 23-31 / [23,32) | + // | nothing-8 | | 24 | 2 | | + // | nothing-9 | | 25 | 3 | | + // | nothing-10 | | 26 | 4 | | + // | nothing-11 | | 27 | 5 | | + // | nothing-12 | | 28 | 6 | | + // | nothing-13 | | 29 | 7! | | + // | | BOYD | 30 | | | + // | | save | 31 | | | + // | | new-span | | | | + // | | load | 32 | 1 | 32- | + // | nothing-14 | | 33 | 2 | | + // | | | | | | + + let expectedFull = [].concat(init); t.deepEqual(fullSummary(), expectedFull); - t.deepEqual(bounds(), [0, 1]); - t.deepEqual(curSummary(), ['initialize-worker']); + t.deepEqual(curBounds(), [0, 1]); + t.deepEqual(curSummary(), [].concat(init)); t.is(snapInfo(), undefined); t.is(incarnation(), 0); @@ -209,23 +223,23 @@ test('transcript spans', async t => { // initialize-worker c = await restart(); - t.deepEqual(curSummary(), ['initialize-worker']); + t.deepEqual(curSummary(), [].concat(init)); t.is(snapInfo(), undefined); t.is(incarnation(), 0); - // starting the kernel will deliver the 'startVat' delivery to all - // vats. And since snapshotInitial=2, we'll get an immediate - // snapshot cycle after the 'startVat'. Then 'bootstrap()' and the - // notify from createVatAdminService are delivered + // starting the kernel will deliver the 'startVat' and bootstrap() + // messages. Since snapshotInitial=3, we'll get an immediate + // snapshot cycle after the 'bootstrap'. The notify from + // createVatAdminService is delivered after the snapshot completes. await c.run(); - expectedFull = expectedFull.concat('startVat', snap, boot, notify); - // first span is 0-2 = [0,3), snapshot happened on the last event - // "2". current span is 3-5 = [3,6) - t.is(snapInfo(), 2); + expectedFull = expectedFull.concat('startVat', boot, snap, notify); + // first span is 0-4 = [0,5), snapshot happened on the last event + // d4. current span is 5-6 = [5,7) + t.is(snapInfo(), 4); t.is(incarnation(), 0); - t.deepEqual(bounds(), [3, 6]); - t.deepEqual(oldSummary(0), [].concat(start, save)); - t.deepEqual(curSummary(), [].concat(load, boot, notify)); + t.deepEqual(curBounds(), [5, 7]); + t.deepEqual(oldSummary(0), [].concat(start, boot, boyd, save)); + t.deepEqual(curSummary(), [].concat(load, notify)); t.deepEqual(fullSummary(), expectedFull); // all delivery events should record computrons @@ -234,20 +248,20 @@ test('transcript spans', async t => { t.truthy(teStartVat.r.metering); t.is(typeof teStartVat.r.metering.computrons, 'number'); - // do some deliveries to trigger an XS heap snapshot event, creating - // a new span + // do some deliveries to trigger more XS heap snapshots and create + // more spans const doSomeNothing = async () => { const kpid = c.queueToVatRoot('bootstrap', 'nothing', [], 'panic'); await c.run(); t.is(c.kpStatus(kpid), 'fulfilled'); }; - await doSomeNothing(); // d6: nothing-1 + await doSomeNothing(); // d7: nothing-1 expectedFull = expectedFull.concat(msg); - t.is(snapInfo(), 2); + t.is(snapInfo(), 4); t.is(incarnation(), 0); - t.deepEqual(bounds(), [3, 7]); - t.deepEqual(curSummary(), [].concat(load, boot, notify, msgN(1))); + t.deepEqual(curBounds(), [5, 8]); + t.deepEqual(curSummary(), [].concat(load, notify, msgN(1))); t.deepEqual(fullSummary(), expectedFull); const teNothing1 = cur().slice(-1)[0]; @@ -256,46 +270,40 @@ test('transcript spans', async t => { t.is(typeof teNothing1.r.metering.computrons, 'number'); c = await restart(); - t.deepEqual(curSummary(), [].concat(load, boot, notify, msgN(1))); + t.deepEqual(curSummary(), [].concat(load, notify, msgN(1))); - await doSomeNothing(); // d7: nothing-2 + await doSomeNothing(); // d8: nothing-2 expectedFull = expectedFull.concat(msg); - t.deepEqual(curSummary(), [].concat(load, boot, notify, msgN(2))); - await doSomeNothing(); // d8: nothing-3 + t.deepEqual(curSummary(), [].concat(load, notify, msgN(2))); + await doSomeNothing(); // d9: nothing-3 expectedFull = expectedFull.concat(msg); - // 8th delivery hits snapshotInterval, so we get a save/load - // cycle. Spans are [0,3), [3,10), [10,11). There were snapshots on - // 2 and 9. - expectedFull = expectedFull.concat(snap); - t.is(snapInfo(), 9); - t.is(incarnation(), 0); - t.deepEqual(bounds(), [10, 11]); - t.deepEqual(curSummary(), [].concat(load)); + await doSomeNothing(); // d10: nothing-4 + expectedFull = expectedFull.concat(msg); + t.deepEqual(curSummary(), [].concat(load, notify, msgN(4))); t.deepEqual(fullSummary(), expectedFull); - c = await restart(); - t.deepEqual(curSummary(), [].concat(load)); - await doSomeNothing(); // nothing-4 + // the 7th delivery of the span will reach snapshotInterval + await doSomeNothing(); // d11: nothing-5 expectedFull = expectedFull.concat(msg); - t.is(snapInfo(), 9); + // so we get a save/load cycle. Spans are [0,5), [5,14), + // [14,15). There were snapshots on 4 and 13. + expectedFull = expectedFull.concat(snap); + t.is(snapInfo(), 13); t.is(incarnation(), 0); - t.deepEqual(bounds(), [10, 12]); - t.deepEqual(oldSummary(3), [].concat(load, boot, notify, msgN(3), save)); - t.deepEqual(curSummary(), [].concat(load, msg)); + t.deepEqual(curBounds(), [14, 15]); + t.deepEqual(curSummary(), [].concat(load)); t.deepEqual(fullSummary(), expectedFull); c = await restart(); - t.deepEqual(curSummary(), [].concat(load, msg)); + t.deepEqual(curSummary(), [].concat(load)); - // two more for good measure - await doSomeNothing(); // nothing-5 - expectedFull = expectedFull.concat(msg); - await doSomeNothing(); // nothing-6 + // one more for good measure, before the upgrade + await doSomeNothing(); // d15: nothing-5 expectedFull = expectedFull.concat(msg); - t.is(snapInfo(), 9); + t.is(snapInfo(), 13); t.is(incarnation(), 0); - t.deepEqual(bounds(), [10, 14]); - t.deepEqual(curSummary(), [].concat(load, msgN(3))); + t.deepEqual(curBounds(), [14, 16]); + t.deepEqual(curSummary(), [].concat(load, msg)); t.deepEqual(fullSummary(), expectedFull); // Now upgrade the vat, introducing a new incarnation. We do a null @@ -305,15 +313,14 @@ test('transcript spans', async t => { c.upgradeStaticVat('bootstrap', false, bundleID); expectedFull = expectedFull.concat(boyd, shutdown, start); await c.run(); - // a weirdness in the way maybeSaveSnapshot counts means that after - // an upgrade, we'll do a snapshot immediately after the first - // message delivery (because processDeliveryMessage calls - // maybeSaveSnapshot), but not after the startVat done by - // processUpgradeVat - t.is(snapInfo(), undefined); + // upgrade starts with a BOYD in d16, then a shutdown-worker + // pseudo-delivery in d17, then we erase the snapshot and start a + // new span with an initialize-worker and a startVat. + + t.is(snapInfo(), undefined); // no snapshot yet t.is(incarnation(), 1); - t.deepEqual(oldSummary(10), [].concat(load, msgN(3), boyd, shutdown)); - t.deepEqual(bounds(), [16, 18]); + t.deepEqual(oldSummary(14), [].concat(load, msg, boyd, shutdown)); + t.deepEqual(curBounds(), [18, 20]); t.deepEqual(curSummary(), [].concat(start)); t.deepEqual(fullSummary(), expectedFull); @@ -324,13 +331,48 @@ test('transcript spans', async t => { t.is(incarnation(), 1); t.deepEqual(curSummary(), [].concat(start)); - // so this provokes a snapshot, leaving us in an empty span - await doSomeNothing(); // nothing-7 + // one delivery will be enough to reach snapshotInitial + // (init+startVat+message = 3) and provoke the first snapshot of the + // new incarnation (thus boyd on d21 and save-snapshot on d22) + + await doSomeNothing(); // d20: nothing-7 + expectedFull = expectedFull.concat(msg, snap); + t.is(snapInfo(), 22); + t.is(incarnation(), 1); + t.deepEqual(oldSummary(18), [].concat(start, msg, boyd, save)); + t.deepEqual(curBounds(), [23, 24]); + t.deepEqual(curSummary(), [].concat(load)); + t.deepEqual(fullSummary(), expectedFull); + + // we need another six deliveries to reach snapshotInterval + await doSomeNothing(); // d24: nothing-8 + expectedFull = expectedFull.concat(msg); + t.deepEqual(fullSummary(), expectedFull); + + await doSomeNothing(); // d25: nothing-9 + expectedFull = expectedFull.concat(msg); + t.deepEqual(fullSummary(), expectedFull); + + await doSomeNothing(); // d26: nothing-10 + expectedFull = expectedFull.concat(msg); + t.deepEqual(fullSummary(), expectedFull); + + await doSomeNothing(); // d27: nothing-11 + expectedFull = expectedFull.concat(msg); + t.deepEqual(fullSummary(), expectedFull); + + await doSomeNothing(); // d28: nothing-12 + expectedFull = expectedFull.concat(msg); + t.deepEqual(fullSummary(), expectedFull); + t.is(snapInfo(), 22); // no snapshot yet + + // this will trigger snapshotInterval + await doSomeNothing(); // d28: nothing-12 expectedFull = expectedFull.concat(msg, snap); - t.is(snapInfo(), 19); + t.is(snapInfo(), 31); t.is(incarnation(), 1); - t.deepEqual(bounds(), [20, 21]); - t.deepEqual(oldSummary(16), [].concat(init, ['startVat'], msg, save)); + t.deepEqual(oldSummary(23), [].concat(load, msgN(6), boyd, save)); + t.deepEqual(curBounds(), [32, 33]); t.deepEqual(curSummary(), [].concat(load)); t.deepEqual(fullSummary(), expectedFull); }); diff --git a/packages/SwingSet/test/vat-warehouse/test-reload-snapshot.js b/packages/SwingSet/test/vat-warehouse/test-reload-snapshot.js index 04c1b17b981..c09778a2830 100644 --- a/packages/SwingSet/test/vat-warehouse/test-reload-snapshot.js +++ b/packages/SwingSet/test/vat-warehouse/test-reload-snapshot.js @@ -51,50 +51,50 @@ test('vat reload from snapshot', async t => { c1.queueToVatRoot('target', 'count', []); // * 2: message (count=0) // * then we hit snapshotInitial - // * 3: save-snapshot - // * 4: load-snapshot + // * 3: BOYD + // * 4: save-snapshot + // * 5: load-snapshot expected1.push(`count = 0`); await c1.run(); t.deepEqual(c1.dump().log, expected1); - t.deepEqual(getPositions(), [3, 4, 5]); + t.deepEqual(getPositions(), [4, 5, 6]); for (let i = 1; i < 11; i += 1) { c1.queueToVatRoot('target', 'count', []); - // * 5: message (count=1) - // * 6: message (count=2) - // * 7: message (count=3) + // * 6: message (count=1) + // * 7: message (count=2) + // * 8: message (count=3) + // * 9: message (count=4) // * then we hit snapshotInterval - // * 8: save-snapshot - // * 9: load-snapshot - // * 10: message (count=4) - // * 11: message (count=5) - // * 12: message (count=6) - // * then we hit snapshotInterval - // * 13: save-snapshot - // * 14: load-snapshot + // * 9: BOYD + // * 10: save-snapshot + // * 11: load-snapshot + // * 12: message (count=5) + // * 13: message (count=6) // * 15: message (count=7) // * 16: message (count=8) - // * 17: message (count=9) // * then we hit snapshotInterval + // * 17: BOYD // * 18: save-snapshot // * 19: load-snapshot - // * 20: message (count=10) + // * 20: message (count=9) + // * 21: message (count=10) expected1.push(`count = ${i}`); } await c1.run(); t.deepEqual(c1.dump().log, expected1); - t.deepEqual(getPositions(), [18, 19, 21]); + t.deepEqual(getPositions(), [18, 19, 22]); await c1.shutdown(); - // the worker will start with the load-snapshot at d19, and replay d20 + // the worker will start with the load-snapshot at d19, and replay d20+d21 const c2 = await makeSwingsetController(kernelStorage); - const expected2 = [`count = 10`]; + const expected2 = [`count = 9`, `count = 10`]; t.deepEqual(c2.dump().log, expected2); // replayed 4 deliveries c2.queueToVatRoot('target', 'count', []); - // * 21: message(count=11) + // * 22: message(count=11) expected2.push(`count = 11`); await c2.run(); t.deepEqual(c2.dump().log, expected2); // note: *not* 0-11 - t.deepEqual(getPositions(), [18, 19, 22]); + t.deepEqual(getPositions(), [18, 19, 23]); await c2.shutdown(); }); diff --git a/packages/SwingSet/test/xsnap-stable-bundles/test-stable-bundles.js b/packages/SwingSet/test/xsnap-stable-bundles/test-stable-bundles.js index 1632c2ceef8..11298438718 100644 --- a/packages/SwingSet/test/xsnap-stable-bundles/test-stable-bundles.js +++ b/packages/SwingSet/test/xsnap-stable-bundles/test-stable-bundles.js @@ -173,11 +173,12 @@ test('xsnap bundles are stable', async t => { // snapshotInitial: 2 // now allow the bootstrap message to be delivered as deliveryNum=1, - // which should trigger a snapshot, which is recorded in a - // save-snapshot as deliveryNum=2 + // which should trigger a snapshot, which first performs a BOYD in + // deliveryNum=2, then the snapshot is recorded in a save-snapshot + // as deliveryNum=3 t.is(snapStore.getSnapshotInfo('v1'), undefined); await c2.run(); - t.is(snapStore.getSnapshotInfo('v1')?.snapPos, 2); + t.is(snapStore.getSnapshotInfo('v1')?.snapPos, 3); await c2.shutdown; // now that the worker has a snapshot, the vat preload won't fetch