-
Notifications
You must be signed in to change notification settings - Fork 206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(swingset): workaround XS garbage collection bugs #6664
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,21 @@ | ||
import djson from '../../lib/djson.js'; | ||
|
||
// Indicate that a syscall is missing from the transcript but is safe to | ||
// perform during replay | ||
const missingSyscall = Symbol('missing transcript syscall'); | ||
|
||
// Indicate that a syscall is recorded in the transcript but can be safely | ||
// ignored / skipped during replay. | ||
const extraSyscall = Symbol('extra transcript syscall'); | ||
|
||
/** @typedef {typeof missingSyscall | typeof extraSyscall | Error | undefined} CompareSyscallsResult */ | ||
|
||
/** | ||
* @param {any} vatID | ||
* @param {object} originalSyscall | ||
* @param {object} newSyscall | ||
* @returns {CompareSyscallsResult} | ||
*/ | ||
export function requireIdentical(vatID, originalSyscall, newSyscall) { | ||
if (djson.stringify(originalSyscall) !== djson.stringify(newSyscall)) { | ||
console.error(`anachrophobia strikes vat ${vatID}`); | ||
|
@@ -10,6 +26,67 @@ export function requireIdentical(vatID, originalSyscall, newSyscall) { | |
return undefined; | ||
} | ||
|
||
const vcSyscallRE = /^vc\.\d+\.\|(?:schemata|label)$/; | ||
|
||
/** | ||
* Liveslots currently has a deficiency which results in [virtual collections | ||
* being sensitive to organic GC](https://github.com/Agoric/agoric-sdk/issues/6360). | ||
* | ||
* XS also has multiple issues causing memory to not be collected identically | ||
* depending on the load from snapshot schedule. This results in organic GC | ||
* triggering at different times based on which snapshot the worker was created | ||
* from. | ||
* | ||
* Combined together, these bugs cause syscalls being emitted by liveslots at | ||
* different times whether the execution occurred in a worker created from a | ||
* more or less recent snapshot. With a strict check during transcript replay, | ||
* this can cause [anachrophobia errors when restarting SwingSet](https://github.com/Agoric/agoric-sdk/issues/6588), | ||
* or potentially when reloading a vat that was paged out. | ||
* | ||
* Thankfully the syscalls issued by liveslots for these virtual collection | ||
* objects are both easily identifiable and stable over time. That means their | ||
* response is always the same regardless when the syscall is made. | ||
* | ||
* This method enhances the basic identical check and returns sentinel values | ||
* (unique symbols), indicating whether a syscall during replay requires to | ||
* skip an entry from the transcript or perform the actual syscall because the | ||
* entry is missing in the transcript. This works in conjunction with | ||
* `simulateSyscall` which then performs the appropriate action. | ||
* | ||
* @param {any} vatID | ||
* @param {object} originalSyscall | ||
* @param {object} newSyscall | ||
* @returns {CompareSyscallsResult} | ||
*/ | ||
export function requireIdenticalExceptStableVCSyscalls( | ||
vatID, | ||
originalSyscall, | ||
newSyscall, | ||
) { | ||
const error = requireIdentical(vatID, originalSyscall, newSyscall); | ||
|
||
if (error) { | ||
if ( | ||
originalSyscall[0] === 'vatstoreGet' && | ||
vcSyscallRE.test(originalSyscall[1]) | ||
) { | ||
// The syscall recorded in the transcript is for a virtual collection | ||
// metadata get. It can be safely skipped. | ||
console.warn(` mitigation: ignoring extra vc syscall`); | ||
return extraSyscall; | ||
} | ||
|
||
if (newSyscall[0] === 'vatstoreGet' && vcSyscallRE.test(newSyscall[1])) { | ||
// The syscall performed by the vat is for a virtual collection metadata | ||
// get. It can be safely performed during replay. | ||
console.warn(` mitigation: falling through to syscall handler`); | ||
return missingSyscall; | ||
} | ||
} | ||
|
||
return error; | ||
} | ||
|
||
export function makeTranscriptManager( | ||
vatKeeper, | ||
vatID, | ||
|
@@ -59,13 +136,44 @@ export function makeTranscriptManager( | |
let replayError; | ||
|
||
function simulateSyscall(newSyscall) { | ||
const s = playbackSyscalls.shift(); | ||
const newReplayError = compareSyscalls(vatID, s.d, newSyscall); | ||
if (newReplayError) { | ||
replayError = newReplayError; | ||
throw replayError; | ||
while (playbackSyscalls.length) { | ||
const compareError = compareSyscalls( | ||
vatID, | ||
playbackSyscalls[0].d, | ||
newSyscall, | ||
); | ||
|
||
if (compareError === missingSyscall) { | ||
// return `undefined` to indicate that this syscall cannot be simulated | ||
// and needs to be performed (virtual collection metadata get) | ||
return undefined; | ||
} | ||
|
||
const s = playbackSyscalls.shift(); | ||
|
||
if (!compareError) { | ||
return s.response; | ||
} else if (compareError !== extraSyscall) { | ||
replayError = compareError; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've walked through the cases and convinced myself this will do the right thing, but it'd be lovely to have a short unit test that demonstrates the different permutations this is prepared to handle. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I've thought about it, but given it's not something I plan on maintaining long term, I didn't. Happy to think about what some unit tests would look like if you feel strongly about them here. |
||
break; | ||
} | ||
|
||
// Check the next transcript entry, skipping any extra syscalls recorded | ||
// in the transcript (virtual collection metadata get) | ||
} | ||
|
||
if (!replayError) { | ||
// Error if the vat performs a syscall for which we don't have a | ||
// corresponding entry in the transcript. | ||
|
||
// Note that if a vat performed an "allowed" vc metadata get syscall after | ||
// we reach the end of the transcript, we would error instead of | ||
// falling through and performing the syscall. However liveslots does not | ||
// perform vc metadata get syscalls unless it needs to provide an entry | ||
// to the program, which always results in subsequent syscalls. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure that's strictly true, but the chances are low, and I can't think of a clean way to deal with it, so I'm ok with this approach. The case would be something like delivery 1 doing: baggage.set('key', makeBigScalarMapStore(..)) That creates the new collection, creates its initial Representative, uses that to look up the vref and stash it in baggage, then drops the Representative. Then delivery 2 does:
Delivery 2 will send If the original execution still had the I don't think that's going to be a common pattern (and I guess we could scan the existing transcripts for it, and convince ourselves that pismoA doesn't ever create it). But if you wanted to fix it, you might add an extra check after the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you do that, it'd be worth refactoring the "is this the right kind of syscall" check into a single function, called twice from ... oh, except, right, that's not particularly easy to configure/activate as just replacing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that edge case is not worth fixing if we're not triggering it. Worst case, it gets triggered, some validators hit a new anachrophobia, and we provide a new patch that does the more complicated check. |
||
replayError = new Error(`historical inaccuracy in replay of ${vatID}`); | ||
} | ||
return s.response; | ||
throw replayError; | ||
} | ||
|
||
function finishReplayDelivery(dnum) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any particular reason to use a Symbol rather than just a string? Or an empty object?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Strings are more forgeable and I have a habit of using unique symbol for sentinel values.