Skip to content

Commit

Permalink
fix: Add consensus-independent vat snapshot/transcript archiving conf…
Browse files Browse the repository at this point in the history
…iguration (#10055)

Fixes #10036

## Description
Adds consensus-independent `vat-{snapshot,transcript}-archive-dir` cosmos-sdk swingset configuration for propagation in AG_COSMOS_INIT. When set, gzip-compressed vat snapshot/transcript files are written to the respective directory (the former when created, the latter when finalized by rollover).

It also includes miscellaneous documentation and code improvements, most significantly a new helper for replacing `const cleanups = []; return aggregateTryFinally(async () => { … cleanups.push(…); … }, async () => { await PromiseAllOrErrors(cleanups.reverse().map(fn => Promise.resolve().then(() => fn()))); });` with `return withDeferredCleanup(async addCleanup => { … addCleanup(…); … });` (i.e., folding in the cleanup registration and ordering).

Best reviewed commit-by-commit.

### Security Considerations
This provides a new interface by which the Swingset kernel can write to its host filesystem, albeit indirectly via callback. But as such, it is a potential vector for filling up a disk or possibly overwriting files, although subject to an operator's own bad configuration and/or elevated permissions.

### Scaling Considerations
When one or both new settings are active and the disk is full or slow, performance may suffer. I considered performing these writes in the background, but decided that silent failures or slow writes would be worse than the new `await`s (and when active, the archive dir will likely be on the same filesystem as the database anyway).

### Documentation Considerations
I think I've covered what I need to, although there might be some other Markdown files worth updating.

### Testing Considerations
New features are covered by unit tests.

### Upgrade Considerations
This is all kernel code that can be used at any node restart (i.e., because the configuration is consensus-independent, it doesn't even need to wait for a chain software upgrade). But we should mention the new cosmos-sdk configuration in release notes, because it won't be added to existing app.toml files already in use.
  • Loading branch information
mergify[bot] authored Sep 11, 2024
2 parents 914bee2 + bf76e00 commit 07f6885
Show file tree
Hide file tree
Showing 23 changed files with 548 additions and 280 deletions.
31 changes: 29 additions & 2 deletions golang/cosmos/x/swingset/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ import (
)

const (
ConfigPrefix = "swingset"
FlagSlogfile = ConfigPrefix + ".slogfile"
ConfigPrefix = "swingset"
FlagSlogfile = ConfigPrefix + ".slogfile"
FlagVatSnapshotArchiveDir = ConfigPrefix + ".vat-snapshot-archive-dir"
FlagVatTranscriptArchiveDir = ConfigPrefix + ".vat-transcript-archive-dir"

SnapshotRetentionOptionDebug = "debug"
SnapshotRetentionOptionOperational = "operational"
Expand Down Expand Up @@ -73,6 +75,12 @@ vat-snapshot-retention = "{{ .Swingset.VatSnapshotRetention }}"
# * "default": determined by 'pruning' ("archival" if 'pruning' is "nothing",
# otherwise "operational")
vat-transcript-retention = "{{ .Swingset.VatTranscriptRetention }}"
# Archival of gzipped vat snapshots.
vat-snapshot-archive-dir = "{{ .Swingset.VatSnapshotArchiveDir }}"
# Archival of historical (i.e., closed) vat transcript spans to gzipped files.
vat-transcript-archive-dir = "{{ .Swingset.VatTranscriptArchiveDir }}"
`

// SwingsetConfig defines configuration for the SwingSet VM.
Expand Down Expand Up @@ -106,6 +114,13 @@ type SwingsetConfig struct {
// * "default": determined by `pruning` ("archival" if `pruning` is
// "nothing", otherwise "operational")
VatTranscriptRetention string `mapstructure:"vat-transcript-retention" json:"vatTranscriptRetention,omitempty"`

// VatSnapshotArchiveDir controls archival of gzipped vat snapshots.
VatSnapshotArchiveDir string `mapstructure:"vat-snapshot-archive-dir" json:"vatSnapshotArchiveDir,omitempty"`

// VatTranscriptArchiveDir controls archival of historical (i.e., closed) vat
// transcript spans to gzipped files.
VatTranscriptArchiveDir string `mapstructure:"vat-transcript-archive-dir" json:"vatTranscriptArchiveDir,omitempty"`
}

var DefaultSwingsetConfig = SwingsetConfig{
Expand Down Expand Up @@ -202,5 +217,17 @@ func SwingsetConfigFromViper(resolvedConfig servertypes.AppOptions) (*SwingsetCo
}
ssConfig.SlogFile = resolvedSlogFile

resolvedSnapshotDir, err := resolvePath(ssConfig.VatSnapshotArchiveDir, FlagVatSnapshotArchiveDir)
if err != nil {
return nil, err
}
ssConfig.VatSnapshotArchiveDir = resolvedSnapshotDir

resolvedTranscriptDir, err := resolvePath(ssConfig.VatTranscriptArchiveDir, FlagVatTranscriptArchiveDir)
if err != nil {
return nil, err
}
ssConfig.VatTranscriptArchiveDir = resolvedTranscriptDir

return ssConfig, nil
}
21 changes: 13 additions & 8 deletions packages/SwingSet/src/kernel/kernel.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { assert, Fail } from '@endo/errors';
import { isNat } from '@endo/nat';
import { mustMatch, M } from '@endo/patterns';
import { importBundle } from '@endo/import-bundle';
import { objectMetaMap } from '@agoric/internal';
import { objectMetaMap, PromiseAllOrErrors } from '@agoric/internal';
import { makeUpgradeDisconnection } from '@agoric/internal/src/upgrade-api.js';
import { kser, kslot, makeError } from '@agoric/kmarshal';
import { assertKnownOptions } from '../lib/assertOptions.js';
Expand Down Expand Up @@ -261,8 +261,12 @@ export default function buildKernel(
*/
async function terminateVat(vatID, shouldReject, info) {
console.log(`kernel terminating vat ${vatID} (failure=${shouldReject})`);
let critical = false;
insistCapData(info);
// Note that it's important for much of this work to happen within the
// synchronous prelude. For details, see
// https://github.com/Agoric/agoric-sdk/pull/10055#discussion_r1754918394
let critical = false;
const deferred = [];
// ISSUE: terminate stuff in its own crank like creation?
// TODO: if a static vat terminates, panic the kernel?
// TODO: guard against somebody telling vatAdmin to kill a vat twice
Expand All @@ -287,7 +291,7 @@ export default function buildKernel(
// remove vatID from the list of live vats, and mark for deletion
kernelKeeper.deleteVatID(vatID);
kernelKeeper.markVatAsTerminated(vatID);
kernelKeeper.removeVatFromSwingStoreExports(vatID);
deferred.push(kernelKeeper.removeVatFromSwingStoreExports(vatID));
for (const kpid of deadPromises) {
resolveToError(kpid, makeError('vat terminated'), vatID);
}
Expand All @@ -302,9 +306,7 @@ export default function buildKernel(
// it's going to be a small cost compared to the trouble you're probably
// already in anyway if this happens.
panic(`critical vat ${vatID} failed`, Error(info.body));
return;
}
if (vatAdminRootKref) {
} else if (vatAdminRootKref) {
// static vat termination can happen before vat admin vat exists
notifyTermination(
vatID,
Expand All @@ -319,8 +321,11 @@ export default function buildKernel(
);
}

// worker needs to be stopped, if any
await vatWarehouse.stopWorker(vatID);
// worker, if present, needs to be stopped
// (note that this only applies to ephemeral vats)
deferred.push(vatWarehouse.stopWorker(vatID));

await PromiseAllOrErrors(deferred);
}

function notifyMeterThreshold(meterID) {
Expand Down
4 changes: 2 additions & 2 deletions packages/SwingSet/src/kernel/state/kernelKeeper.js
Original file line number Diff line number Diff line change
Expand Up @@ -930,13 +930,13 @@ export default function makeKernelKeeper(
kvStore.set(`${kernelSlot}.data.slots`, capdata.slots.join(','));
}

function removeVatFromSwingStoreExports(vatID) {
async function removeVatFromSwingStoreExports(vatID) {
// Delete primary swingstore records for this vat, in preparation
// for (slow) deletion. After this, swingstore exports will omit
// this vat. This is called from the kernel's terminateVat, which
// initiates (but does not complete) deletion.
snapStore.stopUsingLastSnapshot(vatID);
transcriptStore.stopUsingTranscript(vatID);
await transcriptStore.stopUsingTranscript(vatID);
}

/**
Expand Down
4 changes: 2 additions & 2 deletions packages/SwingSet/src/kernel/state/vatKeeper.js
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ export function makeVatKeeper(
addToTranscript(makeSaveSnapshotItem(snapshotID));

// then start a new transcript span
transcriptStore.rolloverSpan(vatID);
await transcriptStore.rolloverSpan(vatID);

// then push a load-snapshot entry, so that the current span
// always starts with an initialize-worker or load-snapshot
Expand Down Expand Up @@ -715,7 +715,7 @@ export function makeVatKeeper(
return transcriptStore.deleteVatTranscripts(vatID, budget);
}

function beginNewIncarnation() {
async function beginNewIncarnation() {
if (snapStore) {
snapStore.stopUsingLastSnapshot(vatID);
}
Expand Down
3 changes: 3 additions & 0 deletions packages/SwingSet/src/kernel/vat-admin-hooks.js
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ export function makeVatAdminHooks(tools) {
// we don't need to incrementRefCount because if terminateVat sends
// 'reason' to vat-admin, it uses notifyTermination / queueToKref /
// doSend, and doSend() does its own incref
// FIXME: This assumes that most work of terminateVat happens in the
// synchronous prelude, which should be made more obvious. For details,
// see https://github.com/Agoric/agoric-sdk/pull/10055#discussion_r1754918394
void terminateVat(vatID, true, marshalledReason);
// TODO: terminateVat is async, result doesn't fire until worker
// is dead. To fix this we'll probably need to move termination
Expand Down
3 changes: 3 additions & 0 deletions packages/SwingSet/test/snapshots/xsnap-store.test.js.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Generated by [AVA](https://avajs.dev).
> initial snapshot
{
archiveWriteSeconds: undefined,
compressSeconds: 0,
dbSaveSeconds: 0,
hash: 'bee3b82eebdde4c5c3774fb95b7efe88382f7dc4afab90b4e0e58add54d6b81c',
Expand All @@ -18,6 +19,7 @@ Generated by [AVA](https://avajs.dev).
> after SES boot - sensitive to SES-shim, XS, and supervisor
{
archiveWriteSeconds: undefined,
compressSeconds: 0,
dbSaveSeconds: 0,
hash: '5433501987ce52b3bd9ab47956195669c5adea89b050e8c787eb9da431ce1a6e',
Expand All @@ -27,6 +29,7 @@ Generated by [AVA](https://avajs.dev).
> after use of harden() - sensitive to SES-shim, XS, and supervisor
{
archiveWriteSeconds: undefined,
compressSeconds: 0,
dbSaveSeconds: 0,
hash: '4cdc352b710f0719bc6f541631315652b5da19093e18ce844ec274340a37efd5',
Expand Down
Binary file modified packages/SwingSet/test/snapshots/xsnap-store.test.js.snap
Binary file not shown.
53 changes: 38 additions & 15 deletions packages/cosmic-swingset/src/chain-main.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
// @ts-check

import { resolve as pathResolve } from 'path';
import path from 'node:path';
import v8 from 'node:v8';
import process from 'node:process';
import fs from 'node:fs';
import fsPromises from 'node:fs/promises';
import { performance } from 'perf_hooks';
import { resolve as importMetaResolve } from 'import-meta-resolve';
import tmpfs from 'tmp';
import { performance } from 'node:perf_hooks';
import { fork } from 'node:child_process';
import { resolve as importMetaResolve } from 'import-meta-resolve';
import tmp from 'tmp';

import { Fail, q } from '@endo/errors';
import { E } from '@endo/far';
Expand All @@ -33,6 +33,10 @@ import { makeShutdown } from '@agoric/internal/src/node/shutdown.js';
import * as STORAGE_PATH from '@agoric/internal/src/chain-storage-paths.js';
import * as ActionType from '@agoric/internal/src/action-types.js';
import { BridgeId, CosmosInitKeyToBridgeId } from '@agoric/internal';
import {
makeArchiveSnapshot,
makeArchiveTranscript,
} from '@agoric/swing-store';
import {
makeBufferedStorage,
makeReadCachingStorage,
Expand Down Expand Up @@ -76,6 +80,8 @@ const toNumber = specimen => {
* @property {number} [maxVatsOnline]
* @property {'debug' | 'operational'} [vatSnapshotRetention]
* @property {'archival' | 'operational'} [vatTranscriptRetention]
* @property {string} [vatSnapshotArchiveDir]
* @property {string} [vatTranscriptArchiveDir]
*/
const SwingsetConfigShape = M.splitRecord(
// All known properties are optional, but unknown properties are not allowed.
Expand All @@ -85,6 +91,8 @@ const SwingsetConfigShape = M.splitRecord(
maxVatsOnline: M.number(),
vatSnapshotRetention: M.or('debug', 'operational'),
vatTranscriptRetention: M.or('archival', 'operational'),
vatSnapshotArchiveDir: M.string(),
vatTranscriptArchiveDir: M.string(),
},
{},
);
Expand Down Expand Up @@ -159,8 +167,8 @@ const makePrefixedBridgeStorage = (
return fromBridgeStringValue(ret);
},
set: (key, value) => {
const path = `${prefix}${key}`;
const entry = [path, toBridgeStringValue(value)];
const fullPath = `${prefix}${key}`;
const entry = [fullPath, toBridgeStringValue(value)];
call(
stringify({
method: setterMethod,
Expand All @@ -169,8 +177,8 @@ const makePrefixedBridgeStorage = (
);
},
delete: key => {
const path = `${prefix}${key}`;
const entry = [path];
const fullPath = `${prefix}${key}`;
const entry = [fullPath];
call(
stringify({
method: setterMethod,
Expand Down Expand Up @@ -315,8 +323,13 @@ export default async function main(progname, args, { env, homedir, agcc }) {
/** @type {CosmosSwingsetConfig} */
const swingsetConfig = harden(initAction.resolvedConfig || {});
validateSwingsetConfig(swingsetConfig);
const { slogfile, vatSnapshotRetention, vatTranscriptRetention } =
swingsetConfig;
const {
slogfile,
vatSnapshotRetention,
vatTranscriptRetention,
vatSnapshotArchiveDir,
vatTranscriptArchiveDir,
} = swingsetConfig;
const keepSnapshots = vatSnapshotRetention
? vatSnapshotRetention !== 'operational'
: ['1', 'true'].includes(XSNAP_KEEP_SNAPSHOTS);
Expand Down Expand Up @@ -471,14 +484,14 @@ export default async function main(progname, args, { env, homedir, agcc }) {
const swingStoreTraceFile = processValue.getPath({
envName: 'SWING_STORE_TRACE',
flagName: 'trace-store',
trueValue: pathResolve(stateDBDir, 'store-trace.log'),
trueValue: path.resolve(stateDBDir, 'store-trace.log'),
});

const nodeHeapSnapshots = Number.parseInt(NODE_HEAP_SNAPSHOTS, 10);

let lastCommitTime = 0;
let commitCallsSinceLastSnapshot = NaN;
const snapshotBaseDir = pathResolve(stateDBDir, 'node-heap-snapshots');
const snapshotBaseDir = path.resolve(stateDBDir, 'node-heap-snapshots');

if (nodeHeapSnapshots >= 0) {
fs.mkdirSync(snapshotBaseDir, { recursive: true });
Expand Down Expand Up @@ -514,7 +527,7 @@ export default async function main(progname, args, { env, homedir, agcc }) {
) {
commitCallsSinceLastSnapshot = 0;
heapSnapshot = `Heap-${process.pid}-${Date.now()}.heapsnapshot`;
const snapshotPath = pathResolve(snapshotBaseDir, heapSnapshot);
const snapshotPath = path.resolve(snapshotBaseDir, heapSnapshot);
v8.writeHeapSnapshot(snapshotPath);
heapSnapshotTime = performance.now() - t3;
}
Expand All @@ -537,6 +550,14 @@ export default async function main(progname, args, { env, homedir, agcc }) {
}
};

const fsPowers = { fs, path, tmp };
const archiveSnapshot = vatSnapshotArchiveDir
? makeArchiveSnapshot(vatSnapshotArchiveDir, fsPowers)
: undefined;
const archiveTranscript = vatTranscriptArchiveDir
? makeArchiveTranscript(vatTranscriptArchiveDir, fsPowers)
: undefined;

const s = await launch({
actionQueueStorage,
highPriorityQueueStorage,
Expand All @@ -556,6 +577,8 @@ export default async function main(progname, args, { env, homedir, agcc }) {
swingStoreTraceFile,
keepSnapshots,
keepTranscripts,
archiveSnapshot,
archiveTranscript,
afterCommitCallback,
swingsetConfig,
});
Expand Down Expand Up @@ -608,7 +631,7 @@ export default async function main(progname, args, { env, homedir, agcc }) {
);
return performStateSyncImport(options, {
fs: { ...fs, ...fsPromises },
pathResolve,
pathResolve: path.resolve,
log: null,
});
}
Expand All @@ -632,7 +655,7 @@ export default async function main(progname, args, { env, homedir, agcc }) {
stateSyncExport = exportData;

await new Promise((resolve, reject) => {
tmpfs.dir(
tmp.dir(
{
prefix: `agd-state-sync-${blockHeight}-`,
unsafeCleanup: true,
Expand Down
4 changes: 4 additions & 0 deletions packages/cosmic-swingset/src/launch-chain.js
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@ export async function launch({
swingStoreExportCallback,
keepSnapshots,
keepTranscripts,
archiveSnapshot,
archiveTranscript,
afterCommitCallback = async () => ({}),
swingsetConfig,
}) {
Expand Down Expand Up @@ -375,6 +377,8 @@ export async function launch({
exportCallback: swingStoreExportSyncCallback,
keepSnapshots,
keepTranscripts,
archiveSnapshot,
archiveTranscript,
});
const { kvStore, commit } = hostStorage;

Expand Down
Loading

0 comments on commit 07f6885

Please sign in to comment.