From 1957eca3b605ab3cced343bf338a11a5f99f1449 Mon Sep 17 00:00:00 2001
From: Peter van der Zee <github-public@qfox.nl>
Date: Tue, 18 Feb 2020 15:49:47 +0100
Subject: [PATCH] fix(gatsby): Chunk nodes when serializing redux to prevent
 OOM

We are using `v8.serialize` to write and read the redux state. This is faster than `JSON.parse`. Unfortunately, as reported in #17233, this can lead to a fatal when the contents of the redux state is too big to be serialized to a Buffer (hard max of 2GB). Alternatively, we also hit this problem on large site like a million small md pages.

The solution is to shard the `nodes` property, which holds all the page data. In this change I've added a simple heuristic to determine the max chunk size (mind you, currently that's basically `Infinity`). It will serialize about 11 individual nodes, measure their size, and based on the biggest node determine how many nodes would fit in 1.5GB.

The serialization process is updated to no longer put the `nodes` in the main redux file, but rather sharded over a few specific files. When reading the state from cache, these files are all read and their contents are put together in a single Map again. If there were no nodes files this part does nothing so it's even backwards compatible.
---
 packages/gatsby/src/redux/persist.ts | 75 ++++++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 5 deletions(-)

diff --git a/packages/gatsby/src/redux/persist.ts b/packages/gatsby/src/redux/persist.ts
index bf6b6902e06f1..da1bebffcb831 100644
--- a/packages/gatsby/src/redux/persist.ts
+++ b/packages/gatsby/src/redux/persist.ts
@@ -1,12 +1,77 @@
 import v8 from "v8"
-import { readFileSync, writeFileSync } from "fs-extra"
-import { ICachedReduxState } from "./types"
+import { readFileSync, writeFileSync, unlinkSync } from "fs-extra"
+import { IReduxNode, ICachedReduxState } from "./types"
+import { sync as globSync } from "glob"
 
-const file = (): string => `${process.cwd()}/.cache/redux.state`
+const CWD = process.cwd()
+const file = (): string => CWD + `/.cache/redux.state`
+const chunkFilePrefix = (): string => CWD + `/.cache/redux.node.state_`
 
-export const readFromCache = (): ICachedReduxState =>
-  v8.deserialize(readFileSync(file()))
+export const readFromCache = (): ICachedReduxState => {
+  // The cache is stored in two steps; the nodes in chunks and the rest
+  // First we revive the rest, then we inject the nodes into that obj (if any)
+  // Each chunk is stored in its own file, this circumvents max buffer lengths
+  // for sites with a _lot_ of content. Since all nodes go into a Map, the order
+  // of reading them is not relevant.
+
+  const obj: ICachedReduxState = v8.deserialize(readFileSync(file()))
+
+  // Note: at 1M pages, this will be 1M/chunkSize chunks (ie. 1m/10k=100)
+  const chunks = globSync(chunkFilePrefix() + "*").map(file =>
+    v8.deserialize(readFileSync(file))
+  )
+
+  const nodes: [string, IReduxNode][] = [].concat(...chunks)
+
+  if (chunks.length) {
+    obj.nodes = new Map(nodes)
+  }
+
+  return obj
+}
 
 export const writeToCache = (contents: ICachedReduxState): void => {
+  // Remove the old node files first, we may have fewer nodes than before and
+  // must make sure that no excess files are kept around.
+  globSync(chunkFilePrefix() + "*").forEach(file => unlinkSync(file))
+
+  // Temporarily save the nodes and remove them from the main redux store
+  // This prevents an OOM when the page nodes collectively contain to much data
+  const map = contents.nodes
+  contents.nodes = undefined
   writeFileSync(file(), v8.serialize(contents))
+  // Now restore them on the redux store
+  contents.nodes = map
+
+  if (map) {
+    // Now store the nodes separately, chunk size determined by a heuristic
+    const values: [string, IReduxNode][] = [...map.entries()]
+    const chunkSize = guessSafeChunkSize(values)
+    const chunks = Math.ceil(values.length / chunkSize)
+
+    for (let i = 0; i < chunks; ++i) {
+      writeFileSync(
+        `${chunkFilePrefix()}${i}`,
+        v8.serialize(values.slice(i * chunkSize, i * chunkSize + chunkSize))
+      )
+    }
+  }
+}
+
+function guessSafeChunkSize(values) {
+  const valueCount = values.length
+
+  // Pick a few random elements and measure their size.
+  // Pick a chunk size ceiling based on the worst case.
+  // This attempts to prevent small sites with very large pages from OOMing
+  let maxSize = 0
+  for (let i = 0; i < valueCount; i += Math.floor(valueCount / 11)) {
+    const size = v8.serialize(values[i]).length
+    maxSize = Math.max(size, maxSize)
+  }
+
+  // Max size of a Buffer is 2gb (yeah, we're assuming 64bit system)
+  // https://stackoverflow.com/questions/8974375/whats-the-maximum-size-of-a-node-js-buffer
+  // Use 1.5gb as the target ceiling, allowing for some margin of error
+  return Math.floor((150 * 1024 * 1024 * 1024) / maxSize)
 }