Fix initialization of the WAL buffer at startup

Heikki Linnakangas · Heikki Linnakangas · commit 63f418221770 · 2025-09-16T18:34:14.000+03:00
There were two bugs: 1. We initialized the page headers on the allocated temporary WAL page even if the end-of-log was precisely at page boundary. That means we wrote beyond the end of the allocation. That readily gives an assertion failure on debug-enabled builds. Add test case and fix. 2. We initialize the WAL buffer by copying the contents of the WAL read buffer. The idea is that when we stop reading WAL, the last buffer in the reader becomes the new WAL buffer we'll write to. That's how PostgreSQL does it too, see code around comment "Tricky point here" in StartupXLOG(). However, that doesn't work with Neon. The startup procedure is a little different: we don't do normal WAL recovery and we don't read any WAL at startup, except when promoting a read replica. Vanilla PostgreSQL always reads WAL: it reads the last checkpoint record from the WAL if nothing else, but in Neon we don't necessarily read even that. In that case, the xlogreader's read buffer is still uninitialized by the time that we copy it. That's relatively harmless, the only consequence is that the initial WAL segment on local disk can contain garbage before the first WAL record that we write. That's why we haven't noticed until now. Furthermore, it seems that the uninitialized memory just happens to be all-zeros. However, it now caused the test_pg_waldump.py test to fail with the new communicator implementation. That was very coincidental - the new communicator process isn't even running yet when the WAL buffer is initialized. It seems to have changed the memory allocation just so that the uninitialized memory is no longer all-zeros. That's normally harmless too, but it makes the pg_waldump test to fail: pg_waldump, with the --ignore option, starts reading the WAL from the first non-zero bytes, so when the uninitialized portion was filled with garbage rather than zeros, it fails. This little patch to poison the allocated buffer with garbage was helpful while debugging, to make the test fail in a repeatable fashion with or without the new communicator: ``` diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 988be3f..2f4844c2b86 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -97,6 +97,7 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir, */ state->readBuf = (char *) palloc_extended(XLOG_BLCKSZ, MCXT_ALLOC_NO_OOM); + memset(state->readBuf, 0x7e, XLOG_BLCKSZ); if (!state->readBuf) { pfree(state); ```
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
@@ -1570,6 +1570,16 @@ FinishWalRecovery(void)
 	Assert(!WalRcvStreaming());
 	StandbyMode = false;
 
+	/*
+	 * We cannot start generating new WAL if we don't have a valid prev-LSN
+	 * to use for the first new WAL record. (Shouldn't happen.)
+	 */
+	if (NeonRecoveryRequested &&!neonWriteOk)
+		ereport(ERROR,
+				(errmsg("cannot start in read-write mode from this base backup")));
+
+	// FIXME: should we unlink neon.signal?
+
 	/*
 	 * Determine where to start writing WAL next.
 	 *
@@ -1633,83 +1643,68 @@ FinishWalRecovery(void)
 		}
 	}
 
-	/*
-	 * When starting from a neon base backup, we don't have WAL. Initialize
-	 * the WAL page where we will start writing new records from scratch,
-	 * instead.
-	 */
-	if (NeonRecoveryRequested)
-	{
-		if (!neonWriteOk)
-		{
-			/*
-			 * We cannot start generating new WAL if we don't have a valid prev-LSN
-			 * to use for the first new WAL record. (Shouldn't happen.)
-			 */
-			ereport(ERROR,
-					(errmsg("cannot start in read-write mode from this base backup")));
-		}
-		else
-		{
-			int			offs = endOfLog % XLOG_BLCKSZ;
-			XLogRecPtr	pageBeginPtr = endOfLog - offs;
-			bool		isLongHeader = (pageBeginPtr % wal_segment_size) == 0;
-			int			lastPageSize = isLongHeader ? SizeOfXLogLongPHD : SizeOfXLogShortPHD;
-			char	   *page = palloc0(offs);
-			XLogPageHeader xlogPageHdr = (XLogPageHeader) page;
-
-			memcpy(page, xlogreader->readBuf, offs);
-			if (xlogPageHdr->xlp_magic != XLOG_PAGE_MAGIC)
-			{
-				xlogPageHdr->xlp_pageaddr = pageBeginPtr;
-				xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
-				xlogPageHdr->xlp_tli = recoveryTargetTLI;
-				xlogPageHdr->xlp_info = 0;
-				/*
-				 * If we start writing with offset from page beginning, pretend in
-				 * page header there is a record ending where actual data will
-				 * start.
-				 */
-				xlogPageHdr->xlp_rem_len = offs - lastPageSize;
-				if (xlogPageHdr->xlp_rem_len > 0)
-					xlogPageHdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
-				readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size);
-
-				if (isLongHeader)
-				{
-					XLogLongPageHeader longHdr = (XLogLongPageHeader) page;
-
-					longHdr->xlp_sysid = GetSystemIdentifier();
-					longHdr->xlp_seg_size = wal_segment_size;
-					longHdr->xlp_xlog_blcksz = XLOG_BLCKSZ;
+	elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(endOfLog));
 
-					xlogPageHdr->xlp_info |= XLP_LONG_HEADER;
-				}
-			}
-			result->lastPageBeginPtr = pageBeginPtr;
-			result->lastPage = page;
-			elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
-
-			// FIXME: should we unlink neon.signal?
-		}
-	}
 	/*
 	 * Copy the last partial block to the caller, for initializing the WAL
 	 * buffer for appending new WAL.
 	 */
-	else if (endOfLog % XLOG_BLCKSZ != 0)
+	if (endOfLog % XLOG_BLCKSZ != 0)
 	{
 		char	   *page;
 		int			len;
 		XLogRecPtr	pageBeginPtr;
 
 		pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
-		Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
 
 		/* Copy the valid part of the last block */
 		len = endOfLog % XLOG_BLCKSZ;
 		page = palloc(len);
-		memcpy(page, xlogreader->readBuf, len);
+
+		/*
+		 * With neon, it's possible that we start without having read any WAL
+		 * whatsoever. In that case, initialize the WAL page where we will
+		 * start writing new records from scratch, instead.
+		 */
+		if (NeonRecoveryRequested && endOfLog == RedoStartLSN)
+		{
+			bool		isLongHeader = (pageBeginPtr % wal_segment_size) == 0;
+			int			lastPageSize = isLongHeader ? SizeOfXLogLongPHD : SizeOfXLogShortPHD;
+			XLogPageHeader xlogPageHdr = (XLogPageHeader) page;
+
+			Assert(len >= lastPageSize);
+
+			xlogPageHdr->xlp_pageaddr = pageBeginPtr;
+			xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
+			xlogPageHdr->xlp_tli = recoveryTargetTLI;
+			xlogPageHdr->xlp_info = 0;
+			/*
+			 * If we start writing with offset from page beginning, pretend in
+			 * page header there is a record ending where actual data will
+			 * start.
+			 */
+			xlogPageHdr->xlp_rem_len = len - lastPageSize;
+			if (xlogPageHdr->xlp_rem_len > 0)
+				xlogPageHdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
+			readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size);
+
+			if (isLongHeader)
+			{
+				XLogLongPageHeader longHdr = (XLogLongPageHeader) page;
+
+				longHdr->xlp_sysid = GetSystemIdentifier();
+				longHdr->xlp_seg_size = wal_segment_size;
+				longHdr->xlp_xlog_blcksz = XLOG_BLCKSZ;
+
+				xlogPageHdr->xlp_info |= XLP_LONG_HEADER;
+			}
+		}
+		else
+		{
+			Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
+
+			memcpy(page, xlogreader->readBuf, len);
+		}
 
 		result->lastPageBeginPtr = pageBeginPtr;
 		result->lastPage = page;