neondatabase · hlinnaka · Feb 2, 2021 · hlinnaka · Mar 2, 2021 · hlinnaka
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
@@ -26,12 +26,14 @@ OBJS = \
 	twophase.o \
 	twophase_rmgr.o \
 	varsup.o \
+	walinsertlock.o \
 	xact.o \
 	xlog.o \
 	xlogarchive.o \
 	xlogfuncs.o \
 	xloginsert.o \
 	xlogreader.o \
+	xlogrecord.o \
 	xlogutils.o
 
 include $(top_srcdir)/src/backend/common.mk

diff --git a/src/backend/access/transam/walinsertlock.c b/src/backend/access/transam/walinsertlock.c
@@ -0,0 +1,132 @@
+#include "walinsertlock.h"
+#include "storage/proc.h"
+#include "c.h"
+
+/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
+WALInsertLockPadded *WALInsertLocks = NULL;
+
+/* For WALInsertLockAcquire/Release functions */
+int	MyLockNo = 0;
+bool holdingAllLocks = false;
+
+/*
+ * Acquire a WAL insertion lock, for inserting to WAL.
+ */
+void
+WALInsertLockAcquire(void)
+{
+	bool		immed;
+
+	/*
+	 * It doesn't matter which of the WAL insertion locks we acquire, so try
+	 * the one we used last time.  If the system isn't particularly busy, it's
+	 * a good bet that it's still available, and it's good to have some
+	 * affinity to a particular lock so that you don't unnecessarily bounce
+	 * cache lines between processes when there's no contention.
+	 *
+	 * If this is the first time through in this backend, pick a lock
+	 * (semi-)randomly.  This allows the locks to be used evenly if you have a
+	 * lot of very short connections.
+	 */
+	static int	lockToTry = -1;
+
+	if (lockToTry == -1)
+		lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
+	MyLockNo = lockToTry;
+
+	/*
+	 * The insertingAt value is initially set to 0, as we don't know our
+	 * insert location yet.
+	 */
+	immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
+	if (!immed)
+	{
+		/*
+		 * If we couldn't get the lock immediately, try another lock next
+		 * time.  On a system with more insertion locks than concurrent
+		 * inserters, this causes all the inserters to eventually migrate to a
+		 * lock that no-one else is using.  On a system with more inserters
+		 * than locks, it still helps to distribute the inserters evenly
+		 * across the locks.
+		 */
+		lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
+	}
+}
+
+/*
+ * Acquire all WAL insertion locks, to prevent other backends from inserting
+ * to WAL.
+ */
+void
+WALInsertLockAcquireExclusive(void)
+{
+	int			i;
+
+	/*
+	 * When holding all the locks, all but the last lock's insertingAt
+	 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
+	 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
+	 */
+	for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
+	{
+		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+		LWLockUpdateVar(&WALInsertLocks[i].l.lock,
+						&WALInsertLocks[i].l.insertingAt,
+						PG_UINT64_MAX);
+	}
+	/* Variable value reset to 0 at release */
+	LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+
+	holdingAllLocks = true;
+}
+
+/*
+ * Release our insertion lock (or locks, if we're holding them all).
+ *
+ * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
+ * next time the lock is acquired.
+ */
+void
+WALInsertLockRelease(void)
+{
+	if (holdingAllLocks)
+	{
+		int			i;
+
+		for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+			LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
+								  &WALInsertLocks[i].l.insertingAt,
+								  0);
+
+		holdingAllLocks = false;
+	}
+	else
+	{
+		LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
+							  &WALInsertLocks[MyLockNo].l.insertingAt,
+							  0);
+	}
+}
+
+/*
+ * Update our insertingAt value, to let others know that we've finished
+ * inserting up to that point.
+ */
+void
+WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
+{
+	if (holdingAllLocks)
+	{
+		/*
+		 * We use the last lock to mark our actual position, see comments in
+		 * WALInsertLockAcquireExclusive.
+		 */
+		LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
+						&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
+						insertingAt);
+	}
+	else
+		LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
+						&WALInsertLocks[MyLockNo].l.insertingAt,
+						insertingAt);
+}
diff --git a/src/backend/access/transam/walinsertlock.h b/src/backend/access/transam/walinsertlock.h
@@ -0,0 +1,80 @@
+#ifndef WAL_INSERT_LOCK_H
+#define WAL_INSERT_LOCK_H
+
+#include "postgres.h"
+
+#include "access/xlogdefs.h"
+#include "storage/lwlock.h"
+
+/*
+ * Inserting to WAL is protected by a small fixed number of WAL insertion
+ * locks. To insert to the WAL, you must hold one of the locks - it doesn't
+ * matter which one. To lock out other concurrent insertions, you must hold
+ * of them. Each WAL insertion lock consists of a lightweight lock, plus an
+ * indicator of how far the insertion has progressed (insertingAt).
+ *
+ * The insertingAt values are read when a process wants to flush WAL from
+ * the in-memory buffers to disk, to check that all the insertions to the
+ * region the process is about to write out have finished. You could simply
+ * wait for all currently in-progress insertions to finish, but the
+ * insertingAt indicator allows you to ignore insertions to later in the WAL,
+ * so that you only wait for the insertions that are modifying the buffers
+ * you're about to write out.
+ *
+ * This isn't just an optimization. If all the WAL buffers are dirty, an
+ * inserter that's holding a WAL insert lock might need to evict an old WAL
+ * buffer, which requires flushing the WAL. If it's possible for an inserter
+ * to block on another inserter unnecessarily, deadlock can arise when two
+ * inserters holding a WAL insert lock wait for each other to finish their
+ * insertion.
+ *
+ * Small WAL records that don't cross a page boundary never update the value,
+ * the WAL record is just copied to the page and the lock is released. But
+ * to avoid the deadlock-scenario explained above, the indicator is always
+ * updated before sleeping while holding an insertion lock.
+ *
+ * lastImportantAt contains the LSN of the last important WAL record inserted
+ * using a given lock. This value is used to detect if there has been
+ * important WAL activity since the last time some action, like a checkpoint,
+ * was performed - allowing to not repeat the action if not. The LSN is
+ * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
+ * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
+ * records.  Tracking the WAL activity directly in WALInsertLock has the
+ * advantage of not needing any additional locks to update the value.
+ */
+typedef struct
+{
+	LWLock		lock;
+	XLogRecPtr	insertingAt;
+	XLogRecPtr	lastImportantAt;
+} WALInsertLock;
+
+/*
+ * All the WAL insertion locks are allocated as an array in shared memory. We
+ * force the array stride to be a power of 2, which saves a few cycles in
+ * indexing, but more importantly also ensures that individual slots don't
+ * cross cache line boundaries. (Of course, we have to also ensure that the
+ * array start address is suitably aligned.)
+ */
+typedef union WALInsertLockPadded
+{
+	WALInsertLock l;
+	char		pad[PG_CACHE_LINE_SIZE];
+} WALInsertLockPadded;
+
+/*
+ * Number of WAL insertion locks to use. A higher value allows more insertions
+ * to happen concurrently, but adds some CPU overhead to flushing the WAL,
+ * which needs to iterate all the locks.
+ */
+#define NUM_XLOGINSERT_LOCKS  8
+
+void WALInsertLockAcquire(void);
+void WALInsertLockAcquireExclusive(void);
+void WALInsertLockRelease(void);
+void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
+
+extern int	MyLockNo;
+extern bool holdingAllLocks;
+
+#endif /* WAL_INSERT_LOCK_H */