diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index cb3b38113c3..fc3e8076a93 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8945,6 +8945,17 @@ heap_xlog_visible(XLogReaderState *record) PageSetAllVisible(page); + /* + * NEON: despite to the comment above we need to update page LSN here. + * See discussion at hackers: https://www.postgresql.org/message-id/flat/039076d4f6cdd871691686361f83cb8a6913a86a.camel%40j-davis.com#101ba42b004f9988e3d54fce26fb3462 + * For Neon this assignment is critical because otherwise last written LSN tracked at compute doesn't + * match with page LSN assignee by WAL-redo and as a result, prefetched page is rejected. + * + * It is fixed in upstream in https://github.com/neondatabase/postgres/commit/7bf713dd2d0739fbcd4103971ed69c17ebe677ea + * but until it is merged we still need to carry a patch here. + */ + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); } else if (action == BLK_RESTORED) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index d414c3f90be..8c1503cc7fb 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -211,7 +211,7 @@ typedef struct LastWrittenLsnCacheEntry /* - * Cache of last written LSN for each relation chunk (hash bucket). + * Cache of last written LSN for each relation page. * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last * relation metadata update. * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"), @@ -606,8 +606,6 @@ static WALInsertLockPadded *WALInsertLocks = NULL; */ static ControlFileData *ControlFile = NULL; -#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */ - /* * Calculate the amount of space left on the page after 'endptr'. Beware * multiple evaluation! @@ -6100,7 +6098,7 @@ GetInsertRecPtr(void) * It returns an upper bound for the last written LSN of a given page, * either from a cached last written LSN or a global maximum last written LSN. * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. - * If cache is large enough ,iterting through all hash items may be rather expensive. + * If cache is large enough, iterating through all hash items may be rather expensive. * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical. */ XLogRecPtr @@ -6119,7 +6117,7 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) BufferTag key; key.rnode = rnode; key.forkNum = forknum; - key.blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET; + key.blockNum = blkno; entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); if (entry != NULL) lsn = entry->lsn; @@ -6143,9 +6141,9 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) /* * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range. * We maintain cache of last written LSNs with limited size and LRU replacement - * policy. To reduce cache size we store max LSN not for each page, but for - * bucket (1024 blocks). This cache allows to use old LSN when - * requesting pages of unchanged or appended relations. + * policy. Keeping last written LSN for each page allows to use old LSN when + * requesting pages of unchanged or appended relations. Also it is critical for + * efficient work of prefetch in case massive update operations (like vacuum or remove). * * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated. * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions. @@ -6167,19 +6165,13 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for LastWrittenLsnCacheEntry* entry; BufferTag key; bool found; - BlockNumber bucket; - BlockNumber start_bucket; /* inclusive */ - BlockNumber end_bucket; /* exclusive */ - - start_bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET; - end_bucket = from == REL_METADATA_PSEUDO_BLOCKNO - ? start_bucket + 1 : (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET; + BlockNumber i; key.rnode = rnode; key.forkNum = forknum; - for (bucket = start_bucket; bucket < end_bucket; bucket++) + for (i = 0; i < n_blocks; i++) { - key.blockNum = bucket; + key.blockNum = from + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); if (found) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index d33b526ef0d..367d6718805 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2452,11 +2452,11 @@ static struct config_int ConfigureNamesInt[] = { {"lsn_cache_size", PGC_POSTMASTER, UNGROUPED, - gettext_noop("Size of las written LSN cache used by Neon."), + gettext_noop("Size of last written LSN cache used by Neon."), NULL }, &lastWrittenLsnCacheSize, - 1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */ + 128*1024, 1024, INT_MAX, NULL, NULL, NULL },