diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 5529afc1fed..5cd7578e27f 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1081,3 +1081,47 @@ item is irrelevant, and need not be stored at all. This arrangement corresponds to the fact that an L&Y non-leaf page has one more pointer than key. Suffix truncation's negative infinity attributes behave in the same way. + +Notes About Index Scan Prefetch +------------------------------- + +Prefetch can significantly improve the speed of OLAP queries. +To be able to perform prefetch, we need to know which pages will +be accessed during the scan. It is trivial for heap- and bitmap scans, +but requires more effort for index scans: to implement prefetch for +index scans, we need to find out subsequent leaf pages. + +Postgres links all pages at the same level of the B-Tree in a doubly linked list and uses this list for +forward and backward iteration. This list, however, can not trivially be used for prefetching because to locate the next page because we need first to load the current page. To prefetch more than only the next page, we can utilize the parent page's downlinks instead, as it contains references to most of the target page's sibling pages. + +Because Postgres' nbtree pages have no reference to their parent page, we need to remember the parent page when descending the btree and use it to prefetch subsequent pages. We will utilize the parent's linked list to improve the performance of this prefetch system past the key range of the parent page. + +We should prefetch not only leaf pages, but also the next parent page. +The trick is to correctly calculate the moment when it will be needed: +We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page. + +Currently there are two different prefetch implementations for +index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches +only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only +if parallel plan is not used. Parallel index scan is using critical section for obtaining next +page by parallel worker. Leaf page is loaded in this critical section. +And if most of time is spent in loading the page, then it actually eliminates any concurrency +and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in +any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0. + +Prefetch for normal (not index-only) index tries to prefetch heap tuples +referenced from leaf page. Average number of items per page +is about 100 which is comparable with default value of effective_io_concurrency. +So there is not so much sense trying to prefetch also next leaf page. + +As far as it is difficult to estimate number of entries traversed by index scan, +we prefer not to prefetch large number of pages from the very beginning. +Such useless prefetch can reduce the performance of point lookups. +Instead of it we start with smallest prefetch distance and increase it +by INCREASE_PREFETCH_DISTANCE_STEP after processing each item +until it reaches effective_io_concurrency. In case of index-only +scan we increase prefetch distance after processing each leaf pages +and for index scan - after processing each tuple. +The only exception is case when no key bounds are specified. +In this case we traverse the whole relation and it makes sense +to start with the largest possible prefetch distance from the very beginning. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index f6f4af8bfe3..6bb34d2f4f7 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -2157,7 +2157,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 73638dd81c7..1398f2d3b71 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -369,6 +369,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + so->prefetch_maximum = 0; /* disable prefetch */ /* * We don't know yet whether the scan will be index-only, so we do not diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index baab42a9da4..a35d68e395a 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -18,12 +18,14 @@ #include "access/nbtree.h" #include "access/relscan.h" #include "access/xact.h" +#include "catalog/catalog.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "storage/predicate.h" #include "utils/lsyscache.h" #include "utils/rel.h" - +#include "utils/spccache.h" static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); @@ -47,6 +49,7 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +#define INCREASE_PREFETCH_DISTANCE_STEP 1 /* * _bt_drop_lock_and_maybe_pin() @@ -837,6 +840,70 @@ _bt_compare(Relation rel, return 0; } + +/* + * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch. + * This functions returns offset of first item. + */ +static int +_bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + OffsetNumber n_child; + int next_parent_prefetch_index; + int i, j; + + buf = _bt_getbuf(rel, parent, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + offnum = P_FIRSTDATAKEY(opaque); + n_child = PageGetMaxOffsetNumber(page) - offnum + 1; + + /* Position where we should insert prefetch of parent page: we intentionally use prefetch_maximum here instead of current_prefetch_distance, + * assuming that it will reach prefetch_maximum before we reach and of the parent page + */ + next_parent_prefetch_index = (n_child > so->prefetch_maximum) + ? n_child - so->prefetch_maximum : 0; + + if (ScanDirectionIsForward(dir)) + { + so->next_parent = opaque->btpo_next; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + i); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + else + { + so->next_parent = opaque->btpo_prev; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + so->n_prefetch_blocks = j; + so->last_prefetch_index = 0; + _bt_relbuf(rel, buf); + return offnum; +} + /* * _bt_first() -- Find the first item in a scan. * @@ -1096,6 +1163,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } + /* Neon: initialize prefetch */ + so->n_prefetch_requests = 0; + so->n_prefetch_blocks = 0; + so->last_prefetch_index = 0; + so->next_parent = P_NONE; + so->prefetch_maximum = IsCatalogRelation(rel) + ? effective_io_concurrency + : get_tablespace_io_concurrency(rel->rd_rel->reltablespace); + + if (scan->xs_want_itup) /* index only scan */ + { + if (enable_indexonlyscan_prefetch) + { + /* We disable prefetch for parallel index-only scan. + * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker + * which issued prefetch request. The logic of splitting pages between parallel workers in + * index scan doesn't allow to satisfy this requirement. + * Also prefetch of leave pages will be useless if expected number of rows fits in one page. + */ + if (scan->parallel_scan) + so->prefetch_maximum = 0; /* disable prefetch */ + } + else + so->prefetch_maximum = 0; /* disable prefetch */ + } + else if (!enable_indexscan_prefetch || !scan->heapRelation) + so->prefetch_maximum = 0; /* disable prefetch */ + + /* If key bounds are not specified, then we will scan the whole relation and it make sense to start with the largest possible prefetch distance */ + so->current_prefetch_distance = (keysCount == 0) ? so->prefetch_maximum : 0; + /* * If we found no usable boundary keys, we have to start from one end of * the tree. Walk down that edge to the first or last key, and scan from @@ -1366,6 +1464,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); + /* Start prefetching for index only scan */ + if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */ + { + int first_offset = _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir); + int skip = ScanDirectionIsForward(dir) + ? stack->bts_offset - first_offset + : first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset; + Assert(so->n_prefetch_blocks >= skip); + so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; + so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); + so->last_prefetch_index = skip + so->n_prefetch_requests; + for (int i = skip; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } + /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1505,9 +1618,63 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_heaptid = currItem->heapTid; - if (scan->xs_want_itup) + if (scan->xs_want_itup) /* index-only scan */ + { scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + } + else if (so->prefetch_maximum > 0) + { + int prefetchLimit, prefetchDistance; + + /* Neon: prefetch referenced heap pages. + * As far as it is difficult to predict how much items index scan will return + * we do not want to prefetch many heap pages from the very beginning because + * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP + * at each index scan iteration until it reaches prefetch_maximum. + */ + + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + else + so->current_prefetch_distance = so->prefetch_maximum; + + /* How much we can prefetch */ + prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1); + + /* Active prefeth requests */ + prefetchDistance = so->n_prefetch_requests; + /* + * Consume one prefetch request (if any) + */ + if (prefetchDistance != 0) + prefetchDistance -= 1; + + /* Keep number of active prefetch requests equal to the current prefetch distance. + * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration, + * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations + */ + if (ScanDirectionIsForward(dir)) + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + else + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */ + } return true; } @@ -1914,6 +2081,30 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->markItemIndex = -1; } + if (scan->xs_want_itup && so->prefetch_maximum > 0) /* Prefetching of leave pages for index-only scan */ + { + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + + so->n_prefetch_requests -= 1; /* we load next leaf page, so decrement number of active prefetch requests */ + + /* Check if the are more children to prefetch at current parent page */ + if (so->last_prefetch_index == so->n_prefetch_blocks && so->next_parent != P_NONE) + { + /* we have prefetched all items from current parent page, let's move to the next parent page */ + _bt_read_parent_for_prefetch(scan, so->next_parent, dir); + so->n_prefetch_requests -= 1; /* loading parent page consumes one more prefetch request */ + } + + /* Try to keep number of active prefetch requests equal to current prefetch distance */ + while (so->n_prefetch_requests < so->current_prefetch_distance && so->last_prefetch_index < so->n_prefetch_blocks) + { + so->n_prefetch_requests += 1; + PrefetchBuffer(scan->indexRelation, MAIN_FORKNUM, so->prefetch_blocks[so->last_prefetch_index++]); + } + } + if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ @@ -2318,6 +2509,7 @@ _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) */ Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent, Snapshot snapshot) { Buffer buf; @@ -2326,6 +2518,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; + BlockNumber parent_blocknum = P_NONE; /* * If we are looking for a leaf page, okay to descend from fast root; @@ -2343,6 +2536,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, page = BufferGetPage(buf); TestForOldSnapshot(snapshot, rel, page); opaque = BTPageGetOpaque(page); + blkno = BufferGetBlockNumber(buf); for (;;) { @@ -2381,12 +2575,15 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, offnum = P_FIRSTDATAKEY(opaque); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + parent_blocknum = blkno; blkno = BTreeTupleGetDownLink(itup); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); } + if (parent) + *parent = parent_blocknum; return buf; } @@ -2410,13 +2607,13 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) BTPageOpaque opaque; OffsetNumber start; BTScanPosItem *currItem; - + BlockNumber parent; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ - buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), &parent, scan->xs_snapshot); if (!BufferIsValid(buf)) { @@ -2429,6 +2626,15 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } + /* Start prefetching for index-only scan */ + if (so->prefetch_maximum > 0 && parent != P_NONE && scan->xs_want_itup) /* index only scan */ + { + _bt_read_parent_for_prefetch(scan, parent, dir); + so->n_prefetch_requests = so->last_prefetch_index = Min(so->prefetch_maximum, so->n_prefetch_blocks); + for (int i = 0; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 20cde3d0aff..bf67672a743 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -153,6 +153,8 @@ bool enable_parallel_hash = true; bool enable_partition_pruning = true; bool enable_async_append = true; bool enable_seqscan_prefetch = true; +bool enable_indexscan_prefetch = true; +bool enable_indexonlyscan_prefetch = true; typedef struct { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index bb1c0fbf9bc..6b7ed4a3413 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1024,6 +1024,26 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of heap pages in index scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexscan_prefetch, + true, + NULL, NULL, NULL + }, + { + {"enable_indexonlyscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of leave pages in index-only scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexonlyscan_prefetch, + true, + NULL, NULL, NULL + }, { {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of sequential-scan plans."), diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 3055a5ca071..98a0edac5c6 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1071,6 +1071,22 @@ typedef struct BTScanOpaqueData /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ + + /* Neon: prefetch state */ + int prefetch_maximum; /* maximal number of prefetch requests */ + + /* Prefech of referenced heap pages for index scan */ + /* To minimize waste prefetch requests we start with prefetch distance 0 + * and increase it until it reaches prefetch_maximum + */ + int current_prefetch_distance; + + /* Prefetch of leave pages of B-Tree for index-only scan */ + int n_prefetch_requests; /* number of active prefetch requests */ + int n_prefetch_blocks; /* number of elements in prefetch_blocks */ + int last_prefetch_index; /* current position in prefetch_blocks (prefetch_blocks[0..last_prefetch_index] are already requested */ + BlockNumber next_parent; /* pointer to next parent page */ + BlockNumber prefetch_blocks[MaxTIDsPerBTreePage + 1]; /* leaves + parent page */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -1234,6 +1250,7 @@ extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent, Snapshot snapshot); /* diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index d6a15292da6..f7c33b7e658 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -70,6 +70,9 @@ extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; extern PGDLLIMPORT bool enable_async_append; extern PGDLLIMPORT bool enable_seqscan_prefetch; +extern PGDLLIMPORT bool enable_indexscan_prefetch; +extern PGDLLIMPORT bool enable_indexonlyscan_prefetch; + extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index eef4f7e1d11..231283c5636 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -118,7 +118,9 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashjoin | on enable_incremental_sort | on enable_indexonlyscan | on + enable_indexonlyscan_prefetch | on enable_indexscan | on + enable_indexscan_prefetch | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -132,7 +134,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan_prefetch | on enable_sort | on enable_tidscan | on -(21 rows) +(23 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail