Skip to content

Commit

Permalink
Skip full index scan during cleanup of B-tree indexes when possible
Browse files Browse the repository at this point in the history
Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete
calls and one amvacuumcleanup call. When workload on particular table
is append-only, then autovacuum isn't intended to touch this table. However,
user may run vacuum manually in order to fill visibility map and get benefits
of index-only scans. Then ambulkdelete wouldn't be called for indexes
of such table (because no heap tuples were deleted), only amvacuumcleanup would
be called In this case, amvacuumcleanup would perform full index scan for
two objectives: put recyclable pages into free space map and update index
statistics.

This patch allows btvacuumclanup to skip full index scan when two conditions
are satisfied: no pages are going to be put into free space map and index
statistics isn't stalled. In order to check first condition, we store
oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then
there are some recyclable pages. In order to check second condition we store
number of heap tuples observed during previous full index scan by cleanup.
If fraction of newly inserted tuples is less than
vacuum_cleanup_index_scale_factor, then statistics isn't considered to be
stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default).

This patch bumps B-tree meta-page version. Upgrade of meta-page is performed
"on the fly": during VACUUM meta-page is rewritten with new version. No special
handling in pg_upgrade is required.

Author: Masahiko Sawada, Alexander Korotkov
Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov
Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com
  • Loading branch information
feodor committed Apr 4, 2018
1 parent eac93e2 commit 857f9c3
Show file tree
Hide file tree
Showing 23 changed files with 458 additions and 45 deletions.
8 changes: 5 additions & 3 deletions contrib/amcheck/verify_nbtree.c
Original file line number Diff line number Diff line change
Expand Up @@ -1500,12 +1500,14 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
errmsg("index \"%s\" meta page is corrupt",
RelationGetRelationName(state->rel))));

if (metad->btm_version != BTREE_VERSION)
if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d",
errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(state->rel),
metad->btm_version, BTREE_VERSION)));
metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
}

/*
Expand Down
3 changes: 2 additions & 1 deletion contrib/pageinspect/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \
brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES)

EXTENSION = pageinspect
DATA = pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
DATA = pageinspect--1.6--1.7.sql \
pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \
pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \
pageinspect--1.0--1.1.sql pageinspect--unpackaged--1.0.sql
Expand Down
4 changes: 3 additions & 1 deletion contrib/pageinspect/btreefuncs.c
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ bt_metap(PG_FUNCTION_ARGS)
BTMetaPageData *metad;
TupleDesc tupleDesc;
int j;
char *values[6];
char *values[8];
Buffer buffer;
Page page;
HeapTuple tuple;
Expand Down Expand Up @@ -555,6 +555,8 @@ bt_metap(PG_FUNCTION_ARGS)
values[j++] = psprintf("%d", metad->btm_level);
values[j++] = psprintf("%d", metad->btm_fastroot);
values[j++] = psprintf("%d", metad->btm_fastlevel);
values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
values[j++] = psprintf("%lf", metad->btm_last_cleanup_num_heap_tuples);

tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
values);
Expand Down
16 changes: 9 additions & 7 deletions contrib/pageinspect/expected/btree.out
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
CREATE INDEX test1_a_idx ON test1 USING btree (a);
\x
SELECT * FROM bt_metap('test1_a_idx');
-[ RECORD 1 ]-----
magic | 340322
version | 2
root | 1
level | 0
fastroot | 1
fastlevel | 0
-[ RECORD 1 ]-----------+-------
magic | 340322
version | 3
root | 1
level | 0
fastroot | 1
fastlevel | 0
oldest_xact | 0
last_cleanup_num_tuples | -1

SELECT * FROM bt_page_stats('test1_a_idx', 0);
ERROR: block 0 is a meta page
Expand Down
26 changes: 26 additions & 0 deletions contrib/pageinspect/pageinspect--1.6--1.7.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/* contrib/pageinspect/pageinspect--1.6--1.7.sql */

-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.7'" to load this file. \quit

--
-- bt_metap()
--
DROP FUNCTION bt_metap(IN relname text,
OUT magic int4,
OUT version int4,
OUT root int4,
OUT level int4,
OUT fastroot int4,
OUT fastlevel int4);
CREATE FUNCTION bt_metap(IN relname text,
OUT magic int4,
OUT version int4,
OUT root int4,
OUT level int4,
OUT fastroot int4,
OUT fastlevel int4,
OUT oldest_xact int4,
OUT last_cleanup_num_tuples real)
AS 'MODULE_PATHNAME', 'bt_metap'
LANGUAGE C STRICT PARALLEL SAFE;
2 changes: 1 addition & 1 deletion contrib/pageinspect/pageinspect.control
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# pageinspect extension
comment = 'inspect the contents of database pages at a low level'
default_version = '1.6'
default_version = '1.7'
module_pathname = '$libdir/pageinspect'
relocatable = true
10 changes: 5 additions & 5 deletions contrib/pgstattuple/expected/pgstattuple.out
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ select version, tree_level,
from pgstatindex('test_pkey');
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)

select version, tree_level,
Expand All @@ -58,7 +58,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::text);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)

select version, tree_level,
Expand All @@ -68,7 +68,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::name);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)

select version, tree_level,
Expand All @@ -78,7 +78,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::regclass);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)

select pg_relpages('test');
Expand Down Expand Up @@ -229,7 +229,7 @@ create index test_partition_hash_idx on test_partition using hash (a);
select pgstatindex('test_partition_idx');
pgstatindex
------------------------------
(2,0,8192,0,0,0,0,0,NaN,NaN)
(3,0,8192,0,0,0,0,0,NaN,NaN)
(1 row)

select pgstathashindex('test_partition_hash_idx');
Expand Down
25 changes: 25 additions & 0 deletions doc/src/sgml/config.sgml
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,31 @@ include_dir 'conf.d'
</note>
</sect2>

<sect2 id="runtime-config-index-vacuum">
<title>Index Vacuum</title>
<variablelist>
<varlistentry id="guc-vacuum-cleanup-index-scale-factor" xreflabel="vacuum_cleanup_index_scale_factor">
<term><varname>vacuum_cleanup_index_scale_factor</varname> (<type>floating point</type>)
<indexterm>
<primary><varname>vacuum_cleanup_index_scale_factor</varname> configuration parameter</primary>
</indexterm>
</term>
<listitem>
<para>
When no tuples were deleted from the heap, B-tree indexes might still
be scanned during <command>VACUUM</command> cleanup stage by two
reasons. The first reason is that B-tree index contains deleted pages
which can be recycled during cleanup. The second reason is that B-tree
index statistics is stalled. The criterion of stalled index statistics
is number of inserted tuples since previous statistics collection
is greater than <varname>vacuum_cleanup_index_scale_factor</varname>
fraction of total number of heap tuples.
</para>
</listitem>
</varlistentry>
</variablelist>
</sect2>

<sect2 id="runtime-config-resource-background-writer">
<title>Background Writer</title>

Expand Down
16 changes: 9 additions & 7 deletions doc/src/sgml/pageinspect.sgml
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,15 @@ test=# SELECT * FROM heap_page_item_attrs(get_raw_page('pg_class', 0), 'pg_class
index's metapage. For example:
<screen>
test=# SELECT * FROM bt_metap('pg_cast_oid_index');
-[ RECORD 1 ]-----
magic | 340322
version | 2
root | 1
level | 0
fastroot | 1
fastlevel | 0
-[ RECORD 1 ]-----------+-------
magic | 340322
version | 3
root | 1
level | 0
fastroot | 1
fastlevel | 0
oldest_xact | 582
last_cleanup_num_tuples | 1000
</screen>
</para>
</listitem>
Expand Down
15 changes: 15 additions & 0 deletions doc/src/sgml/ref/create_index.sgml
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,21 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
</varlistentry>
</variablelist>

<para>
B-tree indexes additionally accept this parameter:
</para>

<variablelist>
<varlistentry>
<term><literal>vacuum_cleanup_index_scale_factor</literal></term>
<listitem>
<para>
Per-table value for <xref linkend="guc-vacuum-cleanup-index-scale-factor"/>.
</para>
</listitem>
</varlistentry>
</variablelist>

<para>
GiST indexes additionally accept this parameter:
</para>
Expand Down
13 changes: 12 additions & 1 deletion src/backend/access/common/reloptions.c
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,15 @@ static relopt_real realRelOpts[] =
},
0, -1.0, DBL_MAX
},
{
{
"vacuum_cleanup_index_scale_factor",
"Number of tuple inserts prior to index cleanup as a fraction of reltuples.",
RELOPT_KIND_BTREE,
ShareUpdateExclusiveLock
},
-1, 0.0, 100.0
},
/* list terminator */
{{NULL}}
};
Expand Down Expand Up @@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
{"user_catalog_table", RELOPT_TYPE_BOOL,
offsetof(StdRdOptions, user_catalog_table)},
{"parallel_workers", RELOPT_TYPE_INT,
offsetof(StdRdOptions, parallel_workers)}
offsetof(StdRdOptions, parallel_workers)},
{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)}
};

options = parseRelOptions(reloptions, validate, kind, &numoptions);
Expand Down
12 changes: 12 additions & 0 deletions src/backend/access/nbtree/nbtinsert.c
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel,

if (BufferIsValid(metabuf))
{
/* upgrade meta-page if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = itup_blkno;
metad->btm_fastlevel = lpageop->btpo.level;
MarkBufferDirty(metabuf);
Expand Down Expand Up @@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel,
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel;
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples =
metad->btm_last_cleanup_num_heap_tuples;

XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
Expand Down Expand Up @@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);

/* upgrade metapage if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);

/*
* Create downlink item for left page (old root). Since this will be the
* first item in a non-leaf page, it implicitly has minus-infinity key
Expand Down Expand Up @@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
md.level = metad->btm_level;
md.fastroot = rootblknum;
md.fastlevel = metad->btm_level;
md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;

XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));

Expand Down
Loading

0 comments on commit 857f9c3

Please sign in to comment.