Skip to content

Commit

Permalink
Add function cbdb_relation_size (#428)
Browse files Browse the repository at this point in the history
Add function cbdb_relation_size

It can be used to fetch the size of a batch of relations as below

SELECT * FROM
cbdb_relation_size((SELECT array_agg(oid) FROM pg_class));

It has better performance than pg_relation_size in such case, more details
see the comment on the function

Co-authored-by: Xiaoran Wang <wangxiaoran@hashdata.cn>
  • Loading branch information
Xiaoran Wang and Xiaoran Wang authored May 13, 2024
1 parent 2b8815b commit 27dc124
Show file tree
Hide file tree
Showing 6 changed files with 372 additions and 1 deletion.
9 changes: 9 additions & 0 deletions src/backend/catalog/system_functions.sql
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,15 @@ CREATE OR REPLACE FUNCTION pg_relation_size(regclass)
PARALLEL UNSAFE STRICT COST 1
RETURN pg_relation_size($1, 'main');

CREATE OR REPLACE FUNCTION
cbdb_relation_size(in reloids oid[], out reloid oid, out size int8)
RETURNS SETOF record
LANGUAGE sql
PARALLEL UNSAFE STRICT COST 1
BEGIN ATOMIC
select * from cbdb_relation_size($1, 'main');
END;

CREATE OR REPLACE FUNCTION obj_description(oid, name)
RETURNS text
LANGUAGE sql
Expand Down
274 changes: 274 additions & 0 deletions src/backend/utils/adt/dbsize.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "commands/tablespace.h"
#include "common/relpath.h"
#include "executor/spi.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "storage/fd.h"
#include "utils/acl.h"
Expand Down Expand Up @@ -54,6 +55,7 @@
#define half_rounded(x) (((x) + ((x) < 0 ? -1 : 1)) / 2)

static int64 calculate_total_relation_size(Relation rel);
static HTAB *cbdb_get_size_from_segDBs(const char *cmd, int32 relnum);

/* Hook for plugins to calculate relation size */
relation_size_hook_type relation_size_hook = NULL;
Expand Down Expand Up @@ -1325,3 +1327,275 @@ pg_relation_filepath(PG_FUNCTION_ARGS)

PG_RETURN_TEXT_P(cstring_to_text(path));
}

/**
* cbdb_relation_size accepts a group of relation
* oids and return their size.
* arg0: oid array
* arg1: fork name
*
* cbdb_relation_size is similar to pg_relation_size
* but when getting multiple relations's size, it can
* get better performance. On each segment, it gets a
* group of relations's size once and sum them up on
* the dispatcher. Compared with pg_relation_size,
* which only computes one relation's size at one time
* and dispatches the sql command for different relations
* multiple times, it saves a lot of work.
*
* If there are duplicated oids in the oid array,
* cbdb_relation_size doesn't deal with that now.
*/
typedef struct
{
Oid reloid;
int64 size;
} RelSize;

typedef struct
{
int32 index;
int32 num_entries;
RelSize *relsize;
} get_relsize_cxt;

Datum
cbdb_relation_size(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
get_relsize_cxt *cxt;
int32 len = 0; /* the length of oid array */
Relation rel;
StringInfoData oidInfo;
RelSize *result;

ForkNumber forkNumber;
ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
text *forkName = PG_GETARG_TEXT_PP(1);
Oid *oidArray = (Oid *) ARR_DATA_PTR(array);


if (array_contains_nulls(array))
ereport(ERROR, (errcode(ERRCODE_ARRAY_ELEMENT_ERROR),
errmsg("cannot work with arrays containing NULLs")));

/* caculate all the relation size */
if (SRF_IS_FIRSTCALL())
{
#define RELSIZE_NATTS 2
MemoryContext oldcontext;
/* create a function context for cross-call persistence */
funcctx = SRF_FIRSTCALL_INIT();
len = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
forkNumber = forkname_to_number(text_to_cstring(forkName));
/* Switch to memory context appropriate for multiple function calls */
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
TupleDesc tupdesc = CreateTemplateTupleDesc(RELSIZE_NATTS);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "reloid", OIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "size", INT8OID, -1, 0);
funcctx->tuple_desc = BlessTupleDesc(tupdesc);

result = (RelSize*) palloc0(sizeof(RelSize) * len);

ERROR_ON_ENTRY_DB();

int relnum = 0; /* the num of oid appended to the oidInfo */
for (int i = 0; i< len; i++)
{
result[i].reloid = oidArray[i];

rel = try_relation_open(oidArray[i], AccessShareLock, false);

/*
* Before 9.2, we used to throw an error if the relation didn't exist, but
* that makes queries like "SELECT pg_relation_size(oid) FROM pg_class"
* less robust, because while we scan pg_class with an MVCC snapshot,
* someone else might drop the table. It's better to return NULL for
* already-dropped tables than throw an error and abort the whole query.
*
* For cbdb_relation_size, for rel not existed, just set the size to 0
*/
if (rel == NULL)
{
continue;
}

/* for foreign table, only get its size on the dispatcher */
if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
{
FdwRoutine *fdwroutine;
bool ok = false;

fdwroutine = GetFdwRoutineForRelation(rel, false);

if (fdwroutine->GetRelationSizeOnSegment != NULL)
ok = fdwroutine->GetRelationSizeOnSegment(rel, &result[i].size);

if (!ok)
ereport(WARNING,
(errmsg("skipping \"%s\" --- cannot calculate this foreign table size",
RelationGetRelationName(rel))));
relation_close(rel, AccessShareLock);
continue;
}

result[i].size = calculate_relation_size(rel, forkNumber);
relation_close(rel, AccessShareLock);

relnum ++;
if (Gp_role == GP_ROLE_DISPATCH)
{
if (relnum == 1)
{
initStringInfo(&oidInfo);
appendStringInfo(&oidInfo, "%u", oidArray[i]);
}
else
appendStringInfo(&oidInfo, ",%u", oidArray[i]);
}
}

if (Gp_role == GP_ROLE_DISPATCH && relnum > 0)
{
char *sql;
HTAB *segsize;
sql = psprintf("select * from pg_catalog.cbdb_relation_size(array[%s]::oid[], '%s')", oidInfo.data,
forkNames[forkNumber]);
segsize = cbdb_get_size_from_segDBs(sql, relnum);
pfree(oidInfo.data);
pfree(sql);

for (int i = 0; i< len; i++)
{
bool found;
RelSize *entry;
Oid oid = result[i].reloid;
entry = hash_search(segsize, &oid, HASH_FIND, &found);
/* some tables may only exist on dispatcher */
if (found)
{
result[i].size += entry->size;
}
}
}

cxt = (get_relsize_cxt *) palloc(sizeof(get_relsize_cxt));
cxt->num_entries = len;
cxt->index = 0;
cxt->relsize = result;

funcctx->user_fctx = cxt;
MemoryContextSwitchTo(oldcontext);
}

funcctx = SRF_PERCALL_SETUP();
cxt = (get_relsize_cxt *) funcctx->user_fctx;

while (cxt->index < cxt->num_entries)
{
RelSize *relsize = &cxt->relsize[cxt->index];
Datum values[RELSIZE_NATTS];
bool nulls[RELSIZE_NATTS];
HeapTuple tuple;
Datum res;

MemSet(nulls, 0, sizeof(nulls));
values[0] = ObjectIdGetDatum(relsize->reloid);
values[1] = Int64GetDatum(relsize->size);
cxt->index++;
tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
res = HeapTupleGetDatum(tuple);

SRF_RETURN_NEXT(funcctx, res);
}

SRF_RETURN_DONE(funcctx);
}

/*
* Helper function to dispatch a size-returning command.
*
* Dispatches the given SQL query to segments, and sums up the results.
*/
static HTAB*
cbdb_get_size_from_segDBs(const char *cmd, int32 relnum)
{
CdbPgResults cdb_pgresults = {NULL, 0};
int i;
HTAB *res_htab = NULL;

Assert(Gp_role == GP_ROLE_DISPATCH);

if (!res_htab)
{
HASHCTL hctl;

memset(&hctl, 0, sizeof(HASHCTL));
hctl.keysize = sizeof(Oid);
hctl.entrysize = sizeof(RelSize);
hctl.hcxt = CurrentMemoryContext;

res_htab = hash_create("cbdb_get_size_from_segDBs",
relnum,
&hctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
}
if (relnum == 0)
return res_htab;

CdbDispatchCommand(cmd, DF_WITH_SNAPSHOT, &cdb_pgresults);

for (i = 0; i < cdb_pgresults.numResults; i++)
{
ExecStatusType status;
int ntuples;
int nfields;

struct pg_result *pgresult = cdb_pgresults.pg_results[i];

status = PQresultStatus(pgresult);
if (status != PGRES_TUPLES_OK)
{
cdbdisp_clearCdbPgResults(&cdb_pgresults);
ereport(ERROR,
(errmsg("unexpected result from segment: %d",
status)));
}

ntuples = PQntuples(pgresult);
nfields = PQnfields(pgresult);

if (ntuples != relnum || nfields != RELSIZE_NATTS)
{
cdbdisp_clearCdbPgResults(&cdb_pgresults);
ereport(ERROR,
(errmsg("unexpected shape of result from segment (%d rows, %d cols)",
ntuples, nfields)));
}

for ( int j = 0; j < ntuples; j++)
{
bool found;
RelSize *entry;
int64 size;
if (PQgetisnull(pgresult, j, 0) || PQgetisnull(pgresult, j, 1))
continue;

Oid oid = DatumGetObjectId(DirectFunctionCall1(oidin,
CStringGetDatum(PQgetvalue(pgresult, j, 0))));
size = DatumGetInt64(DirectFunctionCall1(int8in,
CStringGetDatum(PQgetvalue(pgresult, j, 1))));
entry = hash_search(res_htab, &oid, HASH_ENTER, &found);
if (!found)
{
entry->reloid = oid;
entry->size = size;
}
else
{
entry->size += size;
}
}
}
return res_htab;
}
15 changes: 15 additions & 0 deletions src/include/catalog/pg_proc.dat
Original file line number Diff line number Diff line change
Expand Up @@ -12518,3 +12518,18 @@
prosrc => 'pg_export_snapshot_def', proexeclocation => 's' },
]

#Cloudberry specific functions
{ oid => '8960',
descr => 'disk space usage for the main fork of a group of tables or indexes',
proname => 'cbdb_relation_size', prolang => 'sql', provolatile => 'v', proparallel => 'u',
prorettype => 'record', prorows => '100', proretset => 't',
proargtypes => '_oid', proallargtypes => '{_oid,oid,int8}',
proargmodes => '{i,o,o}', proargnames => '{reloids,reloid,size}',
prosrc => 'see system_functions.sql' },
{ oid => '8961',
descr => 'disk space usage for the specified fork of a group of tables or indexes',
proname => 'cbdb_relation_size', provolatile => 'v', proparallel => 'u',
prorettype => 'record', prorows => '100', proretset => 't',
proargtypes => '_oid text', proallargtypes => '{_oid,text,oid,int8}',
proargmodes => '{i,i,o,o}', proargnames => '{reloids,forkname,reloid,size}',
prosrc => 'cbdb_relation_size' },
44 changes: 44 additions & 0 deletions src/test/regress/expected/cbdb_db_size_functions.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
-- start_ignore
DROP TABLE IF EXISTS cbdbheapsizetest;
DROP TABLE IF EXISTS cbdbaosizetest;
DROP EXTERNAL TABLE IF EXISTS cbdbsize_t_ext;
-- end_ignore
-- create heap table
CREATE TABLE cbdbheapsizetest(a int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO cbdbheapsizetest select generate_series(1, 1000);
-- create ao table
CREATE TABLE cbdbaosizetest (a int) WITH (appendonly=true, orientation=row);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into cbdbaosizetest select generate_series(1, 100000);
-- create EXTERNAL table
CREATE EXTERNAL TABLE cbdbsize_t_ext (a integer) LOCATION ('file://127.0.0.1/tmp/foo') FORMAT 'text';
WITH cbdbrelsize AS (
SELECT *
FROM cbdb_relation_size((SELECT array['cbdbsize_t_ext'::regclass,'cbdbheapsizetest'::regclass, 'cbdbaosizetest'::regclass]))
), pgrelsize AS (
SELECT pg_relation_size(oid) as size, relname, oid FROM pg_class where oid in ('cbdbsize_t_ext'::regclass,'cbdbheapsizetest'::regclass, 'cbdbaosizetest'::regclass)
)
SELECT pgrelsize.relname, pgrelsize.size, cbdbrelsize.size
FROM pgrelsize FULL JOIN cbdbrelsize
ON pgrelsize.oid = cbdbrelsize.reloid
WHERE pgrelsize.size != cbdbrelsize.size;
WARNING: skipping "cbdbsize_t_ext" --- cannot calculate this foreign table size
WARNING: skipping "cbdbsize_t_ext" --- cannot calculate this foreign table size
relname | size | size
---------+------+------
(0 rows)

SELECT * FROM cbdb_relation_size(array[]::oid[], 'main');
reloid | size
--------+------
(0 rows)

SELECT size FROM cbdb_relation_size(array['cbdbheapsizetest'::regclass], 'fsm');
size
------
0
(1 row)

2 changes: 1 addition & 1 deletion src/test/regress/greenplum_schedule
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ test: gp_runtime_filter
#test: olap_window
#test: tpch500GB

test: db_size_functions
test: db_size_functions cbdb_db_size_functions

# FIXME: These tests no longer work, because they try to set
# gp_interconnect_type, which doesn't work:
Expand Down
Loading

0 comments on commit 27dc124

Please sign in to comment.