From f0b0cb0815cc8a434bd9fe29deb0b715181fdca0 Mon Sep 17 00:00:00 2001 From: Cloudberry Date: Mon, 3 Jul 2023 08:27:29 +0000 Subject: [PATCH] Initial Cloudberry code dump2. --- concourse/scripts/dumpdb.bash | 2 + contrib/bloom/blinsert.c | 2 + contrib/bloom/blscan.c | 2 +- deploy/cbdb_deploy.sh | 2 +- doc/src/sgml/config.sgml | 125 +- doc/src/sgml/database-encryption.sgml | 149 ++ doc/src/sgml/filelist.sgml | 1 + doc/src/sgml/installation.sgml | 2 +- doc/src/sgml/monitoring.sgml | 13 + doc/src/sgml/postgres.sgml | 1 + doc/src/sgml/ref/allfiles.sgml | 1 + doc/src/sgml/ref/initdb.sgml | 47 + doc/src/sgml/ref/pg_alterckey.sgml | 210 ++ doc/src/sgml/ref/pg_ctl-ref.sgml | 14 + doc/src/sgml/ref/pgupgrade.sgml | 20 +- doc/src/sgml/ref/postgres-ref.sgml | 13 + doc/src/sgml/reference.sgml | 1 + doc/src/sgml/storage.sgml | 5 + gpMgmt/bin/gpcheckcat | 4 +- gpMgmt/bin/gpinitsystem | 19 +- gpMgmt/bin/gppylib/commands/base.py | 8 +- gpMgmt/bin/lib/gpcreateseg.sh | 6 + gpMgmt/doc/gpinitsystem_help | 4 + gpcontrib/gp_exttable_fdw/gp_exttable_fdw.c | 2 +- hd-ci/compile_cbdb.bash | 4 +- hd-ci/icw_cbdb.bash | 2 +- src/backend/Makefile | 17 +- src/backend/access/aocs/aocs_compaction.c | 2 +- src/backend/access/aocs/aocsam.c | 109 +- src/backend/access/aocs/aocsam_handler.c | 64 +- src/backend/access/aocs/test/aocsam_test.c | 2 + src/backend/access/appendonly/aomd.c | 4 +- .../access/appendonly/appendonly_compaction.c | 2 +- src/backend/access/appendonly/appendonlyam.c | 117 +- .../access/appendonly/appendonlyam_handler.c | 56 +- .../access/appendonly/appendonlywriter.c | 45 +- src/backend/access/bitmap/bitmap.c | 2 + src/backend/access/bitmap/bitmapattutil.c | 10 +- src/backend/access/brin/brin.c | 2 +- src/backend/access/common/reloptions.c | 2 +- src/backend/access/common/session.c | 4 +- src/backend/access/gin/ginget.c | 2 +- src/backend/access/gist/README | 1 - src/backend/access/gist/gistbuild.c | 13 +- src/backend/access/gist/gistget.c | 2 +- src/backend/access/hash/hash.c | 2 +- src/backend/access/hash/hashpage.c | 1 + src/backend/access/heap/rewriteheap.c | 4 + src/backend/access/index/genam.c | 1 + src/backend/access/nbtree/nbtree.c | 4 +- src/backend/access/nbtree/nbtsort.c | 4 +- src/backend/access/rmgrdesc/xlogdesc.c | 6 +- src/backend/access/spgist/spginsert.c | 6 + src/backend/access/spgist/spgscan.c | 2 +- src/backend/access/table/tableam.c | 57 + src/backend/access/transam/parallel.c | 319 +++ src/backend/access/transam/xact.c | 7 + src/backend/access/transam/xlog.c | 52 + src/backend/access/transam/xloginsert.c | 24 + src/backend/bootstrap/bootstrap.c | 32 +- src/backend/catalog/Makefile | 4 +- src/backend/catalog/catalog.c | 23 + src/backend/catalog/dependency.c | 11 +- src/backend/catalog/index.c | 21 +- src/backend/catalog/objectaddress.c | 33 + src/backend/catalog/pg_appendonly.c | 30 + src/backend/catalog/pg_task.c | 230 ++ src/backend/catalog/pg_task_run_history.c | 206 ++ src/backend/catalog/storage.c | 7 +- src/backend/catalog/system_functions.sql | 2 +- src/backend/cdb/cdbappendonlystorageformat.c | 11 + src/backend/cdb/cdbappendonlystorageread.c | 9 +- src/backend/cdb/cdbappendonlystoragewrite.c | 17 +- src/backend/cdb/cdbbufferedappend.c | 2 + src/backend/cdb/cdbbufferedread.c | 6 +- src/backend/cdb/cdbgroupingpaths.c | 149 +- src/backend/cdb/cdbllize.c | 23 +- src/backend/cdb/cdbmutate.c | 14 + src/backend/cdb/cdbpath.c | 1393 +++++++++++- src/backend/cdb/cdbpathlocus.c | 432 +++- src/backend/cdb/cdbpathtoplan.c | 7 + src/backend/cdb/cdbsetop.c | 22 +- src/backend/cdb/cdbvarblock.c | 42 +- src/backend/cdb/dispatcher/cdbdisp.c | 27 + src/backend/cdb/dispatcher/cdbgang.c | 2 +- src/backend/cdb/endpoint/cdbendpoint.c | 1 + src/backend/cdb/motion/cdbmotion.c | 19 +- src/backend/cdb/motion/tupleremap.c | 2 +- src/backend/cdb/test/cdbbufferedread_test.c | 8 +- src/backend/commands/Makefile | 2 +- src/backend/commands/alter.c | 1 + src/backend/commands/analyze.c | 61 + src/backend/commands/createas.c | 12 +- src/backend/commands/event_trigger.c | 1 + src/backend/commands/explain.c | 84 +- src/backend/commands/explain_gp.c | 35 +- src/backend/commands/matview.c | 15 +- src/backend/commands/tablecmds.c | 37 +- src/backend/commands/taskcmds.c | 304 +++ src/backend/commands/vacuum.c | 38 + src/backend/commands/vacuum_ao.c | 19 +- src/backend/crypto/Makefile | 20 + src/backend/crypto/README | 231 ++ src/backend/crypto/bufenc.c | 257 +++ src/backend/crypto/ckey_aws.sh.sample | 53 + src/backend/crypto/ckey_direct.sh.sample | 39 + src/backend/crypto/ckey_passphrase.sh.sample | 35 + src/backend/crypto/ckey_piv_nopin.sh.sample | 68 + src/backend/crypto/ckey_piv_pin.sh.sample | 81 + src/backend/crypto/kmgr.c | 445 ++++ src/backend/crypto/sm4.c | 464 ++++ src/backend/crypto/ssl_passphrase.sh.sample | 35 + src/backend/executor/execAmi.c | 2 + src/backend/executor/execMain.c | 19 +- src/backend/executor/execParallel.c | 215 +- src/backend/executor/execUtils.c | 36 +- src/backend/executor/nodeAppend.c | 13 + src/backend/executor/nodeBitmapHeapscan.c | 8 +- src/backend/executor/nodeBitmapIndexscan.c | 19 + src/backend/executor/nodeHash.c | 32 +- src/backend/executor/nodeHashjoin.c | 51 +- src/backend/executor/nodeIndexonlyscan.c | 38 +- src/backend/executor/nodeIndexscan.c | 46 +- src/backend/executor/nodeMotion.c | 125 +- src/backend/executor/nodeSeqscan.c | 70 +- src/backend/executor/nodeShareInputScan.c | 3 +- src/backend/executor/nodeSort.c | 2 +- src/backend/libpq/be-secure-common.c | 14 + src/backend/main/main.c | 3 + src/backend/nodes/copyfuncs.c | 5 + src/backend/nodes/nodeFuncs.c | 20 + src/backend/nodes/outfuncs.c | 3 + src/backend/nodes/outfuncs_common.c | 2 + src/backend/nodes/print.c | 4 + src/backend/nodes/readfast.c | 1 + src/backend/nodes/readfuncs.c | 3 + src/backend/nodes/readfuncs_common.c | 2 + src/backend/optimizer/README.cbdb.parallel | 53 + src/backend/optimizer/geqo/geqo_eval.c | 2 + src/backend/optimizer/path/allpaths.c | 276 ++- src/backend/optimizer/path/costsize.c | 45 +- src/backend/optimizer/path/indxpath.c | 2 +- src/backend/optimizer/path/joinpath.c | 145 +- src/backend/optimizer/plan/createplan.c | 118 +- src/backend/optimizer/plan/joinpartprune.c | 2 + src/backend/optimizer/plan/planmain.c | 3 +- src/backend/optimizer/plan/planner.c | 417 +++- src/backend/optimizer/plan/setrefs.c | 22 +- src/backend/optimizer/plan/subselect.c | 53 +- src/backend/optimizer/prep/prepunion.c | 7 +- src/backend/optimizer/util/pathnode.c | 505 ++++- src/backend/optimizer/util/walkers.c | 1 + src/backend/parser/gram.y | 163 +- src/backend/postmaster/bgworker.c | 7 + src/backend/postmaster/postmaster.c | 23 +- src/backend/replication/basebackup.c | 4 + src/backend/replication/logical/decode.c | 1 + src/backend/storage/buffer/bufmgr.c | 155 +- src/backend/storage/buffer/localbuf.c | 8 +- src/backend/storage/file/copydir.c | 33 +- src/backend/storage/file/reinit.c | 3 +- src/backend/storage/ipc/ipci.c | 10 + src/backend/storage/ipc/procarray.c | 29 +- src/backend/storage/lmgr/lock.c | 45 + src/backend/storage/lmgr/lwlocknames.txt | 2 + src/backend/storage/lmgr/proc.c | 34 + src/backend/storage/page/bufpage.c | 51 +- src/backend/task/Makefile | 23 + src/backend/task/entry.c | 447 ++++ src/backend/task/job_metadata.c | 832 +++++++ src/backend/task/misc.c | 164 ++ src/backend/task/pg_cron.c | 2008 +++++++++++++++++ src/backend/task/task_states.c | 180 ++ src/backend/tcop/postgres.c | 34 +- src/backend/tcop/utility.c | 28 + src/backend/utils/activity/wait_event.c | 9 + src/backend/utils/adt/dbsize.c | 38 + src/backend/utils/cache/plancache.c | 17 +- src/backend/utils/cache/syscache.c | 11 + src/backend/utils/cache/typcache.c | 25 +- src/backend/utils/datumstream/datumstream.c | 85 +- .../utils/datumstream/datumstreamblock.c | 111 +- src/backend/utils/init/postinit.c | 11 +- src/backend/utils/misc/guc.c | 61 +- src/backend/utils/misc/guc_gp.c | 163 ++ src/backend/utils/misc/pg_controldata.c | 13 +- src/backend/utils/misc/postgresql.conf.sample | 5 + src/backend/utils/time/combocid.c | 79 +- src/backend/utils/time/snapmgr.c | 121 +- src/bin/Makefile | 1 + src/bin/initdb/initdb.c | 127 +- src/bin/pg_alterckey/.gitignore | 1 + src/bin/pg_alterckey/Makefile | 38 + src/bin/pg_alterckey/README | 24 + src/bin/pg_alterckey/pg_alterckey.c | 788 +++++++ src/bin/pg_controldata/pg_controldata.c | 3 + src/bin/pg_ctl/pg_ctl.c | 59 +- src/bin/pg_resetwal/pg_resetwal.c | 3 + src/bin/pg_rewind/filemap.c | 8 + src/bin/pg_upgrade/check.c | 35 + src/bin/pg_upgrade/controldata.c | 42 +- src/bin/pg_upgrade/file.c | 2 + src/bin/pg_upgrade/option.c | 7 +- src/bin/pg_upgrade/pg_upgrade.h | 7 + src/bin/psql/tab-complete.c | 23 + src/common/Makefile | 3 + src/common/cipher.c | 98 + src/common/cipher_openssl.c | 419 ++++ src/common/kmgr_utils.c | 469 ++++ src/include/Makefile | 2 +- src/include/access/appendonlywriter.h | 24 + src/include/access/gist.h | 5 +- src/include/access/parallel.h | 49 + src/include/access/relscan.h | 11 +- src/include/access/session.h | 2 +- src/include/access/tableam.h | 32 +- src/include/access/xlog.h | 20 +- src/include/access/xloginsert.h | 2 + src/include/catalog/catalog.h | 1 + src/include/catalog/dependency.h | 5 +- src/include/catalog/pg_appendonly.h | 4 + src/include/catalog/pg_control.h | 4 + src/include/catalog/pg_proc.dat | 18 +- src/include/catalog/pg_task.h | 58 + src/include/catalog/pg_task_run_history.h | 62 + src/include/cdb/cdbaocsam.h | 11 +- src/include/cdb/cdbappendonlyam.h | 11 +- src/include/cdb/cdbappendonlystorageread.h | 5 +- src/include/cdb/cdbbufferedread.h | 6 +- src/include/cdb/cdbgroupingpaths.h | 3 +- src/include/cdb/cdbllize.h | 1 - src/include/cdb/cdbmutate.h | 1 + src/include/cdb/cdbpath.h | 37 +- src/include/cdb/cdbpathlocus.h | 111 +- src/include/cdb/cdbvarblock.h | 12 +- src/include/cdb/cdbvars.h | 3 + src/include/commands/explain.h | 1 + src/include/commands/taskcmds.h | 28 + src/include/commands/vacuum.h | 10 + src/include/common/cipher.h | 74 + src/include/common/kmgr_utils.h | 96 + src/include/crypto/bufenc.h | 34 + src/include/crypto/kmgr.h | 27 + src/include/crypto/sm4.h | 62 + src/include/executor/execdesc.h | 6 + src/include/executor/hashjoin.h | 2 + src/include/executor/nodeAppend.h | 2 + src/include/nodes/execnodes.h | 12 + src/include/nodes/nodes.h | 3 + src/include/nodes/parsenodes.h | 32 + src/include/nodes/pathnodes.h | 3 + src/include/nodes/plannodes.h | 14 + src/include/nodes/primnodes.h | 1 + src/include/optimizer/pathnode.h | 3 +- src/include/optimizer/paths.h | 5 +- src/include/parser/kwlist.h | 3 + src/include/postmaster/bgworker.h | 1 + src/include/postmaster/postmaster.h | 4 +- src/include/storage/bufpage.h | 15 +- src/include/storage/copydir.h | 2 +- src/include/storage/lock.h | 7 + src/include/storage/lwlock.h | 3 + src/include/storage/proc.h | 16 + src/include/storage/procarray.h | 3 + src/include/storage/shmem.h | 2 +- src/include/task/bitstring.h | 122 + src/include/task/cron.h | 296 +++ src/include/task/job_metadata.h | 66 + src/include/task/pg_cron.h | 36 + src/include/task/task_states.h | 67 + src/include/tcop/cmdtaglist.h | 3 + src/include/utils/datumstream.h | 6 +- src/include/utils/datumstreamblock.h | 25 +- src/include/utils/guc.h | 11 + src/include/utils/guc_tables.h | 3 + src/include/utils/snapmgr.h | 1 + src/include/utils/sync_guc_name.h | 3 + src/include/utils/syscache.h | 1 + src/include/utils/typcache.h | 2 + src/include/utils/unsync_guc_name.h | 13 + src/include/utils/wait_event.h | 3 + src/test/Makefile | 9 + src/test/README | 3 + src/test/crypto/.gitignore | 4 + src/test/crypto/KWP_AD_128.txt | 35 + src/test/crypto/KWP_AD_256.txt | 35 + src/test/crypto/KWP_AE_128.txt | 35 + src/test/crypto/KWP_AE_256.txt | 35 + src/test/crypto/Makefile | 39 + src/test/crypto/README | 33 + src/test/crypto/gcmDecrypt128.rsp | 129 ++ src/test/crypto/gcmDecrypt256.rsp | 129 ++ src/test/crypto/gcmEncryptExtIV128.rsp | 129 ++ src/test/crypto/gcmEncryptExtIV256.rsp | 129 ++ src/test/crypto/t/001_testcrypto.pl | 137 ++ src/test/crypto/t/002_testkwp.pl | 126 ++ src/test/crypto/t/003_clusterkey.pl | 93 + src/test/crypto/t/004_buffers.pl | 157 ++ src/test/crypto/testcrypto.c | 545 +++++ src/test/isolation2/Makefile | 7 + .../expected/crash_recovery_dtm.out | 13 +- src/test/isolation2/expected/gpdispatch.out | 2 + src/test/isolation2/expected/gpdispatch_1.out | 2 + .../parallel_retrieve_cursor/explain.source | 8 +- .../parallel_retrieve_cursor/explain.source | 8 +- .../isolation2/sql/crash_recovery_dtm.sql | 7 +- src/test/isolation2/sql/gpdispatch.sql | 1 + src/test/regress/GNUmakefile | 7 + src/test/regress/atmsort.pm | 6 +- src/test/regress/expected/aggregates.out | 8 +- .../expected/alter_distribution_policy.out | 117 + src/test/regress/expected/bfv_dd.out | 1 + .../regress/expected/bfv_dd_optimizer.out | 3 + .../regress/expected/bfv_partition_plans.out | 4 + .../bfv_partition_plans_optimizer.out | 4 + src/test/regress/expected/explain.out | 6 + src/test/regress/expected/explain_format.out | 15 +- .../expected/explain_format_optimizer.out | 9 +- .../regress/expected/explain_optimizer.out | 6 + src/test/regress/expected/gp_aggregates.out | 2 + src/test/regress/expected/gp_parallel.out | 1549 +++++++++++++ src/test/regress/expected/gporca.out | 88 +- .../regress/expected/gporca_optimizer.out | 88 +- src/test/regress/expected/guc_gp.out | 46 + .../regress/expected/incremental_sort.out | 4 + .../expected/incremental_sort_optimizer.out | 8 +- src/test/regress/expected/insert.out | 2 +- src/test/regress/expected/misc_sanity.out | 13 +- .../expected/misc_sanity_external_fts.out | 13 +- src/test/regress/expected/partition_prune.out | 2 + .../expected/partition_prune_optimizer.out | 2 + src/test/regress/expected/pg_stat.out | 2 + src/test/regress/expected/qp_misc.out | 2 + .../regress/expected/qp_query_execution.out | 1 + .../regress/expected/qp_targeted_dispatch.out | 2 + .../qp_targeted_dispatch_optimizer.out | 2 + src/test/regress/expected/rangefuncs_cdb.out | 6 +- src/test/regress/expected/sanity_check.out | 2 + src/test/regress/expected/segspace.out | 43 + src/test/regress/expected/select_parallel.out | 358 +-- .../expected/select_parallel_optimizer.out | 1252 ---------- src/test/regress/expected/shared_scan.out | 78 +- .../expected/shared_scan_optimizer.out | 78 +- src/test/regress/expected/stats_ext.out | 2 + .../regress/expected/stats_ext_optimizer.out | 2 + src/test/regress/expected/subselect.out | 4 +- .../regress/expected/subselect_optimizer.out | 4 +- src/test/regress/expected/sysviews.out | 3 +- src/test/regress/expected/task.out | 83 + src/test/regress/expected/with_clause.out | 2 +- .../expected/workfile/hashagg_spill.out | 7 +- .../expected/workfile/materialize_spill.out | 4 + .../expected/workfile/sisc_mat_sort.out | 2 + .../expected/workfile/sisc_sort_spill.out | 2 + src/test/regress/expected/write_parallel.out | 16 +- src/test/regress/greenplum_schedule | 3 + src/test/regress/init_file | 9 + src/test/regress/input/dispatch.source | 2 + .../regress/input/temp_tablespaces.source | 5 + src/test/regress/output/dispatch.source | 2 + .../regress/output/temp_tablespaces.source | 5 + src/test/regress/parallel_schedule | 5 + src/test/regress/pg_regress.c | 7 +- src/test/regress/serial_schedule | 1 + .../regress/sql/alter_distribution_policy.sql | 77 + src/test/regress/sql/bfv_dd.sql | 19 + src/test/regress/sql/bfv_dd_multicolumn.sql | 2 + src/test/regress/sql/bfv_partition_plans.sql | 4 + src/test/regress/sql/explain.sql | 4 + src/test/regress/sql/explain_format.sql | 6 +- src/test/regress/sql/gp_aggregates.sql | 2 + src/test/regress/sql/gp_parallel.sql | 471 ++++ src/test/regress/sql/gporca.sql | 4 +- src/test/regress/sql/guc_gp.sql | 21 + src/test/regress/sql/incremental_sort.sql | 4 + src/test/regress/sql/insert.sql | 2 +- src/test/regress/sql/partition_prune.sql | 2 + src/test/regress/sql/pg_stat.sql | 2 + src/test/regress/sql/qp_dml_joins.sql | 2 + src/test/regress/sql/qp_misc.sql | 2 + src/test/regress/sql/qp_misc_rio.sql | 3 +- src/test/regress/sql/qp_query_execution.sql | 1 + src/test/regress/sql/qp_targeted_dispatch.sql | 4 + src/test/regress/sql/query_finish_pending.sql | 6 + src/test/regress/sql/rangefuncs_cdb.sql | 6 +- src/test/regress/sql/segspace.sql | 22 + src/test/regress/sql/select_parallel.sql | 15 +- src/test/regress/sql/shared_scan.sql | 31 +- src/test/regress/sql/stats_ext.sql | 2 + src/test/regress/sql/subselect.sql | 2 + src/test/regress/sql/sysviews.sql | 3 +- src/test/regress/sql/task.sql | 65 + .../regress/sql/workfile/hashagg_spill.sql | 7 +- .../sql/workfile/materialize_spill.sql | 4 + .../regress/sql/workfile/sisc_mat_sort.sql | 2 + .../regress/sql/workfile/sisc_sort_spill.sql | 2 + src/test/regress/sql/write_parallel.sql | 7 +- src/tools/msvc/Mkvcbuild.pm | 4 +- 398 files changed, 22038 insertions(+), 2553 deletions(-) create mode 100644 doc/src/sgml/database-encryption.sgml create mode 100644 doc/src/sgml/ref/pg_alterckey.sgml create mode 100644 src/backend/catalog/pg_task.c create mode 100644 src/backend/catalog/pg_task_run_history.c create mode 100644 src/backend/commands/taskcmds.c create mode 100644 src/backend/crypto/Makefile create mode 100644 src/backend/crypto/README create mode 100644 src/backend/crypto/bufenc.c create mode 100644 src/backend/crypto/ckey_aws.sh.sample create mode 100644 src/backend/crypto/ckey_direct.sh.sample create mode 100644 src/backend/crypto/ckey_passphrase.sh.sample create mode 100644 src/backend/crypto/ckey_piv_nopin.sh.sample create mode 100644 src/backend/crypto/ckey_piv_pin.sh.sample create mode 100644 src/backend/crypto/kmgr.c create mode 100644 src/backend/crypto/sm4.c create mode 100644 src/backend/crypto/ssl_passphrase.sh.sample create mode 100644 src/backend/optimizer/README.cbdb.parallel create mode 100644 src/backend/task/Makefile create mode 100644 src/backend/task/entry.c create mode 100644 src/backend/task/job_metadata.c create mode 100644 src/backend/task/misc.c create mode 100644 src/backend/task/pg_cron.c create mode 100644 src/backend/task/task_states.c create mode 100644 src/bin/pg_alterckey/.gitignore create mode 100644 src/bin/pg_alterckey/Makefile create mode 100644 src/bin/pg_alterckey/README create mode 100644 src/bin/pg_alterckey/pg_alterckey.c create mode 100644 src/common/cipher.c create mode 100644 src/common/cipher_openssl.c create mode 100644 src/common/kmgr_utils.c create mode 100644 src/include/catalog/pg_task.h create mode 100644 src/include/catalog/pg_task_run_history.h create mode 100644 src/include/commands/taskcmds.h create mode 100644 src/include/common/cipher.h create mode 100644 src/include/common/kmgr_utils.h create mode 100644 src/include/crypto/bufenc.h create mode 100644 src/include/crypto/kmgr.h create mode 100644 src/include/crypto/sm4.h create mode 100644 src/include/task/bitstring.h create mode 100644 src/include/task/cron.h create mode 100644 src/include/task/job_metadata.h create mode 100644 src/include/task/pg_cron.h create mode 100644 src/include/task/task_states.h create mode 100644 src/test/crypto/.gitignore create mode 100644 src/test/crypto/KWP_AD_128.txt create mode 100644 src/test/crypto/KWP_AD_256.txt create mode 100644 src/test/crypto/KWP_AE_128.txt create mode 100644 src/test/crypto/KWP_AE_256.txt create mode 100644 src/test/crypto/Makefile create mode 100644 src/test/crypto/README create mode 100644 src/test/crypto/gcmDecrypt128.rsp create mode 100644 src/test/crypto/gcmDecrypt256.rsp create mode 100644 src/test/crypto/gcmEncryptExtIV128.rsp create mode 100644 src/test/crypto/gcmEncryptExtIV256.rsp create mode 100644 src/test/crypto/t/001_testcrypto.pl create mode 100644 src/test/crypto/t/002_testkwp.pl create mode 100644 src/test/crypto/t/003_clusterkey.pl create mode 100644 src/test/crypto/t/004_buffers.pl create mode 100644 src/test/crypto/testcrypto.c create mode 100644 src/test/regress/expected/gp_parallel.out delete mode 100644 src/test/regress/expected/select_parallel_optimizer.out create mode 100644 src/test/regress/expected/task.out create mode 100644 src/test/regress/sql/gp_parallel.sql create mode 100644 src/test/regress/sql/task.sql diff --git a/concourse/scripts/dumpdb.bash b/concourse/scripts/dumpdb.bash index 0b525ce24c3..fdb27ea1c0e 100755 --- a/concourse/scripts/dumpdb.bash +++ b/concourse/scripts/dumpdb.bash @@ -7,6 +7,8 @@ INSTALL_DIR=${INSTALL_DIR:-/usr/local/cloudberry-db-devel} source $INSTALL_DIR/greenplum_path.sh source ./gpdb_src/gpAux/gpdemo/gpdemo-env.sh +# ignore ERR trap +gpstop -qa || : gpstart -a sleep 60 ./gpdb_src/concourse/scripts/ic_start_fts_once.bash diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index c34a640d1c4..510c1687f71 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -176,6 +176,8 @@ blbuildempty(Relation index) * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need * this even when wal_level=minimal. */ + PageEncryptInplace(metapage, INIT_FORKNUM, + BLOOM_METAPAGE_BLKNO); PageSetChecksumInplace(metapage, BLOOM_METAPAGE_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, BLOOM_METAPAGE_BLKNO, (char *) metapage, true); diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c index 569f1503e58..729f1fdad5c 100644 --- a/contrib/bloom/blscan.c +++ b/contrib/bloom/blscan.c @@ -97,7 +97,7 @@ blgetbitmap(IndexScanDesc scan, Node **bmNodeP) if (*bmNodeP == NULL) { /* XXX should we use less than work_mem for this? */ - tbm = tbm_create(work_mem * 1024L, NULL); + tbm = tbm_create(work_mem * 1024L, scan->dsa); *bmNodeP = (Node *) tbm; } else if (!IsA(*bmNodeP, TIDBitmap)) diff --git a/deploy/cbdb_deploy.sh b/deploy/cbdb_deploy.sh index c62feafceab..94f441f831e 100755 --- a/deploy/cbdb_deploy.sh +++ b/deploy/cbdb_deploy.sh @@ -55,7 +55,7 @@ function cbdb_build() { #do compile configuration echo "[CBDB build] start to init configuraiton for code compile..." - CFLAGS=-O0 CXXFLAGS='-O0 -std=c++14' ./configure --prefix=${install_dir}/cbdb --enable-debug --enable-cassert --enable-tap-tests --with-gssapi --with-libxml --with-quicklz --with-pythonsrc-ext + CFLAGS=-O0 CXXFLAGS='-O0 -std=c++14' ./configure --prefix=${install_dir}/cbdb --enable-debug --enable-cassert --enable-tap-tests --with-gssapi --with-libxml --with-quicklz --with-pythonsrc-ext --with-openssl #do compile echo "[CBDB build] start to compile binary file..." diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index e83361b23fd..7eaa9cad9b1 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1542,19 +1542,31 @@ include_dir 'conf.d' mechanism is used. - The command must print the passphrase to the standard output and exit - with code 0. In the parameter value, %p is - replaced by a prompt string. (Write %% for a - literal %.) Note that the prompt string will - probably contain whitespace, so be sure to quote adequately. A single - newline is stripped from the end of the output if present. + The command must print the passphrase to the standard output + and exit with code 0. It can prompt from the terminal if + is used. In the parameter + value, %R is replaced by a file descriptor + number opened to the terminal that started the server. A file + descriptor is only available if enabled at server start via + . If %R is specified and + no file descriptor is available, the server will not start. Value + %p is replaced by a pre-defined prompt string. + (Write %% for a literal %.) + Note that the prompt string will probably contain whitespace, + so be sure to quote its use adequately. Newlines are stripped + from the end of the output if present. + - The command does not actually have to prompt the user for a - passphrase. It can read it from a file, obtain it from a keychain - facility, or similar. It is up to the user to make sure the chosen - mechanism is adequately secure. + Sample scripts can be found in + $SHAREDIR/auth_commands, + where $SHAREDIR means the + PostgreSQL installation's shared-data + directory, often /usr/local/share/postgresql + (use pg_config --sharedir to determine it if + you're not sure). + This parameter can only be set in the postgresql.conf file or on the server command line. @@ -1576,10 +1588,12 @@ include_dir 'conf.d' parameter is off (the default), then ssl_passphrase_command will be ignored during a reload and the SSL configuration will not be reloaded if a passphrase - is needed. That setting is appropriate for a command that requires a - TTY for prompting, which might not be available when the server is - running. Setting this parameter to on might be appropriate if the - passphrase is obtained from a file, for example. + is needed. This setting is appropriate for a command that requires a + terminal for prompting, which will likely not be available when the server is + running. ( closes the terminal file + descriptor soon after server start.) Setting this parameter on + might be appropriate, for example, if the passphrase is obtained + from a file. This parameter can only be set in the postgresql.conf @@ -2775,7 +2789,7 @@ include_dir 'conf.d' Note that changing wal_level to minimal makes any base backups taken before unavailable for archive recovery and standby server, which may - lead to data loss. + lead to data loss. Cluster file encryption also does not support. In logical level, the same information is logged as @@ -3129,9 +3143,10 @@ include_dir 'conf.d' - If data checksums are enabled, hint bit updates are always WAL-logged - and this setting is ignored. You can use this setting to test how much - extra WAL-logging would occur if your database had data checksums + If data checksums or cluster file encryption is enabled, + hint bit updates are always WAL-logged and this setting is + ignored. You can use this setting to test how much extra + WAL-logging would occur if your database had data checksums enabled. @@ -8048,6 +8063,64 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; + + Cluster File Encryption + + + + cluster_key_command (string) + + cluster_key_command configuration parameter + + + + + This option specifies an external command to obtain the cluster-level + key for cluster file encryption during server initialization and + server start. + + + The command must print the cluster key to the standard + output as 64 hexadecimal characters, and exit with code 0. + The command can prompt for the passphrase or PIN from the + terminal if is used. In the + parameter value, %R is replaced by a file + descriptor number opened to the terminal that started the server. + A file descriptor is only available if enabled at server start + via . If %R is specified + and no file descriptor is available, the server will not start. + Value %p is replaced by a pre-defined + prompt string. Value %d is replaced by the + directory containing the keys; this is useful if the command + must create files with the keys, e.g., to store a cluster-level + key encrypted by a key stored in a hardware security module. + (Write %% for a literal %.) + Note that the prompt string will probably contain whitespace, + so be sure to quote its use adequately. Newlines are stripped + from the end of the output if present. + + + + Sample script can be found in + $SHAREDIR/auth_commands, + where $SHAREDIR means the + PostgreSQL installation's shared-data + directory, often /usr/local/share/postgresql + (use pg_config --sharedir to determine it if + you're not sure). + + + + This parameter can only be set by + initdb, in the + postgresql.conf file, or on the server + command line. + + + + + + Client Connection Defaults @@ -10068,6 +10141,22 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir' + + file_encryption_method (boolean) + + Cluster file encryption method + + + + + Reports the cluster file + encryption method. See for more + information. + + + + data_directory_mode (integer) diff --git a/doc/src/sgml/database-encryption.sgml b/doc/src/sgml/database-encryption.sgml new file mode 100644 index 00000000000..c4377ada642 --- /dev/null +++ b/doc/src/sgml/database-encryption.sgml @@ -0,0 +1,149 @@ + + + + Cluster File Encryption + + + Cluster File Encryption + + + + The purpose of cluster file encryption is to prevent users with read + access on the directories used to store database files and write-ahead + log files from being able to access the data stored in those files. + For example, when using cluster file encryption, users who have read + access to the cluster directories for backup purposes will not be able + to decrypt the data stored in these files. Read-only access for a group + of users can be enabled using the initdb + option. Cluster file encryption + also provides data-at-rest security, protecting users from data loss + should the physical storage media be stolen or improperly erased before + disposal. + + + + Cluster file encryption does not protect against unauthorized file + system writes. Such writes can allow data decryption if used to weaken + the system's security and the weakened system is later supplied with + the externally-stored cluster encryption key. This also does not always + detect if users with write access remove or modify database files. + + + + This also does not protect against users who have read access to database + process memory because all in-memory data pages and data encryption keys + are stored unencrypted in memory. Therefore, an attacker who is able + to read memory can read the data encryption keys and decrypt the entire + cluster. The Postgres operating system user and the operating system + administrator, e.g., the root user, have such access. + + + + Keys + + + Cluster file encryption uses two levels of encryption — an upper + key which encrypts lower-level keys. The upper-level key is often + referred to as a Key Encryption Key (KEK). This key + is not stored in the file system, but provided at + initdb time and each time the server is started. This + key can be easily changed via pg_alterckey without + requiring any changes to the the data files or WAL + files. + + + + The lower level keys are data encryption keys, specifically for relations + and WAL. The relation key is used to encrypt database + heap and index files. The WAL key is used to encrypt write-ahead log + (WAL) files. Two different keys are used so that primary and standby + servers can use different relation keys, but the same WAL key, so that + these keys can (in a future release) be rotated by switching the + primary to the standby and then changing the WAL key. Eventually, + encryption will be able to added to non-encrypted clusters by creating + encrypted replicas and switching over to them. + + + + Postgres stores the data encryption (lower-level) keys in the data + directory encrypted (wrapped) by key encryption (upper-level) key. + Though the data encryption keys technically exist in the file system, + the key encryption key does not, so the data encryption keys are + securely stored. Data encryption keys are used to security encrypt + other database files. + + + + + Initialization + + + Cluster file encryption is enabled when + PostgreSQL is built + with --with-openssl and is specified + during initdb. The cluster key + provided by the + option during initdb and the one generated + by in the + postgresql.conf must match for the database + cluster to start. Note that the cluster key command + passed to initdb must return a key of + 64 hexadecimal characters. For example: + +initdb -D dbname --cluster-key-command='ckey_passphrase.sh' + + Cluster file encryption does not support a wal_level + of minimal. + + + + + Operation + + + During the initdb process, if + is specified, two data-level + encryption keys are created. These two keys are then encrypted with + the key encryption key (KEK) supplied by the cluster key command before + being stored in the database directory. The key or passphrase that + derives the key must be supplied from the terminal or stored in a + trusted key store, such as key vault software or a hardware security + module. + + + + If the PostgreSQL server has + been initialized to require a cluster key, each time the + server starts the postgresql.conf + cluster_key_command command will be executed + and the cluster key retrieved. The data encryption keys in the + pg_cryptokeys directory will then be decrypted + using the supplied key and integrity-checked to ensure it matches the + initdb-supplied key. (If this check fails, the server will refuse + to start.) The cluster encryption key will then be removed from + system memory. The decrypted data encryption keys will remain in + shared memory until the server is stopped. + + + + The data encryption keys are randomly generated and can be 128, 192, + or 256-bits in length, depending on whether AES128, + AES192, or AES256 is specified. + They are encrypted by the key encryption key (KEK) using Advanced + Encryption Standard (AES256) encryption in Key + Wrap Padded Mode, which also provides KEK authentication; see RFC 5649. While + 128-bit encryption is sufficient for most sites, 256-bit encryption + is thought to be more immune to future quantum cryptographic attacks. + + + . + If you prefer to create the random keys on your own, you can create + a empty directory with a pg_cryptokeys/live + subdirectory, generate the keys there using your tools. and use the + initdb + to copy those keys into the newly-created cluster. + + + diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index 459add557bc..ac4c60c46de 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -49,6 +49,7 @@ + diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml index 3b0b55fa096..2a5437ce59a 100644 --- a/doc/src/sgml/installation.sgml +++ b/doc/src/sgml/installation.sgml @@ -1004,7 +1004,7 @@ build-postgresql: Build with support for SSL (encrypted) - connections. The only LIBRARY + connections and cluster file encryption. The only LIBRARY supported is . This requires the OpenSSL package to be installed. configure will check for the required diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 9b43feb8557..9c67c9d1c50 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1318,6 +1318,19 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser DataFileWrite Waiting for a write to a relation data file. + + KeyFileRead + Waiting for a read of the wrapped data encryption keys. + + + KeyFileWrite + Waiting for a write of the wrapped data encryption keys. + + + KeyFileSync + Waiting for changes to the wrapped data encryption keys to reach + durable storage. + LockFileAddToDataDirRead Waiting for a read while adding a line to the data directory lock diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml index dba9cf413f9..2a3b8f29d4b 100644 --- a/doc/src/sgml/postgres.sgml +++ b/doc/src/sgml/postgres.sgml @@ -171,6 +171,7 @@ break is not needed in a wider output rendering. &wal; &logical-replication; &jit; + &database-encryption; ®ress; diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml index c0c3d1d7419..8063efae616 100644 --- a/doc/src/sgml/ref/allfiles.sgml +++ b/doc/src/sgml/ref/allfiles.sgml @@ -199,6 +199,7 @@ Complete list of usable sgml source files in this directory. + diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index f63a9aef558..6dac9994633 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -163,6 +163,18 @@ PostgreSQL documentation + + + + + + This option specifies an external command to obtain the cluster-level + key for cluster file encryption during server initialization and + server start; see for details. + + + + @@ -224,6 +236,18 @@ PostgreSQL documentation + + + + + + Specifies the cluster file encryption method. The + value values are AES128 (the default), AES192, and AES256. + + + + @@ -299,6 +323,17 @@ PostgreSQL documentation + + + + + + Allows the command + to prompt for a passphrase or PIN. + + + + @@ -321,6 +356,18 @@ PostgreSQL documentation + + + + + + Copies cluster file encryption keys from another cluster; required + when using pg_upgrade on a cluster + with cluster file encryption enabled. + + + + diff --git a/doc/src/sgml/ref/pg_alterckey.sgml b/doc/src/sgml/ref/pg_alterckey.sgml new file mode 100644 index 00000000000..867c439b8ec --- /dev/null +++ b/doc/src/sgml/ref/pg_alterckey.sgml @@ -0,0 +1,210 @@ + + + + + pg_alterckey + + + + pg_alterckey + 1 + Application + + + + pg_alterckey + alter the PostgreSQL cluster key + + + + + pg_alterckey + + + + + + + + old_cluster_key_command + new_cluster_key_command + + + + + + + + datadir + + + + + + pg_alterckey + + + + + + + + + + + + + + + + + datadir + + + + + + + Description + + pg_alterckey alters the cluster key used + for cluster file encryption. The cluster key is initially set + during . The command can be run while the + server is running or stopped. The new password must be used the next + time the server is started. + + + + pg_alterckey changes the key encryption key + (KEK) which encrypts the data encryption keys; + it does not change the data encryption keys. It does this by + decrypting each data encryption key using the old_cluster_key_command, + re-encrypting it using the new_cluster_key_command, and then + writes the result back to the cluster directory. + + + + See the documentation for how to define + the old and new passphrase commands. You can use different executables + for these commands, or you can use the same executable with different + arguments to specify retrieval of the old or new key. + + + + pg_alterckey manages data encryption keys, + which are critical to allowing Postgres to access its decrypted + data. For this reason, it is very careful to preserve these + keys in most possible failure conditions, e.g., operating system + failure during cluster encryption key rotation. + + + + When started, pg_alterckey repairs any files that + remain from previous failures before altering the cluster encryption + key. During this repair phase, pg_alterckey will + either roll back the cluster key or roll forward the changes that + were previously requested. The server will not start if repair is + needed, though a running server is unaffected by an unrepaired cluster + key configuration. Therefore, if pg_alterckey + fails for any reason, it is recommended you run the command with + to simply roll back or forward any previous + changes. This will report if it rolled the cluster key back or forward, + and then run the command again to change the cluster key if needed. + + + + You can specify the data directory on the command line, or use + the environment variable PGDATA. + + + + + Options + + + + + + + + + Allows the and + commands + to prompt for a passphrase or PIN. + + + + + + + + Other options: + + + + + + + + Print the pg_alterckey version and exit. + + + + + + + + + + Show help about pg_alterckey command line + arguments, and exit. + + + + + + + + + + + Environment + + + + PGDATA + + + + Default data directory location + + + + + + PG_COLOR + + + Specifies whether to use color in diagnostic messages. Possible values + are always, auto and + never. + + + + + + + + See Also + + + + + + + diff --git a/doc/src/sgml/ref/pg_ctl-ref.sgml b/doc/src/sgml/ref/pg_ctl-ref.sgml index 3946fa52eab..0662ae051a3 100644 --- a/doc/src/sgml/ref/pg_ctl-ref.sgml +++ b/doc/src/sgml/ref/pg_ctl-ref.sgml @@ -38,6 +38,7 @@ PostgreSQL documentation options path + @@ -72,6 +73,7 @@ PostgreSQL documentation seconds options + @@ -373,6 +375,18 @@ PostgreSQL documentation + + + + + + Allows or + to prompt for a passphrase + or PIN. + + + + diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 301167c0525..ac2d77f0913 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -166,6 +166,15 @@ PostgreSQL documentation + + + + allows or + to prompt for a passphrase + or PIN. + + + dir dir @@ -293,7 +302,9 @@ make prefix=/usr/local/pgsql.new install Again, use compatible initdb flags that match the old cluster. Many prebuilt installers do this step automatically. There is no need to - start the new cluster. + start the new cluster. If upgrading a cluster that uses + cluster file encryption, the initdb option + must be specified. @@ -833,6 +844,13 @@ psql --username=postgres --file=script.sql postgres is down. + + If the old cluster uses file encryption, the new cluster must use + the same keys, so pg_upgrade copies them to the + new cluster. It is necessary to initialize the new cluster with + the same cluster_key_command and the same + file encryption method. + diff --git a/doc/src/sgml/ref/postgres-ref.sgml b/doc/src/sgml/ref/postgres-ref.sgml index 4aaa7abe1a2..805da81e073 100644 --- a/doc/src/sgml/ref/postgres-ref.sgml +++ b/doc/src/sgml/ref/postgres-ref.sgml @@ -297,6 +297,19 @@ PostgreSQL documentation + + + + + Makes postgres prompt for a passphrase or PIN + from the specified open numeric file descriptor. The descriptor + is closed after the key is read. The file descriptor number + -1 duplicates standard error for the terminal; + this is useful for single-user mode. + + + + diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml index da421ff24e2..dff7a426452 100644 --- a/doc/src/sgml/reference.sgml +++ b/doc/src/sgml/reference.sgml @@ -240,6 +240,7 @@ + &pgalterckey; &clusterdb; &createdb; &createuser; diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml index 7136bbe7a32..96b373f990b 100644 --- a/doc/src/sgml/storage.sgml +++ b/doc/src/sgml/storage.sgml @@ -77,6 +77,11 @@ Item Subdirectory containing transaction commit timestamp data + + pg_cryptokeys + Subdirectory containing file encryption keys + + pg_dynshmem Subdirectory containing files used by the dynamic shared memory diff --git a/gpMgmt/bin/gpcheckcat b/gpMgmt/bin/gpcheckcat index 547bc230d92..3df54dec0ea 100755 --- a/gpMgmt/bin/gpcheckcat +++ b/gpMgmt/bin/gpcheckcat @@ -1529,8 +1529,8 @@ def checkTableInconsistentEntry(cat): if coordinator: return - # Skip gp_segment_configuration - if catname == "gp_segment_configuration": + # Skip gp_segment_configuration or pg_appendonly + if catname == "gp_segment_configuration" or catname == "pg_appendonly": return # skip shared/non-shared tables diff --git a/gpMgmt/bin/gpinitsystem b/gpMgmt/bin/gpinitsystem index 6338783fe83..1600497156d 100755 --- a/gpMgmt/bin/gpinitsystem +++ b/gpMgmt/bin/gpinitsystem @@ -177,6 +177,9 @@ USAGE () { $ECHO " 0 - No problems encountered with requested operation" $ECHO " 1 - Fatal error, instance not created/started, or in an inconsistent state," $ECHO " see log file for failure reason." + $ECHO " -T, cluster file encryption method, such as AES256, SM4" + $ECHO " -C, enable cluster file encryption and set command" + $ECHO " to obtain the cluster key" $ECHO exit $EXIT_STATUS fi @@ -1276,11 +1279,17 @@ CREATE_QD_DB () { # Since we explicitly set each LC_* setting, we do not need to pass a --locale option to initdb LC_ALL_SETTINGS=" $LC_COLLATE_SETTING $LC_CTYPE_SETTING $LC_MESSAGES_SETTING $LC_MONETARY_SETTING $LC_NUMERIC_SETTING $LC_TIME_SETTING" - # build initdb command cmd="$INITDB" cmd="$cmd -E $ENCODING" cmd="$cmd -D $GP_DIR" + if [ x"" != x"$ENCRYPT_METHOD" ]; then + cmd="$cmd -K $ENCRYPT_METHOD" + fi + if [ x"" != x"$CLUSTER_KEY_CMD" ]; then + cmd="$cmd -c $CLUSTER_KEY_CMD -R" + fi + cmd="$cmd $LC_ALL_SETTINGS" cmd="$cmd --max_connections=$COORDINATOR_MAX_CONNECT" cmd="$cmd --shared_buffers=$COORDINATOR_SHARED_BUFFERS" @@ -1449,6 +1458,8 @@ CREATE_SEGMENT () { export QE_MAX_CONNECT export QE_SHARED_BUFFERS export SEG_PREFIX + export ENCRYPT_METHOD + export CLUSTER_KEY_CMD if [ $DEBUG_LEVEL -eq 0 ] && [ x"" != x"$VERBOSE" ];then $NOLINE_ECHO ".\c";fi FLAG="" if [ x"" != x"$PG_CONF_ADD_FILE" ] ; then @@ -2087,7 +2098,7 @@ CHECK_DEPLOYMENT_MODE() { #****************************************************************************** CHECK_DEPLOYMENT_MODE trap 'ERROR_EXIT "[FATAL]:-Received INT or TERM signal"' INT TERM -while getopts ":vaqe:c:l:-:p:m:h:n:s:P:S:b:DB:I:O:E:F:M:U:" opt +while getopts ":vaqe:c:l:-:p:m:h:n:s:P:S:b:DB:I:O:E:F:M:U:T:C" opt do case $opt in v ) print_version ;; @@ -2111,6 +2122,8 @@ while getopts ":vaqe:c:l:-:p:m:h:n:s:P:S:b:DB:I:O:E:F:M:U:" opt U ) USE_EXTERNAL_FTS=$OPTARG ;; E ) ETCD_HOST_CONFIG=$OPTARG ;; F ) FTS_HOST_CONFIG=$OPTARG ;; + T ) ENCRYPT_METHOD=$OPTARG ;; + C ) CLUSTER_KEY_CMD=$OPTARG ;; - ) # Long options ... NAME=${OPTARG%%=*} VAL=${OPTARG#*=} @@ -2127,6 +2140,8 @@ while getopts ":vaqe:c:l:-:p:m:h:n:s:P:S:b:DB:I:O:E:F:M:U:" opt "lc-time" ) LCTIME=$VAL ;; "mirror-mode" ) SET_MIRROR_MODE $VAL ;; "standby-datadir" ) STANDBY_DATADIR=$VAL ;; + "encrypt-method" ) ENCRYPT_METHOD=$VAL ;; + "cluster-key-cmd" ) CLUSTER_KEY_CMD=$VAL ;; "help" ) USAGE "print_doc" ;; "version" ) print_version ;; * ) LOG_MSG "[ERROR]:-Unknown option --$NAME" 1; USAGE ;; diff --git a/gpMgmt/bin/gppylib/commands/base.py b/gpMgmt/bin/gppylib/commands/base.py index 951d24f68a9..c7dc4b8e2ca 100755 --- a/gpMgmt/bin/gppylib/commands/base.py +++ b/gpMgmt/bin/gppylib/commands/base.py @@ -509,16 +509,16 @@ def execute(self, cmd, pickled=False): cmdstr=cmd.cmdStr) LocalExecutionContext.execute(self, cmd, pickled=pickled) if (cmd.get_results().stderr.startswith('ssh_exchange_identification: Connection closed by remote host')): - self.__retry(cmd) + self.__retry(cmd, 0, pickled) pass - def __retry(self, cmd, count=0): + def __retry(self, cmd, count, pickled): if count == SSH_MAX_RETRY: return time.sleep(SSH_RETRY_DELAY) - LocalExecutionContext.execute(self, cmd, pickled=pickled) + LocalExecutionContext.execute(self, cmd, pickled) if (cmd.get_results().stderr.startswith('ssh_exchange_identification: Connection closed by remote host')): - self.__retry(cmd, count + 1) + self.__retry(cmd, count + 1, pickled) class Command(object): """ TODO: diff --git a/gpMgmt/bin/lib/gpcreateseg.sh b/gpMgmt/bin/lib/gpcreateseg.sh index e3a916b7423..115369fac56 100755 --- a/gpMgmt/bin/lib/gpcreateseg.sh +++ b/gpMgmt/bin/lib/gpcreateseg.sh @@ -119,6 +119,12 @@ CREATE_QES_PRIMARY () { cmd="$EXPORT_LIB_PATH;$INITDB" cmd="$cmd -E $ENCODING" cmd="$cmd -D $GP_DIR" + if [ x"" != x"$ENCRYPT_METHOD" ]; then + cmd="$cmd -K $ENCRYPT_METHOD" + fi + if [ x"" != x"$CLUSTER_KEY_CMD" ]; then + cmd="$cmd -c $CLUSTER_KEY_CMD" + fi cmd="$cmd --locale=$LOCALE_SETTING" cmd="$cmd $LC_ALL_SETTINGS" cmd="$cmd --max_connections=$QE_MAX_CONNECT" diff --git a/gpMgmt/doc/gpinitsystem_help b/gpMgmt/doc/gpinitsystem_help index ceedbbb3396..aa3f66383cf 100755 --- a/gpMgmt/doc/gpinitsystem_help +++ b/gpMgmt/doc/gpinitsystem_help @@ -22,6 +22,7 @@ gpinitsystem -c [--mirror-mode={group|spread}] [-a] [-q] [-l ] [-D] [-I input_configuration_file] [-O output_configuration_file] + [-T ] gpinitsystem -v @@ -108,6 +109,9 @@ OPTIONS cluster_configuration_file option or the -I input_configuration_file option to gpinitsystem. +-T +enable transparent data encryption, and set cluster file encryption method, +such as AES256, SM4. --locale= | -n diff --git a/gpcontrib/gp_exttable_fdw/gp_exttable_fdw.c b/gpcontrib/gp_exttable_fdw/gp_exttable_fdw.c index 1b6503f79d1..eba09d607ce 100644 --- a/gpcontrib/gp_exttable_fdw/gp_exttable_fdw.c +++ b/gpcontrib/gp_exttable_fdw/gp_exttable_fdw.c @@ -550,7 +550,7 @@ exttable_GetForeignPaths(PlannerInfo *root, NULL, /* no outer rel either */ NULL, /* no extra plan */ list_make1(externalscan_info)); - pathnode->path.locus = cdbpathlocus_from_baserel(root, baserel); + pathnode->path.locus = cdbpathlocus_from_baserel(root, baserel, 0); pathnode->path.motionHazard = false; /* diff --git a/hd-ci/compile_cbdb.bash b/hd-ci/compile_cbdb.bash index 46a1d6b7fd0..b9cbc9a9db1 100755 --- a/hd-ci/compile_cbdb.bash +++ b/hd-ci/compile_cbdb.bash @@ -16,10 +16,10 @@ function download_dependencies() { function compile_cbdb() { fts_mode=$1 export GPDB_SRC_PATH="${SRC_PATH}" - export CONFIGURE_FLAGS="--enable-cassert --enable-tap-tests --enable-debug-extensions" + export CONFIGURE_FLAGS="--enable-cassert --enable-tap-tests --enable-debug-extensions --with-openssl" if [[ ${BUILD_TYPE} == "release" ]]; then export CPPFLAGS="${CXXFLAGS} -Wno-unused-function -Wno-unused-variable" - export CONFIGURE_FLAGS="--disable-cassert --disable-tap-tests --disable-debug-extensions " + export CONFIGURE_FLAGS="--disable-cassert --disable-tap-tests --disable-debug-extensions --with-openssl" fi if [ "${fts_mode}" = "external_fts" ]; then diff --git a/hd-ci/icw_cbdb.bash b/hd-ci/icw_cbdb.bash index f9e1e32ed35..1ca7c84174b 100755 --- a/hd-ci/icw_cbdb.bash +++ b/hd-ci/icw_cbdb.bash @@ -39,7 +39,7 @@ function download_etcd() { icw_cbdb() { fts_mode=$1 # setup ENV before running this script: MAKE_TEST_COMMAND, TEST_OS, TEST_BINARY_SWAP, DUMP_DB - export CONFIGURE_FLAGS="--disable-cassert --enable-tap-tests --enable-debug-extensions" + export CONFIGURE_FLAGS="--disable-cassert --enable-tap-tests --enable-debug-extensions --with-openssl" if [ "${fts_mode}" = "external_fts" ]; then export CONFIGURE_FLAGS="${CONFIGURE_FLAGS} --enable-external-fts" fi diff --git a/src/backend/Makefile b/src/backend/Makefile index c246863b3c5..5b1b2d80230 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -21,7 +21,7 @@ SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \ main nodes optimizer partitioning port postmaster \ regex replication rewrite \ statistics storage tcop tsearch utils $(top_builddir)/src/timezone \ - jit + jit task crypto SUBDIRS += fts cdb ifeq ($(enable_orca),yes) @@ -227,6 +227,12 @@ endif $(INSTALL_DATA) $(srcdir)/libpq/pg_hba.conf.sample '$(DESTDIR)$(datadir)/pg_hba.conf.sample' $(INSTALL_DATA) $(srcdir)/libpq/pg_ident.conf.sample '$(DESTDIR)$(datadir)/pg_ident.conf.sample' $(INSTALL_DATA) $(srcdir)/utils/misc/postgresql.conf.sample '$(DESTDIR)$(datadir)/postgresql.conf.sample' + $(INSTALL_DATA) $(srcdir)/crypto/ckey_aws.sh.sample '$(DESTDIR)$(datadir)/auth_commands/ckey_aws.sh.sample' + $(INSTALL_DATA) $(srcdir)/crypto/ckey_direct.sh.sample '$(DESTDIR)$(datadir)/auth_commands/ckey_direct.sh.sample' + $(INSTALL_DATA) $(srcdir)/crypto/ckey_passphrase.sh.sample '$(DESTDIR)$(datadir)/auth_commands/ckey_passphrase.sh.sample' + $(INSTALL_DATA) $(srcdir)/crypto/ckey_piv_nopin.sh.sample '$(DESTDIR)$(datadir)/auth_commands/ckey_piv_nopin.sh.sample' + $(INSTALL_DATA) $(srcdir)/crypto/ckey_piv_pin.sh.sample '$(DESTDIR)$(datadir)/auth_commands/ckey_piv_pin.sh.sample' + $(INSTALL_DATA) $(srcdir)/crypto/ssl_passphrase.sh.sample '$(DESTDIR)$(datadir)/auth_commands/ssl_passphrase.sh.sample' ifeq ($(enable_orca), yes) $(MAKE) -C gporca $@ INSTLOC=$(DESTDIR)$(libdir) $(MAKE) -C gpopt $@ INSTLOC=$(DESTDIR)$(libdir) @@ -259,6 +265,7 @@ endif installdirs: $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(datadir)' + $(MKDIR_P) '$(DESTDIR)$(datadir)' '$(DESTDIR)$(datadir)/auth_commands' ifeq ($(PORTNAME), cygwin) ifeq ($(MAKE_DLL), true) $(MKDIR_P) '$(DESTDIR)$(libdir)' @@ -298,7 +305,13 @@ endif $(MAKE) -C utils uninstall-data rm -f '$(DESTDIR)$(datadir)/pg_hba.conf.sample' \ '$(DESTDIR)$(datadir)/pg_ident.conf.sample' \ - '$(DESTDIR)$(datadir)/postgresql.conf.sample' + '$(DESTDIR)$(datadir)/postgresql.conf.sample' \ + '$(DESTDIR)$(datadir)/auth_commands/ckey_aws.sh.sample' \ + '$(DESTDIR)$(datadir)/auth_commands/ckey_direct.sh.sample' \ + '$(DESTDIR)$(datadir)/auth_commands/ckey_passphrase.sh.sample' \ + '$(DESTDIR)$(datadir)/auth_commands/ckey_piv_nopin.sh.sample' \ + '$(DESTDIR)$(datadir)/auth_commands/ckey_piv_pin.sh.sample' \ + '$(DESTDIR)$(datadir)/auth_commands/ssl_passphrase.sh.sample' ifeq ($(with_llvm), yes) $(call uninstall_llvm_module,postgres) endif diff --git a/src/backend/access/aocs/aocs_compaction.c b/src/backend/access/aocs/aocs_compaction.c index 2c9fe8f8541..bb801acfeb2 100644 --- a/src/backend/access/aocs/aocs_compaction.c +++ b/src/backend/access/aocs/aocs_compaction.c @@ -402,7 +402,7 @@ AOCSCompact(Relation aorel, appendOnlyMetaDataSnapshot); insertDesc->skipModCountIncrement = true; - aocs_insert_finish(insertDesc); + aocs_insert_finish(insertDesc, NULL); } else { diff --git a/src/backend/access/aocs/aocsam.c b/src/backend/access/aocs/aocsam.c index d4f19e4482e..124ce44b4d3 100644 --- a/src/backend/access/aocs/aocsam.c +++ b/src/backend/access/aocs/aocsam.c @@ -56,12 +56,14 @@ static AOCSScanDesc aocs_beginscan_internal(Relation relation, int total_seg, Snapshot snapshot, Snapshot appendOnlyMetaDataSnapshot, + ParallelTableScanDesc parallel_scan, bool *proj, uint32 flags); static void reorder_qual_col(AOCSScanDesc scan); static bool aocs_col_predicate_test(AOCSScanDesc scan, TupleTableSlot *slot, int i, bool sample_phase); static bool aocs_getnext_sample(AOCSScanDesc scan, ScanDirection direction, TupleTableSlot *slot); +static void aocs_insert_finish_guts(AOCSInsertDesc aoInsertDesc); /* Hook for plugins to get control in aocs_delete() */ aocs_delete_hook_type aocs_delete_hook = NULL; @@ -131,6 +133,10 @@ open_ds_write(Relation rel, DatumStreamWrite **ds, TupleDesc relationTupleDesc, { int nvp = relationTupleDesc->natts; StdRdOptions **opts = RelationGetAttributeOptions(rel); + RelFileNodeBackend rnode; + + rnode.node = rel->rd_node; + rnode.backend = rel->rd_backend; /* open datum streams. It will open segment file underneath */ for (int i = 0; i < nvp; ++i) @@ -171,7 +177,8 @@ open_ds_write(Relation rel, DatumStreamWrite **ds, TupleDesc relationTupleDesc, attr, RelationGetRelationName(rel), /* title */ titleBuf.data, - XLogIsNeeded() && RelationNeedsWAL(rel)); + XLogIsNeeded() && RelationNeedsWAL(rel), + &rnode); } } @@ -261,7 +268,8 @@ open_ds_read(Relation rel, DatumStreamRead **ds, TupleDesc relationTupleDesc, blksz, attr, RelationGetRelationName(rel), - /* title */ titleBuf.data); + /* title */ titleBuf.data, + &rel->rd_node); } } @@ -336,8 +344,30 @@ initscan_with_colinfo(AOCSScanDesc scan) static int open_next_scan_seg(AOCSScanDesc scan) { - while (++scan->cur_seg < scan->total_seg) + bool isParallel = false; + ParallelBlockTableScanDesc pbscan = NULL; + + if (scan->rs_base.rs_parallel != NULL) + { + isParallel = true; + pbscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; + } + + while (scan->cur_seg < scan->total_seg) { + if (isParallel) + { + scan->cur_seg = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1); + if (scan->cur_seg >= pbscan->phs_nblocks) + break; + } + else + { + scan->cur_seg = scan->cur_seg + 1; + if (scan->cur_seg >= scan->total_seg) + break; + } + AOCSFileSegInfo *curSegInfo = scan->seginfo[scan->cur_seg]; if (curSegInfo->total_tupcount > 0) @@ -485,12 +515,14 @@ aocs_beginrangescan(Relation relation, snapshot, appendOnlyMetaDataSnapshot, NULL, + NULL, 0); } AOCSScanDesc aocs_beginscan(Relation relation, Snapshot snapshot, + ParallelTableScanDesc pscan, bool *proj, uint32 flags) { @@ -515,6 +547,7 @@ aocs_beginscan(Relation relation, total_seg, snapshot, aocsMetaDataSnapshot, + pscan, proj, flags); } @@ -528,6 +561,7 @@ aocs_beginscan_internal(Relation relation, int total_seg, Snapshot snapshot, Snapshot appendOnlyMetaDataSnapshot, + ParallelTableScanDesc parallel_scan, bool *proj, uint32 flags) { @@ -540,6 +574,7 @@ aocs_beginscan_internal(Relation relation, scan->rs_base.rs_rd = relation; scan->rs_base.rs_snapshot = snapshot; scan->rs_base.rs_flags = flags; + scan->rs_base.rs_parallel = parallel_scan; scan->appendOnlyMetaDataSnapshot = appendOnlyMetaDataSnapshot; scan->seginfo = seginfo; scan->total_seg = total_seg; @@ -644,6 +679,10 @@ aocs_endscan(AOCSScanDesc scan) if (scan->total_seg != 0) AppendOnlyVisimap_Finish(&scan->visibilityMap, AccessShareLock); + /* GPDB should backport this to upstream */ + if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) + UnregisterSnapshot(scan->rs_base.rs_snapshot); + RelationDecrementReferenceCount(scan->rs_base.rs_rd); pfree(scan); @@ -978,6 +1017,7 @@ aocs_insert_init(Relation rel, int segno) Assert(segno >= 0); desc->cur_segno = segno; + desc->range = 0; GetAppendOnlyEntryAttributes(rel->rd_id, &desc->blocksz, @@ -1027,6 +1067,10 @@ aocs_insert_init(Relation rel, int segno) (FileSegInfo *) desc->fsInfo, desc->lastSequence, rel, segno, tupleDesc->natts, true); + /* should not enable insertMultiFiles if the table is created by own transaction */ + desc->insertMultiFiles = enable_parallel && + gp_appendonly_insert_files > 1 && + !ShouldUseReservedSegno(rel, CHOOSE_MODE_WRITE); return desc; } @@ -1106,6 +1150,7 @@ aocs_insert_values(AOCSInsertDesc idesc, Datum *d, bool *null, AOTupleId *aoTupl idesc->insertCount++; idesc->lastSequence++; + idesc->range++; if (idesc->numSequences > 0) (idesc->numSequences)--; @@ -1127,13 +1172,14 @@ aocs_insert_values(AOCSInsertDesc idesc, Datum *d, bool *null, AOTupleId *aoTupl idesc->lastSequence + 1, NUM_FAST_SEQUENCES); - Assert(firstSequence == idesc->lastSequence + 1); + /* fast sequence could be inconsecutive when insert multiple segfiles */ + AssertImply(gp_appendonly_insert_files <= 1, firstSequence == idesc->lastSequence + 1); idesc->numSequences = NUM_FAST_SEQUENCES; } } -void -aocs_insert_finish(AOCSInsertDesc idesc) +static void +aocs_insert_finish_guts(AOCSInsertDesc idesc) { Relation rel = idesc->aoi_rel; int i; @@ -1155,6 +1201,42 @@ aocs_insert_finish(AOCSInsertDesc idesc) close_ds_write(idesc->ds, rel->rd_att->natts); } +/* + * aocs_insert_finish + * + * Use head to traverse multiple segment files of insertion, NULL if there is + * only one segment file. + * Keep param idesc for less changes. + */ +void +aocs_insert_finish(AOCSInsertDesc idesc, dlist_head *head) +{ + AOCSInsertDesc next = NULL; + dlist_iter iter; + + /* no mutiple segfiles insertion */ + if(!head) + { + aocs_insert_finish_guts(idesc); + pfree(idesc); + return; + } + + Assert(!dlist_is_empty(head)); + + dlist_foreach(iter, head) + { + if(next) + pfree(next); + + next = (AOCSInsertDesc)dlist_container(AOCSInsertDescData, node, iter.cur); + aocs_insert_finish_guts(next); + } + + if(next) + pfree(next); +} + static void positionFirstBlockOfRange(DatumStreamFetchDesc datumStreamFetchDesc) { @@ -1464,7 +1546,8 @@ aocs_fetch_init(Relation relation, blksz, TupleDescAttr(tupleDesc, colno), relation->rd_rel->relname.data, - /* title */ titleBuf.data); + /* title */ titleBuf.data, + &relation->rd_node); } if (opts[colno]) @@ -1775,7 +1858,7 @@ aocs_update_finish(AOCSUpdateDesc desc) AppendOnlyVisimapDelete_Finish(&desc->visiMapDelete); - aocs_insert_finish(desc->insertDesc); + aocs_insert_finish(desc->insertDesc, NULL); desc->insertDesc = NULL; /* Keep lock until the end of transaction */ @@ -1958,7 +2041,8 @@ aocs_begin_headerscan(Relation rel, int colno) opts[colno]->blocksize, RelationGetRelationName(rel), "ALTER TABLE ADD COLUMN scan", - &ao_attr); + &ao_attr, + &rel->rd_node); hdesc->colno = colno; return hdesc; } @@ -2017,6 +2101,10 @@ aocs_addcol_init(Relation rel, int iattr; StringInfoData titleBuf; bool checksum; + RelFileNodeBackend rnode; + + rnode.node = rel->rd_node; + rnode.backend = rel->rd_backend; desc = palloc(sizeof(AOCSAddColumnDescData)); desc->num_newcols = num_newcols; @@ -2053,7 +2141,8 @@ aocs_addcol_init(Relation rel, desc->dsw[i] = create_datumstreamwrite(ct, clvl, checksum, 0, blksz /* safeFSWriteSize */ , attr, RelationGetRelationName(rel), titleBuf.data, - XLogIsNeeded() && RelationNeedsWAL(rel)); + XLogIsNeeded() && RelationNeedsWAL(rel), + &rnode); } return desc; } diff --git a/src/backend/access/aocs/aocsam_handler.c b/src/backend/access/aocs/aocsam_handler.c index 32870f49596..2d389f8c9d3 100644 --- a/src/backend/access/aocs/aocsam_handler.c +++ b/src/backend/access/aocs/aocsam_handler.c @@ -42,6 +42,7 @@ #include "utils/faultinjector.h" #include "utils/lsyscache.h" #include "utils/pg_rusage.h" +#include "utils/guc.h" #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID) @@ -87,6 +88,7 @@ typedef struct AOCODMLState { Oid relationOid; AOCSInsertDesc insertDesc; + dlist_head head; // Head of multiple segment files insertion list. AOCSDeleteDesc deleteDesc; } AOCODMLState; @@ -187,6 +189,7 @@ enter_dml_state(const Oid relationOid) state->insertDesc = NULL; state->deleteDesc = NULL; + dlist_init(&state->head); Assert(!found); @@ -291,7 +294,7 @@ aoco_dml_finish(Relation relation, CmdType operation) if (state->insertDesc) { Assert(state->insertDesc->aoi_rel == relation); - aocs_insert_finish(state->insertDesc); + aocs_insert_finish(state->insertDesc, &state->head); state->insertDesc = NULL; } @@ -304,19 +307,49 @@ static AOCSInsertDesc get_insert_descriptor(const Relation relation) { AOCODMLState *state; + AOCSInsertDesc next = NULL; state = find_dml_state(RelationGetRelid(relation)); if (state->insertDesc == NULL) { + List *segments = NIL; MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(aocoLocal.stateCxt); state->insertDesc = aocs_insert_init(relation, ChooseSegnoForWrite(relation)); + dlist_init(&state->head); + dlist_head *head = &state->head; + dlist_push_tail(head, &state->insertDesc->node); + + if (state->insertDesc->insertMultiFiles) + { + segments = lappend_int(segments, state->insertDesc->cur_segno); + for (int i = 0; i < gp_appendonly_insert_files - 1; i++) + { + next = aocs_insert_init(relation, + ChooseSegnoForWriteMultiFile(relation, segments)); + dlist_push_tail(head, &next->node); + segments = lappend_int(segments, next->cur_segno); + } + list_free(segments); + } MemoryContextSwitchTo(oldcxt); } + /* switch insertDesc */ + if (state->insertDesc->insertMultiFiles && state->insertDesc->range == gp_appendonly_insert_files_tuples_range) + { + state->insertDesc->range = 0; + if (!dlist_has_next(&state->head, &state->insertDesc->node)) + next = (AOCSInsertDesc)dlist_container(AOCSInsertDescData, node, dlist_head_node(&state->head)); + else + next = (AOCSInsertDesc)dlist_container(AOCSInsertDescData, node, dlist_next_node(&state->head, &state->insertDesc->node)); + + state->insertDesc = next; + } + return state->insertDesc; } @@ -427,7 +460,7 @@ extractcolumns_from_node(Node *expr, bool *cols, AttrNumber natts) } static TableScanDesc -aoco_beginscan_extractcolumns(Relation rel, Snapshot snapshot, +aoco_beginscan_extractcolumns(Relation rel, Snapshot snapshot, ParallelTableScanDesc parallel_scan, List *targetlist, List *qual, uint32 flags) { @@ -451,6 +484,7 @@ aoco_beginscan_extractcolumns(Relation rel, Snapshot snapshot, aoscan = aocs_beginscan(rel, snapshot, + parallel_scan, cols, flags); @@ -524,11 +558,9 @@ aoco_beginscan(Relation relation, { AOCSScanDesc aoscan; - /* Parallel scan not supported for AO_COLUMN tables */ - Assert(pscan == NULL); - aoscan = aocs_beginscan(relation, snapshot, + pscan, NULL, flags); @@ -591,19 +623,32 @@ aoco_getnextslot(TableScanDesc scan, ScanDirection direction, TupleTableSlot *sl static Size aoco_parallelscan_estimate(Relation rel) { - elog(ERROR, "parallel SeqScan not implemented for AO_COLUMN tables"); + return sizeof(ParallelBlockTableScanDescData); } +/* + * AOCO only uses part fields of ParallelBlockTableScanDesc. + */ static Size aoco_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan) { - elog(ERROR, "parallel SeqScan not implemented for AO_COLUMN tables"); + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + bpscan->base.phs_relid = RelationGetRelid(rel); + bpscan->phs_nblocks = 0; /* init, will be updated later by table_parallelscan_initialize */ + pg_atomic_init_u64(&bpscan->phs_nallocated, 0); + /* we don't need phs_mutex and phs_startblock in ao, though, init them. */ + SpinLockInit(&bpscan->phs_mutex); + bpscan->phs_startblock = InvalidBlockNumber; + return sizeof(ParallelBlockTableScanDescData); } static void aoco_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) { - elog(ERROR, "parallel SeqScan not implemented for AO_COLUMN tables"); + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + pg_atomic_write_u64(&bpscan->phs_nallocated, 0); } static IndexFetchTableData * @@ -1180,6 +1225,7 @@ aoco_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, slot = table_slot_create(OldHeap, NULL); scan = aocs_beginscan(OldHeap, GetActiveSnapshot(), + NULL /* parallel_scan */, NULL /* proj */, 0 /* flags */); @@ -1234,7 +1280,7 @@ aoco_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tuplesort_end(tuplesort); /* Finish and deallocate insertion */ - aocs_insert_finish(idesc); + aocs_insert_finish(idesc, NULL); } static bool diff --git a/src/backend/access/aocs/test/aocsam_test.c b/src/backend/access/aocs/test/aocsam_test.c index 2d1bc8f5d78..b3c38cd8b77 100644 --- a/src/backend/access/aocs/test/aocsam_test.c +++ b/src/backend/access/aocs/test/aocsam_test.c @@ -96,6 +96,8 @@ test__aocs_addcol_init(void **state) expect_value(create_datumstreamwrite, maxsz, 8192 * 2); expect_value(create_datumstreamwrite, needsWAL, true); expect_value(create_datumstreamwrite, needsWAL, true); + expect_any(create_datumstreamwrite, rnode); + expect_any(create_datumstreamwrite, rnode); expect_any_count(create_datumstreamwrite, attr, 2); expect_any_count(create_datumstreamwrite, relname, 2); expect_any_count(create_datumstreamwrite, title, 2); diff --git a/src/backend/access/appendonly/aomd.c b/src/backend/access/appendonly/aomd.c index a47b75d3dc4..c147562f7f9 100644 --- a/src/backend/access/appendonly/aomd.c +++ b/src/backend/access/appendonly/aomd.c @@ -34,6 +34,7 @@ #include "catalog/pg_appendonly.h" #include "cdb/cdbappendonlystorage.h" #include "cdb/cdbappendonlyxlog.h" +#include "crypto/bufenc.h" #include "common/relpath.h" #include "pgstat.h" #include "storage/sync.h" @@ -417,7 +418,6 @@ copy_file(char *srcsegpath, char *dstsegpath, (errcode_for_file_access(), errmsg("could not write %d bytes to file \"%s\": %m", len, dstsegpath))); - if (use_wal) xlog_ao_insert(dst, segfilenum, offset, buffer, len); @@ -438,6 +438,7 @@ copy_file(char *srcsegpath, char *dstsegpath, struct copy_append_only_data_callback_ctx { char *srcPath; char *dstPath; + RelFileNode src; RelFileNode dst; bool useWal; }; @@ -467,6 +468,7 @@ copy_append_only_data(RelFileNode src, RelFileNode dst, copyFiles.srcPath = srcPath; copyFiles.dstPath = dstPath; + copyFiles.src = src; copyFiles.dst = dst; copyFiles.useWal = useWal; diff --git a/src/backend/access/appendonly/appendonly_compaction.c b/src/backend/access/appendonly/appendonly_compaction.c index a7a4c427250..7aeb7b0382c 100644 --- a/src/backend/access/appendonly/appendonly_compaction.c +++ b/src/backend/access/appendonly/appendonly_compaction.c @@ -798,7 +798,7 @@ AppendOnlyCompact(Relation aorel, appendOnlyMetaDataSnapshot); insertDesc->skipModCountIncrement = true; - appendonly_insert_finish(insertDesc); + appendonly_insert_finish(insertDesc, NULL); } else { diff --git a/src/backend/access/appendonly/appendonlyam.c b/src/backend/access/appendonly/appendonlyam.c index ca63e00db2a..b5993e30f4f 100755 --- a/src/backend/access/appendonly/appendonlyam.c +++ b/src/backend/access/appendonly/appendonlyam.c @@ -45,6 +45,7 @@ #include "access/heaptoast.h" #include "access/valid.h" #include "access/xact.h" +#include "access/relscan.h" #include "catalog/catalog.h" #include "catalog/gp_fastsequence.h" #include "catalog/namespace.h" @@ -55,6 +56,7 @@ #include "cdb/cdbappendonlystorageformat.h" #include "cdb/cdbappendonlystoragelayer.h" #include "cdb/cdbvars.h" +#include "crypto/bufenc.h" #include "executor/executor.h" #include "fmgr.h" #include "miscadmin.h" @@ -69,7 +71,7 @@ /* * AppendOnlyDeleteDescData is used for delete data from append-only * relations. It serves an equivalent purpose as AppendOnlyScanDescData - * (relscan.h) only that the later is used for scanning append-only + * (cdbappendonlyam.h) only that the later is used for scanning append-only * relations. */ typedef struct AppendOnlyDeleteDescData @@ -99,7 +101,7 @@ typedef struct AppendOnlyDeleteDescData /* * AppendOnlyUpdateDescData is used to update data from append-only * relations. It serves an equivalent purpose as AppendOnlyScanDescData - * (relscan.h) only that the later is used for scanning append-only + * (cdbappendonlyam.h) only that the later is used for scanning append-only * relations. */ typedef struct AppendOnlyUpdateDescData @@ -146,6 +148,7 @@ static void AppendOnlyExecutorReadBlock_ResetCounts( extern void finishWriteBlock(AppendOnlyInsertDesc aoInsertDesc); extern void setupNextWriteBlock(AppendOnlyInsertDesc aoInsertDesc); +static void appendonly_insert_finish_guts(AppendOnlyInsertDesc aoInsertDesc); /* Hook for plugins to get control in appendonly_delete() */ appendonly_delete_hook_type appendonly_delete_hook = NULL; @@ -190,6 +193,15 @@ SetNextFileSegForRead(AppendOnlyScanDesc scan) int formatversion = -2; /* some invalid value */ bool finished_all_files = true; /* assume */ int32 fileSegNo; + bool isParallel = false; + ParallelBlockTableScanDesc pbscan = NULL; + + if (scan->rs_base.rs_parallel != NULL) + { + isParallel = true; + pbscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; + } + Assert(scan->aos_need_new_segfile); /* only call me when last segfile * completed */ @@ -207,7 +219,8 @@ SetNextFileSegForRead(AppendOnlyScanDesc scan) scan->usableBlockSize, NameStr(scan->aos_rd->rd_rel->relname), scan->title, - &scan->storageAttributes); + &scan->storageAttributes, + &scan->aos_rd->rd_node); /* * There is no guarantee that the current memory context will be @@ -261,16 +274,27 @@ SetNextFileSegForRead(AppendOnlyScanDesc scan) /* * Do we have more segment files to read or are we done? */ + int idx; /* fetch segfile idx */ while (scan->aos_segfiles_processed < scan->aos_total_segfiles) { - /* still have more segment files to read. get info of the next one */ - FileSegInfo *fsinfo = scan->aos_segfile_arr[scan->aos_segfiles_processed]; + if (isParallel) + { + idx = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1); + if (idx >= pbscan->phs_nblocks) + break; + } + else + { + idx = scan->aos_segfiles_processed; + } + + FileSegInfo *fsinfo = scan->aos_segfile_arr[idx]; segno = fsinfo->segno; formatversion = fsinfo->formatversion; eof = (int64) fsinfo->eof; - scan->aos_segfiles_processed++; + scan->aos_segfiles_processed = idx + 1; /* * If the 'eof' is zero or it's just a lingering dropped segment @@ -641,6 +665,16 @@ AppendOnlyExecutorReadBlock_GetContents(AppendOnlyExecutorReadBlock *executorRea switch (executorReadBlock->executorBlockKind) { case AoExecutorBlockKind_VarBlock: + + /* + * Now use the VarBlock module to extract the items out. + */ + VarBlockReaderInit(&executorReadBlock->varBlockReader, + executorReadBlock->dataBuffer, + executorReadBlock->dataLen, + true /* need decrypt */, + &executorReadBlock->storageRead->relFileNode); + varBlockCheckError = VarBlockIsValid(executorReadBlock->dataBuffer, executorReadBlock->dataLen); if (varBlockCheckError != VarBlockCheckOk) ereport(ERROR, @@ -651,13 +685,6 @@ AppendOnlyExecutorReadBlock_GetContents(AppendOnlyExecutorReadBlock *executorRea errdetail_appendonly_read_storage_content_header(executorReadBlock->storageRead), errcontext_appendonly_read_storage_block(executorReadBlock->storageRead))); - /* - * Now use the VarBlock module to extract the items out. - */ - VarBlockReaderInit(&executorReadBlock->varBlockReader, - executorReadBlock->dataBuffer, - executorReadBlock->dataLen); - executorReadBlock->readerItemCount = VarBlockReaderItemCount(&executorReadBlock->varBlockReader); executorReadBlock->currentItemCount = 0; @@ -693,6 +720,11 @@ AppendOnlyExecutorReadBlock_GetContents(AppendOnlyExecutorReadBlock *executorRea executorReadBlock->singleRow = executorReadBlock->dataBuffer; executorReadBlock->singleRowLen = executorReadBlock->dataLen; + if (FileEncryptionEnabled) + DecryptAOBlock(executorReadBlock->singleRow, + executorReadBlock->singleRowLen, + &executorReadBlock->storageRead->relFileNode); + elogif(Debug_appendonly_print_scan, LOG, "Append-only scan read single row for table '%s' with length %d (block offset in file = " INT64_FORMAT ")", AppendOnlyStorageRead_RelationName(executorReadBlock->storageRead), executorReadBlock->singleRowLen, @@ -1409,7 +1441,8 @@ setupNextWriteBlock(AppendOnlyInsertDesc aoInsertDesc) aoInsertDesc->nonCompressedData, aoInsertDesc->maxDataLen, aoInsertDesc->tempSpace, - aoInsertDesc->tempSpaceLen); + aoInsertDesc->tempSpaceLen, + &aoInsertDesc->storageWrite); aoInsertDesc->bufferCount++; } @@ -1434,13 +1467,15 @@ finishWriteBlock(AppendOnlyInsertDesc aoInsertDesc) return; } - dataLen = VarBlockMakerFinish(&aoInsertDesc->varBlockMaker); + dataLen = VarBlockMakerFinish(&aoInsertDesc->varBlockMaker, + &aoInsertDesc->storageWrite); aoInsertDesc->varblockCount++; if (itemCount == 1) { dataLen = VarBlockCollapseToSingleItem( + /* storageWrite */ &aoInsertDesc->storageWrite, /* target */ aoInsertDesc->nonCompressedData, /* source */ aoInsertDesc->nonCompressedData, /* sourceLen */ dataLen); @@ -1776,6 +1811,10 @@ appendonly_endscan(TableScanDesc scan) aoscan->aofetch = NULL; } + /* GPDB should backport this to upstream */ + if (aoscan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) + UnregisterSnapshot(aoscan->rs_base.rs_snapshot); + pfree(aoscan->aos_filenamepath); pfree(aoscan->title); @@ -2197,7 +2236,8 @@ appendonly_fetch_init(Relation relation, aoFetchDesc->usableBlockSize, NameStr(aoFetchDesc->relation->rd_rel->relname), aoFetchDesc->title, - &aoFetchDesc->storageAttributes); + &aoFetchDesc->storageAttributes, + &relation->rd_node); fns = get_funcs_for_compression(NameStr(aoFormData.compresstype)); @@ -2623,6 +2663,7 @@ appendonly_insert_init(Relation rel, int segno) aoInsertDesc = (AppendOnlyInsertDesc) palloc0(sizeof(AppendOnlyInsertDescData)); aoInsertDesc->aoi_rel = rel; + aoInsertDesc->range = 0; /* * We want to see an up-to-date view of the metadata. The target segment's @@ -2810,6 +2851,10 @@ aoInsertDesc->appendOnlyMetaDataSnapshot, //CONCERN:Safe to assume all block dir aoInsertDesc->fsInfo, aoInsertDesc->lastSequence, rel, segno, 1, false); + /* should not enable insertMultiFiles if the table is created by own transaction */ + aoInsertDesc->insertMultiFiles = enable_parallel && + gp_appendonly_insert_files > 1 && + !ShouldUseReservedSegno(rel, CHOOSE_MODE_WRITE); return aoInsertDesc; } @@ -3008,6 +3053,7 @@ appendonly_insert(AppendOnlyInsertDesc aoInsertDesc, aoInsertDesc->insertCount++; aoInsertDesc->lastSequence++; + aoInsertDesc->range++; if (aoInsertDesc->numSequences > 0) (aoInsertDesc->numSequences)--; @@ -3033,7 +3079,8 @@ appendonly_insert(AppendOnlyInsertDesc aoInsertDesc, aoInsertDesc->lastSequence + 1, NUM_FAST_SEQUENCES); - Assert(firstSequence == aoInsertDesc->lastSequence + 1); + /* fast sequence could be inconsecutive when insert multiple segfiles */ + AssertImply(gp_appendonly_insert_files <= 1, firstSequence == aoInsertDesc->lastSequence + 1); aoInsertDesc->numSequences = NUM_FAST_SEQUENCES; } @@ -3055,9 +3102,42 @@ appendonly_insert(AppendOnlyInsertDesc aoInsertDesc, * * when done inserting all the data via appendonly_insert() we need to call * this function to flush all remaining data in the buffer into the file. + * + * Use head to traverse multiple segment files of insertion, NULL if there is + * only one segment file. + * Keep param aoInsertDesc for less changes. */ void -appendonly_insert_finish(AppendOnlyInsertDesc aoInsertDesc) +appendonly_insert_finish(AppendOnlyInsertDesc aoInsertDesc, dlist_head *head) +{ + AppendOnlyInsertDesc next = NULL; + dlist_iter iter; + + /* no mutiple segfiles insertion */ + if(!head) + { + appendonly_insert_finish_guts(aoInsertDesc); + pfree(aoInsertDesc); + return; + } + + Assert(!dlist_is_empty(head)); + + dlist_foreach(iter, head) + { + if(next) + pfree(next); + + next = (AppendOnlyInsertDesc)dlist_container(AppendOnlyInsertDescData, node, iter.cur); + appendonly_insert_finish_guts(next); + } + + if(next) + pfree(next); +} + +static void +appendonly_insert_finish_guts(AppendOnlyInsertDesc aoInsertDesc) { /* * Finish up that last varblock. @@ -3075,7 +3155,6 @@ appendonly_insert_finish(AppendOnlyInsertDesc aoInsertDesc) destroy_memtuple_binding(aoInsertDesc->mt_bind); pfree(aoInsertDesc->title); - pfree(aoInsertDesc); } ExprState * diff --git a/src/backend/access/appendonly/appendonlyam_handler.c b/src/backend/access/appendonly/appendonlyam_handler.c index 52bc1a418f3..bce0a00db2e 100644 --- a/src/backend/access/appendonly/appendonlyam_handler.c +++ b/src/backend/access/appendonly/appendonlyam_handler.c @@ -43,6 +43,7 @@ #include "utils/faultinjector.h" #include "utils/lsyscache.h" #include "utils/pg_rusage.h" +#include "utils/guc.h" #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID) @@ -58,6 +59,7 @@ typedef struct AppendOnlyDMLState { Oid relationOid; AppendOnlyInsertDesc insertDesc; + dlist_head head; // Head of multiple segment files insertion list. AppendOnlyDeleteDesc deleteDesc; } AppendOnlyDMLState; @@ -158,6 +160,7 @@ enter_dml_state(const Oid relationOid) state->insertDesc = NULL; state->deleteDesc = NULL; + dlist_init(&state->head); Assert(!found); @@ -262,7 +265,7 @@ appendonly_dml_finish(Relation relation, CmdType operation) if (state->insertDesc) { Assert(state->insertDesc->aoi_rel == relation); - appendonly_insert_finish(state->insertDesc); + appendonly_insert_finish(state->insertDesc, &state->head); state->insertDesc = NULL; } } @@ -289,19 +292,49 @@ static AppendOnlyInsertDesc get_insert_descriptor(const Relation relation) { AppendOnlyDMLState *state; + AppendOnlyInsertDesc next = NULL; state = find_dml_state(RelationGetRelid(relation)); if (state->insertDesc == NULL) { + List *segments = NIL; MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(appendOnlyLocal.stateCxt); state->insertDesc = appendonly_insert_init(relation, ChooseSegnoForWrite(relation)); + + dlist_init(&state->head); + dlist_head *head = &state->head; + dlist_push_tail(head, &state->insertDesc->node); + if (state->insertDesc->insertMultiFiles) + { + segments = lappend_int(segments, state->insertDesc->cur_segno); + for (int i = 0; i < gp_appendonly_insert_files - 1; i++) + { + next = appendonly_insert_init(relation, + ChooseSegnoForWriteMultiFile(relation, segments)); + dlist_push_tail(head, &next->node); + segments = lappend_int(segments, next->cur_segno); + } + list_free(segments); + } MemoryContextSwitchTo(oldcxt); } + /* switch insertDesc */ + if (state->insertDesc->insertMultiFiles && state->insertDesc->range == gp_appendonly_insert_files_tuples_range) + { + state->insertDesc->range = 0; + if (!dlist_has_next(&state->head, &state->insertDesc->node)) + next = (AppendOnlyInsertDesc)dlist_container(AppendOnlyInsertDescData, node, dlist_head_node(&state->head)); + else + next = (AppendOnlyInsertDesc)dlist_container(AppendOnlyInsertDescData, node, dlist_next_node(&state->head, &state->insertDesc->node)); + + state->insertDesc = next; + } + return state->insertDesc; } @@ -397,19 +430,31 @@ appendonly_free_memtuple(MemTuple tuple) static Size appendonly_parallelscan_estimate(Relation rel) { - elog(ERROR, "parallel SeqScan not implemented for AO_ROW tables"); + return sizeof(ParallelBlockTableScanDescData); } +/* + * AO only uses part fields of ParallelBlockTableScanDesc. + */ static Size appendonly_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan) { - elog(ERROR, "parallel SeqScan not implemented for AO_ROW tables"); + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + bpscan->base.phs_relid = RelationGetRelid(rel); + bpscan->phs_nblocks = 0; /* init, will be updated later by table_parallelscan_initialize */ + pg_atomic_init_u64(&bpscan->phs_nallocated, 0); + /* we don't need phs_mutex and phs_startblock in ao, though, init them. */ + SpinLockInit(&bpscan->phs_mutex); + bpscan->phs_startblock = InvalidBlockNumber; + return sizeof(ParallelBlockTableScanDescData); } static void appendonly_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) { - elog(ERROR, "parallel SeqScan not implemented for AO_ROW tables"); + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + pg_atomic_write_u64(&bpscan->phs_nallocated, 0); } /* ------------------------------------------------------------------------ @@ -763,6 +808,7 @@ appendonly_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slo MemTuple mtuple; TM_Result result; + /* should disable insert multilfiles for update? */ insertDesc = get_insert_descriptor(relation); deleteDesc = get_delete_descriptor(relation, true); @@ -1143,7 +1189,7 @@ appendonly_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, tuplesort_end(tuplesort); /* Finish and deallocate insertion */ - appendonly_insert_finish(aoInsertDesc); + appendonly_insert_finish(aoInsertDesc, NULL); } static bool diff --git a/src/backend/access/appendonly/appendonlywriter.c b/src/backend/access/appendonly/appendonlywriter.c index 64434fd8932..9405c3b1c6c 100644 --- a/src/backend/access/appendonly/appendonlywriter.c +++ b/src/backend/access/appendonly/appendonlywriter.c @@ -45,29 +45,6 @@ #define SEGFILE_CAPACITY_THRESHOLD 0.9 - -/* - * Modes of operation for the choose_segno_internal() function. - */ -typedef enum -{ - /* - * Normal mode; select a segment to insert to, for INSERT or COPY. - */ - CHOOSE_MODE_WRITE, - - /* - * Select a segment to insert surviving rows to, when compacting - * another segfile in VACUUM. - */ - CHOOSE_MODE_COMPACTION_WRITE, - - /* - * Select next segment to compact. - */ - CHOOSE_MODE_COMPACTION_TARGET -} choose_segno_mode; - /* * local functions */ @@ -294,6 +271,26 @@ ChooseSegnoForWrite(Relation rel) return chosen_segno; } +int +ChooseSegnoForWriteMultiFile(Relation rel, List *avoid_segnos) +{ + int chosen_segno; + + if (Debug_appendonly_print_segfile_choice) + ereport(LOG, + (errmsg("ChooseSegnoForWrite: Choosing a segfile for relation \"%s\"", + RelationGetRelationName(rel)))); + + chosen_segno = choose_segno_internal(rel, avoid_segnos, CHOOSE_MODE_WRITE); + + if (chosen_segno == -1) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + (errmsg("could not find segment file to use for inserting into relation \"%s\"", + RelationGetRelationName(rel))))); + return chosen_segno; +} + /* * Select a segfile to write surviving tuples to, when doing VACUUM compaction. */ @@ -329,7 +326,7 @@ ChooseSegnoForCompaction(Relation rel, List *avoid_segnos) * same transaction that created the table. See * InsertInitialFastSequenceEntries for more details. */ -static bool +bool ShouldUseReservedSegno(Relation rel, choose_segno_mode mode) { Relation pg_class; diff --git a/src/backend/access/bitmap/bitmap.c b/src/backend/access/bitmap/bitmap.c index 0d098c72c8b..30d0dd49fa7 100644 --- a/src/backend/access/bitmap/bitmap.c +++ b/src/backend/access/bitmap/bitmap.c @@ -83,6 +83,7 @@ bmhandler(PG_FUNCTION_ARGS) amroutine->amcanbackward = false; amroutine->amcanunique = true; amroutine->amcanmulticol = true; + amroutine->amcanparallel = false; amroutine->amoptionalkey = true; amroutine->amsearcharray = false; amroutine->amsearchnulls = false; @@ -796,6 +797,7 @@ copy_scan_desc(IndexScanDesc scan) s->opaque = palloc(sizeof(BMScanOpaqueData)); s->indexRelation = scan->indexRelation; + s->dsa = scan->dsa; so = (BMScanOpaque)scan->opaque; sp = so->bm_currPos; diff --git a/src/backend/access/bitmap/bitmapattutil.c b/src/backend/access/bitmap/bitmapattutil.c index cf3f03ee04b..0675342c222 100644 --- a/src/backend/access/bitmap/bitmapattutil.c +++ b/src/backend/access/bitmap/bitmapattutil.c @@ -226,7 +226,15 @@ _bitmap_create_lov_heapandindex(Relation rel, indexInfo->ii_ReadyForInserts = true; indexInfo->ii_Concurrent = false; indexInfo->ii_BrokenHotChain = false; - indexInfo->ii_ParallelWorkers = 0; + /* + * GP_PARALLEL_FIXME: temporarily set ii_ParallelWorkers to -1 to disable parallel in bitmap index + * building. That's because that we still hold InterruptHoldoffCount after launch parallel workers. + * And when parallel workers detach the message 'X' is not interrupt the leader. However, the leader + * must wait for workers detaching. Thus there will be a hang issue. + * + * We should bring it back in the future. + */ + indexInfo->ii_ParallelWorkers = -1; indexInfo->ii_Am = BTREE_AM_OID; indexInfo->ii_AmCache = NULL; indexInfo->ii_Context = CurrentMemoryContext; diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 2847804f7e3..5d7493e1949 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -469,7 +469,7 @@ bringetbitmap(IndexScanDesc scan, Node **bmNodeP) if (*bmNodeP == NULL) { /* XXX should we use less than work_mem for this? */ - tbm = tbm_create(work_mem * 1024L, NULL); + tbm = tbm_create(work_mem * 1024L, scan->dsa); *bmNodeP = (Node *) tbm; } else if (!IsA(*bmNodeP, TIDBitmap)) diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 3eb98607697..f928fb6e058 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -375,7 +375,7 @@ static relopt_int intRelOpts[] = { "parallel_workers", "Number of parallel processes that can be used per executor node for this relation.", - RELOPT_KIND_HEAP, + RELOPT_KIND_HEAP | RELOPT_KIND_APPENDOPTIMIZED, ShareUpdateExclusiveLock }, -1, 0, 1024 diff --git a/src/backend/access/common/session.c b/src/backend/access/common/session.c index 61b3206befb..f3a5e62e060 100644 --- a/src/backend/access/common/session.c +++ b/src/backend/access/common/session.c @@ -46,7 +46,8 @@ /* This backend's current session. */ Session *CurrentSession = NULL; - +/* gp style parallelism session. */ +Session *ParallelSession = NULL; /* * Set up CurrentSession to point to an empty Session object. */ @@ -54,6 +55,7 @@ void InitializeSession(void) { CurrentSession = MemoryContextAllocZero(TopMemoryContext, sizeof(Session)); + ParallelSession = MemoryContextAllocZero(TopMemoryContext, sizeof(Session)); } /* diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 653aa506f05..9645b58d604 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -1932,7 +1932,7 @@ gingetbitmap(IndexScanDesc scan, Node **bmNodeP) if (*bmNodeP == NULL) { /* XXX should we use less than work_mem for this? */ - tbm = tbm_create(work_mem * 1024L, NULL); + tbm = tbm_create(work_mem * 1024L, scan->dsa); *bmNodeP = (Node *) tbm; } else if (!IsA(*bmNodeP, TIDBitmap)) diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 25cab0047b6..4558a5668ba 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -461,7 +461,6 @@ value. The page is not recycled, until that XID is no longer visible to anyone. That's much more conservative than necessary, but let's keep it simple. - Authors: Teodor Sigaev Oleg Bartunov diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index ec28bfe89f0..d281d89000f 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -48,6 +48,8 @@ #include "utils/rel.h" #include "utils/tuplesort.h" +extern XLogRecPtr LSNForEncryption(bool use_wal_lsn); + /* Step of index tuples for check whether to switch to buffering build mode */ #define BUFFERING_MODE_SWITCH_CHECK_STEP 256 @@ -452,6 +454,8 @@ gist_indexsortbuild(GISTBuildState *state) /* Write out the root */ RelationOpenSmgr(state->indexrel); PageSetLSN(pagestate->page, GistBuildLSN); + PageEncryptInplace(pagestate->page, MAIN_FORKNUM, + GIST_ROOT_BLKNO); PageSetChecksumInplace(pagestate->page, GIST_ROOT_BLKNO); smgrwrite(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO, pagestate->page, true); @@ -588,7 +592,14 @@ gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) if (blkno != state->pages_written) elog(ERROR, "unexpected block number to flush GiST sorting build"); - PageSetLSN(page, GistBuildLSN); + PageSetLSN(page, !FileEncryptionEnabled ? GistBuildLSN : + LSNForEncryption(RelationIsPermanent(state->indexrel))); + /* Make sure LSNs are vaild, and if encryption, are not constant. */ + Assert(!XLogRecPtrIsInvalid(PageGetLSN(page)) && + (!FileEncryptionEnabled || + PageGetLSN(page) != GistBuildLSN)); + PageEncryptInplace(page, MAIN_FORKNUM, + blkno); PageSetChecksumInplace(page, blkno); smgrextend(state->indexrel->rd_smgr, MAIN_FORKNUM, blkno, page, true); diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index f655f997ecf..f19fe0d0d29 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -765,7 +765,7 @@ gistgetbitmap(IndexScanDesc scan, Node **bmNodeP) if (*bmNodeP == NULL) { /* XXX should we use less than work_mem for this? */ - tbm = tbm_create(work_mem * 1024L, NULL); + tbm = tbm_create(work_mem * 1024L, scan->dsa); *bmNodeP = (Node *) tbm; } else if (!IsA(*bmNodeP, TIDBitmap)) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index d152d793e7f..9ab8b829a1e 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -345,7 +345,7 @@ hashgetbitmap(IndexScanDesc scan, Node **bmNodeP) if (*bmNodeP == NULL) { /* XXX should we use less than work_mem for this? */ - tbm = tbm_create(work_mem * 1024L, NULL); + tbm = tbm_create(work_mem * 1024L, scan->dsa); *bmNodeP = (Node *) tbm; } else if (!IsA(*bmNodeP, TIDBitmap)) diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 49a98677876..7ee505c6bab 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -1025,6 +1025,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) true); RelationOpenSmgr(rel); + PageEncryptInplace(page, MAIN_FORKNUM, lastblock); PageSetChecksumInplace(page, lastblock); smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf.data, false); diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 2865389c697..a1b98c82cfd 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -324,6 +324,8 @@ end_heap_rewrite(RewriteState state) state->rs_buffer, true); + PageEncryptInplace(state->rs_buffer, MAIN_FORKNUM, + state->rs_blockno); PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); RelationOpenSmgr(state->rs_new_rel); @@ -697,6 +699,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup) */ RelationOpenSmgr(state->rs_new_rel); + PageEncryptInplace(page, MAIN_FORKNUM, + state->rs_blockno); PageSetChecksumInplace(page, state->rs_blockno); smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index b93288a6fe6..ce226a48a68 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -125,6 +125,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->dsa = NULL; return scan; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index e36909781d3..906d753d222 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -166,6 +166,8 @@ btbuildempty(Relation index) * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need * this even when wal_level=minimal. */ + PageEncryptInplace(metapage, INIT_FORKNUM, + BTREE_METAPAGE); PageSetChecksumInplace(metapage, BTREE_METAPAGE); smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, (char *) metapage, true); @@ -381,7 +383,7 @@ btgetbitmap(IndexScanDesc scan, Node **bmNodeP) if (*bmNodeP == NULL) { /* XXX should we use less than work_mem for this? */ - tbm = tbm_create(work_mem * 1024L, NULL); + tbm = tbm_create(work_mem * 1024L, scan->dsa); *bmNodeP = (Node *) tbm; } else if (!IsA(*bmNodeP, TIDBitmap)) diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 0ff7e546326..e7f6cfa3412 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -658,13 +658,15 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) { if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); - /* don't set checksum for all-zero page */ + /* don't set checksum or encryption for all-zero page */ smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); } + PageEncryptInplace(page, MAIN_FORKNUM, + blkno); PageSetChecksumInplace(page, blkno); /* diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 44d6fac42ec..dacc46a5654 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -145,7 +145,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) appendStringInfoString(buf, xlrec->rp_name); } - else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || + info == XLOG_ENCRYPTION_LSN) { /* no further information to print */ } @@ -267,6 +268,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_ENCRYPTION_LSN: + id = "ENCRYPTION_LSN"; + break; } return id; diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 1af0af7da21..23ecc913af8 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -168,6 +168,8 @@ spgbuildempty(Relation index) * of their existing content when the corresponding create records are * replayed. */ + PageEncryptInplace(page, INIT_FORKNUM, + SPGIST_METAPAGE_BLKNO); PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, (char *) page, true); @@ -177,6 +179,8 @@ spgbuildempty(Relation index) /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); + PageEncryptInplace(page, INIT_FORKNUM, + SPGIST_ROOT_BLKNO); PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO, (char *) page, true); @@ -186,6 +190,8 @@ spgbuildempty(Relation index) /* Likewise for the null-tuples root page. */ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); + PageEncryptInplace(page, INIT_FORKNUM, + SPGIST_NULL_BLKNO); PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO, (char *) page, true); diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 63fe2f3bf63..1feea5280d3 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -955,7 +955,7 @@ spggetbitmap(IndexScanDesc scan, Node **bmNodeP) if (*bmNodeP == NULL) { /* XXX should we use less than work_mem for this? */ - tbm = tbm_create(work_mem * 1024L, NULL); + tbm = tbm_create(work_mem * 1024L, scan->dsa); *bmNodeP = (Node *) tbm; } else if (!IsA(*bmNodeP, TIDBitmap)) diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 5ea5bdd8104..5dfc3d659c2 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -30,6 +30,9 @@ #include "storage/shmem.h" #include "storage/smgr.h" +#include "access/aosegfiles.h" +#include "access/aocssegfiles.h" + /* * Constants to control the behavior of block allocation to parallel workers * during a parallel seqscan. Technically these values do not need to be @@ -169,6 +172,60 @@ table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Assert(snapshot == SnapshotAny); pscan->phs_snapshot_any = true; } + + /* + * GPDB: for AO/AOCO tables, + * we need to fill parallel info which need a snapshot to scan systables. + * We couldn't do it in parallelscan_initialize AM which doesn't have a + * snapshot param. And we should keep same with Upstream. + * But parallelscan_initialize tells us the snapshot offset and we have set + * it in ParallelTableScanDesc, so we could use it directly. + */ + if (RelationIsAoRows(rel)) + { + Snapshot appendOnlyMetaDataSnapshot = snapshot; + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + int segfile_count; + FileSegInfo **seginfo; + if (snapshot == SnapshotAny) + { + /* + * the append-only meta data should never be fetched with + * SnapshotAny as bogus results are returned. + */ + appendOnlyMetaDataSnapshot = GetTransactionSnapshot(); + } + seginfo = GetAllFileSegInfo(rel, appendOnlyMetaDataSnapshot, &segfile_count, NULL); + bpscan->phs_nblocks = segfile_count; + if (seginfo) + { + FreeAllSegFileInfo(seginfo, segfile_count); + pfree(seginfo); + } + } + else if (RelationIsAoCols(rel)) + { + Snapshot appendOnlyMetaDataSnapshot = snapshot; + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + int segfile_count; + AOCSFileSegInfo **seginfo; + if (snapshot == SnapshotAny) + { + /* + * the append-only meta data should never be fetched with + * SnapshotAny as bogus results are returned. + */ + appendOnlyMetaDataSnapshot = GetTransactionSnapshot(); + } + seginfo = GetAllAOCSFileSegInfo(rel, appendOnlyMetaDataSnapshot, &segfile_count, NULL); + bpscan->phs_nblocks = segfile_count; + if (seginfo) + { + FreeAllAOCSSegFileInfo(seginfo, segfile_count); + pfree(seginfo); + } + + } } TableScanDesc diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 8ce95ab190b..a4bf68cea33 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -24,19 +24,31 @@ #include "catalog/namespace.h" #include "catalog/pg_enum.h" #include "catalog/storage.h" +#include "cdb/cdbgang.h" +#include "cdb/cdbutil.h" +#include "cdb/cdbvars.h" #include "commands/async.h" #include "executor/execParallel.h" +#include "executor/executor.h" +#include "executor/hashjoin.h" +#include "executor/nodeAppend.h" +#include "executor/nodeHashjoin.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "libpq/pqmq.h" #include "miscadmin.h" +#include "nodes/execnodes.h" +#include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" +#include "optimizer/walkers.h" #include "pgstat.h" +#include "postmaster/postmaster.h" #include "storage/ipc.h" #include "storage/predicate.h" #include "storage/sinval.h" #include "storage/spin.h" #include "tcop/tcopprot.h" +#include "utils/builtins.h" #include "utils/combocid.h" #include "utils/guc.h" #include "utils/inval.h" @@ -57,6 +69,9 @@ /* Magic number for parallel context TOC. */ #define PARALLEL_MAGIC 0x50477c7c +/* Magic number for gp style parallel context TOC. */ +#define GP_PARALLEL_MAGIC 0x50477d7d + /* * Magic numbers for per-context parallel state sharing. Higher-level code * should use smaller values, leaving these very large ones for use by this @@ -76,6 +91,20 @@ #define PARALLEL_KEY_REINDEX_STATE UINT64CONST(0xFFFFFFFFFFFF000C) #define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) #define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) +#define PARALLEL_KEY_GP_DSA UINT64CONST(0xFFFFFFFFFFFF000F) + + +/* Shared parallel dsm entry table size. estimated number = 100 connections * average 50 slices. */ +#define SHARED_PARALLEL_DSM_TABLE_SIZE 5000 + +/* CDB auxiliary state need to be synced from leader to parallel workers */ +typedef struct CdbParallelAuxState +{ + int session_id; + int num_segments; + int ic_htab_size; + char interconnect_address[NI_MAXHOST]; +} CdbParallelAuxState; /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -96,6 +125,9 @@ typedef struct FixedParallelState TimestampTz stmt_ts; SerializableXactHandle serializable_xact_handle; + /* CDB auxiliary state that worker must restore. */ + CdbParallelAuxState cdb_aux_state; + /* Mutex protects remaining fields. */ slock_t mutex; @@ -126,6 +158,9 @@ static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list); /* Backend-local copy of data from FixedParallelState. */ static pid_t ParallelLeaderPid; +/* Shared Hashmap to save Parallel Entries for each Query Slice */ +static HTAB *GpParallelDSMHash; + /* * List of internal parallel worker entry points. We need this for * reasons explained in LookupParallelWorkerFunction(), below. @@ -219,6 +254,9 @@ InitializeParallelDSM(ParallelContext *pcxt) Snapshot transaction_snapshot = GetTransactionSnapshot(); Snapshot active_snapshot = GetActiveSnapshot(); + if (gp_select_invisible) + active_snapshot = transaction_snapshot; + /* We might be running in a very short-lived memory context. */ oldcontext = MemoryContextSwitchTo(TopTransactionContext); @@ -336,6 +374,12 @@ InitializeParallelDSM(ParallelContext *pcxt) fps->last_xlog_end = 0; shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps); + /* CDB: should sync some global states to workes */ + fps->cdb_aux_state.session_id = gp_session_id; + fps->cdb_aux_state.num_segments = numsegmentsFromQD; + strcpy(fps->cdb_aux_state.interconnect_address, interconnect_address); + fps->cdb_aux_state.ic_htab_size = ic_htab_size; + /* We can skip the rest of this if we're not budgeting for any workers. */ if (pcxt->nworkers > 0) { @@ -1335,6 +1379,16 @@ ParallelWorkerMain(Datum main_arg) pq_set_parallel_leader(fps->parallel_leader_pid, fps->parallel_leader_backend_id); + /* CDB: should sync some global states from leader */ + Gp_role = GP_ROLE_EXECUTE; + Gp_is_writer = false; + gp_session_id = fps->cdb_aux_state.session_id; + interconnect_address = fps->cdb_aux_state.interconnect_address; + numsegmentsFromQD = fps->cdb_aux_state.num_segments; + ic_htab_size = fps->cdb_aux_state.ic_htab_size; + MyProc->mppSessionId = gp_session_id; + MyProc->mppIsWriter = Gp_is_writer; + /* * Send a BackendKeyData message to the process that initiated parallelism * so that it has access to our PID before it receives any other messages @@ -1406,8 +1460,16 @@ ParallelWorkerMain(Datum main_arg) /* Restore GUC values from launching backend. */ gucspace = shm_toc_lookup(toc, PARALLEL_KEY_GUC, false); RestoreGUCState(gucspace); + /* make sure GUC functions doesn't set the sanpshot */ + Assert(!FirstSnapshotSet); CommitTransactionCommand(); + /* CDB: we skip restore Gp_role and Gp_is_writer, please see can_skip_gucvar(). + * This keep consistent of gucs in this function. + */ + Assert(Gp_role == GP_ROLE_EXECUTE); + Assert(!Gp_is_writer); + /* Crank up a transaction state appropriate to a parallel worker. */ tstatespace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_STATE, false); StartParallelWorkerTransaction(tstatespace); @@ -1583,3 +1645,260 @@ LookupParallelWorkerFunction(const char *libraryname, const char *funcname) return (parallel_worker_main_type) load_external_function(libraryname, funcname, true, NULL); } + +void +InitGpParallelDSMHash(void) +{ + HASHCTL info; + + info.keysize = sizeof(ParallelEntryTag); + info.entrysize = sizeof(GpParallelDSMEntry); + info.num_partitions = NUM_PARALLEL_DSM_PARTITIONS; + + GpParallelDSMHash = ShmemInitHash("Gp Parallel DSM Hash", + SHARED_PARALLEL_DSM_TABLE_SIZE, + SHARED_PARALLEL_DSM_TABLE_SIZE, + &info, + HASH_ELEM | HASH_BLOBS | HASH_PARTITION); +} + +Size +GpParallelDSMHashSize(void) +{ + /* GPDB_PARALLEL_FIXME: limit for max slice */ + return hash_estimate_size(SHARED_PARALLEL_DSM_TABLE_SIZE, + sizeof(GpParallelDSMEntry)); +} + + +void * +GpFetchParallelDSMEntry(ParallelEntryTag tag, int plan_node_id) +{ + GpParallelDSMEntry *entry; + shm_toc *toc; + bool found = false; + + entry = (GpParallelDSMEntry *) + hash_search(GpParallelDSMHash, + &tag, + HASH_FIND, + &found); + Assert(found); + + if (entry->pid == MyProcPid) + { + toc = entry->toc; + } + else + { + Assert(ParallelSession->segment); + toc = shm_toc_attach(GP_PARALLEL_MAGIC, dsm_segment_address(ParallelSession->segment)); + } + + Assert(toc != NULL); + + return shm_toc_lookup(toc, plan_node_id, true); +} + + +void GpDestroyParallelDSMEntry() +{ + GpParallelDSMEntry *entry; + bool found = false; + + ParallelEntryTag tag; + + INIT_PARALLELENTRYTAG(tag, gp_command_count, currentSliceId, gp_session_id); + + LWLockAcquire(GpParallelDSMHashLock, LW_EXCLUSIVE); + + entry = (GpParallelDSMEntry *) + hash_search(GpParallelDSMHash, + &tag, + HASH_FIND, + &found); + + if (entry != NULL && ParallelSession->segment != NULL) + { + entry->reference--; + + Assert(entry->tolaunch >= 0 && entry->reference >= 0); + + /* + * Since we pin the dsa and dsm when we first create it, + * we need to unpin them when we detach in the last parallel worker. + */ + if (entry->reference == 0 && entry->tolaunch == 0) + { + dsa_unpin(ParallelSession->area); + dsm_unpin_segment(entry->handle); + + hash_search(GpParallelDSMHash, + &tag, + HASH_REMOVE, + &found); + } + + dsa_detach(ParallelSession->area); + dsm_detach(ParallelSession->segment); + + ParallelSession->segment = NULL; + ParallelSession->area = NULL; + } + LWLockRelease(GpParallelDSMHashLock); +} + +void +AtEOXact_GP_Parallel() +{ + GpDestroyParallelDSMEntry(); +} + +void +AtProcExit_GP_Parallel(int code, Datum arg) +{ + AtEOXact_GP_Parallel(); +} + +GpParallelDSMEntry * +GpInsertParallelDSMHash(PlanState *planstate) +{ + GpParallelDSMEntry *entry; + bool found = false; + static bool init = false; + + int localSliceId = LocallyExecutingSliceIndex(planstate->state); + int parallel_workers = 0; + + if (planstate->state->es_plannedstmt && planstate->state->es_plannedstmt->slices) + { + parallel_workers = planstate->state->es_plannedstmt->slices[localSliceId].parallel_workers; + } + + if (parallel_workers <= 1) + return NULL; + + ParallelEntryTag tag; + INIT_PARALLELENTRYTAG(tag, gp_command_count, localSliceId, gp_session_id); + + LWLockAcquire(GpParallelDSMHashLock, LW_EXCLUSIVE); + + entry = (GpParallelDSMEntry *) + hash_search(GpParallelDSMHash, + &tag, + HASH_ENTER_NULL, + &found); + + if (!entry) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("out of cross-slice SHARED_PARALLEL_DSM_TABLE_SIZE slots."))); + + if (!found) + { + shm_toc_estimator *estimator = NULL; + ParallelContext context = { + .nworkers = parallel_workers, + }; + + Size dsa_minsize = dsa_minimum_size(); + estimator = &context.estimator; + shm_toc_initialize_estimator(estimator); + + /* Estimate space for parallel DSA area. */ + shm_toc_estimate_chunk(estimator, dsa_minsize); + shm_toc_estimate_keys(estimator, 1); + + EstimateGpParallelDSMEntrySize(planstate, &context); + + Size segsize = shm_toc_estimate(estimator); + + shm_toc *toc; + dsm_segment* seg = dsm_create(segsize, DSM_CREATE_NULL_IF_MAXSEGMENTS); + + if (seg != NULL) + toc = shm_toc_create(GP_PARALLEL_MAGIC, + dsm_segment_address(seg), + segsize); + else + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("create dsm for gp style parallel workers failed."))); + + BarrierInit(&entry->build_barrier, parallel_workers); + entry->handle = dsm_segment_handle(seg); + entry->toc = toc; + entry->pid = MyProcPid; + entry->reference = 1; + entry->tolaunch = parallel_workers - 1; + entry->parallel_workers = parallel_workers; + entry->temp_worker_id = 0; + + /* Create a DSA area that can be used by the leader and all workers. */ + char *area_space = shm_toc_allocate(entry->toc, dsa_minsize); + shm_toc_insert(entry->toc, PARALLEL_KEY_GP_DSA, area_space); + dsa_area* area = dsa_create_in_place(area_space, + dsa_minsize, + LWTRANCHE_PARALLEL_QUERY_DSA, + seg); + + planstate->state->es_query_dsa = area; + + /* + * We need to pin the segment we created. + * Otherwise, if some of the parallel workers detach the segment soon enough, + * the `dsm_control->item[i].refcnt` will be set to one and the segment will + * be destroyed by dsm_detach. + * + * We need to pin dsa area too for the similar reason. + */ + dsm_pin_segment(seg); + dsa_pin(area); + + ParallelSession->area = area; + ParallelSession->segment = seg; + context.seg = seg; + context.toc = toc; + + InitializeGpParallelDSMEntry(planstate, &context); + if (!init) + { + /* should ensure that no shared memory is pinned before process exist. */ + before_shmem_exit(AtProcExit_GP_Parallel, 0); + init = true; + } + } + else + { + dsm_segment *seg = dsm_attach(entry->handle); + if (seg == NULL) + elog(ERROR, "could not attach to Parallel DSM segment"); + ParallelSession->segment = seg; + + /* Attach to DSA area that can be used by the leader and all workers. */ + shm_toc* toc = shm_toc_attach(GP_PARALLEL_MAGIC, dsm_segment_address(seg)); + char* area_space = shm_toc_lookup(toc, PARALLEL_KEY_GP_DSA, false); + dsa_area* area = dsa_attach_in_place(area_space, seg); + + ParallelSession->area = area; + planstate->state->es_query_dsa = area; + + entry->temp_worker_id = parallel_workers - entry->tolaunch; + entry->tolaunch--; + entry->reference++; + ParallelWorkerContext ctx = { + .seg = seg, + .toc = toc, + .nworkers = parallel_workers, + .worker_id = entry->temp_worker_id, + }; + + InitializeGpParallelWorkers(planstate, &ctx); + } + + LWLockRelease(GpParallelDSMHashLock); + BarrierArriveAndWait(&entry->build_barrier, 0); + return entry; +} diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 616253d8034..c2b72a87e67 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -21,6 +21,7 @@ #include #include "access/commit_ts.h" +#include "access/heapam.h" #include "access/multixact.h" #include "access/parallel.h" #include "access/subtrans.h" @@ -2814,6 +2815,9 @@ CommitTransaction(void) if (IsInParallelMode()) AtEOXact_Parallel(true); + /* Clean up GP style parallel workers which we might have. */ + AtEOXact_GP_Parallel(); + /* Shut down the deferred-trigger manager */ AfterTriggerEndXact(true); @@ -3538,6 +3542,9 @@ AbortTransaction(void) s->parallelModeLevel = 0; } + /* Clean up GP style parallel workers which we might have. */ + AtEOXact_GP_Parallel(); + /* * do abort processing */ diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 5765ed27fb1..7b3622bf509 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -43,13 +43,17 @@ #include "commands/progress.h" #include "commands/tablespace.h" #include "common/controldata_utils.h" +#include "common/file_utils.h" +#include "crypto/kmgr.h" #include "executor/instrument.h" +#include "crypto/bufenc.h" #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" #include "port/atomics.h" #include "port/pg_iovec.h" #include "postmaster/bgwriter.h" +#include "postmaster/postmaster.h" #include "postmaster/startup.h" #include "postmaster/walwriter.h" #include "replication/basebackup.h" @@ -95,6 +99,7 @@ #include "utils/syscache.h" extern uint32 bootstrap_data_checksum_version; +extern int bootstrap_file_encryption_method; /* Unsupported old recovery command file names (relative to $PGDATA) */ #define RECOVERY_COMMAND_FILE "recovery.conf" @@ -124,6 +129,8 @@ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; bool track_wal_io_timing = false; +/* tde feature enable or not */ +int FileEncryptionEnabled = false; /* GPDB specific */ bool gp_pause_on_restore_point_replay = false; @@ -4715,6 +4722,7 @@ InitControlFile(uint64 sysidentifier) ControlFile->wal_log_hints = wal_log_hints; ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = bootstrap_data_checksum_version; + ControlFile->file_encryption_method = bootstrap_file_encryption_method; } static void @@ -5002,6 +5010,22 @@ ReadControlFile(void) /* Make the initdb settings visible as GUC variables, too */ SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", PGC_INTERNAL, PGC_S_OVERRIDE); + + StaticAssertStmt(lengthof(encryption_methods) == NUM_ENCRYPTION_METHODS, + "encryption_methods[] must match NUM_ENCRYPTION_METHODS"); + + if (ControlFile->file_encryption_method < 0 || + ControlFile->file_encryption_method > NUM_ENCRYPTION_METHODS - 1) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with file_encryption_method %d," + "The max value of file_encryption_method is: %d.", + ControlFile->file_encryption_method, NUM_ENCRYPTION_METHODS), + errhint("It looks like you need to recompile or initdb."))); + + SetConfigOption("file_encryption_method", + encryption_methods[ControlFile->file_encryption_method].name, + PGC_INTERNAL, PGC_S_OVERRIDE); } /* @@ -5044,6 +5068,21 @@ DataChecksumsEnabled(void) return (ControlFile->data_checksum_version > 0); } +/* + * Is cluster file encryption enabled? + */ +int +GetFileEncryptionMethod(void) +{ + if (IsBootstrapProcessingMode()) + return bootstrap_file_encryption_method; + else + { + Assert(ControlFile != NULL); + return ControlFile->file_encryption_method; + } +} + /* * Returns a fake LSN for unlogged relations. * @@ -5458,6 +5497,15 @@ BootStrapXLOG(void) /* some additional ControlFile fields are set in WriteControlFile() */ WriteControlFile(); + BootStrapKmgr(); + InitializeBufferEncryption(); + + if (terminal_fd != -1) + { + close(terminal_fd); + terminal_fd = -1; + } + /* Bootstrap the commit log, too */ BootStrapCLOG(); BootStrapCommitTs(); @@ -11064,6 +11112,10 @@ xlog_redo(XLogReaderState *record) UnlockReleaseBuffer(buffer); } } + else if (info == XLOG_ENCRYPTION_LSN) + { + /* nothing to do here */ + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 39421a4c35f..9cd6bf438d0 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1018,6 +1018,30 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) return recptr; } +/* + * This function returns either a WAL or fake LSN, for use by encryption. + */ +XLogRecPtr +LSNForEncryption(bool use_wal_lsn) +{ + if (use_wal_lsn) + { + int dummy = 0; + + Assert(FileEncryptionEnabled); + /* + * Records other than SWITCH_WAL must have content. We use an integer 0 to + * follow the restriction. + */ + XLogBeginInsert(); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + XLogRegisterData((char *) &dummy, sizeof(dummy)); + return XLogInsert(RM_XLOG_ID, XLOG_ENCRYPTION_LSN); + } + else + return GetFakeLSNForUnloggedRel(); +} + /* * Write a WAL record containing a full image of a page. Caller is responsible * for writing the page to disk after calling this routine. diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 499a5fb705b..4bfd33ccc68 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -31,12 +31,14 @@ #include "catalog/storage_tablespace.h" #include "commands/tablespace.h" #include "common/link-canary.h" +#include "crypto/bufenc.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "nodes/makefuncs.h" #include "pg_getopt.h" #include "pgstat.h" #include "postmaster/bgwriter.h" +#include "postmaster/postmaster.h" /* TODO: verify we need this still */ #include "postmaster/startup.h" #include "postmaster/walwriter.h" #include "replication/walreceiver.h" @@ -53,7 +55,9 @@ #include "utils/rel.h" #include "utils/relmapper.h" -uint32 bootstrap_data_checksum_version = 0; /* No checksum */ +uint32 bootstrap_data_checksum_version = 0; /* No checksum */ +int bootstrap_file_encryption_method = DISABLED_ENCRYPTION_METHOD; +char *bootstrap_old_key_datadir = NULL; /* disabled */ static void CheckerModeMain(void); @@ -227,7 +231,7 @@ AuxiliaryProcessMain(int argc, char *argv[]) /* If no -x argument, we are a CheckerProcess */ MyAuxProcType = CheckerProcess; - while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:x:X:-:")) != -1) + while ((flag = getopt(argc, argv, "B:c:d:D:FkK:r:x:R:u:X:-:")) != -1) { switch (flag) { @@ -256,12 +260,36 @@ AuxiliaryProcessMain(int argc, char *argv[]) case 'k': bootstrap_data_checksum_version = PG_DATA_CHECKSUM_VERSION; break; + case 'K': + { + int i; + + /* method 0/disabled cannot be specified */ + for (i = DISABLED_ENCRYPTION_METHOD + 1; + i < NUM_ENCRYPTION_METHODS; i++) + if (pg_strcasecmp(optarg, encryption_methods[i].name) == 0) + { + bootstrap_file_encryption_method = i; + break; + } + if (i == NUM_ENCRYPTION_METHODS) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid encryption method specified, optarg:%s, index:%d", optarg, i))); + } + break; case 'r': strlcpy(OutputFileName, optarg, MAXPGPATH); break; case 'x': MyAuxProcType = atoi(optarg); break; + case 'R': + terminal_fd = atoi(optarg); + break; + case 'u': + bootstrap_old_key_datadir = pstrdup(optarg); + break; case 'X': { int WalSegSz = strtoul(optarg, NULL, 0); diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 814ec41ed57..0fe440e8d98 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -52,7 +52,7 @@ OBJS += pg_extprotocol.o \ pg_appendonly.o \ oid_dispatch.o aocatalog.o storage_tablespace.o storage_database.o \ storage_tablespace_twophase.o storage_tablespace_xact.o \ - gp_partition_template.o + gp_partition_template.o pg_task.o pg_task_run_history.o CATALOG_JSON:= $(addprefix $(top_srcdir)/gpMgmt/bin/gppylib/data/, $(addsuffix .json,$(GP_MAJORVERSION))) @@ -90,7 +90,7 @@ CATALOG_HEADERS := \ pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \ pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \ pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \ - pg_subscription_rel.h gp_partition_template.h + pg_subscription_rel.h gp_partition_template.h pg_task.h pg_task_run_history.h USE_INTERNAL_FTS_FOUND := $(if $(findstring USE_INTERNAL_FTS,$(CFLAGS)),true,false) diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index bb328a2b80f..2ef75e4458f 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -43,11 +43,14 @@ #include "catalog/pg_shseclabel.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" +#include "catalog/pg_task.h" +#include "catalog/pg_task_run_history.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "storage/fd.h" #include "utils/fmgroids.h" #include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/snapmgr.h" #include "utils/syscache.h" @@ -175,6 +178,14 @@ IsSystemClass(Oid relid, Form_pg_class reltuple) IsAoSegmentClass(reltuple)); } +bool +IsSystemClassByRelid(Oid relid) +{ + Oid relnamespace = get_rel_namespace(relid); + return (IsCatalogRelationOid(relid) || IsToastNamespace(relnamespace) || + IsAoSegmentNamespace(relnamespace)); +} + /* * IsCatalogRelation * True iff the relation is a system catalog. @@ -481,6 +492,18 @@ IsSharedRelation(Oid relationId) return true; } #endif + + /* GPDB added task tables and their indexes */ + if (relationId == TaskRelationId || + relationId == TaskJobNameUserNameIndexId || + relationId == TaskJobIdIndexId || + relationId == TaskRunHistoryRelationId || + relationId == TaskRunHistoryJobIdIndexId || + relationId == TaskRunHistoryRunIdIndexId) + { + return true; + } + return false; } diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 2918767f122..f5aaf172b30 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -54,6 +54,7 @@ #include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" +#include "catalog/pg_task.h" #include "catalog/pg_transform.h" #include "catalog/pg_trigger.h" #include "catalog/pg_ts_config.h" @@ -73,6 +74,7 @@ #include "commands/schemacmds.h" #include "commands/seclabel.h" #include "commands/sequence.h" +#include "commands/taskcmds.h" #include "commands/trigger.h" #include "commands/typecmds.h" #include "foreign/foreign.h" @@ -195,7 +197,8 @@ static const Oid object_classes[] = { TransformRelationId, /* OCLASS_TRANSFORM */ /* GPDB additions */ - ExtprotocolRelationId /* OCLASS_EXTPROTOCOL */ + ExtprotocolRelationId, /* OCLASS_EXTPROTOCOL */ + TaskRelationId /* OCLASS_TASK */ }; @@ -1536,6 +1539,9 @@ doDeletion(const ObjectAddress *object, int flags) case OCLASS_SCHEMA: RemoveSchemaById(object->objectId); break; + case OCLASS_TASK: + RemoveTaskById(object->objectId); + break; case OCLASS_CAST: case OCLASS_COLLATION: @@ -2938,6 +2944,9 @@ getObjectClass(const ObjectAddress *object) case TransformRelationId: return OCLASS_TRANSFORM; + + case TaskRelationId: + return OCLASS_TASK; } /* shouldn't get here */ diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 8c2fdea787a..715a3ddf4b7 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1305,7 +1305,15 @@ index_create(Relation heapRelation, } else { - index_build(heapRelation, indexRelation, indexInfo, false, true); + /* + * GP_PARALLEL_FIXME: temporarily set ii_ParallelWorkers to -1 to disable parallel in bitmap index + * building. That's because that we still hold InterruptHoldoffCount after launch parallel workers. + * And when parallel workers detach the message 'X' is not interrupt the leader. However, the leader + * must wait for workers detaching. Thus there will be a hang issue. + * + * We should bring it back in the future. + */ + index_build(heapRelation, indexRelation, indexInfo, false, indexInfo->ii_ParallelWorkers != -1); } /* @@ -3052,14 +3060,9 @@ index_build(Relation heapRelation, plan_create_index_workers(RelationGetRelid(heapRelation), RelationGetRelid(indexRelation)); - /* - * GPDB_12_MERGE_FIXME: Parallel CREATE INDEX temporarily disabled. - * In the 'partition_prune' regression test, the parallel worker - * blocked waiting for the main process. I believe there's something - * broken in the lock manager in GPDB with parallel workers. Need - * figure that out first. - */ - indexInfo->ii_ParallelWorkers = 0; + /* If we are QD or dealing with AO table, we disable parallelism. */ + if (GP_ROLE_DISPATCH == Gp_role || AMHandlerIsAO(heapRelation->rd_amhandler)) + indexInfo->ii_ParallelWorkers = 0; if (indexInfo->ii_ParallelWorkers == 0) ereport(DEBUG1, diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 33b37792e1e..320d88793a0 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -56,6 +56,7 @@ #include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" +#include "catalog/pg_task.h" #include "catalog/pg_transform.h" #include "catalog/pg_trigger.h" #include "catalog/pg_ts_config.h" @@ -4009,6 +4010,21 @@ getObjectDescription(const ObjectAddress *object, bool missing_ok) ExtProtocolGetNameByOid(object->objectId)); break; } + + case OCLASS_TASK: + { + char *taskname; + taskname = GetTaskNameById(object->objectId); + if (!taskname) + { + if (!missing_ok) + elog(ERROR, "cache lookup failed for task %u", + object->objectId); + break; + } + appendStringInfo(&buffer, _("task %s"), taskname); + break; + } } /* an empty buffer is equivalent to no object found */ @@ -4568,6 +4584,10 @@ getObjectTypeDescription(const ObjectAddress *object, bool missing_ok) appendStringInfoString(&buffer, "external protocol"); break; + case OCLASS_TASK: + appendStringInfoString(&buffer, "task"); + break; + /* * There's intentionally no default: case here; we want the * compiler to warn if a new OCLASS hasn't been handled above. @@ -5864,6 +5884,19 @@ getObjectIdentityParts(const ObjectAddress *object, } break; + case OCLASS_TASK: + { + char *taskname; + taskname = GetTaskNameById(object->objectId); + if (taskname) + { + appendStringInfoString(&buffer, quote_identifier(taskname)); + if (objname) + *objname = list_make1(taskname); + } + } + break; + /* * There's intentionally no default: case here; we want the * compiler to warn if a new OCLASS hasn't been handled above. diff --git a/src/backend/catalog/pg_appendonly.c b/src/backend/catalog/pg_appendonly.c index ef04d8755fc..a97a31107a4 100644 --- a/src/backend/catalog/pg_appendonly.c +++ b/src/backend/catalog/pg_appendonly.c @@ -16,11 +16,13 @@ */ #include "postgres.h" +#include "c.h" #include "catalog/pg_appendonly.h" #include "catalog/pg_type.h" #include "catalog/pg_proc.h" #include "catalog/gp_fastsequence.h" +#include "access/appendonlywriter.h" #include "access/genam.h" #include "access/heapam.h" #include "access/table.h" @@ -89,6 +91,7 @@ InsertAppendOnlyEntry(Oid relid, values[Anum_pg_appendonly_compresstype - 1] = NameGetDatum(&compresstype_name); values[Anum_pg_appendonly_columnstore - 1] = BoolGetDatum(columnstore); values[Anum_pg_appendonly_segrelid - 1] = ObjectIdGetDatum(segrelid); + values[Anum_pg_appendonly_segfilecount- 1] = Int16GetDatum(0); values[Anum_pg_appendonly_blkdirrelid - 1] = ObjectIdGetDatum(blkdirrelid); values[Anum_pg_appendonly_blkdiridxid - 1] = ObjectIdGetDatum(blkdiridxid); values[Anum_pg_appendonly_visimaprelid - 1] = ObjectIdGetDatum(visimaprelid); @@ -639,3 +642,30 @@ SwapAppendonlyEntries(Oid entryRelId1, Oid entryRelId2) } } +int16 +GetAppendOnlySegmentFilesCount(Relation rel) +{ + Relation pg_aoseg_rel; + HeapTuple tuple; + SysScanDesc aoscan; + int16 result = 0; + Oid segrelid = InvalidOid; + + GetAppendOnlyEntryAuxOids(rel->rd_id, NULL, &segrelid, NULL, + NULL, NULL, NULL); + if (segrelid == InvalidOid) + elog(ERROR, "could not find pg_aoseg aux table for AO table \"%s\"", + RelationGetRelationName(rel)); + + pg_aoseg_rel = table_open(segrelid, AccessShareLock); + aoscan = systable_beginscan(pg_aoseg_rel, InvalidOid, false, NULL, 0, NULL); + while ((tuple = systable_getnext(aoscan)) != NULL) + { + result++; + CHECK_FOR_INTERRUPTS(); + } + Assert(result <= MAX_AOREL_CONCURRENCY); + systable_endscan(aoscan); + table_close(pg_aoseg_rel, AccessShareLock); + return result; +} diff --git a/src/backend/catalog/pg_task.c b/src/backend/catalog/pg_task.c new file mode 100644 index 00000000000..34a9a0b16c0 --- /dev/null +++ b/src/backend/catalog/pg_task.c @@ -0,0 +1,230 @@ +/*------------------------------------------------------------------------- + * + * pg_task.c + * save all tasks of pg cron scheduler. + * + * Portions Copyright (c) 2023-Present Hashdata Inc. + * + * + * IDENTIFICATION + * src/backend/catalog/pg_task.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/genam.h" +#include "access/table.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/pg_task.h" +#include "postmaster/bgworker.h" +#include "task/pg_cron.h" +#include "utils/builtins.h" +#include "utils/rel.h" + +/* + * TaskCreate + * Create an new task in pg_task. + */ +Oid +TaskCreate(const char *schedule, const char *command, + const char *nodename, int32 nodeport, + const char *database, const char *username, + bool active, const char *jobname) +{ + Relation pg_task; + HeapTuple tup; + Oid jobid; + Datum values[Natts_pg_task]; + bool nulls[Natts_pg_task]; + pid_t cron_pid; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + pg_task = table_open(TaskRelationId, RowExclusiveLock); + + jobid = GetNewOidWithIndex(pg_task, TaskJobIdIndexId, Anum_pg_task_jobid); + values[Anum_pg_task_jobid - 1] = ObjectIdGetDatum(jobid); + values[Anum_pg_task_command - 1] = CStringGetTextDatum(command); + values[Anum_pg_task_schedule - 1] = CStringGetTextDatum(schedule); + values[Anum_pg_task_nodename - 1] = CStringGetTextDatum(nodename); + values[Anum_pg_task_nodeport - 1] = Int32GetDatum(nodeport); + values[Anum_pg_task_database - 1] = CStringGetTextDatum(database); + values[Anum_pg_task_username - 1] = CStringGetTextDatum(username); + values[Anum_pg_task_active - 1] = BoolGetDatum(active); + if (jobname) + values[Anum_pg_task_jobname - 1] = CStringGetTextDatum(jobname); + else + nulls[Anum_pg_task_jobname - 1] = true; + + tup = heap_form_tuple(RelationGetDescr(pg_task), values, nulls); + CatalogTupleInsert(pg_task, tup); + heap_freetuple(tup); + + table_close(pg_task, RowExclusiveLock); + + /* Send SIGHUP to pg_cron launcher to reload the task */ + cron_pid = PgCronLauncherPID(); + if (cron_pid == InvalidPid) + elog(ERROR, "could not find pid of pg_cron launcher process"); + if (kill(cron_pid, SIGHUP) < 0) + elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) cron_pid, SIGHUP); + + return jobid; +} + +/* + * TaskUpdate + * Update an existing task in pg_task. + */ +void +TaskUpdate(Oid jobid, const char *schedule, + const char *command, const char *database, + const char *username, bool *active) +{ + Relation pg_task; + HeapTuple tup; + SysScanDesc scanDescriptor = NULL; + ScanKeyData scanKey[1]; + Datum values[Natts_pg_task]; + bool nulls[Natts_pg_task]; + bool doreplace[Natts_pg_task]; + pid_t cron_pid; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(doreplace, false, sizeof(doreplace)); + + pg_task = table_open(TaskRelationId, RowExclusiveLock); + + /* try to find the task */ + ScanKeyInit(&scanKey[0], Anum_pg_task_jobid, + BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(jobid)); + + scanDescriptor = systable_beginscan(pg_task, TaskJobIdIndexId, + true, NULL, 1, scanKey); + tup = systable_getnext(scanDescriptor); + + if (!HeapTupleIsValid(tup)) + elog(ERROR, "could not find valid entry for job"); + + /* specify the fields that need to be updated */ + if (schedule) + { + values[Anum_pg_task_schedule - 1] = CStringGetTextDatum(schedule); + doreplace[Anum_pg_task_schedule - 1] = true; + } + if (command) + { + values[Anum_pg_task_command - 1] = CStringGetTextDatum(command); + doreplace[Anum_pg_task_command - 1] = true; + } + if (database) + { + values[Anum_pg_task_database - 1] = CStringGetTextDatum(database); + doreplace[Anum_pg_task_database - 1] = true; + } + if (username) + { + values[Anum_pg_task_username - 1] = CStringGetTextDatum(username); + doreplace[Anum_pg_task_username - 1] = true; + } + if (active) + { + values[Anum_pg_task_active - 1] = BoolGetDatum(*active); + doreplace[Anum_pg_task_active - 1] = true; + } + + tup = heap_modify_tuple(tup, RelationGetDescr(pg_task), values, nulls, doreplace); + CatalogTupleUpdate(pg_task, &tup->t_self, tup); + heap_freetuple(tup); + + systable_endscan(scanDescriptor); + table_close(pg_task, RowExclusiveLock); + + /* Send SIGHUP to pg_cron launcher to reload the task */ + cron_pid = PgCronLauncherPID(); + if (cron_pid == InvalidPid) + elog(ERROR, "could not find pid of pg_cron launcher process"); + if (kill(cron_pid, SIGHUP) < 0) + elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) cron_pid, SIGHUP); +} + +/* + * GetTaskJobId + * Get the jobid of a task. + */ +Oid +GetTaskJobId(const char *jobname, const char *username) +{ + Relation pg_task; + HeapTuple tup; + SysScanDesc scanDescriptor = NULL; + ScanKeyData scanKey[2]; + Oid jobid = InvalidOid; + Form_pg_task task; + + pg_task = table_open(TaskRelationId, AccessShareLock); + + ScanKeyInit(&scanKey[0], Anum_pg_task_jobname, BTEqualStrategyNumber, + F_TEXTEQ, CStringGetTextDatum(jobname)); + ScanKeyInit(&scanKey[1], Anum_pg_task_username, BTEqualStrategyNumber, + F_TEXTEQ, CStringGetTextDatum(username)); + + scanDescriptor = systable_beginscan(pg_task, TaskJobNameUserNameIndexId, + true, NULL, 2, scanKey); + tup = systable_getnext(scanDescriptor); + + if (HeapTupleIsValid(tup)) + { + task = (Form_pg_task) GETSTRUCT(tup); + jobid = task->jobid; + } + + systable_endscan(scanDescriptor); + table_close(pg_task, AccessShareLock); + + return jobid; +} + +/* + * GetTaskNameById + * Get task name by job id. + */ +char * +GetTaskNameById(Oid jobid) +{ + Relation pg_task; + HeapTuple tup; + SysScanDesc scanDescriptor = NULL; + ScanKeyData scanKey[1]; + char *result = NULL; + TupleDesc tupleDesc = NULL; + bool isNull = false; + + pg_task = table_open(TaskRelationId, AccessShareLock); + + ScanKeyInit(&scanKey[0], Anum_pg_task_jobid, BTEqualStrategyNumber, + F_OIDEQ, ObjectIdGetDatum(jobid)); + + scanDescriptor = systable_beginscan(pg_task, TaskJobIdIndexId, + true, NULL, 1, scanKey); + tup = systable_getnext(scanDescriptor); + + if (HeapTupleIsValid(tup)) + { + tupleDesc = RelationGetDescr(pg_task); + Datum jobname = heap_getattr(tup, Anum_pg_task_jobname, tupleDesc, &isNull); + if (!isNull) + result = TextDatumGetCString(jobname); + } + + systable_endscan(scanDescriptor); + table_close(pg_task, AccessShareLock); + + return result; +} diff --git a/src/backend/catalog/pg_task_run_history.c b/src/backend/catalog/pg_task_run_history.c new file mode 100644 index 00000000000..0b188081a5e --- /dev/null +++ b/src/backend/catalog/pg_task_run_history.c @@ -0,0 +1,206 @@ +/*------------------------------------------------------------------------- + * + * pg_task_run_history.c + * save all tasks run histories of pg cron scheduler. + * + * Portions Copyright (c) 2023-Present Hashdata Inc. + * + * + * IDENTIFICATION + * src/backend/catalog/pg_task_run_history.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/genam.h" +#include "access/table.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/pg_task_run_history.h" +#include "utils/builtins.h" +#include "utils/rel.h" +#include "utils/timestamp.h" + +/* + * TaskRunHistoryCreate + * Create an new task run history in pg_task_run_history. + */ +void +TaskRunHistoryCreate(Oid runid, int64 *jobid, const char *database, const char *username, + const char *command, const char *status) +{ + Relation pg_task_run_history; + HeapTuple tup; + Datum values[Natts_pg_task_run_history]; + bool nulls[Natts_pg_task_run_history]; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + pg_task_run_history = table_open(TaskRunHistoryRelationId, RowExclusiveLock); + + values[Anum_pg_task_run_history_runid - 1] = ObjectIdGetDatum(runid); + values[Anum_pg_task_run_history_jobid - 1] = ObjectIdGetDatum((Oid) *jobid); + values[Anum_pg_task_run_history_database - 1] = CStringGetTextDatum(database); + values[Anum_pg_task_run_history_username - 1] = CStringGetTextDatum(username); + values[Anum_pg_task_run_history_command - 1] = CStringGetTextDatum(command); + values[Anum_pg_task_run_history_status - 1] = CStringGetTextDatum(status); + + nulls[Anum_pg_task_run_history_return_message - 1] = true; + nulls[Anum_pg_task_run_history_start_time - 1] = true; + nulls[Anum_pg_task_run_history_end_time - 1] = true; + + tup = heap_form_tuple(RelationGetDescr(pg_task_run_history), values, nulls); + CatalogTupleInsert(pg_task_run_history, tup); + heap_freetuple(tup); + + table_close(pg_task_run_history, RowExclusiveLock); +} + +/* + * TaskRunHistoryUpdate + * Update an existing task run history in pg_task_run_history. + */ +void +TaskRunHistoryUpdate(Oid runid, int32 *job_pid, const char *status, + const char *return_message, TimestampTz *start_time, TimestampTz *end_time) +{ + Relation pg_task_run_history; + HeapTuple tup; + SysScanDesc scanDescriptor = NULL; + ScanKeyData scanKey[1]; + Datum values[Natts_pg_task_run_history]; + bool nulls[Natts_pg_task_run_history]; + bool doreplace[Natts_pg_task_run_history]; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(doreplace, false, sizeof(doreplace)); + + pg_task_run_history = table_open(TaskRunHistoryRelationId, RowExclusiveLock); + + ScanKeyInit(&scanKey[0], + Anum_pg_task_run_history_runid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(runid)); + + scanDescriptor = systable_beginscan(pg_task_run_history, + TaskRunHistoryRunIdIndexId, + true, NULL, 1, scanKey); + + tup = systable_getnext(scanDescriptor); + if (!HeapTupleIsValid(tup)) + { + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("task run history with runid %u does not exist", runid))); + } + + if (job_pid != NULL) + { + values[Anum_pg_task_run_history_job_pid - 1] = Int32GetDatum(*job_pid); + doreplace[Anum_pg_task_run_history_job_pid - 1] = true; + } + if (status != NULL) + { + values[Anum_pg_task_run_history_status - 1] = CStringGetTextDatum(status); + doreplace[Anum_pg_task_run_history_status - 1] = true; + } + if (return_message != NULL) + { + values[Anum_pg_task_run_history_return_message - 1] = CStringGetTextDatum(return_message); + doreplace[Anum_pg_task_run_history_return_message - 1] = true; + } + if (start_time != NULL) + { + values[Anum_pg_task_run_history_start_time - 1] = TimestampTzGetDatum(*start_time); + doreplace[Anum_pg_task_run_history_start_time - 1] = true; + } + if (end_time != NULL) + { + values[Anum_pg_task_run_history_end_time - 1] = TimestampTzGetDatum(*end_time); + doreplace[Anum_pg_task_run_history_end_time - 1] = true; + } + + tup = heap_modify_tuple(tup, RelationGetDescr(pg_task_run_history), values, nulls, doreplace); + CatalogTupleUpdate(pg_task_run_history, &tup->t_self, tup); + heap_freetuple(tup); + + systable_endscan(scanDescriptor); + table_close(pg_task_run_history, RowExclusiveLock); +} + +/* + * MarkRunningTaskAsFailed + * Mark all the running tasks as failed. + */ +void +MarkRunningTaskAsFailed(void) +{ + Relation pg_task_run_history; + HeapTuple tup; + SysScanDesc scanDescriptor = NULL; + ScanKeyData scanKey[1]; + Datum values[Natts_pg_task_run_history]; + bool nulls[Natts_pg_task_run_history]; + bool doreplace[Natts_pg_task_run_history]; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(doreplace, false, sizeof(doreplace)); + + pg_task_run_history = table_open(TaskRunHistoryRelationId, RowExclusiveLock); + + ScanKeyInit(&scanKey[0], Anum_pg_task_run_history_status, BTEqualStrategyNumber, + F_TEXTEQ, CStringGetTextDatum("running")); + + scanDescriptor = systable_beginscan(pg_task_run_history, InvalidOid, + false, NULL, 1, scanKey); + + while (HeapTupleIsValid(tup = systable_getnext(scanDescriptor))) + { + values[Anum_pg_task_run_history_status - 1] = CStringGetTextDatum("failed"); + doreplace[Anum_pg_task_run_history_status - 1] = true; + values[Anum_pg_task_run_history_return_message - 1] = CStringGetTextDatum("server restarted"); + doreplace[Anum_pg_task_run_history_return_message - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(pg_task_run_history), values, nulls, doreplace); + CatalogTupleUpdate(pg_task_run_history, &tup->t_self, tup); + heap_freetuple(tup); + } + + systable_endscan(scanDescriptor); + table_close(pg_task_run_history, RowExclusiveLock); +} + +/* + * RemoveTaskRunHistoryByJobId + * Remove all the task run history records for the given jobid. + */ +void +RemoveTaskRunHistoryByJobId(Oid jobid) +{ + Relation pg_task_run_history; + HeapTuple tup; + SysScanDesc scanDescriptor = NULL; + ScanKeyData scanKey[1]; + + pg_task_run_history = table_open(TaskRunHistoryRelationId, RowExclusiveLock); + + ScanKeyInit(&scanKey[0], Anum_pg_task_run_history_jobid, BTEqualStrategyNumber, + F_OIDEQ, ObjectIdGetDatum(jobid)); + + scanDescriptor = systable_beginscan(pg_task_run_history, TaskRunHistoryJobIdIndexId, + true, NULL, 1, scanKey); + + while (HeapTupleIsValid(tup = systable_getnext(scanDescriptor))) + { + CatalogTupleDelete(pg_task_run_history, &tup->t_self); + } + + systable_endscan(scanDescriptor); + table_close(pg_task_run_history, RowExclusiveLock); +} diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index e875d4d62d1..9e24ac4599e 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -475,8 +475,8 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, smgrread(src, forkNum, blkno, buf.data); - if (!PageIsVerifiedExtended(page, blkno, - PIV_LOG_WARNING | PIV_REPORT_STAT)) + if (!PageIsVerifiedExtended(page, forkNum, + blkno, PIV_LOG_WARNING | PIV_REPORT_STAT)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page in block %u of relation %s", @@ -484,7 +484,6 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, relpathbackend(src->smgr_rnode.node, src->smgr_rnode.backend, forkNum)))); - /* * WAL-log the copied page. Unfortunately we don't know what kind of a * page this is, so we have to log the full page including any unused @@ -493,6 +492,8 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, if (use_wal) log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false); + PageEncryptInplace(page, forkNum, + blkno); PageSetChecksumInplace(page, blkno); /* diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index 55342932d05..51cd0e41939 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -278,7 +278,7 @@ RETURN pg_sleep(extract(epoch from $1) - CREATE OR REPLACE FUNCTION pg_relation_size(regclass) RETURNS bigint LANGUAGE sql - PARALLEL SAFE STRICT COST 1 + PARALLEL UNSAFE STRICT COST 1 RETURN pg_relation_size($1, 'main'); CREATE OR REPLACE FUNCTION obj_description(oid, name) diff --git a/src/backend/cdb/cdbappendonlystorageformat.c b/src/backend/cdb/cdbappendonlystorageformat.c index e8d40b61f46..972057a0eec 100755 --- a/src/backend/cdb/cdbappendonlystorageformat.c +++ b/src/backend/cdb/cdbappendonlystorageformat.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/xlog.h" #include "cdb/cdbappendonlystorage_int.h" #include "cdb/cdbappendonlystorage.h" #include "cdb/cdbappendonlystorageformat.h" @@ -1696,6 +1697,16 @@ AppendOnlyStorageFormat_VerifyBlockChecksum( pg_crc32 *blockChecksumPtr; + + /* + * TODO, when tde is enable, the buffer data will been decrypted, + * the data is changed, the computedChecksum will be not the same + * as the storedChecksum. So just return true. Maybe we should fix + * it in the future. + */ + if (FileEncryptionEnabled) + return true; + Assert(headerPtr != NULL); Assert(storedChecksum != NULL); Assert(computedChecksum != NULL); diff --git a/src/backend/cdb/cdbappendonlystorageread.c b/src/backend/cdb/cdbappendonlystorageread.c index 3eaf6e77827..6918af8a5ec 100755 --- a/src/backend/cdb/cdbappendonlystorageread.c +++ b/src/backend/cdb/cdbappendonlystorageread.c @@ -62,7 +62,8 @@ AppendOnlyStorageRead_Init(AppendOnlyStorageRead *storageRead, int32 maxBufferLen, char *relationName, char *title, - AppendOnlyStorageAttributes *storageAttributes) + AppendOnlyStorageAttributes *storageAttributes, + RelFileNode *relFileNode) { uint8 *memory; int32 memoryLen; @@ -115,7 +116,8 @@ AppendOnlyStorageRead_Init(AppendOnlyStorageRead *storageRead, memoryLen, storageRead->maxBufferLen, storageRead->largeReadLen, - relationName); + relationName, + relFileNode); elogif(Debug_appendonly_print_scan || Debug_appendonly_print_read_block, LOG, "Append-Only Storage Read initialize for table '%s' " @@ -128,7 +130,8 @@ AppendOnlyStorageRead_Init(AppendOnlyStorageRead *storageRead, storageRead->file = -1; storageRead->formatVersion = -1; - + storageRead->relFileNode = *relFileNode; + MemoryContextSwitchTo(oldMemoryContext); storageRead->isActive = true; diff --git a/src/backend/cdb/cdbappendonlystoragewrite.c b/src/backend/cdb/cdbappendonlystoragewrite.c index c5f40488f35..c4e486906c7 100755 --- a/src/backend/cdb/cdbappendonlystoragewrite.c +++ b/src/backend/cdb/cdbappendonlystoragewrite.c @@ -1072,6 +1072,15 @@ AppendOnlyStorageWrite_VerifyWriteBlock(AppendOnlyStorageWrite *storageWrite, VarBlockReader varBlockReader; int readerItemCount; + /* + * Now use the VarBlock module to extract the items out. + */ + VarBlockReaderInit(&varBlockReader, + data, + uncompressedLen, + true, + &storageWrite->relFileNode.node); + varBlockCheckError = VarBlockIsValid(data, uncompressedLen); if (varBlockCheckError != VarBlockCheckOk) ereport(ERROR, @@ -1082,13 +1091,6 @@ AppendOnlyStorageWrite_VerifyWriteBlock(AppendOnlyStorageWrite *storageWrite, errdetail_appendonly_write_storage_block_header(storageWrite), errcontext_appendonly_write_storage_block(storageWrite))); - /* - * Now use the VarBlock module to extract the items out. - */ - VarBlockReaderInit(&varBlockReader, - data, - uncompressedLen); - readerItemCount = VarBlockReaderItemCount(&varBlockReader); if (rowCount != readerItemCount) @@ -1425,7 +1427,6 @@ AppendOnlyStorageWrite_FinishBuffer(AppendOnlyStorageWrite *storageWrite, executorBlockKind, rowCount, compressedLen); - /* * Finish the current buffer by specifying the used length. */ diff --git a/src/backend/cdb/cdbbufferedappend.c b/src/backend/cdb/cdbbufferedappend.c index de6fc949cf6..96a5e7edf41 100644 --- a/src/backend/cdb/cdbbufferedappend.c +++ b/src/backend/cdb/cdbbufferedappend.c @@ -19,8 +19,10 @@ #include /* for write() */ +#include "access/xlog.h" #include "cdb/cdbappendonlyxlog.h" #include "cdb/cdbbufferedappend.h" +#include "crypto/bufenc.h" #include "pgstat.h" #include "utils/guc.h" diff --git a/src/backend/cdb/cdbbufferedread.c b/src/backend/cdb/cdbbufferedread.c index 89546bd4027..00126630077 100644 --- a/src/backend/cdb/cdbbufferedread.c +++ b/src/backend/cdb/cdbbufferedread.c @@ -16,7 +16,9 @@ */ #include "postgres.h" +#include "access/xlog.h" #include "cdb/cdbbufferedread.h" +#include "crypto/bufenc.h" #include "miscadmin.h" #include "pgstat.h" #include "utils/guc.h" @@ -59,7 +61,8 @@ BufferedReadInit( int32 memoryLen, int32 maxBufferLen, int32 maxLargeReadLen, - char *relationName) + char *relationName, + RelFileNode *file_node) { Assert(bufferedRead != NULL); Assert(memory != NULL); @@ -104,6 +107,7 @@ BufferedReadInit( /* start reading from beginning of file */ bufferedRead->fileOff = 0; + bufferedRead->relFileNode = *file_node; /* * Temporary limit support for random reading. */ diff --git a/src/backend/cdb/cdbgroupingpaths.c b/src/backend/cdb/cdbgroupingpaths.c index 8a7e3ca8e74..3598c45d053 100644 --- a/src/backend/cdb/cdbgroupingpaths.c +++ b/src/backend/cdb/cdbgroupingpaths.c @@ -161,7 +161,7 @@ typedef struct } cdb_multi_dqas_info; static void create_two_stage_paths(PlannerInfo *root, cdb_agg_planning_context *ctx, - RelOptInfo *input_rel, RelOptInfo *output_rel); + RelOptInfo *input_rel, RelOptInfo *output_rel, List *partial_pathlist); static List *get_all_rollup_groupclauses(List *rollups); @@ -181,11 +181,13 @@ static void add_second_stage_group_agg_path(PlannerInfo *root, Path *path, bool is_sorted, cdb_agg_planning_context *ctx, - RelOptInfo *output_rel); + RelOptInfo *output_rel, + bool is_partial); static void add_second_stage_hash_agg_path(PlannerInfo *root, Path *path, cdb_agg_planning_context *ctx, - RelOptInfo *output_rel); + RelOptInfo *output_rel, + bool is_partial); static void add_single_dqa_hash_agg_path(PlannerInfo *root, Path *path, cdb_agg_planning_context *ctx, @@ -247,7 +249,8 @@ cdb_create_multistage_grouping_paths(PlannerInfo *root, const AggClauseCosts *agg_final_costs, List *rollups, List *new_rollups, - AggStrategy strat) + AggStrategy strat, + List *partial_pathlist) { Query *parse = root->parse; Path *cheapest_path = input_rel->cheapest_total_path; @@ -331,6 +334,11 @@ cdb_create_multistage_grouping_paths(PlannerInfo *root, } ctx.partial_needed_pathkeys = root->group_pathkeys; ctx.partial_sort_pathkeys = root->group_pathkeys; + /* + * CBDB parallel: Set consider_parallel for costs comparison. + * Else 2-stage agg with lower costs may lose to 1-stage agg. + */ + ctx.partial_rel->consider_parallel = output_rel->consider_parallel; ctx.group_tles = get_common_group_tles(target, parse->groupClause, @@ -449,7 +457,7 @@ cdb_create_multistage_grouping_paths(PlannerInfo *root, /* * All set, generate the two-stage paths. */ - create_two_stage_paths(root, &ctx, input_rel, output_rel); + create_two_stage_paths(root, &ctx, input_rel, output_rel, partial_pathlist); /* * Aggregates with DISTINCT arguments are more complicated, and are not @@ -594,6 +602,11 @@ cdb_create_twostage_distinct_paths(PlannerInfo *root, ctx.agg_final_costs = &zero_agg_costs; ctx.rollups = NIL; ctx.partial_rel = fetch_upper_rel(root, UPPERREL_CDB_FIRST_STAGE_DISTINCT, NULL); + /* + * CBDB parallel: Set consider_parallel for costs comparison. + * Else 2-stage agg with lower costs may lose to 1-stage agg. + */ + ctx.partial_rel->consider_parallel = output_rel->consider_parallel; /* * Set up these fields to look like a query with a GROUP BY on all the @@ -649,7 +662,7 @@ cdb_create_twostage_distinct_paths(PlannerInfo *root, /* * All set, generate the two-stage paths. */ - create_two_stage_paths(root, &ctx, input_rel, output_rel); + create_two_stage_paths(root, &ctx, input_rel, output_rel, NIL); } /* @@ -657,7 +670,7 @@ cdb_create_twostage_distinct_paths(PlannerInfo *root, */ static void create_two_stage_paths(PlannerInfo *root, cdb_agg_planning_context *ctx, - RelOptInfo *input_rel, RelOptInfo *output_rel) + RelOptInfo *input_rel, RelOptInfo *output_rel, List *partial_pathlist) { Path *cheapest_path = input_rel->cheapest_total_path; @@ -665,9 +678,9 @@ create_two_stage_paths(PlannerInfo *root, cdb_agg_planning_context *ctx, * Consider ways to do the first Aggregate stage. * * The first stage's output is Partially Aggregated. The paths are - * collected to the ctx->partial_rel, by calling add_path(). We do *not* - * use add_partial_path(), these partially aggregated paths are considered - * more like MPP paths in Cloudberry in general. + * collected to the ctx->partial_rel, by calling add_path(). + * These partially aggregated paths are considered + * more like MPP paths in Greenplum in general. * * First consider sorted Aggregate paths. */ @@ -716,6 +729,20 @@ create_two_stage_paths(PlannerInfo *root, cdb_agg_planning_context *ctx, add_first_stage_hash_agg_path(root, cheapest_path, ctx); } + if (partial_pathlist) + { + ListCell *lc; + + foreach(lc, partial_pathlist) + { + Path *path = (Path *) lfirst(lc); + + if (cdbpathlocus_collocates_tlist(root, path->locus, ctx->group_tles)) + continue; + add_partial_path(ctx->partial_rel, path); + } + } + /* * We now have partially aggregated paths in ctx->partial_rel. Consider * different ways of performing the Finalize Aggregate stage. @@ -747,14 +774,63 @@ create_two_stage_paths(PlannerInfo *root, cdb_agg_planning_context *ctx, else is_sorted = false; if (path == cheapest_first_stage_path || is_sorted) + { add_second_stage_group_agg_path(root, path, is_sorted, - ctx, output_rel); + ctx, output_rel, false); + } } } if (ctx->can_hash && list_length(ctx->agg_costs->distinctAggrefs) == 0) + { add_second_stage_hash_agg_path(root, cheapest_first_stage_path, - ctx, output_rel); + ctx, output_rel, false); + } + } + + /* + * Same like above, but for partial paths in partital_rel, + * that's parallel agg with multiple workers. + */ + if (ctx->partial_rel->partial_pathlist) + { + Path *cheapest_first_stage_path; + + cheapest_first_stage_path = linitial(ctx->partial_rel->partial_pathlist); + + if (ctx->can_sort) + { + ListCell *lc; + + foreach(lc, ctx->partial_rel->partial_pathlist) + { + Path *path = (Path *) lfirst(lc); + bool is_sorted; + + /* + * In two-stage GROUPING SETS paths, the second stage's grouping + * will include GROUPINGSET_ID(), which is not included in + * root->pathkeys. The first stage's sort order does not include + * that, so we know it's not sorted. + */ + if (!root->parse->groupingSets) + is_sorted = pathkeys_contained_in(ctx->final_needed_pathkeys, + path->pathkeys); + else + is_sorted = false; + if (path == cheapest_first_stage_path || is_sorted) + { + add_second_stage_group_agg_path(root, path, is_sorted, + ctx, output_rel, true); + } + } + } + + if (ctx->can_hash && list_length(ctx->agg_costs->distinctAggrefs) == 0) + { + add_second_stage_hash_agg_path(root, cheapest_first_stage_path, + ctx, output_rel, true); + } } } @@ -951,13 +1027,15 @@ static void /* * Create Finalize Aggregate path, from a partially aggregated input. + * If is_partial is true, add path to partital_pathlist. */ static void add_second_stage_group_agg_path(PlannerInfo *root, Path *initial_agg_path, bool is_sorted, cdb_agg_planning_context *ctx, - RelOptInfo *output_rel) + RelOptInfo *output_rel, + bool is_partial) { Path *path; CdbPathLocus singleQE_locus; @@ -995,7 +1073,9 @@ add_second_stage_group_agg_path(PlannerInfo *root, /* Alternative 1: Redistribute -> Sort -> Agg */ if (CdbPathLocus_IsHashed(group_locus)) { - path = cdbpath_create_motion_path(root, initial_agg_path, NIL, + path = initial_agg_path; + + path = cdbpath_create_motion_path(root, path, NIL, false, group_locus); if (ctx->final_sort_pathkeys) @@ -1018,7 +1098,10 @@ add_second_stage_group_agg_path(PlannerInfo *root, ctx->dNumGroupsTotal); path->pathkeys = strip_gsetid_from_pathkeys(ctx->gsetid_sortref, path->pathkeys); - add_path(output_rel, path, root); + if (!is_partial) + add_path(output_rel, path, root); + else + add_partial_path(output_rel, path); } /* @@ -1052,7 +1135,10 @@ add_second_stage_group_agg_path(PlannerInfo *root, ctx->agg_final_costs, ctx->dNumGroupsTotal); path->pathkeys = strip_gsetid_from_pathkeys(ctx->gsetid_sortref, path->pathkeys); - add_path(output_rel, path, root); + if (!is_partial) + add_path(output_rel, path, root); + else + add_partial_path(output_rel, path); } /* @@ -1083,7 +1169,8 @@ add_first_stage_hash_agg_path(PlannerInfo *root, ctx->new_rollups, ctx->agg_partial_costs); CdbPathLocus_MakeStrewn(&(first_stage_agg_path->locus), - CdbPathLocus_NumSegments(first_stage_agg_path->locus)); + CdbPathLocus_NumSegments(first_stage_agg_path->locus), + path->parallel_workers); add_path(ctx->partial_rel, first_stage_agg_path, root); } else @@ -1106,12 +1193,14 @@ add_first_stage_hash_agg_path(PlannerInfo *root, /* * Create Finalize Aggregate path from a partially aggregated input by hashing. + * If is_partial is true, add path to partital_pathlist. */ static void add_second_stage_hash_agg_path(PlannerInfo *root, Path *initial_agg_path, cdb_agg_planning_context *ctx, - RelOptInfo *output_rel) + RelOptInfo *output_rel, + bool is_partial) { CdbPathLocus group_locus; bool needs_redistribute; @@ -1140,10 +1229,11 @@ add_second_stage_hash_agg_path(PlannerInfo *root, if (enable_hashagg_disk || hashentrysize * dNumGroups < work_mem * 1024L) { - Path *path; + Path *path = initial_agg_path; - path = cdbpath_create_motion_path(root, initial_agg_path, NIL, false, - group_locus); + if (needs_redistribute) + path = cdbpath_create_motion_path(root, path, NIL, false, + group_locus); path = (Path *) create_agg_path(root, output_rel, @@ -1156,7 +1246,10 @@ add_second_stage_hash_agg_path(PlannerInfo *root, ctx->havingQual, ctx->agg_final_costs, dNumGroups); - add_path(output_rel, path, root); + if (!is_partial) + add_path(output_rel, path, root); + else + add_partial_path(output_rel, path); } /* @@ -1176,9 +1269,9 @@ add_second_stage_hash_agg_path(PlannerInfo *root, hashentrysize = MAXALIGN(initial_agg_path->pathtarget->width) + MAXALIGN(SizeofMinimalTupleHeader); if (hashentrysize * ctx->dNumGroupsTotal <= work_mem * 1024L) { - Path *path; + Path *path = initial_agg_path; - path = cdbpath_create_motion_path(root, initial_agg_path, + path = cdbpath_create_motion_path(root, path, NIL, false, singleQE_locus); @@ -1193,7 +1286,10 @@ add_second_stage_hash_agg_path(PlannerInfo *root, ctx->havingQual, ctx->agg_final_costs, ctx->dNumGroupsTotal); - add_path(output_rel, path, root); + if (!is_partial) + add_path(output_rel, path, root); + else + add_partial_path(output_rel, path); } } } @@ -1907,7 +2003,8 @@ choose_grouping_locus(PlannerInfo *root, Path *path, hash_exprs, hash_opfamilies, hash_sortrefs, - getgpsegmentCount()); + getgpsegmentCount(), + path->parallel_workers); else CdbPathLocus_MakeSingleQE(&locus, getgpsegmentCount()); need_redistribute = true; diff --git a/src/backend/cdb/cdbllize.c b/src/backend/cdb/cdbllize.c index a8a5e434646..425d35d5f0a 100644 --- a/src/backend/cdb/cdbllize.c +++ b/src/backend/cdb/cdbllize.c @@ -166,9 +166,12 @@ get_partitioned_policy_from_path(PlannerInfo *root, Path *path) * Is it a Hashed distribution? * * NOTE: HashedOJ is not OK, because we cannot let the NULLs be stored - * multiple segments. + * multiple segments. HashedWorkers is OK. + * GPDB_PARALLEL_FIXME: Is HashedWorkers OK? + * There is no parallel insertion now, query->intoPolicy couldn't be CdbLocusType_HashedWorkers. */ - if (path->locus.locustype != CdbLocusType_Hashed) + if (!(path->locus.locustype == CdbLocusType_Hashed || + path->locus.locustype == CdbLocusType_HashedWorkers)) { return NULL; } @@ -338,7 +341,7 @@ cdbllize_get_final_locus(PlannerInfo *root, PathTarget *target) { CdbPathLocus locus; - CdbPathLocus_MakeReplicated(&locus, intoPolicy->numsegments); + CdbPathLocus_MakeReplicated(&locus, intoPolicy->numsegments, 0); return locus; } } @@ -509,7 +512,8 @@ cdbllize_adjust_top_path(PlannerInfo *root, Path *best_path, CdbPathLocus replicatedLocus; CdbPathLocus_MakeReplicated(&replicatedLocus, - targetPolicy->numsegments); + targetPolicy->numsegments, + 0); best_path = cdbpath_create_motion_path(root, best_path, @@ -969,7 +973,6 @@ fix_outer_query_motions_mutator(Node *node, decorate_subplans_with_motions_conte return newnode; } - /* * Add a Motion node on top of a Plan if needed, to make the result available * in 'outer_query_flow'. Subroutine of cdbllize_fix_outer_query_motions(). @@ -1383,6 +1386,13 @@ typedef struct sanity_result_t int flags; } sanity_result_t; +typedef struct aware_result_t +{ + plan_tree_base_prefix base; /* Required prefix for + * plan_tree_walker/mutator */ + int nnodes; +} aware_result_t; + static bool motion_sanity_walker(Node *node, sanity_result_t *result) { @@ -1569,6 +1579,9 @@ motion_sanity_check(PlannerInfo *root, Plan *plan) static void adjust_top_path_for_parallel_retrieve_cursor(Path *path, PlanSlice *slice) { + /* GPDB_PARALLEL_FIXME: should consider parallel_workers for parallel cursor? */ + Assert(path->locus.parallel_workers == 0); + if (CdbPathLocus_IsSingleQE(path->locus) || CdbPathLocus_IsGeneral(path->locus) || CdbPathLocus_IsEntry(path->locus)) diff --git a/src/backend/cdb/cdbmutate.c b/src/backend/cdb/cdbmutate.c index 67e0c8a3c21..ff341c5b1af 100644 --- a/src/backend/cdb/cdbmutate.c +++ b/src/backend/cdb/cdbmutate.c @@ -146,6 +146,20 @@ make_broadcast_motion(Plan *lefttree) return motion; } +Motion * +make_parallel_broadcast_motion(Plan *lefttree) +{ + Motion *motion; + + motion = make_motion(NULL, lefttree, + 0, NULL, NULL, NULL, NULL); + motion->motionType = MOTIONTYPE_PARALLEL_BROADCAST; + motion->hashExprs = NIL; + motion->hashFuncs = NULL; + + return motion; +} + Plan * make_explicit_motion(PlannerInfo *root, Plan *lefttree, AttrNumber segidColIdx) { diff --git a/src/backend/cdb/cdbpath.c b/src/backend/cdb/cdbpath.c index 3d2a42b75ce..c67c6f86468 100644 --- a/src/backend/cdb/cdbpath.c +++ b/src/backend/cdb/cdbpath.c @@ -55,10 +55,11 @@ typedef struct bool ok_to_replicate; bool require_existing_order; bool has_wts; /* Does the rel have WorkTableScan? */ + bool isouter; /* Is at outer table side? */ } CdbpathMfjRel; static bool try_redistribute(PlannerInfo *root, CdbpathMfjRel *g, - CdbpathMfjRel *o, List *redistribution_clauses); + CdbpathMfjRel *o, List *redistribution_clauses, bool parallel_aware); static SplitUpdatePath *make_splitupdate_path(PlannerInfo *root, Path *subpath, Index rti); @@ -75,23 +76,36 @@ cdbpath_cost_motion(PlannerInfo *root, CdbMotionPath *motionpath) Cost motioncost; double recvrows; double sendrows; - double send_segments; - double recv_segments; + double send_segments = 1; + double recv_segments = 1; double total_rows; - if (CdbPathLocus_IsPartitioned(motionpath->path.locus)) - recv_segments = CdbPathLocus_NumSegments(motionpath->path.locus); - else - recv_segments = 1; + CdbPathLocus sublocus = subpath->locus; + CdbPathLocus motionlocus = motionpath->path.locus; - if (CdbPathLocus_IsPartitioned(subpath->locus)) - send_segments = CdbPathLocus_NumSegments(subpath->locus); - else - send_segments = 1; + int mot_parallel = motionlocus.parallel_workers; + int sub_parallel = sublocus.parallel_workers; + + if (CdbPathLocus_IsPartitioned(motionlocus)) + { + recv_segments = CdbPathLocus_NumSegments(motionlocus); + if (mot_parallel > 0) + recv_segments *= mot_parallel; + } + else if (mot_parallel > 0 && CdbPathLocus_IsReplicatedWorkers(motionlocus)) + recv_segments *= mot_parallel; + + if (CdbPathLocus_IsPartitioned(sublocus)) + { + send_segments = CdbPathLocus_NumSegments(sublocus); + if (sub_parallel > 0) + send_segments *= sub_parallel; + } + else if (sub_parallel > 0 && CdbPathLocus_IsReplicatedWorkers(sublocus)) + send_segments *= sub_parallel; /* * Estimate the total number of rows being sent. - * * The base estimate is computed by multiplying the subpath's rows with * the number of sending segments. But in some cases, that leads to too * large estimates, if the subpath's estimate was "clamped" to 1 row. The @@ -125,9 +139,19 @@ cdbpath_cost_motion(PlannerInfo *root, CdbMotionPath *motionpath) cost_per_row = (gp_motion_cost_per_row > 0.0) ? gp_motion_cost_per_row : 2.0 * cpu_tuple_cost; + sendrows = subpath->rows; recvrows = motionpath->path.rows; motioncost = cost_per_row * 0.5 * (sendrows + recvrows); + /* + * GPDB_PARALLEL_FIXME: + * Motioncost may be higher than sendrows + recvrows. + * ex: Broadcast Motion 3:6 + * Broadcast to prallel workers, each worker's has a rel's all rows(recvrows), + * but the transfered cost will double as we will broadcast to 6 workers. + */ + if(CdbPathLocus_IsReplicated(motionlocus) && mot_parallel > 0) + motioncost *= mot_parallel; motionpath->path.total_cost = motioncost + subpath->total_cost; motionpath->path.startup_cost = subpath->startup_cost; @@ -165,6 +189,15 @@ cdbpath_create_motion_path(PlannerInfo *root, Assert(cdbpathlocus_is_valid(locus) && cdbpathlocus_is_valid(subpath->locus)); + /* + * ISTM, subpath of ReplicatedWorkers only happened if general join with broadcast. + * And that only happened if we're doing some updating. Such as: + * `explain update rt3 set b = rt2.b from rt2 where rt3.b = rt2.b;` + * where rt3 and rt2 should have different numsegments. + * However, we don't support parallel update yet, so it will never happen. + */ + Assert(!CdbPathLocus_IsReplicatedWorkers(subpath->locus)); + /* * Motion is to change path's locus, if target locus is the * same as the subpath's, there is no need to add motion. @@ -262,7 +295,7 @@ cdbpath_create_motion_path(PlannerInfo *root, */ pathnode->path.parallel_aware = false; pathnode->path.parallel_safe = subpath->parallel_safe; - pathnode->path.parallel_workers = subpath->parallel_workers; + pathnode->path.parallel_workers = locus.parallel_workers; pathnode->path.pathkeys = pathkeys; pathnode->subpath = subpath; @@ -272,6 +305,7 @@ cdbpath_create_motion_path(PlannerInfo *root, pathnode->path.total_cost = subpath->total_cost; pathnode->path.memory = subpath->memory; pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; /* Motion nodes are never rescannable. */ pathnode->path.rescannable = false; @@ -279,6 +313,7 @@ cdbpath_create_motion_path(PlannerInfo *root, } if (CdbPathLocus_IsSegmentGeneral(subpath->locus) || + CdbPathLocus_IsSegmentGeneralWorkers(subpath->locus) || CdbPathLocus_IsReplicated(subpath->locus)) { /* @@ -312,7 +347,7 @@ cdbpath_create_motion_path(PlannerInfo *root, */ pathnode->path.parallel_aware = false; pathnode->path.parallel_safe = subpath->parallel_safe; - pathnode->path.parallel_workers = subpath->parallel_workers; + pathnode->path.parallel_workers = locus.parallel_workers; pathnode->subpath = subpath; @@ -321,6 +356,7 @@ cdbpath_create_motion_path(PlannerInfo *root, pathnode->path.total_cost = subpath->total_cost; pathnode->path.memory = subpath->memory; pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; /* Motion nodes are never rescannable. */ pathnode->path.rescannable = false; @@ -381,7 +417,7 @@ cdbpath_create_motion_path(PlannerInfo *root, } /* Must be partitioned-->replicated */ - else if (!CdbPathLocus_IsReplicated(locus)) + else if (!CdbPathLocus_IsReplicated(locus) && !CdbPathLocus_IsHashedWorkers(locus) && !CdbPathLocus_IsReplicatedWorkers(locus)) goto invalid_motion_request; /* Fail if caller insists on ordered result or no motion. */ @@ -398,6 +434,10 @@ cdbpath_create_motion_path(PlannerInfo *root, /* If subplan uses no tables, it can run on qDisp or a singleton qExec. */ else if (CdbPathLocus_IsGeneral(subpath->locus)) { + /* + * Parallel replicating is now only happening if both sides are not general. + */ + Assert(!CdbPathLocus_IsReplicatedWorkers(locus)); /* * No motion needed if general-->general or general-->replicated or * general-->segmentGeneral @@ -409,6 +449,9 @@ cdbpath_create_motion_path(PlannerInfo *root, return subpath; } + if (CdbPathLocus_IsSegmentGeneralWorkers(subpath->locus)) + goto invalid_motion_request; + /* Must be general-->partitioned. */ if (!CdbPathLocus_IsPartitioned(locus)) goto invalid_motion_request; @@ -439,7 +482,7 @@ cdbpath_create_motion_path(PlannerInfo *root, } /* Most motions from SegmentGeneral (replicated table) are disallowed */ - else if (CdbPathLocus_IsSegmentGeneral(subpath->locus)) + else if (CdbPathLocus_IsSegmentGeneral(subpath->locus) || CdbPathLocus_IsSegmentGeneralWorkers(subpath->locus)) { /* * The only allowed case is a SegmentGeneral to Hashed motion, @@ -505,7 +548,7 @@ cdbpath_create_motion_path(PlannerInfo *root, * * SELECT va FROM v_sourcetable; * - * So, push down the Gather Motion if the SubqueryScan dose not + * So, push down the Gather Motion if the SubqueryScan dose not * have pathkey but the SubqueryScan's subpath does. * */ @@ -559,8 +602,23 @@ cdbpath_create_motion_path(PlannerInfo *root, * assertion failures. */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = subpath->parallel_safe; - pathnode->path.parallel_workers = subpath->parallel_workers; + /* + * GPDB_PARALLEL_FIXME: + * We once set parallel_safe by locus type, but almost all locus are + * parallel safe nowadays. + * In principle, we should set parallel_safe = true if we are in a parallel join. + * TODO: Set parallel_safe to true for all locus. + */ + pathnode->path.parallel_safe = (locus.parallel_workers > 0 || + CdbPathLocus_IsHashedWorkers(locus) || + CdbPathLocus_IsSingleQE(locus) || + CdbPathLocus_IsEntry(locus) || + CdbPathLocus_IsReplicatedWorkers(locus) || + CdbPathLocus_IsReplicated(locus) || /* CTAS replicated table */ + CdbPathLocus_IsHashed(locus)); + if (!subpath->parallel_safe) + pathnode->path.parallel_safe = false; + pathnode->path.parallel_workers = locus.parallel_workers; pathnode->subpath = subpath; pathnode->is_explicit_motion = false; @@ -570,6 +628,11 @@ cdbpath_create_motion_path(PlannerInfo *root, /* Tell operators above us that slack may be needed for deadlock safety. */ pathnode->path.motionHazard = true; + /* + * If parallel workers > 0, which means barrier hazard exits for parallel + * hash join. + */ + pathnode->path.barrierHazard = (locus.parallel_workers > 0); pathnode->path.rescannable = false; /* @@ -613,7 +676,7 @@ cdbpath_create_explicit_motion_path(PlannerInfo *root, * assertion failures. */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = subpath->parallel_safe; + pathnode->path.parallel_safe = false; pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->subpath = subpath; @@ -624,6 +687,7 @@ cdbpath_create_explicit_motion_path(PlannerInfo *root, /* Tell operators above us that slack may be needed for deadlock safety. */ pathnode->path.motionHazard = true; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = false; return (Path *) pathnode; @@ -642,7 +706,7 @@ cdbpath_create_broadcast_motion_path(PlannerInfo *root, pathnode->path.parent = subpath->parent; /* Motion doesn't project, so use source path's pathtarget */ pathnode->path.pathtarget = subpath->pathtarget; - CdbPathLocus_MakeReplicated(&pathnode->path.locus, numsegments); + CdbPathLocus_MakeReplicated(&pathnode->path.locus, numsegments, subpath->parallel_workers); pathnode->path.rows = subpath->rows; pathnode->path.pathkeys = NIL; @@ -652,7 +716,7 @@ cdbpath_create_broadcast_motion_path(PlannerInfo *root, * assertion failures. */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = subpath->parallel_safe; + pathnode->path.parallel_safe = false; pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->subpath = subpath; @@ -663,6 +727,7 @@ cdbpath_create_broadcast_motion_path(PlannerInfo *root, /* Tell operators above us that slack may be needed for deadlock safety. */ pathnode->path.motionHazard = true; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = false; return (Path *) pathnode; @@ -694,7 +759,7 @@ make_motion_path(PlannerInfo *root, Path *subpath, * assertion failures. */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = subpath->parallel_safe; + pathnode->path.parallel_safe = false; pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->subpath = subpath; @@ -707,6 +772,7 @@ make_motion_path(PlannerInfo *root, Path *subpath, /* Tell operators above us that slack may be needed for deadlock safety. */ pathnode->path.motionHazard = true; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = false; return pathnode; @@ -729,6 +795,7 @@ typedef struct List *mergeclause_list; Path *path; CdbPathLocus locus; + CdbPathLocus otherlocus; CdbPathLocus *colocus; bool colocus_eq_locus; } CdbpathMatchPredsContext; @@ -781,7 +848,7 @@ cdbpath_eclass_constant_is_hashable(EquivalenceClass *ec, Oid hashOpFamily) static bool cdbpath_match_preds_to_distkey_tail(CdbpathMatchPredsContext *ctx, - List *list, ListCell *distkeycell) + List *list, ListCell *distkeycell, bool parallel_aware) { DistributionKey *distkey = (DistributionKey *) lfirst(distkeycell); DistributionKey *codistkey; @@ -789,8 +856,17 @@ cdbpath_match_preds_to_distkey_tail(CdbpathMatchPredsContext *ctx, ListCell *rcell; Assert(CdbPathLocus_IsHashed(ctx->locus) || + CdbPathLocus_IsHashedWorkers(ctx->locus) || CdbPathLocus_IsHashedOJ(ctx->locus)); + /* + * Try ro redistributed one to match another. + * non-parallel_aware + * HashedWorkers can only work with replica, can't redistributed one to match + */ + if (!parallel_aware && CdbPathLocus_IsHashedWorkers(ctx->locus)) + return false; + /*---------------- * Is there a " = " predicate? * @@ -866,7 +942,7 @@ cdbpath_match_preds_to_distkey_tail(CdbpathMatchPredsContext *ctx, distkeycell = lnext(list, distkeycell); if (distkeycell) { - if (!cdbpath_match_preds_to_distkey_tail(ctx, list, distkeycell)) + if (!cdbpath_match_preds_to_distkey_tail(ctx, list, distkeycell, parallel_aware)) return false; } @@ -877,7 +953,8 @@ cdbpath_match_preds_to_distkey_tail(CdbpathMatchPredsContext *ctx, *ctx->colocus = ctx->locus; else if (!distkeycell) CdbPathLocus_MakeHashed(ctx->colocus, list_make1(codistkey), - CdbPathLocus_NumSegments(ctx->locus)); + CdbPathLocus_NumSegments(ctx->locus), + ctx->locus.parallel_workers); else { ctx->colocus->distkey = lcons(codistkey, ctx->colocus->distkey); @@ -906,12 +983,25 @@ cdbpath_match_preds_to_distkey(PlannerInfo *root, List *mergeclause_list, Path *path, CdbPathLocus locus, + CdbPathLocus otherlocus, + bool parallel_aware, CdbPathLocus *colocus) /* OUT */ { CdbpathMatchPredsContext ctx; if (!CdbPathLocus_IsHashed(locus) && - !CdbPathLocus_IsHashedOJ(locus)) + !CdbPathLocus_IsHashedOJ(locus) && + !CdbPathLocus_IsHashedWorkers(locus)) + return false; + + /* + * Don't bother to redistribute to non-parallel locus if parallel_aware is true. + * We should already consider non-parallel join of the same two path before. + */ + if (locus.parallel_workers == 0 && parallel_aware) + return false; + + if (!parallel_aware && CdbPathLocus_IsHashedWorkers(locus)) return false; Assert(cdbpathlocus_is_valid(locus)); @@ -920,10 +1010,11 @@ cdbpath_match_preds_to_distkey(PlannerInfo *root, ctx.mergeclause_list = mergeclause_list; ctx.path = path; ctx.locus = locus; + ctx.otherlocus = otherlocus; ctx.colocus = colocus; ctx.colocus_eq_locus = true; - return cdbpath_match_preds_to_distkey_tail(&ctx, locus.distkey, list_head(locus.distkey)); + return cdbpath_match_preds_to_distkey_tail(&ctx, locus.distkey, list_head(locus.distkey), parallel_aware); } @@ -942,7 +1033,8 @@ static bool cdbpath_match_preds_to_both_distkeys(PlannerInfo *root, List *mergeclause_list, CdbPathLocus outer_locus, - CdbPathLocus inner_locus) + CdbPathLocus inner_locus, + bool parallel_aware) { ListCell *outercell; ListCell *innercell; @@ -952,14 +1044,20 @@ cdbpath_match_preds_to_both_distkeys(PlannerInfo *root, if (!mergeclause_list || CdbPathLocus_NumSegments(outer_locus) != CdbPathLocus_NumSegments(inner_locus) || outer_locus.distkey == NIL || inner_locus.distkey == NIL || + CdbPathLocus_NumParallelWorkers(outer_locus) != CdbPathLocus_NumParallelWorkers(inner_locus) || list_length(outer_locus.distkey) != list_length(inner_locus.distkey)) return false; Assert(CdbPathLocus_IsHashed(outer_locus) || + CdbPathLocus_IsHashedWorkers(outer_locus) || CdbPathLocus_IsHashedOJ(outer_locus)); Assert(CdbPathLocus_IsHashed(inner_locus) || + CdbPathLocus_IsHashedWorkers(inner_locus) || CdbPathLocus_IsHashedOJ(inner_locus)); + if (!parallel_aware && (CdbPathLocus_IsHashedWorkers(outer_locus) || CdbPathLocus_IsHashedWorkers(inner_locus))) + return false; + outer_distkey = outer_locus.distkey; inner_distkey = inner_locus.distkey; @@ -1040,6 +1138,8 @@ cdbpath_distkeys_from_preds(PlannerInfo *root, List *mergeclause_list, Path *a_path, int numsegments, + int parallel_workers, + bool parallel_aware, CdbPathLocus *a_locus, /* OUT */ CdbPathLocus *b_locus) /* OUT */ { @@ -1186,9 +1286,9 @@ cdbpath_distkeys_from_preds(PlannerInfo *root, if (!a_distkeys) return false; - CdbPathLocus_MakeHashed(a_locus, a_distkeys, numsegments); + CdbPathLocus_MakeHashed(a_locus, a_distkeys, numsegments, parallel_workers); if (b_distkeys) - CdbPathLocus_MakeHashed(b_locus, b_distkeys, numsegments); + CdbPathLocus_MakeHashed(b_locus, b_distkeys, numsegments, parallel_workers); else *b_locus = *a_locus; return true; @@ -1259,11 +1359,11 @@ cdbpath_motion_for_join(PlannerInfo *root, bool outer_require_existing_order, bool inner_require_existing_order) { - CdbpathMfjRel outer; - CdbpathMfjRel inner; - int numsegments; - bool join_quals_contain_outer_references; - ListCell *lc; + CdbpathMfjRel outer; + CdbpathMfjRel inner; + int numsegments; + bool join_quals_contain_outer_references; + ListCell *lc; *p_rowidexpr_id = 0; @@ -1279,6 +1379,10 @@ cdbpath_motion_for_join(PlannerInfo *root, Assert(cdbpathlocus_is_valid(outer.locus)); Assert(cdbpathlocus_is_valid(inner.locus)); + /* No parallel paths should get here. */ + Assert(outer.locus.parallel_workers == 0); + Assert(inner.locus.parallel_workers == 0); + /* * Does the join quals contain references to outer query? If so, we must * evaluate them in the outer query's locus. That means pulling both @@ -1331,7 +1435,7 @@ cdbpath_motion_for_join(PlannerInfo *root, */ if (outer.has_wts && inner.locus.distkey != NIL) CdbPathLocus_MakeStrewn(&inner.locus, - CdbPathLocus_NumSegments(inner.locus)); + CdbPathLocus_NumSegments(inner.locus), 0); /* * Caller can specify an ordering for each source path that is the same as @@ -1405,7 +1509,6 @@ cdbpath_motion_for_join(PlannerInfo *root, CdbPathLocus_MakeSingleQE(&outer.locus, CdbPathLocus_NumSegments(inner.locus)); outer.path->locus = outer.locus; - } else if (CdbPathLocus_IsSegmentGeneral(outer.locus)) { @@ -1562,7 +1665,7 @@ cdbpath_motion_for_join(PlannerInfo *root, * add redistribute motion, if fails, we gather other * to singleQE. */ - else if (!try_redistribute(root, general, other, redistribution_clauses)) + else if (!try_redistribute(root, general, other, redistribution_clauses, false)) { /* * FIXME: do we need test other's movable? @@ -1647,7 +1750,7 @@ cdbpath_motion_for_join(PlannerInfo *root, * FIXME: do we need to test inner's movable? */ CdbPathLocus_MakeReplicated(&inner.move_to, - CdbPathLocus_NumSegments(outer.locus)); + CdbPathLocus_NumSegments(outer.locus), 0); use_common = false; } else if ((CdbPathLocus_NumSegments(outer.locus) < @@ -1667,11 +1770,11 @@ cdbpath_motion_for_join(PlannerInfo *root, * FIXME: do we need to test outer's movable? */ CdbPathLocus_MakeReplicated(&outer.move_to, - CdbPathLocus_NumSegments(inner.locus)); + CdbPathLocus_NumSegments(inner.locus), 0); use_common = false; } } - + if (use_common) { /* @@ -1701,7 +1804,7 @@ cdbpath_motion_for_join(PlannerInfo *root, Assert(CdbPathLocus_IsBottleneck(other->locus) || CdbPathLocus_IsPartitioned(other->locus)); - + /* * For UPDATE/DELETE, replicated table can't guarantee a logic row has * same ctid or item pointer on each copy. If we broadcast matched tuples @@ -1721,12 +1824,12 @@ cdbpath_motion_for_join(PlannerInfo *root, * everywhere so that for each segment, we have to collect * all the information of other that is we should broadcast it */ - + /* * FIXME: do we need to test other's movable? */ CdbPathLocus_MakeReplicated(&other->move_to, - CdbPathLocus_NumSegments(segGeneral->locus)); + CdbPathLocus_NumSegments(segGeneral->locus), 0); } else if (CdbPathLocus_IsBottleneck(other->locus)) { @@ -1746,11 +1849,11 @@ cdbpath_motion_for_join(PlannerInfo *root, * hashed, hashoj, strewn */ Assert(CdbPathLocus_IsPartitioned(other->locus)); - + if (!segGeneral->ok_to_replicate) { if (!try_redistribute(root, segGeneral, - other, redistribution_clauses)) + other, redistribution_clauses, false)) { /* * FIXME: do we need to test movable? @@ -1778,7 +1881,7 @@ cdbpath_motion_for_join(PlannerInfo *root, else { if (!try_redistribute(root, segGeneral, - other, redistribution_clauses)) + other, redistribution_clauses, false)) { numsegments = CdbPathLocus_CommonSegments(segGeneral->locus, other->locus); @@ -1800,13 +1903,14 @@ cdbpath_motion_for_join(PlannerInfo *root, */ else if (CdbPathLocus_IsBottleneck(outer.locus) || CdbPathLocus_IsBottleneck(inner.locus)) - { /* singleQE or entry db */ + { /* singleQE or entry db */ CdbpathMfjRel *single = &outer; CdbpathMfjRel *other = &inner; - bool single_immovable = (outer.require_existing_order && - !outer_pathkeys) || outer.has_wts; - bool other_immovable = inner.require_existing_order && - !inner_pathkeys; + bool single_immovable = (outer.require_existing_order && + !outer_pathkeys) || + outer.has_wts; + bool other_immovable = inner.require_existing_order && + !inner_pathkeys; /* * If each of the sources has a single-process locus, then assign both @@ -1857,7 +1961,9 @@ cdbpath_motion_for_join(PlannerInfo *root, redistribution_clauses, other->path, other->locus, - &single->move_to)) /* OUT */ + single->locus, + false, /* parallel_aware */ + &single->move_to)) /* OUT */ { AssertEquivalent(CdbPathLocus_NumSegments(other->locus), CdbPathLocus_NumSegments(single->move_to)); @@ -1868,7 +1974,7 @@ cdbpath_motion_for_join(PlannerInfo *root, (single->bytes * CdbPathLocus_NumSegments(other->locus) < single->bytes + other->bytes)) CdbPathLocus_MakeReplicated(&single->move_to, - CdbPathLocus_NumSegments(other->locus)); + CdbPathLocus_NumSegments(other->locus), 0); /* * Redistribute both rels on equijoin cols. @@ -1881,8 +1987,10 @@ cdbpath_motion_for_join(PlannerInfo *root, redistribution_clauses, single->path, CdbPathLocus_NumSegments(other->locus), - &single->move_to, /* OUT */ - &other->move_to)) /* OUT */ + 0, /* parallel_workers */ + false, /* parallel_aware */ + &single->move_to, /* OUT */ + &other->move_to)) /* OUT */ { /* ok */ } @@ -1893,27 +2001,27 @@ cdbpath_motion_for_join(PlannerInfo *root, single->bytes < other->bytes || other->has_wts)) CdbPathLocus_MakeReplicated(&single->move_to, - CdbPathLocus_NumSegments(other->locus)); + CdbPathLocus_NumSegments(other->locus), 0); /* Last resort: If possible, move all partitions of other rel to single QE. */ else if (!other_immovable) other->move_to = single->locus; else goto fail; - } /* singleQE or entry */ + } /* singleQE or entry */ /* * No motion if partitioned alike and joining on the partitioning keys. */ else if (cdbpath_match_preds_to_both_distkeys(root, redistribution_clauses, - outer.locus, inner.locus)) + outer.locus, inner.locus, false)) return cdbpathlocus_join(jointype, outer.locus, inner.locus); /* * Both sources are partitioned. Redistribute or replicate one or both. */ else - { /* partitioned */ + { /* partitioned */ CdbpathMfjRel *large_rel = &outer; CdbpathMfjRel *small_rel = &inner; @@ -1932,7 +2040,9 @@ cdbpath_motion_for_join(PlannerInfo *root, redistribution_clauses, large_rel->path, large_rel->locus, - &small_rel->move_to)) /* OUT */ + small_rel->locus, + false, /* parallel_aware */ + &small_rel->move_to)) /* OUT */ { AssertEquivalent(CdbPathLocus_NumSegments(large_rel->locus), CdbPathLocus_NumSegments(small_rel->move_to)); @@ -1947,7 +2057,7 @@ cdbpath_motion_for_join(PlannerInfo *root, (small_rel->bytes * CdbPathLocus_NumSegments(large_rel->locus) < large_rel->bytes)) CdbPathLocus_MakeReplicated(&small_rel->move_to, - CdbPathLocus_NumSegments(large_rel->locus)); + CdbPathLocus_NumSegments(large_rel->locus), 0); /* * Replicate larger rel if cheaper than redistributing smaller rel. @@ -1958,7 +2068,7 @@ cdbpath_motion_for_join(PlannerInfo *root, (large_rel->bytes * CdbPathLocus_NumSegments(small_rel->locus) < small_rel->bytes)) CdbPathLocus_MakeReplicated(&large_rel->move_to, - CdbPathLocus_NumSegments(small_rel->locus)); + CdbPathLocus_NumSegments(small_rel->locus), 0); /* If joining on smaller rel's partitioning key, redistribute larger. */ else if (!large_rel->require_existing_order && @@ -1966,7 +2076,9 @@ cdbpath_motion_for_join(PlannerInfo *root, redistribution_clauses, small_rel->path, small_rel->locus, - &large_rel->move_to)) /* OUT */ + large_rel->locus, + false, /* parallel_aware */ + &large_rel->move_to)) /* OUT */ { AssertEquivalent(CdbPathLocus_NumSegments(small_rel->locus), CdbPathLocus_NumSegments(large_rel->move_to)); @@ -1978,7 +2090,7 @@ cdbpath_motion_for_join(PlannerInfo *root, (small_rel->bytes * CdbPathLocus_NumSegments(large_rel->locus) < small_rel->bytes + large_rel->bytes)) CdbPathLocus_MakeReplicated(&small_rel->move_to, - CdbPathLocus_NumSegments(large_rel->locus)); + CdbPathLocus_NumSegments(large_rel->locus), 0); /* Replicate largeer rel if cheaper than redistributing both rels. */ else if (!large_rel->require_existing_order && @@ -1986,7 +2098,7 @@ cdbpath_motion_for_join(PlannerInfo *root, (large_rel->bytes * CdbPathLocus_NumSegments(small_rel->locus) < large_rel->bytes + small_rel->bytes)) CdbPathLocus_MakeReplicated(&large_rel->move_to, - CdbPathLocus_NumSegments(small_rel->locus)); + CdbPathLocus_NumSegments(small_rel->locus), 0); /* * Redistribute both rels on equijoin cols. @@ -2004,6 +2116,8 @@ cdbpath_motion_for_join(PlannerInfo *root, large_rel->path, CdbPathLocus_CommonSegments(large_rel->locus, small_rel->locus), + 0, /* parallel_workers */ + false, /* parallel_aware */ &large_rel->move_to, &small_rel->move_to)) { @@ -2018,11 +2132,11 @@ cdbpath_motion_for_join(PlannerInfo *root, else if (!small_rel->require_existing_order && small_rel->ok_to_replicate) CdbPathLocus_MakeReplicated(&small_rel->move_to, - CdbPathLocus_NumSegments(large_rel->locus)); + CdbPathLocus_NumSegments(large_rel->locus), 0); else if (!large_rel->require_existing_order && large_rel->ok_to_replicate) CdbPathLocus_MakeReplicated(&large_rel->move_to, - CdbPathLocus_NumSegments(small_rel->locus)); + CdbPathLocus_NumSegments(small_rel->locus), 0); /* Last resort: Move both rels to a single qExec. */ else @@ -2032,7 +2146,7 @@ cdbpath_motion_for_join(PlannerInfo *root, CdbPathLocus_MakeSingleQE(&outer.move_to, numsegments); CdbPathLocus_MakeSingleQE(&inner.move_to, numsegments); } - } /* partitioned */ + } /* partitioned */ /* * Move outer. @@ -2044,7 +2158,7 @@ cdbpath_motion_for_join(PlannerInfo *root, outer_pathkeys, outer.require_existing_order, outer.move_to); - if (!outer.path) /* fail if outer motion not feasible */ + if (!outer.path) /* fail if outer motion not feasible */ goto fail; if (IsA(outer.path, MaterialPath) && !root->config->may_rescan) @@ -2053,7 +2167,7 @@ cdbpath_motion_for_join(PlannerInfo *root, * If we are the outer path and can never be rescanned, * we could remove the materialize path. */ - MaterialPath *mpath = (MaterialPath *) outer.path; + MaterialPath *mpath = (MaterialPath *)outer.path; outer.path = mpath->subpath; } } @@ -2068,7 +2182,7 @@ cdbpath_motion_for_join(PlannerInfo *root, inner_pathkeys, inner.require_existing_order, inner.move_to); - if (!inner.path) /* fail if inner motion not feasible */ + if (!inner.path) /* fail if inner motion not feasible */ goto fail; } @@ -2081,10 +2195,10 @@ cdbpath_motion_for_join(PlannerInfo *root, /* Tell caller where the join will be done. */ return cdbpathlocus_join(jointype, outer.path->locus, inner.path->locus); -fail: /* can't do this join */ +fail: /* can't do this join */ CdbPathLocus_MakeNull(&outer.move_to); return outer.move_to; -} /* cdbpath_motion_for_join */ +} /* cdbpath_motion_for_join */ /* * Does the path contain WorkTableScan? @@ -2146,15 +2260,19 @@ has_redistributable_clause(RestrictInfo *restrictinfo) */ static bool try_redistribute(PlannerInfo *root, CdbpathMfjRel *g, CdbpathMfjRel *o, - List *redistribution_clauses) + List *redistribution_clauses, bool parallel_aware) { bool g_immovable; bool o_immovable; Assert(CdbPathLocus_IsGeneral(g->locus) || - CdbPathLocus_IsSegmentGeneral(g->locus)); + CdbPathLocus_IsSegmentGeneral(g->locus) || + CdbPathLocus_IsSegmentGeneralWorkers(g->locus)); Assert(CdbPathLocus_IsPartitioned(o->locus)); + if (CdbPathLocus_IsHashedWorkers(o->locus)) + return false; + /* * we cannot add motion if requiring order. * has_wts can be true only for general locus @@ -2185,6 +2303,8 @@ try_redistribute(PlannerInfo *root, CdbpathMfjRel *g, CdbpathMfjRel *o, redistribution_clauses, o->path, o->locus, + g->locus, + parallel_aware, &g->move_to)) return true; else @@ -2204,6 +2324,8 @@ try_redistribute(PlannerInfo *root, CdbpathMfjRel *g, CdbpathMfjRel *o, redistribution_clauses, o->path, numsegments, + Max(o->path->parallel_workers, g->path->parallel_workers), + parallel_aware, &o->move_to, &g->move_to)) { @@ -2230,6 +2352,8 @@ try_redistribute(PlannerInfo *root, CdbpathMfjRel *g, CdbpathMfjRel *o, redistribution_clauses, o->path, numsegments, + Max(o->path->parallel_workers, g->path->parallel_workers), + parallel_aware, &o->move_to, &g->move_to)) { @@ -2263,7 +2387,7 @@ create_motion_path_for_ctas(PlannerInfo *root, GpPolicy *policy, Path *subpath) */ CdbPathLocus targetLocus; - CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments); + CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments, 0); return cdbpath_create_motion_path(root, subpath, NIL, false, targetLocus); } else @@ -2302,7 +2426,8 @@ create_motion_path_for_insert(PlannerInfo *root, GpPolicy *policy, /* * If the target table is DISTRIBUTED RANDOMLY, we can insert the * rows anywhere. So if the input path is already partitioned, let - * the insertions happen where they are. + * the insertions happen where they are. Unless the GUC gp_force_random_redistribution + * tells us to force the redistribution. * * If you `explain` the query insert into tab_random select * from tab_partition * there is not Motion node in plan. However, it is not means that the query only @@ -2311,16 +2436,16 @@ create_motion_path_for_insert(PlannerInfo *root, GpPolicy *policy, * But, we need to grant a Motion node if target locus' segnumber is different with * subpath. */ - if(targetLocus.numsegments != subpath->locus.numsegments) + if (gp_force_random_redistribution || targetLocus.numsegments != subpath->locus.numsegments) { - CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments); + CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments, 0); subpath = cdbpath_create_motion_path(root, subpath, NIL, false, targetLocus); } } else if (CdbPathLocus_IsNull(targetLocus)) { /* could not create DistributionKeys to represent the distribution keys. */ - CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments); + CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments, 0); subpath = (Path *) make_motion_path(root, subpath, targetLocus, false, policy); } @@ -2344,6 +2469,9 @@ create_motion_path_for_insert(PlannerInfo *root, GpPolicy *policy, !contain_volatile_functions((Node *)subpath->pathtarget->exprs) && !contain_volatile_functions((Node *)root->parse->havingQual)) { + /* doesn't support insert parallel yet. */ + Assert(!CdbPathLocus_IsSegmentGeneralWorkers(subpath->locus)); + /* * CdbLocusType_SegmentGeneral is only used by replicated table * right now, so if both input and target are replicated table, @@ -2417,7 +2545,7 @@ create_motion_path_for_upddel(PlannerInfo *root, Index rti, GpPolicy *policy, * * Is "strewn" correct here? Can we do better? */ - CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments); + CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments, 0); subpath = cdbpath_create_explicit_motion_path(root, subpath, targetLocus); @@ -2564,7 +2692,9 @@ create_split_update_path(PlannerInfo *root, Index rti, GpPolicy *policy, Path *s Path * turn_volatile_seggen_to_singleqe(PlannerInfo *root, Path *path, Node *node) { - if ((CdbPathLocus_IsSegmentGeneral(path->locus) || CdbPathLocus_IsGeneral(path->locus)) && + if ((CdbPathLocus_IsSegmentGeneral(path->locus) || + CdbPathLocus_IsGeneral(path->locus) || + CdbPathLocus_IsSegmentGeneralWorkers(path->locus)) && (contain_volatile_functions(node) || IsA(path, LimitPath))) { CdbPathLocus singleQE; @@ -2667,9 +2797,1102 @@ can_elide_explicit_motion(PlannerInfo *root, Index rti, Path *subpath, if (!CdbPathLocus_IsStrewn(subpath->locus)) { - CdbPathLocus resultrelation_locus = cdbpathlocus_from_policy(root, rti, policy); + CdbPathLocus resultrelation_locus = cdbpathlocus_from_policy(root, rti, policy, 0); return cdbpathlocus_equal(subpath->locus, resultrelation_locus); } return false; } + +/* + * cdbpath_motion_for_parallel_join + * Sibling of cdbpath_motion_for_join in parallel mode. + * Separate with non-parallel functions as the logic of parallel join is quite different: + * 1. Trating path locus by outer/inner. The position side in prallel join is sensitive. + * 2. Still try Redistribute Motion even if Broadcast one side. In parallel mode, the cost based on rel size + * might not be better than redistribute one or both. Let the planner decide which is better. + * 3. Never duplicated outer_path(parallel_workers=0). That will lead to wrong results, ex: parallel left join. + * Follow upstream until we have a clear answer. + * + * The locus of path whose workers > 1 could be: + * HashedWorkers: parallel scan on a Hashed locus table. + * ReplicatedWorkers: like Broadcast, replica data to segments but strewn on workers of the same segment. + * SegmentGeneralWorkers: parallel scan on a replica table. + * Strewn(parallel_workers > 1), parallel scan on a randomly distributed table. + * Hashed(parallel_workers > 1), a special one generated by HashedWorkers with a Redistribute Motion. + * + * When we add a new xxxWorkers locus? + * ISTM: xxxWorkers means strewn on workers of the same segment, but together as a xxx locus on segments that + * could be used to join with other locus as non-parallel plan. + * ex: ReplicatedWorkers, all data are replicated on segments, but strewn on workers of a segment. + * For Hashed(parallel_workers > 1), it's a little different because data is firstly hashed on segments, + * and hashed on parallel_workers of a segment, so the Hashed(parallel_workers) could join with the same + * locus without any motions. And it's not strewn on workers. + * Another special locus is: Strewn(parallel_workers > 1). Shall we add a StrewnWorkers too? + * Since it's already strewn on segments, no matter with more processes. + * Another reason is adding a new locus is complex and expensive, we have to handle all the possible locus + * joined with that. + * + * parallel_aware means parallel hashjoin with a shared hash table. + * + * Incompatible locus could be compatible when parallel_aware, ex: + * JOIN + * / \ + * HashedWorkers ParallelHash + * \ + * ReplicatedWorkers + * Both sides are strewn on workers of the same segments, but ParallelHash collect all data from workers' processes. + * So, outer side could find every matched data. And in this example, the join locus is HashedWorkers. + * + * We don't reset path's parallel_workers now. + * There was once an idea reseting path's parallel_works to avoid + * Motion if inner and outer's parallel_workers doesn't match. + * But there are a lot of issues we don't have a clear answer. + * See https://code.hashdata.xyz/cloudberry/cbdb-postgres-merge/-/issues/43. + * + * We couldn't expect the parallel_workers of outer or inner path. + * Partial path may generate locus(parallel_workers=0) if needed, ex: + * GP's parallel two stage Group Gather Agg path which will generate a + * SingleQE locus in the middle plan. And that path could participate in + * parallel plan with Motion(1:6), but it still can't be processed by multiple + * workers or be duplicated in every worker as the inner path. + * + * All locus test cases are in gp_parallel, see final join locus examples there. + */ +CdbPathLocus +cdbpath_motion_for_parallel_join(PlannerInfo *root, + JoinType jointype, /* JOIN_INNER/FULL/LEFT/RIGHT/IN */ + Path **p_outer_path, /* INOUT */ + Path **p_inner_path, /* INOUT */ + int *p_rowidexpr_id, /* OUT */ + List *redistribution_clauses, /* equijoin RestrictInfo list */ + List *restrict_clauses, + List *outer_pathkeys, + List *inner_pathkeys, + bool outer_require_existing_order, + bool inner_require_existing_order, + bool parallel_aware, + bool uninterested_broadcast) +{ + CdbpathMfjRel outer; + CdbpathMfjRel inner; + int numsegments; + bool join_quals_contain_outer_references; + ListCell *lc; + + *p_rowidexpr_id = 0; + + outer.pathkeys = outer_pathkeys; + inner.pathkeys = inner_pathkeys; + outer.path = *p_outer_path; + inner.path = *p_inner_path; + outer.locus = outer.path->locus; + inner.locus = inner.path->locus; + CdbPathLocus_MakeNull(&outer.move_to); + CdbPathLocus_MakeNull(&inner.move_to); + outer.isouter = true; + inner.isouter = false; + + Assert(cdbpathlocus_is_valid(outer.locus)); + Assert(cdbpathlocus_is_valid(inner.locus)); + /* GPDB_PARALLEL_FIXME: reconsider the meaning of parallel_safe in GP parallel? */ + if (!outer.path->parallel_safe || !inner.path->parallel_safe) + goto fail; + + /* + * Does the join quals contain references to outer query? If so, we must + * evaluate them in the outer query's locus. That means pulling both + * inputs to outer locus, and performing the join there. + * + * XXX: If there are pseudoconstant quals, they will be executed by a + * gating Result with a One-Time Filter. In that case, the join's inputs + * wouldn't need to be brought to the outer locus. We could execute the + * join normally, and bring the result to the outer locus and put the + * gating Result above the Motion, instead. But for now, we're not smart + * like that. + */ + join_quals_contain_outer_references = false; + foreach(lc, restrict_clauses) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + + if (rinfo->contain_outer_query_references) + { + join_quals_contain_outer_references = true; + break; + } + } + + /* + * Locus type Replicated/ReplicatedWorkers can only be generated + * by join operation. + * And in the function cdbpathlocus_join there is a rule: + * join => any locus type + * Proof by contradiction, it shows that when code arrives here, + * is is impossible that any of the two input paths' locus + * is Replicated. So we add asserts here. + */ + Assert(!CdbPathLocus_IsReplicated(outer.locus)); + Assert(!CdbPathLocus_IsReplicated(inner.locus)); + Assert(!CdbPathLocus_IsReplicatedWorkers(outer.locus)); + Assert(!CdbPathLocus_IsReplicatedWorkers(inner.locus)); + + if (CdbPathLocus_IsReplicated(outer.locus) || + CdbPathLocus_IsReplicated(inner.locus) || + CdbPathLocus_IsReplicatedWorkers(outer.locus) || + CdbPathLocus_IsReplicatedWorkers(inner.locus)) + goto fail; + + outer.has_wts = cdbpath_contains_wts(outer.path); + inner.has_wts = cdbpath_contains_wts(inner.path); + + /* For now, inner path should not contain WorkTableScan */ + Assert(!inner.has_wts); + + /* + * If outer rel contains WorkTableScan and inner rel is hash distributed, + * unfortunately we have to pretend that inner is randomly distributed, + * otherwise we may end up with redistributing outer rel. + */ + /* GPDB_PARALLEL_FIXME: this may cause parallel CTE, not sure if it's right */ + if (outer.has_wts && inner.locus.distkey != NIL) + CdbPathLocus_MakeStrewn(&inner.locus, + CdbPathLocus_NumSegments(inner.locus), + inner.path->parallel_workers); + + /* + * Caller can specify an ordering for each source path that is the same as + * or weaker than the path's existing ordering. Caller may insist that we + * do not add motion that would lose the specified ordering property; + * otherwise the given ordering is preferred but not required. A required + * NIL ordering means no motion is allowed for that path. + */ + outer.require_existing_order = outer_require_existing_order; + inner.require_existing_order = inner_require_existing_order; + + /* + * Don't consider replicating the preserved rel of an outer join, or the + * current-query rel of a join between current query and subquery. + * + * Path that contains WorkTableScan cannot be replicated. + */ + /* ok_to_replicate means broadcast */ + outer.ok_to_replicate = !outer.has_wts; + inner.ok_to_replicate = true; + + /* + * For parallel mode, join is executed by each batches. + * It is hard to tell whether null exists in the whole table. + */ + if (parallel_aware && jointype == JOIN_LASJ_NOTIN) + goto fail; + + switch (jointype) + { + case JOIN_INNER: + break; + case JOIN_SEMI: + case JOIN_ANTI: + case JOIN_LEFT: + case JOIN_LASJ_NOTIN: + outer.ok_to_replicate = false; + break; + case JOIN_UNIQUE_OUTER: + case JOIN_UNIQUE_INNER: + case JOIN_RIGHT: + case JOIN_FULL: + case JOIN_DEDUP_SEMI: + case JOIN_DEDUP_SEMI_REVERSE: + /* Join types are not supported in parallel yet. */ + goto fail; + default: + elog(ERROR, "unexpected join type %d", jointype); + } + + /* Get rel sizes. */ + outer.bytes = outer.path->rows * outer.path->pathtarget->width; + inner.bytes = inner.path->rows * inner.path->pathtarget->width; + + if (join_quals_contain_outer_references || + CdbPathLocus_IsOuterQuery(outer.locus) || + CdbPathLocus_IsOuterQuery(inner.locus) || + CdbPathLocus_IsGeneral(outer.locus) || + CdbPathLocus_IsGeneral(inner.locus)) + { + /* + * Not supported to participate in parallel yet. + */ + goto fail; + } + /* SegmentGeneralWorkers join others */ + else if (CdbPathLocus_IsSegmentGeneralWorkers(outer.locus)) + { + CdbpathMfjRel *segGeneral = &outer; + CdbpathMfjRel *other = &inner; + + int outerParallel = outer.locus.parallel_workers; + int innerParallel = inner.locus.parallel_workers; + Assert(outerParallel > 1); + + if (CdbPathLocus_IsSegmentGeneralWorkers(inner.locus)) + { + Assert(innerParallel > 1); + /* We don't handle parallel when expanding segments */ + if (CdbPathLocus_NumSegments(outer.locus) != CdbPathLocus_NumSegments(inner.locus)) + goto fail; + /* + * Couldn't join without shared hash table if both are SegmentGeneralWorkers. + * We don't expect a motion for that. + */ + if (!parallel_aware) + goto fail; + if ((outerParallel != innerParallel)) + goto fail; + /* + * SegmentGeneralWorkers parallel join SegmentGeneralWorkers when parallel_aware + * generate SegmentGeneralWorerks locus. + * see ex 5_P_5_5 in gp_parallel.sql + */ + if (outer.ok_to_replicate && inner.ok_to_replicate) + return outer.locus; + goto fail; + } + + if (CdbPathLocus_IsSegmentGeneral(inner.locus)) + { + Assert(innerParallel <= 1); + if (parallel_aware) + goto fail; + + if (CdbPathLocus_NumSegments(outer.locus) != CdbPathLocus_NumSegments(inner.locus)) + goto fail; + /* + * SegmentGeneralWorkers JOIN SegmentGeneral without shared hash table. + * And the join locus is SegmentGeneralWorkers. + * Then we can return the outer locus as join will set workers as outer locus. + * See ex 5_4_5 in gp_parallel.sql + */ + if (outer.ok_to_replicate && inner.ok_to_replicate) + return outer.locus; + goto fail; + } + + if (CdbPathLocus_IsBottleneck(inner.locus)) + { + /* + * Bottleneck locus can't participate in parallel with SegmentGeneralWorkers at present, may be enabled later. + * We don't support parallel on QD yet. If bottleneck is on QE, ex: + * A SingleQE join with SegmentGeneralWorkers(workers:2), we have two ways: + * 1.Motion(1:2) SingleQE to 2 process in local segments. + * There is no such motion now, and we have no clear answer that if SingleQE is parallel_safe(I think most + * are not, because SingleQE is always the last resort which means things must be done on a single process of + * a segments with all data). + * And what's join locus of that, SingleQE with workers = 2? It breaks the rule. + * + * 2.Motion(2:1) SegmentGeneralWorkers to a single process: + * In GPDB, we usually do not motion a SegmentGeneralxxx locus except for: 1) bring to singleQE; 2) Redistributed + * to Partitioned if XXXGeneral can't be general as it has volatile fuctions and so on. + * We follow it as GPDB. + */ + goto fail; + } + + if (CdbPathLocus_IsPartitioned(inner.locus)) + { + if (CdbPathLocus_NumSegments(outer.locus) != CdbPathLocus_NumSegments(inner.locus)) + goto fail; + + if (!segGeneral->ok_to_replicate) + { + if (!try_redistribute(root, segGeneral, + other, redistribution_clauses, parallel_aware)) + { + if (parallel_aware) + goto fail; + + CdbPathLocus_MakeSingleQE(&segGeneral->move_to, + CdbPathLocus_NumSegments(segGeneral->locus)); + CdbPathLocus_MakeSingleQE(&other->move_to, + CdbPathLocus_NumSegments(other->locus)); + } + } + else + { + /* Parallel HashedOJ is not supported yet */ + if (CdbPathLocus_IsHashedOJ(other->locus)) + goto fail; + + if (parallel_aware) + { + if (innerParallel != outerParallel) + goto fail; + /* + * SegmentGeneralWorkers join HashedWorkers, Hashed, Strewn when parallel_aware. + * Let cdbpathlocus_parallel_join decide the join locus. + * That will generate: + * SegmentGeneralWorkers join HashedWorkers generate HashedWorkers(ex 5_P_12_12). + * SegmentGeneralWorkers join Hashed generate HashedWorkers(Need to create a case). + * SegmentGeneralWorkers join Strewn generate Strewn(ex 5_P_11_11). + */ + return cdbpathlocus_parallel_join(jointype, segGeneral->locus, other->locus, true); + } + else if (innerParallel == 0 && other->path->pathtype == T_SeqScan) + { + /* + * GPDB_PARALLEL_FIXME: The inner path will be duplicately processed. + * That require inner path should not have descendant Motion paths. + * Use Seqscan here is more strit, but for now. + * + * SegmentGeneralWokrers(w=N) join inner_locus(w=0). + * That will generate: + * SegmentGeneralWorkers(w=N) join Hashed(w=0) generate HashedWorkers(w=N)(ex 5_9_12). + * SegmentGeneralWorkers(w=N) join Strewn(w=0) generate Strewn(w=N)(ex 5_11_11). + */ + return cdbpathlocus_parallel_join(jointype, segGeneral->locus, other->locus, false); + } + else + { + goto fail; + } + } + } + } + else if (CdbPathLocus_IsSegmentGeneralWorkers(inner.locus)) + { + /* + * The whole branch handles the case that at least + * one of the two locus is SegmentGeneralWorkers. + * Put this before SegmentGeneral, we will handle + * SegmentGeneral with SegmentGeneralWorkers in this branch. + */ + + CdbpathMfjRel *segGeneral; + CdbpathMfjRel *other; + int outerParallel = outer.locus.parallel_workers; + int innerParallel = inner.locus.parallel_workers; + /* Didn't support insert with parallel yet */ + Assert(root->upd_del_replicated_table == 0); + + /* We don't handle parallel when expanding segments */ + if (CdbPathLocus_NumSegments(outer.locus) != CdbPathLocus_NumSegments(inner.locus)) + goto fail; + + if (CdbPathLocus_IsSegmentGeneral(outer.locus) && + CdbPathLocus_IsSegmentGeneralWorkers(inner.locus)) + { + /* + * GPDB_PARALLEL_FIXME: + * We shouln't get here as Path(parallel_worker=1) won't be added to partial_pathlist. + * If outer locus is SegmentGeneral and its parallel_workers must be 0. + * We neighter want a Motion nor change the parallel_workers of a path(May be enabled + * later in some very restricted scenarios or use path(parallel_workers=1) as a partial_path). + */ + goto fail; + } + else + { + /* SegmentGeneralWorkers with Partitioned or Bottleneck */ + segGeneral = &inner; + segGeneral->isouter = false; + other = &outer; + other->isouter = true; + Assert(innerParallel > 1); + + Assert(CdbPathLocus_IsBottleneck(other->locus) || + CdbPathLocus_IsPartitioned(other->locus)); + + if (CdbPathLocus_IsBottleneck(other->locus)) + { + /* + * Bottleneck locus can't participate in parallel at present, may be enabled later if we have a clear answer. + * We don't support parallel on QD yet. If bottleneck is on QE, ex: + * A SingleQE join with SegmentGeneralWorkers(workers:2), we have two ways: + * 1.Motion(1:2) SingleQE to 2 process in local segments. + * There is no such motion now, and we have no clear answer that if SingleQE is parallel_safe(I think most + * are not, because SingleQE is always the last resort which means things must be done on a single process of + * a segments with all data). + * And what's join locus of that, SingleQE with workers = 2? It breaks the rule. + * + * 2.Motion(2:1) SegmentGeneralWorkers to a single process: + * In GPDB, we usually do not motion a SegmentGeneralxxx locus except for: 1) bring to singleQE; 2) Redistributed + * to Partitioned if XXXGeneral can't be general as it has volatile fuctions and so on. + * We follow it as GPDB. + * Another reason is: not sure if we could benefit from a Parallel Scan on replicated tables and Gather data to a single process plan: Parallel scan with Motion all rows cost vs scan without motion. And we have no test cases for that. + */ + goto fail; + } + else + { + /* + * This branch handles for partitioned other locus + * hashed, hashoj, strewn and hashedworkers. + */ + Assert(CdbPathLocus_IsPartitioned(other->locus)); + + if (!segGeneral->ok_to_replicate) + { + if (!try_redistribute(root, segGeneral, + other, redistribution_clauses, parallel_aware)) + { + /* + * FIXME: do we need to test movable? + */ + if (parallel_aware) + goto fail; + + CdbPathLocus_MakeSingleQE(&segGeneral->move_to, + CdbPathLocus_NumSegments(segGeneral->locus)); + CdbPathLocus_MakeSingleQE(&other->move_to, + CdbPathLocus_NumSegments(other->locus)); + } + } + else + { + if (parallel_aware) + { + if (innerParallel != outerParallel) + goto fail; + + if (segGeneral->isouter) + return cdbpathlocus_parallel_join(jointype, segGeneral->locus, other->locus, true); + + /* HashedWorkers, Hashed, Strewn JOIN SegmentGeneralWorkers with shared hash table, return the other locus anyway */ + return other->locus; + } + + /* No shared hash table join */ + /* Couldn't join if other is at outer side without shared hash table */ + Assert(other->isouter); + goto fail; + } + } + } + } + else if (CdbPathLocus_IsSegmentGeneral(outer.locus) || + CdbPathLocus_IsSegmentGeneral(inner.locus)) + { + /* + * the whole branch handles the case that at least + * one of the two locus is SegmentGeneral. The logic + * is: + * - if both are SegmentGeneral: + * 1. if both locus are equal, no motion needed, simply return + * 2. For update cases. If resultrelation + * is SegmentGeneral, the update must execute + * on each segment of the resultrelation, if resultrelation's + * numsegments is larger, the only solution is to broadcast + * other + * 3. no motion is needed, change both numsegments to common + * - if only one of them is SegmentGeneral : + * 1. consider update case, if resultrelation is SegmentGeneral, + * the only solution is to broadcast the other + * 2. if other's locus is singleQE or entry, make SegmentGeneral + * to other's locus + * 3. the remaining possibility of other's locus is partitioned + * 3.1 if SegmentGeneral is not ok_to_replicate, try to + * add redistribute motion, if fails gather each to + * singleQE + * 3.2 if SegmentGeneral's numsegments is larger, just return + * other's locus + * 3.3 try to add redistribute motion, if fails, gather each + * to singleQE + */ + CdbpathMfjRel *segGeneral; + CdbpathMfjRel *other; + + if (CdbPathLocus_IsSegmentGeneral(outer.locus) && + CdbPathLocus_IsSegmentGeneral(inner.locus)) + { + int outerParallel = outer.locus.parallel_workers; + int innerParallel = inner.locus.parallel_workers; + Assert(outerParallel == 0); + Assert(innerParallel == 0); + if (innerParallel > 0 || outerParallel > 0) + goto fail; + + /* + * use_common to indicate whether we should + * return a segmentgeneral locus with common + * numsegments. + */ + bool use_common = true; + + /* + * Handle the case two same locus + */ + if (CdbPathLocus_NumSegments(outer.locus) == CdbPathLocus_NumSegments(inner.locus)) + return inner.locus; + + /* + * Now, two locus' numsegments not equal + * We should consider update resultrelation + * if update, + * - resultrelation's numsegments larger, then + * we should broadcast the other + * - otherwise, results is common + * else: + * common + */ + if (root->upd_del_replicated_table > 0) + { + if ((CdbPathLocus_NumSegments(outer.locus) > + CdbPathLocus_NumSegments(inner.locus)) && + bms_is_member(root->upd_del_replicated_table, + outer.path->parent->relids)) + { + /* + * the updated resultrelation is replicated table + * and its numsegments is larger, we should broadcast + * the other path + */ + if (!inner.ok_to_replicate) + goto fail; + + CdbPathLocus_MakeReplicated(&inner.move_to, + CdbPathLocus_NumSegments(outer.locus), + inner.path->parallel_workers); + use_common = false; + } + else if ((CdbPathLocus_NumSegments(outer.locus) < + CdbPathLocus_NumSegments(inner.locus)) && + bms_is_member(root->upd_del_replicated_table, + inner.path->parent->relids)) + { + /* + * the updated resultrelation is replicated table + * and its numsegments is larger, we should broadcast + * the other path + */ + if (!outer.ok_to_replicate) + goto fail; + + CdbPathLocus_MakeReplicated(&outer.move_to, + CdbPathLocus_NumSegments(inner.locus), + outer.path->parallel_workers); + use_common = false; + } + } + + if (use_common) + { + /* + * The statement is not update a replicated table. + * Just return the segmentgeneral with a smaller numsegments. + */ + numsegments = CdbPathLocus_CommonSegments(inner.locus, + outer.locus); + outer.locus.numsegments = numsegments; + inner.locus.numsegments = numsegments; + + return inner.locus; + } + } + else + { + if (CdbPathLocus_IsSegmentGeneral(outer.locus)) + { + Assert(!CdbPathLocus_HasMultipleWorkers(outer.locus)); + segGeneral = &outer; + segGeneral->isouter = true; + other = &inner; + other->isouter = false; + } + else + { + Assert(!CdbPathLocus_HasMultipleWorkers(inner.locus)); + segGeneral = &inner; + segGeneral->isouter = false; + other = &outer; + other->isouter = true; + } + + Assert(CdbPathLocus_IsBottleneck(other->locus) || + CdbPathLocus_IsPartitioned(other->locus)); + + /* + * For UPDATE/DELETE, replicated table can't guarantee a logic row has + * same ctid or item pointer on each copy. If we broadcast matched tuples + * to all segments, the segments may update the wrong tuples or can't + * find a valid tuple according to ctid or item pointer. + * + * So For UPDATE/DELETE on replicated table, we broadcast other path so + * all target tuples can be selected on all copys and then be updated + * locally. + */ + if (root->upd_del_replicated_table > 0 && + bms_is_member(root->upd_del_replicated_table, + segGeneral->path->parent->relids)) + { + /* + * For UPDATE on a replicated table, we have to do it + * everywhere so that for each segment, we have to collect + * all the information of other that is we should broadcast it + */ + /* doesn't support insert parallel yet */ + Assert(other->path->parallel_workers == 0); + + CdbPathLocus_MakeReplicated(&other->move_to, + CdbPathLocus_NumSegments(segGeneral->locus), + 0); + } + else if (CdbPathLocus_IsBottleneck(other->locus)) + { + if (parallel_aware) + goto fail; + Assert(other->locus.parallel_workers == 0); + + /* + * if the locus type is equal and segment count is unequal, + * we will dispatch the one on more segments to the other + */ + numsegments = CdbPathLocus_CommonSegments(segGeneral->locus, + other->locus); + segGeneral->move_to = other->locus; + segGeneral->move_to.numsegments = numsegments; + } + else + { + /* + * This branch handles for partitioned other locus + * hashed, hashoj, strewn and hashedworkers. + */ + Assert(CdbPathLocus_IsPartitioned(other->locus)); + + if (!segGeneral->ok_to_replicate) + { + if (!try_redistribute(root, segGeneral, + other, redistribution_clauses, parallel_aware)) + { + /* + * FIXME: do we need to test movable? + */ + if (parallel_aware) + goto fail; + + CdbPathLocus_MakeSingleQE(&segGeneral->move_to, + CdbPathLocus_NumSegments(segGeneral->locus)); + CdbPathLocus_MakeSingleQE(&other->move_to, + CdbPathLocus_NumSegments(other->locus)); + } + } + else + { + /* SegmentGeneral join with Partitioned */ + + /* Couldn't join with SegmentGeneral with a shared hash table */ + if (parallel_aware) + goto fail; + + if (other->locus.parallel_workers > 1) + { + if (CdbPathLocus_NumSegments(segGeneral->locus) != CdbPathLocus_NumSegments(other->locus)) + goto fail; + if (!other->isouter) + goto fail; /* partial path must be outer side when parallel_aware is false */ + if (segGeneral->ok_to_replicate) + return other->locus; /* Partitioned JOIN SegmentGeneral */ + goto fail; + } + + /* + * If all other's segments have segGeneral stored, then no motion + * is needed. + * + * A sql to reach here: + * select * from d2 a join r1 b using (c1); + * where d2 is a replicated table on 2 segment, + * r1 is a random table on 1 segments. + */ + if (CdbPathLocus_NumSegments(segGeneral->locus) >= + CdbPathLocus_NumSegments(other->locus)) + return other->locus; + else + { + if (!try_redistribute(root, segGeneral, + other, redistribution_clauses, parallel_aware)) + { + if (parallel_aware) + goto fail; + + numsegments = CdbPathLocus_CommonSegments(segGeneral->locus, + other->locus); + /* + * FIXME: do we need to test movable? + */ + CdbPathLocus_MakeSingleQE(&segGeneral->move_to, numsegments); + CdbPathLocus_MakeSingleQE(&other->move_to, numsegments); + } + } + } + } + } + } + /* + * Is either source confined to a single process? NB: Motion to a single + * process (qDisp or qExec) is the only motion in which we may use Merge + * Receive to preserve an existing ordering. + */ + else if (CdbPathLocus_IsBottleneck(outer.locus) || + CdbPathLocus_IsBottleneck(inner.locus)) + { /* singleQE or entry db */ + CdbpathMfjRel *single = &outer; + CdbpathMfjRel *other = &inner; + bool single_immovable = (outer.require_existing_order && + !outer_pathkeys) || outer.has_wts; + bool other_immovable = inner.require_existing_order && + !inner_pathkeys; + + /* + * If each of the sources has a single-process locus, then assign both + * sources and the join to run in the same process, without motion. + * The slice will be run on the entry db if either source requires it. + */ + if (CdbPathLocus_IsEntry(single->locus)) + { + if (CdbPathLocus_IsBottleneck(other->locus)) + return single->locus; + } + else if (CdbPathLocus_IsSingleQE(single->locus)) + { + if (CdbPathLocus_IsBottleneck(other->locus)) + { + /* + * Can join directly on one of the common segments. + */ + numsegments = CdbPathLocus_CommonSegments(outer.locus, + inner.locus); + + other->locus.numsegments = numsegments; + return other->locus; + } + } + + /* Let 'single' be the source whose locus is singleQE or entry. */ + else + { + CdbSwap(CdbpathMfjRel *, single, other); + CdbSwap(bool, single_immovable, other_immovable); + } + + Assert(CdbPathLocus_IsBottleneck(single->locus)); + Assert(CdbPathLocus_IsPartitioned(other->locus)); + + /* If the bottlenecked rel can't be moved, bring the other rel to it. */ + if (single_immovable) + { + if (other_immovable) + goto fail; + else + other->move_to = single->locus; + } + + /* Redistribute single rel if joining on other rel's partitioning key */ + else if (cdbpath_match_preds_to_distkey(root, + redistribution_clauses, + other->path, + other->locus, + single->locus, + parallel_aware, + &single->move_to)) /* OUT */ + { + AssertEquivalent(CdbPathLocus_NumSegments(other->locus), + CdbPathLocus_NumSegments(single->move_to)); + } + + /* Replicate single rel if cheaper than redistributing both rels. */ + else if (single->ok_to_replicate && + (single->bytes * CdbPathLocus_NumSegments(other->locus) < + single->bytes + other->bytes)) + CdbPathLocus_MakeReplicated(&single->move_to, + CdbPathLocus_NumSegments(other->locus), + single->path->parallel_workers); + + /* + * Redistribute both rels on equijoin cols. + * + * Redistribute both to the same segments, here we choose the + * same segments with other. + */ + else if (!other_immovable && + cdbpath_distkeys_from_preds(root, + redistribution_clauses, + single->path, + CdbPathLocus_NumSegments(other->locus), + Max(single->path->parallel_workers, other->path->parallel_workers), + parallel_aware, + &single->move_to, /* OUT */ + &other->move_to)) /* OUT */ + { + /* ok */ + } + + /* Broadcast single rel for below cases. */ + else if (single->ok_to_replicate && + (other_immovable || + single->bytes < other->bytes || + other->has_wts)) + CdbPathLocus_MakeReplicated(&single->move_to, + CdbPathLocus_NumSegments(other->locus), + single->path->parallel_workers); + + /* Last resort: If possible, move all partitions of other rel to single QE. */ + else if (!other_immovable) + other->move_to = single->locus; + else + goto fail; + } /* singleQE or entry */ + + /* + * No motion if partitioned alike and joining on the partitioning keys. + */ + else if (cdbpath_match_preds_to_both_distkeys(root, redistribution_clauses, + outer.locus, inner.locus, parallel_aware)) + return cdbpathlocus_parallel_join(jointype, outer.locus, inner.locus, parallel_aware); + + /* + * Both sources are partitioned. Redistribute or replicate one or both. + */ + else + { /* partitioned */ + CdbpathMfjRel *large_rel = &outer; + CdbpathMfjRel *small_rel = &inner; + + /* Consider locus when parallel_ware. */ + if(parallel_aware) + { + /* can't parallel join if both are Hashed, it should be in non-parallel path */ + if (CdbPathLocus_IsHashed(outer.locus) && + CdbPathLocus_IsHashed(inner.locus)) + goto fail; + } + + /* Which rel is bigger? */ + /* GPDB_PARALLEL_FIXME: should we swap if parallel_aware? */ + if (large_rel->bytes < small_rel->bytes) + CdbSwap(CdbpathMfjRel *, large_rel, small_rel); + + /* Both side are distribued in 1 segment and no parallel, it can join without motion. */ + if (CdbPathLocus_NumSegments(large_rel->locus) == 1 && + CdbPathLocus_NumSegments(small_rel->locus) == 1 && + CdbPathLocus_NumParallelWorkers(large_rel->locus) == 0 && + CdbPathLocus_NumParallelWorkers(small_rel->locus) == 0) + return large_rel->locus; + + /* If joining on larger rel's partitioning key, redistribute smaller. */ + if (!small_rel->require_existing_order && + cdbpath_match_preds_to_distkey(root, + redistribution_clauses, + large_rel->path, + large_rel->locus, + small_rel->locus, + parallel_aware, + &small_rel->move_to)) /* OUT */ + { + AssertEquivalent(CdbPathLocus_NumSegments(large_rel->locus), + CdbPathLocus_NumSegments(small_rel->move_to)); + } + + /* + * Replicate smaller rel if cheaper than redistributing larger rel. + * But don't replicate a rel that is to be preserved in outer join. + */ + else if (!small_rel->require_existing_order && + small_rel->ok_to_replicate && + ((!parallel_aware && (small_rel->bytes * CdbPathLocus_NumSegmentsPlusParallelWorkers(large_rel->locus) < large_rel->bytes)) || + (parallel_aware && !uninterested_broadcast && (small_rel->bytes * CdbPathLocus_NumSegments(large_rel->locus) < large_rel->bytes)))) + { + if (!parallel_aware) + CdbPathLocus_MakeReplicated(&small_rel->move_to, + CdbPathLocus_NumSegments(large_rel->locus), + large_rel->path->parallel_workers); + else + CdbPathLocus_MakeReplicatedWorkers(&small_rel->move_to, + CdbPathLocus_NumSegments(large_rel->locus), + large_rel->path->parallel_workers); + } + + /* + * Replicate larger rel if cheaper than redistributing smaller rel. + * But don't replicate a rel that is to be preserved in outer join. + */ + else if (!large_rel->require_existing_order && + large_rel->ok_to_replicate && + ((!parallel_aware && (large_rel->bytes * CdbPathLocus_NumSegmentsPlusParallelWorkers(small_rel->locus) < small_rel->bytes)) || + (parallel_aware && !uninterested_broadcast && (large_rel->bytes * CdbPathLocus_NumSegments(small_rel->locus) < small_rel->bytes)))) + { + if (!parallel_aware) + CdbPathLocus_MakeReplicated(&large_rel->move_to, + CdbPathLocus_NumSegments(small_rel->locus), + small_rel->path->parallel_workers); + else + CdbPathLocus_MakeReplicatedWorkers(&large_rel->move_to, + CdbPathLocus_NumSegments(small_rel->locus), + small_rel->path->parallel_workers); + } + + /* If joining on smaller rel's partitioning key, redistribute larger. */ + else if (!large_rel->require_existing_order && + (!(large_rel->path->parallel_workers > 0) || parallel_aware) && + cdbpath_match_preds_to_distkey(root, + redistribution_clauses, + small_rel->path, + small_rel->locus, + large_rel->locus, + parallel_aware, + &large_rel->move_to)) /* OUT */ + { + AssertEquivalent(CdbPathLocus_NumSegments(small_rel->locus), + CdbPathLocus_NumSegments(large_rel->move_to)); + } + + /* Replicate smaller rel if cheaper than redistributing both rels. */ + else if (!small_rel->require_existing_order && + small_rel->ok_to_replicate && + ((!parallel_aware && (small_rel->bytes * CdbPathLocus_NumSegmentsPlusParallelWorkers(large_rel->locus) < small_rel->bytes + large_rel->bytes)) || + (parallel_aware && !uninterested_broadcast && (small_rel->bytes * CdbPathLocus_NumSegments(large_rel->locus) < small_rel->bytes + large_rel->bytes)))) + { + if (!parallel_aware) + CdbPathLocus_MakeReplicated(&small_rel->move_to, + CdbPathLocus_NumSegments(large_rel->locus), + large_rel->path->parallel_workers); + else + CdbPathLocus_MakeReplicatedWorkers(&small_rel->move_to, + CdbPathLocus_NumSegments(large_rel->locus), + large_rel->path->parallel_workers); + } + + /* Replicate larger rel if cheaper than redistributing both rels. */ + else if (!large_rel->require_existing_order && + large_rel->ok_to_replicate && + ((!parallel_aware && (large_rel->bytes * CdbPathLocus_NumSegmentsPlusParallelWorkers(small_rel->locus) < small_rel->bytes + large_rel->bytes)) || + (parallel_aware && !uninterested_broadcast && (large_rel->bytes * CdbPathLocus_NumSegments(small_rel->locus) < small_rel->bytes + large_rel->bytes)))) + { + if (!parallel_aware) + CdbPathLocus_MakeReplicated(&large_rel->move_to, + CdbPathLocus_NumSegments(small_rel->locus), + small_rel->path->parallel_workers); + else + CdbPathLocus_MakeReplicatedWorkers(&large_rel->move_to, + CdbPathLocus_NumSegments(small_rel->locus), + small_rel->path->parallel_workers); + } + + /* + * Redistribute both rels on equijoin cols. + * + * the two results should all be distributed on the same segments, + * here we make them the same with common segments for safe + * TODO: how about distribute them both to ALL segments? + */ + else if (!small_rel->require_existing_order && + !small_rel->has_wts && + !large_rel->require_existing_order && + !large_rel->has_wts && + cdbpath_distkeys_from_preds(root, + redistribution_clauses, + large_rel->path, + CdbPathLocus_CommonSegments(large_rel->locus, + small_rel->locus), + Max(large_rel->path->parallel_workers, small_rel->path->parallel_workers), + parallel_aware, + &large_rel->move_to, + &small_rel->move_to)) + { + /* ok */ + } + + /* + * No usable equijoin preds, or couldn't consider the preferred + * motion. Replicate one rel if possible. MPP TODO: Consider number of + * seg dbs per host. + */ + else if (!small_rel->require_existing_order && + small_rel->ok_to_replicate) + { + if (!parallel_aware) + CdbPathLocus_MakeReplicated(&small_rel->move_to, + CdbPathLocus_NumSegments(large_rel->locus), + large_rel->path->parallel_workers); + else + CdbPathLocus_MakeReplicatedWorkers(&small_rel->move_to, + CdbPathLocus_NumSegments(large_rel->locus), + large_rel->path->parallel_workers); + } + + else if (!large_rel->require_existing_order && + large_rel->ok_to_replicate) + { + if (!parallel_aware) + CdbPathLocus_MakeReplicated(&large_rel->move_to, + CdbPathLocus_NumSegments(small_rel->locus), + small_rel->path->parallel_workers); + else + CdbPathLocus_MakeReplicatedWorkers(&large_rel->move_to, + CdbPathLocus_NumSegments(small_rel->locus), + small_rel->path->parallel_workers); + } + + /* Last resort: Move both rels to a single qExec. */ + else + { + int numsegments = CdbPathLocus_CommonSegments(outer.locus, + inner.locus); + CdbPathLocus_MakeSingleQE(&outer.move_to, numsegments); + CdbPathLocus_MakeSingleQE(&inner.move_to, numsegments); + } + } /* partitioned */ + + /* + * Move outer. + */ + if (!CdbPathLocus_IsNull(outer.move_to)) + { + outer.path = cdbpath_create_motion_path(root, + outer.path, + outer_pathkeys, + outer.require_existing_order, + outer.move_to); + if (!outer.path) /* fail if outer motion not feasible */ + goto fail; + + if (IsA(outer.path, MaterialPath) && !root->config->may_rescan) + { + /* + * If we are the outer path and can never be rescanned, + * we could remove the materialize path. + */ + MaterialPath *mpath = (MaterialPath *) outer.path; + outer.path = mpath->subpath; + } + } + + /* + * Move inner. + */ + if (!CdbPathLocus_IsNull(inner.move_to)) + { + inner.path = cdbpath_create_motion_path(root, + inner.path, + inner_pathkeys, + inner.require_existing_order, + inner.move_to); + if (!inner.path) /* fail if inner motion not feasible */ + goto fail; + + if (parallel_aware) + inner.path->motionHazard = true; + } + + /* + * Ok to join. Give modified subpaths to caller. + */ + *p_outer_path = outer.path; + *p_inner_path = inner.path; + + /* Tell caller where the join will be done. */ + return cdbpathlocus_parallel_join(jointype, outer.path->locus, inner.path->locus, parallel_aware); + +fail: /* can't do this join */ + CdbPathLocus_MakeNull(&outer.move_to); + return outer.move_to; +} /* cdbpath_motion_for_parallel_join */ diff --git a/src/backend/cdb/cdbpathlocus.c b/src/backend/cdb/cdbpathlocus.c index 555af3def29..b11134b34c0 100644 --- a/src/backend/cdb/cdbpathlocus.c +++ b/src/backend/cdb/cdbpathlocus.c @@ -37,6 +37,41 @@ static List *cdb_build_distribution_keys(PlannerInfo *root, Index rti, GpPolicy *policy); +bool +cdbpath_distkey_equal(List *a_distkey, List *b_distkey) +{ + ListCell *acell; + ListCell *bcell; + ListCell *a_ec_cell; + ListCell *b_ec_cell; + + forboth(acell, a_distkey, bcell, b_distkey) + { + DistributionKey *adistkey = (DistributionKey *) lfirst(acell); + DistributionKey *bdistkey = (DistributionKey *) lfirst(bcell); + + if (adistkey->dk_opfamily != bdistkey->dk_opfamily) + return false; + + foreach(b_ec_cell, bdistkey->dk_eclasses) + { + EquivalenceClass *b_ec = (EquivalenceClass *) lfirst(b_ec_cell); + + if (!list_member_ptr(adistkey->dk_eclasses, b_ec)) + return false; + } + foreach(a_ec_cell, adistkey->dk_eclasses) + { + EquivalenceClass *a_ec = (EquivalenceClass *) lfirst(a_ec_cell); + + if (!list_member_ptr(bdistkey->dk_eclasses, a_ec)) + return false; + } + } + + return true; +} + /* * cdbpathlocus_equal * @@ -52,16 +87,26 @@ static List *cdb_build_distribution_keys(PlannerInfo *root, bool cdbpathlocus_equal(CdbPathLocus a, CdbPathLocus b) { - ListCell *acell; - ListCell *bcell; - ListCell *a_ec_cell; - ListCell *b_ec_cell; - /* Unless a and b have the same numsegments the result is always false */ if (CdbPathLocus_NumSegments(a) != CdbPathLocus_NumSegments(b)) return false; + /* Unless a and b have the same parallel_workers the result is always false */ + if (CdbPathLocus_NumParallelWorkers(a) != + CdbPathLocus_NumParallelWorkers(b)) + return false; + + /* HashedWorkers will never be equal */ + if (CdbPathLocus_IsHashedWorkers(a) || + CdbPathLocus_IsHashedWorkers(b)) + return false; + + /* SegmentGeneralWorkers will never be equal */ + if (CdbPathLocus_IsSegmentGeneralWorkers(a) || + CdbPathLocus_IsSegmentGeneralWorkers(b)) + return false; + if (CdbPathLocus_IsStrewn(a) || CdbPathLocus_IsStrewn(b)) return false; @@ -76,34 +121,8 @@ cdbpathlocus_equal(CdbPathLocus a, CdbPathLocus b) if ((CdbPathLocus_IsHashed(a) || CdbPathLocus_IsHashedOJ(a)) && (CdbPathLocus_IsHashed(b) || CdbPathLocus_IsHashedOJ(b))) - { - forboth(acell, a.distkey, bcell, b.distkey) - { - DistributionKey *adistkey = (DistributionKey *) lfirst(acell); - DistributionKey *bdistkey = (DistributionKey *) lfirst(bcell); - - if (adistkey->dk_opfamily != bdistkey->dk_opfamily) - return false; - - foreach(b_ec_cell, bdistkey->dk_eclasses) - { - EquivalenceClass *b_ec = (EquivalenceClass *) lfirst(b_ec_cell); - - if (!list_member_ptr(adistkey->dk_eclasses, b_ec)) - return false; - } - foreach(a_ec_cell, adistkey->dk_eclasses) - { - EquivalenceClass *a_ec = (EquivalenceClass *) lfirst(a_ec_cell); - - if (!list_member_ptr(bdistkey->dk_eclasses, a_ec)) - return false; - } - } - return true; - } + return cdbpath_distkey_equal(a.distkey, b.distkey); - Assert(false); return false; } /* cdbpathlocus_equal */ @@ -286,11 +305,11 @@ cdbpathlocus_for_insert(PlannerInfo *root, GpPolicy *policy, CdbPathLocus_MakeNull(&targetLocus); } else if (distkeys) - CdbPathLocus_MakeHashed(&targetLocus, distkeys, policy->numsegments); + CdbPathLocus_MakeHashed(&targetLocus, distkeys, policy->numsegments, 0 /* parallel_workers */); else { /* DISTRIBUTED RANDOMLY */ - CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments); + CdbPathLocus_MakeStrewn(&targetLocus, policy->numsegments, 0); } return targetLocus; @@ -302,7 +321,7 @@ cdbpathlocus_for_insert(PlannerInfo *root, GpPolicy *policy, * Returns a locus describing the distribution of a policy */ CdbPathLocus -cdbpathlocus_from_policy(struct PlannerInfo *root, Index rti, GpPolicy *policy) +cdbpathlocus_from_policy(struct PlannerInfo *root, Index rti, GpPolicy *policy, int parallel_workers) { CdbPathLocus result; @@ -322,24 +341,32 @@ cdbpathlocus_from_policy(struct PlannerInfo *root, Index rti, GpPolicy *policy) policy); if (distkeys) - CdbPathLocus_MakeHashed(&result, distkeys, policy->numsegments); + { + if (parallel_workers > 1) + CdbPathLocus_MakeHashedWorkers(&result, distkeys, policy->numsegments, parallel_workers); + else + CdbPathLocus_MakeHashed(&result, distkeys, policy->numsegments, 0 /* parallel_workers */); + } else { /* * It's possible that we fail to build a DistributionKey * representation for the distribution policy. */ - CdbPathLocus_MakeStrewn(&result, policy->numsegments); + CdbPathLocus_MakeStrewn(&result, policy->numsegments, parallel_workers); } } /* Rows are distributed on an unknown criterion (uniformly, we hope!) */ else - CdbPathLocus_MakeStrewn(&result, policy->numsegments); + CdbPathLocus_MakeStrewn(&result, policy->numsegments, parallel_workers); } else if (GpPolicyIsReplicated(policy)) { - CdbPathLocus_MakeSegmentGeneral(&result, policy->numsegments); + if (parallel_workers <= 1) + CdbPathLocus_MakeSegmentGeneral(&result, policy->numsegments); + else + CdbPathLocus_MakeSegmentGeneralWorkers(&result, policy->numsegments, parallel_workers); } /* Normal catalog access */ else @@ -355,9 +382,10 @@ cdbpathlocus_from_policy(struct PlannerInfo *root, Index rti, GpPolicy *policy) */ CdbPathLocus cdbpathlocus_from_baserel(struct PlannerInfo *root, - struct RelOptInfo *rel) + struct RelOptInfo *rel, + int parallel_workers) { - return cdbpathlocus_from_policy(root, rel->relid, rel->cdbpolicy); + return cdbpathlocus_from_policy(root, rel->relid, rel->cdbpolicy, parallel_workers); } /* cdbpathlocus_from_baserel */ @@ -372,7 +400,8 @@ cdbpathlocus_from_exprs(struct PlannerInfo *root, List *hash_on_exprs, List *hash_opfamilies, List *hash_sortrefs, - int numsegments) + int numsegments, + int parallel_workers) { CdbPathLocus locus; List *distkeys = NIL; @@ -389,7 +418,7 @@ cdbpathlocus_from_exprs(struct PlannerInfo *root, distkeys = lappend(distkeys, distkey); } - CdbPathLocus_MakeHashed(&locus, distkeys, numsegments); + CdbPathLocus_MakeHashed(&locus, distkeys, numsegments, parallel_workers); return locus; } /* cdbpathlocus_from_exprs */ @@ -450,7 +479,7 @@ cdbpathlocus_from_subquery(struct PlannerInfo *root, { /* shouldn't happen, but let's try to do something sane */ Assert(false); - CdbPathLocus_MakeStrewn(&locus, numsegments); + CdbPathLocus_MakeStrewn(&locus, numsegments, subpath->locus.parallel_workers); return locus; } parentrel = root->simple_rel_array[parent_relid]; @@ -524,9 +553,9 @@ cdbpathlocus_from_subquery(struct PlannerInfo *root, } if (failed) - CdbPathLocus_MakeStrewn(&locus, numsegments); + CdbPathLocus_MakeStrewn(&locus, numsegments, subpath->locus.parallel_workers); else if (CdbPathLocus_IsHashed(subpath->locus)) - CdbPathLocus_MakeHashed(&locus, distkeys, numsegments); + CdbPathLocus_MakeHashed(&locus, distkeys, numsegments, subpath->locus.parallel_workers); else { Assert(CdbPathLocus_IsHashedOJ(subpath->locus)); @@ -563,7 +592,7 @@ cdbpathlocus_get_distkey_exprs(CdbPathLocus locus, *exprs_p = NIL; *opfamilies_p = NIL; - if (CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus)) + if (CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus) || CdbPathLocus_IsHashedWorkers(locus)) { foreach(distkeycell, locus.distkey) { @@ -617,13 +646,15 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, Bitmapset *relids, List *targetlist, List *newvarlist, - Index newrelid) + Index newrelid, + bool parallel_aware) { CdbPathLocus newlocus; Assert(cdbpathlocus_is_valid(locus)); - if (CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus)) + if (CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus) || + (CdbPathLocus_IsHashedWorkers(locus) && parallel_aware)) { ListCell *distkeycell; List *newdistkeys = NIL; @@ -673,7 +704,7 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, */ if (!new_ec) { - CdbPathLocus_MakeStrewn(&newlocus, numsegments); + CdbPathLocus_MakeStrewn(&newlocus, numsegments, locus.parallel_workers); return newlocus; } @@ -687,7 +718,12 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, /* Build new locus. */ if (CdbPathLocus_IsHashed(locus)) - CdbPathLocus_MakeHashed(&newlocus, newdistkeys, numsegments); + CdbPathLocus_MakeHashed(&newlocus, newdistkeys, numsegments, locus.parallel_workers); + else if (CdbPathLocus_IsHashedWorkers(locus)) + { + Assert(parallel_aware); + CdbPathLocus_MakeHashedWorkers(&newlocus, newdistkeys, numsegments, locus.parallel_workers); + } else CdbPathLocus_MakeHashedOJ(&newlocus, newdistkeys, numsegments); return newlocus; @@ -696,7 +732,6 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, return locus; } /* cdbpathlocus_pull_above_projection */ - /* * cdbpathlocus_join * @@ -706,14 +741,23 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root, CdbPathLocus cdbpathlocus_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b) { - ListCell *acell; - ListCell *bcell; + ListCell *acell; + ListCell *bcell; CdbPathLocus resultlocus = {0}; - int numsegments; + int numsegments; Assert(cdbpathlocus_is_valid(a)); Assert(cdbpathlocus_is_valid(b)); + /* + * Parallel locus never get here. + * There shouldn't be xxxWorkers locus or parallel_workers > 1 + * (Hashed locus could have parallel_worker > 0). + */ + Assert(!CdbPathLocus_IsHashedWorkers(a) && !CdbPathLocus_IsSegmentGeneralWorkers(a) && !CdbPathLocus_IsReplicatedWorkers(a)); + Assert(!CdbPathLocus_IsHashedWorkers(b) && !CdbPathLocus_IsSegmentGeneralWorkers(b) && !CdbPathLocus_IsReplicatedWorkers(b)); + Assert(a.parallel_workers == 0 && b.parallel_workers == 0); + /* Do both input rels have same locus? */ if (cdbpathlocus_equal(a, b)) return a; @@ -823,12 +867,12 @@ cdbpathlocus_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b) */ /* Zip the two distkey lists together to make a HashedOJ locus. */ - List *newdistkeys = NIL; + List *newdistkeys = NIL; forboth(acell, a.distkey, bcell, b.distkey) { - DistributionKey *adistkey = (DistributionKey *) lfirst(acell); - DistributionKey *bdistkey = (DistributionKey *) lfirst(bcell); + DistributionKey *adistkey = (DistributionKey *)lfirst(acell); + DistributionKey *bdistkey = (DistributionKey *)lfirst(bcell); DistributionKey *newdistkey; Assert(adistkey->dk_opfamily == bdistkey->dk_opfamily); @@ -844,7 +888,7 @@ cdbpathlocus_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b) } Assert(cdbpathlocus_is_valid(resultlocus)); return resultlocus; -} /* cdbpathlocus_join */ +} /* cdbpathlocus_join */ /* * cdbpathlocus_is_hashed_on_exprs @@ -865,6 +909,9 @@ cdbpathlocus_is_hashed_on_exprs(CdbPathLocus locus, List *exprlist, Assert(cdbpathlocus_is_valid(locus)); + if (CdbPathLocus_IsHashedWorkers(locus)) + return false; + if (CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus)) { foreach(distkeycell, locus.distkey) @@ -927,6 +974,9 @@ cdbpathlocus_is_hashed_on_eclasses(CdbPathLocus locus, List *eclasses, Assert(cdbpathlocus_is_valid(locus)); + if (CdbPathLocus_IsHashedWorkers(locus)) + return false; + if (CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus)) { foreach(distkeycell, locus.distkey) @@ -995,6 +1045,9 @@ cdbpathlocus_is_hashed_on_tlist(CdbPathLocus locus, List *tlist, Assert(cdbpathlocus_is_valid(locus)); + if (CdbPathLocus_IsHashedWorkers(locus)) + return false; + if (CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus)) { foreach(distkeycell, locus.distkey) @@ -1135,10 +1188,10 @@ cdbpathlocus_is_valid(CdbPathLocus locus) if (!CdbLocusType_IsValid(locus.locustype)) goto bad; - if (!CdbPathLocus_IsHashed(locus) && !CdbPathLocus_IsHashedOJ(locus) && locus.distkey != NIL) + if (!CdbPathLocus_IsHashedWorkers(locus)&& !CdbPathLocus_IsHashed(locus) && !CdbPathLocus_IsHashedOJ(locus) && locus.distkey != NIL) goto bad; - if (CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus)) + if (CdbPathLocus_IsHashedWorkers(locus) || CdbPathLocus_IsHashed(locus) || CdbPathLocus_IsHashedOJ(locus)) { if (locus.distkey == NIL) goto bad; @@ -1168,3 +1221,260 @@ cdbpathlocus_is_valid(CdbPathLocus locus) bad: return false; } /* cdbpathlocus_is_valid */ + +/* + * cdbpathlocus_parallel_join + * This is parallel version of cdbpathlocus_join. + * We only handle parallel join here(either path's parallel_works > 1, or both). + */ +CdbPathLocus +cdbpathlocus_parallel_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b, bool parallel_aware) +{ + ListCell *acell; + ListCell *bcell; + CdbPathLocus resultlocus = {0}; + int numsegments; + int outerParallel PG_USED_FOR_ASSERTS_ONLY = CdbPathLocus_NumParallelWorkers(a); + int innerParallel PG_USED_FOR_ASSERTS_ONLY = CdbPathLocus_NumParallelWorkers(b); + + Assert(cdbpathlocus_is_valid(a)); + Assert(cdbpathlocus_is_valid(b)); + + /* Do both input rels have same locus? */ + if (cdbpathlocus_equal(a, b)) + return a; + + /* + * SingleQE may have different segment counts. + */ + if (CdbPathLocus_IsSingleQE(a) && + CdbPathLocus_IsSingleQE(b)) + { + CdbPathLocus_MakeSingleQE(&resultlocus, + CdbPathLocus_CommonSegments(a, b)); + return resultlocus; + } + + if (CdbPathLocus_IsGeneral(a)) + return b; + + if (CdbPathLocus_IsGeneral(b)) + return a; + + /* + * If one rel is replicated, result stays with the other rel, + * but need to ensure the result is on the common segments. + */ + if (CdbPathLocus_IsReplicated(a)) + { + b.numsegments = CdbPathLocus_CommonSegments(a, b); + return b; + } + if (CdbPathLocus_IsReplicatedWorkers(a)) + { + b.numsegments = CdbPathLocus_CommonSegments(a, b); + if (CdbPathLocus_IsHashed(b)) + { + b.locustype = CdbLocusType_HashedWorkers; + } + return b; + } + if (CdbPathLocus_IsReplicated(b) || CdbPathLocus_IsReplicatedWorkers(b)) + { + a.numsegments = CdbPathLocus_CommonSegments(a, b); + return a; + } + + /* + * If one rel is segmentgeneral, result stays with the other rel, + * but need to ensure the result is on the common segments. + * + * NB: the code check SegmentGeneral and replicated is quite similar, + * but we have to put check-segmentgeneral below. Consider one + * is segmentgeneral and the other is replicated, only by this order + * we can be sure that this function never return a locus of + * Replicated. + * update a replicated table join with a partitioned locus table will + * reach here. + */ + if (CdbPathLocus_IsSegmentGeneral(a)) + { + Assert(outerParallel == 0); + Assert(innerParallel == 0); + b.numsegments = CdbPathLocus_CommonSegments(a, b); + return b; + } + + /* + * If inner side is SegmentGeneral, return the other locus anyway. + * parallel join + * joinlocus = a JOIN b, joinlocus could be: + * Hashed = Hashed JOIN SegmentGeneral + * HashedWorkers = HashedWorkers JOIN SegmentGeneral + * SegmentGeneralWorkers = SegmentGeneralWorkers JOIN SegmentGeneral + * + * non-parallel join + * OuterLocus = OuterLocus JOIN SegmentGeneral + */ + if (CdbPathLocus_IsSegmentGeneral(b)) + { + Assert(innerParallel == 0); + AssertImply(outerParallel > 1, b.numsegments == a.numsegments); + a.numsegments = CdbPathLocus_CommonSegments(a, b); + return a; + } + + if (CdbPathLocus_IsSegmentGeneralWorkers(a) || CdbPathLocus_IsSegmentGeneralWorkers(b)) + { + /* GPDB parallel join */ + Assert(a.numsegments == b.numsegments); + if (parallel_aware) + { + + Assert(outerParallel == innerParallel); + Assert(outerParallel > 1); + if (CdbPathLocus_IsSegmentGeneralWorkers(a)) + { + /* + * NB: GPDB parallel could generate HashedWorkers locus besides base rel. + * SegmentGeneralWorkers Parallel JOIN Hashed(parallel_workers>1) -> joinlocus: HashedWorkers. + * Let final join's parallel_workers be with outer, this is not a parallel_workers change of a path! + */ + if (CdbPathLocus_IsHashed(b)) + { + b.parallel_workers = a.parallel_workers; + b.locustype = CdbLocusType_HashedWorkers; + } + return b; + } + + /* If SegmentGeneralWorkers is inner side, return the outer side locus */ + if (CdbPathLocus_IsSegmentGeneralWorkers(b)) + return a; + } + else + { + /* + * SegmentGeneralWorkers Parallel Join without shared hash table. + * The valid join form should be like: + * SegmentGeneralWorkers Join Hashed(parallel_workers=0). + * SegmentGeneralWorkers Join Strewn(parallel_workers=0). + * + * Couldn't get here if the join is : + * SegmentGeneralWorkers Join Bottleneck. + * SegmentGeneralWorkers Join SegmentGeneral. + * Necessary checks should be done in cbdpath_motion_for_join. + * and they are rejected or returned.See more details there. + */ + Assert(outerParallel > 1); + Assert(innerParallel == 0); + if(!CdbPathLocus_IsSegmentGeneralWorkers(a)) + elog(ERROR, "could not construct join locus if SegmentGeneralWorkers is at inner side without shared hash table"); + + if (CdbPathLocus_IsHashed(b)) + { + /* + * NB: GPDB parallel could generate HashedWorkers locus besides base rel. + * SegmentGeneralWorkers JOIN Hashed(parallel_workers=0) -> joinlocus: HashedWorkers. + * This is a hack way to make HashedWorkersand! + */ + b.parallel_workers = a.parallel_workers; + b.locustype = CdbLocusType_HashedWorkers; + return b; + } + + if (CdbPathLocus_IsStrewn(b)) + { + /* SegmentGeneralWorkers JOIN Strewn(parallel_workers=0) -> joinlocus: Strewn(parallel_workers > 1). */ + b.parallel_workers = a.parallel_workers; + return b; + } + return a; + } + } + + /* + * Both sides must be Hashed (or HashedOJ), then. And the distribution + * keys should be compatible; otherwise the caller should not be building + * a join directly between these two rels (a Motion would be needed). + */ + if (!(CdbPathLocus_IsHashed(a) || CdbPathLocus_IsHashedOJ(a) || CdbPathLocus_IsHashedWorkers(a))) + elog(ERROR, "could not be construct join with non-hashed path"); + if (!(CdbPathLocus_IsHashed(b) || CdbPathLocus_IsHashedOJ(b) || CdbPathLocus_IsHashedWorkers(b))) + elog(ERROR, "could not be construct join with non-hashed path"); + if ((!CdbPathLocus_IsReplicatedWorkers(a) && !CdbPathLocus_IsReplicatedWorkers(b)) && + (a.distkey == NIL || list_length(a.distkey) != list_length(b.distkey))) + elog(ERROR, "could not construct hashed join locus with incompatible distribution keys"); + if (CdbPathLocus_NumSegments(a) != CdbPathLocus_NumSegments(b)) + elog(ERROR, "could not construct hashed join locus with different number of segments"); + + if (CdbPathLocus_NumParallelWorkers(a) != CdbPathLocus_NumParallelWorkers(b)) + elog(ERROR, "could not construct hashed join locus with different number of segments"); + + /* + * After refactor the redistribution motion, we will allow CdbLocusType_HashedWorkers and CdbLocusType_Hashed parallel join. + * If inner is hashed workers, and outer is hashed. Join locus will be hashed. + * If outer is hashed workers, and inner is hashed. Join locus will be hashed workers. + * Seems we should just return outer locus anyway. + */ + if (parallel_aware) + return a; + + numsegments = CdbPathLocus_NumSegments(a); + + /* + * For a LEFT/RIGHT OUTER JOIN, we can use key of the outer, non-nullable + * side as is. There should not be any more joins with the nullable side + * above this join rel, so the inner side's keys are not interesting above + * this. + */ + if (jointype == JOIN_LEFT || + jointype == JOIN_LASJ_NOTIN || + jointype == JOIN_ANTI) + { + resultlocus = a; + } + else if (jointype == JOIN_RIGHT) + { + resultlocus = b; + } + else + { + /* + * Not a LEFT/RIGHT JOIN. We don't usually get here with INNER JOINs + * either, because if you have an INNER JOIN on a equality predicate, + * they should form an EquivalenceClass, so that the distribution keys + * on both sides of the join refer to the same EquivalenceClass, and + * we exit already at the top of this function, at the + * "if(cdbpathlocus_equal(a, b)" test. The usual case that we get here + * is a FULL JOIN. + * + * I'm not sure what non-FULL corner cases there are that lead here. + * But it's safe to create a HashedOJ locus for them, anyway, because + * the promise of a HashedOJ is weaker than Hashed. + */ + + /* Zip the two distkey lists together to make a HashedOJ locus. */ + List *newdistkeys = NIL; + + forboth(acell, a.distkey, bcell, b.distkey) + { + DistributionKey *adistkey = (DistributionKey *) lfirst(acell); + DistributionKey *bdistkey = (DistributionKey *) lfirst(bcell); + DistributionKey *newdistkey; + + Assert(adistkey->dk_opfamily == bdistkey->dk_opfamily); + + newdistkey = makeNode(DistributionKey); + newdistkey->dk_eclasses = list_union_ptr(adistkey->dk_eclasses, + bdistkey->dk_eclasses); + newdistkey->dk_opfamily = adistkey->dk_opfamily; + + newdistkeys = lappend(newdistkeys, newdistkey); + } + + CdbPathLocus_MakeHashedOJ(&resultlocus, newdistkeys, numsegments); + } + Assert(cdbpathlocus_is_valid(resultlocus)); + return resultlocus; +} /* cdbpathlocus_parallel_join */ diff --git a/src/backend/cdb/cdbpathtoplan.c b/src/backend/cdb/cdbpathtoplan.c index 57d96e3493e..b5e64b7415a 100644 --- a/src/backend/cdb/cdbpathtoplan.c +++ b/src/backend/cdb/cdbpathtoplan.c @@ -53,11 +53,18 @@ cdbpathtoplan_create_flow(PlannerInfo *root, flow = makeFlow(FLOW_SINGLETON, locus.numsegments); flow->segindex = 0; } + else if (CdbPathLocus_IsSegmentGeneralWorkers(locus)) + { + flow = makeFlow(FLOW_SINGLETON, locus.numsegments); + flow->segindex = 0; + } else if (CdbPathLocus_IsReplicated(locus)) { + /* GPDB_PARALLEL_FIXME: What if ReplicatedWorkers? */ flow = makeFlow(FLOW_REPLICATED, locus.numsegments); } else if (CdbPathLocus_IsHashed(locus) || + CdbPathLocus_IsHashedWorkers(locus) || CdbPathLocus_IsHashedOJ(locus)) { flow = makeFlow(FLOW_PARTITIONED, locus.numsegments); diff --git a/src/backend/cdb/cdbsetop.c b/src/backend/cdb/cdbsetop.c index 9df24f1e43e..8ee56a1bbb9 100644 --- a/src/backend/cdb/cdbsetop.c +++ b/src/backend/cdb/cdbsetop.c @@ -56,6 +56,7 @@ choose_setop_type(List *pathlist) switch (subpath->locus.locustype) { case CdbLocusType_Hashed: + case CdbLocusType_HashedWorkers: case CdbLocusType_HashedOJ: case CdbLocusType_Strewn: ok_general = false; @@ -77,12 +78,19 @@ choose_setop_type(List *pathlist) ok_general = false; break; + case CdbLocusType_SegmentGeneralWorkers: + ok_general = false; + break; + case CdbLocusType_General: break; case CdbLocusType_Replicated: break; + case CdbLocusType_ReplicatedWorkers: + break; + case CdbLocusType_Null: elog(ERROR, "unexpected Null locus in set operation branch"); break; @@ -133,6 +141,8 @@ adjust_setop_arguments(PlannerInfo *root, List *pathlist, List *tlist_list, GpSe case CdbLocusType_SingleQE: case CdbLocusType_General: case CdbLocusType_SegmentGeneral: + case CdbLocusType_SegmentGeneralWorkers: + case CdbLocusType_HashedWorkers: /* * The setop itself will run on an N-gang, so we need * to arrange for the singleton input to be separately @@ -144,6 +154,7 @@ adjust_setop_arguments(PlannerInfo *root, List *pathlist, List *tlist_list, GpSe case CdbLocusType_Null: case CdbLocusType_Entry: case CdbLocusType_Replicated: + case CdbLocusType_ReplicatedWorkers: case CdbLocusType_OuterQuery: case CdbLocusType_End: ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -156,6 +167,7 @@ adjust_setop_arguments(PlannerInfo *root, List *pathlist, List *tlist_list, GpSe switch (subpath->locus.locustype) { case CdbLocusType_Hashed: + case CdbLocusType_HashedWorkers: case CdbLocusType_HashedOJ: case CdbLocusType_Strewn: CdbPathLocus_MakeEntry(&locus); @@ -165,6 +177,7 @@ adjust_setop_arguments(PlannerInfo *root, List *pathlist, List *tlist_list, GpSe case CdbLocusType_SingleQE: case CdbLocusType_SegmentGeneral: + case CdbLocusType_SegmentGeneralWorkers: /* * The input was focused on a single QE, but we need it in the QD. * It's bit silly to add a Motion to just move the whole result from @@ -184,6 +197,7 @@ adjust_setop_arguments(PlannerInfo *root, List *pathlist, List *tlist_list, GpSe case CdbLocusType_Null: case CdbLocusType_Replicated: + case CdbLocusType_ReplicatedWorkers: case CdbLocusType_End: ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("unexpected argument locus to set operation"))); @@ -196,6 +210,7 @@ adjust_setop_arguments(PlannerInfo *root, List *pathlist, List *tlist_list, GpSe switch (subpath->locus.locustype) { case CdbLocusType_Hashed: + case CdbLocusType_HashedWorkers: case CdbLocusType_HashedOJ: case CdbLocusType_Strewn: /* Gather to QE. No need to keep ordering. */ @@ -212,6 +227,7 @@ adjust_setop_arguments(PlannerInfo *root, List *pathlist, List *tlist_list, GpSe break; case CdbLocusType_SegmentGeneral: + case CdbLocusType_SegmentGeneralWorkers: /* Gather to QE. No need to keep ordering. */ CdbPathLocus_MakeSingleQE(&locus, getgpsegmentCount()); adjusted_path = cdbpath_create_motion_path(root, subpath, NULL, false, @@ -221,6 +237,7 @@ adjust_setop_arguments(PlannerInfo *root, List *pathlist, List *tlist_list, GpSe case CdbLocusType_Entry: case CdbLocusType_Null: case CdbLocusType_Replicated: + case CdbLocusType_ReplicatedWorkers: case CdbLocusType_End: ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("unexpected argument locus to set operation"))); @@ -291,7 +308,8 @@ make_motion_hash_all_targets(PlannerInfo *root, Path *subpath, List *tlist) hashexprs, hashopfamilies, hashsortrefs, - getgpsegmentCount()); + getgpsegmentCount(), + subpath->parallel_workers); } else { @@ -322,7 +340,7 @@ mark_append_locus(Path *path, GpSetOpType optype) CdbPathLocus_MakeGeneral(&path->locus); break; case PSETOP_PARALLEL_PARTITIONED: - CdbPathLocus_MakeStrewn(&path->locus, getgpsegmentCount()); + CdbPathLocus_MakeStrewn(&path->locus, getgpsegmentCount(), path->parallel_workers); break; case PSETOP_SEQUENTIAL_QD: CdbPathLocus_MakeEntry(&path->locus); diff --git a/src/backend/cdb/cdbvarblock.c b/src/backend/cdb/cdbvarblock.c index 7524eca972a..ea0020b51cb 100644 --- a/src/backend/cdb/cdbvarblock.c +++ b/src/backend/cdb/cdbvarblock.c @@ -18,7 +18,9 @@ #include "postgres.h" +#include "access/xlog.h" #include "cdb/cdbvarblock.h" +#include "crypto/bufenc.h" static VarBlockByteLen VarBlockGetItemLen( VarBlockReader *varBlockReader, @@ -49,11 +51,12 @@ static VarBlockByteOffset VarBlockGetOffset( */ void VarBlockMakerInit( - VarBlockMaker *varBlockMaker, + VarBlockMaker *varBlockMaker, uint8 *buffer, VarBlockByteLen maxBufferLen, uint8 *tempScratchSpace, - int tempScratchSpaceLen) + int tempScratchSpaceLen, + AppendOnlyStorageWrite *storageWrite) { Assert(varBlockMaker != NULL); Assert(buffer != NULL); @@ -263,7 +266,8 @@ VarBlockMakerItemCount( */ VarBlockByteLen VarBlockMakerFinish( - VarBlockMaker *varBlockMaker) + VarBlockMaker *varBlockMaker, + AppendOnlyStorageWrite *storageWrite) { uint8 *buffer; int itemCount; @@ -348,6 +352,15 @@ VarBlockMakerFinish( /* } */ /* #endif */ + /* for singerow, we don't encrypt in the var block. */ + if (VarBlockMakerItemCount(varBlockMaker) != 1 && FileEncryptionEnabled) + { + int encryptDataOffset = VARBLOCK_HEADER_LEN; + EncryptAOBLock(buffer + encryptDataOffset, + bufferLen - encryptDataOffset, + &storageWrite->relFileNode.node); + } + return bufferLen; } @@ -632,7 +645,9 @@ void VarBlockReaderInit( VarBlockReader *varBlockReader, uint8 *buffer, - VarBlockByteLen bufferLen) + VarBlockByteLen bufferLen, + bool needDecrypt, + RelFileNode *file_node) { VarBlockHeader *header; VarBlockByteLen itemLenSum; @@ -658,6 +673,15 @@ VarBlockReaderInit( */ offsetToOffsetArray = VARBLOCK_HEADER_LEN + ((itemLenSum + 1) / 2) * 2; + + if (FileEncryptionEnabled && needDecrypt) + { + int encryptDataOffset = VARBLOCK_HEADER_LEN; + DecryptAOBlock(buffer + encryptDataOffset, + bufferLen - encryptDataOffset, + file_node); + } + if (VarBlockGet_offsetsAreSmall(header)) { divisor = 2; @@ -829,6 +853,7 @@ VarBlockReaderGetItemPtr( VarBlockByteLen VarBlockCollapseToSingleItem( + AppendOnlyStorageWrite *storageWrite, uint8 *target, uint8 *source, int32 sourceLen) @@ -840,7 +865,9 @@ VarBlockCollapseToSingleItem( VarBlockReaderInit( &varBlockReader, source, - sourceLen); + sourceLen, + false, + &storageWrite->relFileNode.node); Assert(VarBlockReaderItemCount(&varBlockReader) == 1); @@ -861,5 +888,10 @@ VarBlockCollapseToSingleItem( itemPtr, itemLen); + if (FileEncryptionEnabled) + EncryptAOBLock(target, + itemLen, + &storageWrite->relFileNode.node); + return itemLen; } diff --git a/src/backend/cdb/dispatcher/cdbdisp.c b/src/backend/cdb/dispatcher/cdbdisp.c index c0da285a8b5..8254869415e 100644 --- a/src/backend/cdb/dispatcher/cdbdisp.c +++ b/src/backend/cdb/dispatcher/cdbdisp.c @@ -620,9 +620,36 @@ segmentsListToString(const char *prefix, List *segments) return string.data; } +/* Filter duplicated segments due to paralle plan. */ +static List* +filterParallelSegments(List *segments) +{ + int tmp = -2; + ListCell *l; + foreach(l, segments) + { + int segID = lfirst_int(l); + if (tmp == segID) + segments = foreach_delete_current(segments, l); + tmp = segID; + } + return segments; +} + char* segmentsToContentStr(List *segments) { + /* + * GPDB parallel: + * We may direct dispatch to same segment with parallel workers. + * That would be like: + * INFO: (slice 1) Dispatch command to PARTIAL contents: 2 1 + * INFO: (slice 1) Dispatch command to ALL contents: 2 2 1 1 + * There are confused as we are not PARTITAL or ALL contents. + * Filter dumplicated parallel workers info for regression tests. + */ + + segments = filterParallelSegments(segments); int size = list_length(segments); if (size == 0) diff --git a/src/backend/cdb/dispatcher/cdbgang.c b/src/backend/cdb/dispatcher/cdbgang.c index a436fc09d61..f2349ee58ef 100644 --- a/src/backend/cdb/dispatcher/cdbgang.c +++ b/src/backend/cdb/dispatcher/cdbgang.c @@ -612,7 +612,7 @@ setupCdbProcessList(ExecSlice *slice) Assert(gang->type == GANGTYPE_PRIMARY_WRITER || gang->type == GANGTYPE_PRIMARY_READER || (gang->type == GANGTYPE_ENTRYDB_READER && gang->size == 1) || - (gang->type == GANGTYPE_SINGLETON_READER && gang->size == 1)); + (/* parallel scan replica table */gang->type == GANGTYPE_SINGLETON_READER)); for (i = 0; i < gang->size; i++) diff --git a/src/backend/cdb/endpoint/cdbendpoint.c b/src/backend/cdb/endpoint/cdbendpoint.c index 6da7198527b..6dc24a72650 100644 --- a/src/backend/cdb/endpoint/cdbendpoint.c +++ b/src/backend/cdb/endpoint/cdbendpoint.c @@ -227,6 +227,7 @@ GetParallelCursorEndpointPosition(PlannedStmt *plan) { if (plan->planTree->flow->flotype == FLOW_SINGLETON) { + /* GPDB_PARALLEL_FIXME: CdbLocusType_SegmentGeneralWorkers */ if (plan->planTree->flow->locustype == CdbLocusType_SegmentGeneral) return ENDPOINT_ON_SINGLE_QE; else diff --git a/src/backend/cdb/motion/cdbmotion.c b/src/backend/cdb/motion/cdbmotion.c index 3b712995a6d..27105b68afa 100644 --- a/src/backend/cdb/motion/cdbmotion.c +++ b/src/backend/cdb/motion/cdbmotion.c @@ -17,6 +17,7 @@ #include "postgres.h" #include "access/htup.h" +#include "access/session.h" #include "libpq-fe.h" #include "libpq-int.h" #include "cdb/cdbconn.h" @@ -1249,9 +1250,14 @@ statRecvTuple(MotionNodeEntry *pMNEntry, ChunkSorterEntry *pCSEntry) static bool ShouldSendRecordCache(MotionConn *conn, SerTupInfo *pSerInfo) { + int32 typmod; + + typmod = CurrentSession->shared_typmod_registry == NULL + ? NextRecordTypmod : GetSharedNextRecordTypmod(CurrentSession->shared_typmod_registry); + return pSerInfo->has_record_types && - NextRecordTypmod > 0 && - NextRecordTypmod > conn->sent_record_typmod; + typmod > 0 && + typmod > conn->sent_record_typmod; } /* @@ -1260,5 +1266,12 @@ ShouldSendRecordCache(MotionConn *conn, SerTupInfo *pSerInfo) static void UpdateSentRecordCache(MotionConn *conn) { - conn->sent_record_typmod = NextRecordTypmod; + if (CurrentSession->shared_typmod_registry != NULL) + { + conn->sent_record_typmod = GetSharedNextRecordTypmod(CurrentSession->shared_typmod_registry); + } + else + { + conn->sent_record_typmod = NextRecordTypmod; + } } diff --git a/src/backend/cdb/motion/tupleremap.c b/src/backend/cdb/motion/tupleremap.c index 7f668d43f8f..168e2ee1b08 100644 --- a/src/backend/cdb/motion/tupleremap.c +++ b/src/backend/cdb/motion/tupleremap.c @@ -563,7 +563,7 @@ TRRemapRecord(TupleRemapper *remapper, RecordRemapInfo *remapinfo, remapinfo->rectypmod = typmod; /* Release reference count acquired by lookup_rowtype_tupdesc. */ - DecrTupleDescRefCount(tupledesc); + ReleaseTupleDesc(tupledesc); } /* If transient record, replace remote typmod with local typmod. */ diff --git a/src/backend/cdb/test/cdbbufferedread_test.c b/src/backend/cdb/test/cdbbufferedread_test.c index f518d854ac6..75493ed3622 100644 --- a/src/backend/cdb/test/cdbbufferedread_test.c +++ b/src/backend/cdb/test/cdbbufferedread_test.c @@ -16,12 +16,13 @@ test__BufferedReadInit__IsConsistent(void **state) char *relname = "test"; int32 maxBufferLen = 128; int32 maxLargeReadLen = 128; + RelFileNode file_node = {0}; memset(bufferedRead, 0 , sizeof(BufferedRead)); /* * Call the function so as to set the above values. */ - BufferedReadInit(bufferedRead, memory, memoryLen, maxBufferLen, maxLargeReadLen, relname); + BufferedReadInit(bufferedRead, memory, memoryLen, maxBufferLen, maxLargeReadLen, relname, &file_node); /* * Check for consistency */ @@ -45,12 +46,13 @@ test__BufferedReadUseBeforeBuffer__IsNextReadLenZero(void **state) int32 maxLargeReadLen = 128; int32 nextBufferLen; int32 maxReadAheadLen = 64; - + RelFileNode file_node = {0}; + memset(bufferedRead, 0 , sizeof(BufferedRead)); /* * Initialize the buffer */ - BufferedReadInit(bufferedRead, memory, memoryLen, maxBufferLen, maxLargeReadLen, relname); + BufferedReadInit(bufferedRead, memory, memoryLen, maxBufferLen, maxLargeReadLen, relname, &file_node); /* * filling up the bufferedRead struct */ diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile index f4f3c4f2e56..613886dce77 100644 --- a/src/backend/commands/Makefile +++ b/src/backend/commands/Makefile @@ -66,6 +66,6 @@ OBJS = \ view.o OBJS += analyzefuncs.o analyzeutils.o extprotocolcmds.o exttablecmds.o queue.o -OBJS += resgroupcmds.o tablecmds_gp.o vacuum_ao.o +OBJS += resgroupcmds.o tablecmds_gp.o vacuum_ao.o taskcmds.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c index 990021be98b..dad0bcf9fef 100644 --- a/src/backend/commands/alter.c +++ b/src/backend/commands/alter.c @@ -725,6 +725,7 @@ AlterObjectNamespace_oid(Oid classId, Oid objid, Oid nspOid, case OCLASS_SUBSCRIPTION: case OCLASS_TRANSFORM: case OCLASS_EXTPROTOCOL: + case OCLASS_TASK: /* ignore object types that don't have schema-qualified names */ break; diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 2415ee6432d..326a2aed6a0 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -68,6 +68,7 @@ #include "access/detoast.h" #include "access/genam.h" +#include "access/heapam.h" #include "access/multixact.h" #include "access/relation.h" #include "access/sysattr.h" @@ -111,12 +112,14 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_rusage.h" +#include "utils/rel.h" #include "utils/sampling.h" #include "utils/sortsupport.h" #include "utils/spccache.h" #include "utils/syscache.h" #include "utils/timestamp.h" +#include "access/appendonlywriter.h" #include "catalog/heap.h" #include "catalog/pg_am.h" #include "cdb/cdbappendonlyam.h" @@ -195,6 +198,8 @@ static void analyze_rel_internal(Oid relid, RangeVar *relation, gp_acquire_sample_rows_context *ctx); static void acquire_hll_by_query(Relation onerel, int nattrs, VacAttrStats **attrstats, int elevel); +static int16 AcquireCountOfSegmentFile(Relation onerel); + /* * analyze_rel() -- analyze one relation * @@ -1020,6 +1025,36 @@ do_analyze_rel(Relation onerel, VacuumParams *params, in_outer_xact, false /* isVacuum */); + /* Update pg_appendonly for ao tables */ + if (RelationIsAppendOptimized(onerel)) + { + Relation aorel; + Oid aorelid = RelationGetRelid(onerel); + HeapTuple aotup; + Form_pg_appendonly aoform; + int16 ao_segfile_count = 0; + + aotup = SearchSysCache1(AORELID, ObjectIdGetDatum(aorelid)); + if (!HeapTupleIsValid(aotup)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("appendonly table relid %u does not exist in pg_appendonly", aorelid))); + + aoform = (Form_pg_appendonly) GETSTRUCT(aotup); + if (aoform->segfilecount < MAX_AOREL_CONCURRENCY) + { + ao_segfile_count = AcquireCountOfSegmentFile(onerel); + if (aoform->segfilecount != ao_segfile_count) + { + aorel = table_open(AppendOnlyRelationId, RowExclusiveLock); + aoform->segfilecount = ao_segfile_count; + heap_inplace_update(aorel, aotup); + table_close(aorel, RowExclusiveLock); + } + } + ReleaseSysCache(aotup); + } + /* Same for indexes */ for (ind = 0; ind < nindexes; ind++) { @@ -2181,6 +2216,32 @@ acquire_hll_by_query(Relation onerel, int nattrs, VacAttrStats **attrstats, int SPI_finish(); } +/* + * Count AO/AOCO tables segment file number. + */ +static int16 +AcquireCountOfSegmentFile(Relation onerel) +{ + int16 count = 0; + + if (!RelationIsAppendOptimized(onerel)) + return 0; + + if (Gp_role == GP_ROLE_DISPATCH && + onerel->rd_cdbpolicy && !GpPolicyIsEntry(onerel->rd_cdbpolicy)) + { + /* Query the segments using gp_ao_segment_file_count(). */ + char *sql; + sql = psprintf("select pg_catalog.gp_ao_segment_file_count(%u)", RelationGetRelid(onerel)); + count = get_size_from_segDBs(sql)/getgpsegmentCount(); + } + else + { + count = GetAppendOnlySegmentFilesCount(onerel); + } + return count; +} + /* * Compute relation size. * diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index 5ffaba85568..c824f7da2a5 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -362,6 +362,10 @@ ExecCreateTableAs(ParseState *pstate, CreateTableAsStmt *stmt, save_nestlevel = NewGUCNestLevel(); } + /* into AO/AOCS ?*/ + char* am = (into && into->accessMethod) ? into->accessMethod : default_table_access_method; + bool intoAO = ((strcmp(am, "ao_row") == 0) || (strcmp(am, "ao_column") == 0)); + { /* * Parse analysis was done already, but we still have to run the rule @@ -380,8 +384,12 @@ ExecCreateTableAs(ParseState *pstate, CreateTableAsStmt *stmt, Assert(query->commandType == CMD_SELECT); /* plan the query */ - plan = pg_plan_query(query, pstate->p_sourcetext, - CURSOR_OPT_PARALLEL_OK, params); + if (!intoAO) + plan = pg_plan_query(query, pstate->p_sourcetext, + CURSOR_OPT_PARALLEL_OK, params); + else + plan = pg_plan_query(query, pstate->p_sourcetext, + CURSOR_OPT_PARALLEL_NOT_OK, params); /*GPDB: Save the target information in PlannedStmt */ /* diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index 1cf37d5aa39..72b3cb74c5e 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -1062,6 +1062,7 @@ EventTriggerSupportsObjectClass(ObjectClass objclass) case OCLASS_TRANSFORM: return true; case OCLASS_EXTPROTOCOL: + case OCLASS_TASK: return false; /* diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index ce27a667677..e0e59ca481c 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -182,7 +182,9 @@ static void ExplainJSONLineEnding(ExplainState *es); static void ExplainYAMLLineStarting(ExplainState *es); static void escape_yaml(StringInfo buf, const char *str); -/* Include the Cloudberry EXPLAIN extensions */ +static void Explainlocus(ExplainState *es, CdbLocusType locustype, int parallel); + +/* Include the Greenplum EXPLAIN extensions */ #include "explain_gp.c" @@ -212,6 +214,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt, es->analyze = defGetBoolean(opt); else if (strcmp(opt->defname, "verbose") == 0) es->verbose = defGetBoolean(opt); + else if (strcmp(opt->defname, "locus") == 0) + es->locus = defGetBoolean(opt); else if (strcmp(opt->defname, "costs") == 0) es->costs = defGetBoolean(opt); else if (strcmp(opt->defname, "buffers") == 0) @@ -1765,6 +1769,7 @@ ExplainNode(PlanState *planstate, List *ancestors, Assert(plan->lefttree); motion_snd = list_length(es->currentSlice->segments); + motion_recv = parentSlice == NULL ? 1 : list_length(parentSlice->segments); switch (pMotion->motionType) @@ -1783,6 +1788,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case MOTIONTYPE_BROADCAST: sname = "Broadcast Motion"; break; + case MOTIONTYPE_PARALLEL_BROADCAST: + sname = "Parallel Broadcast Motion"; + break; case MOTIONTYPE_EXPLICIT: sname = "Explicit Redistribute Motion"; break; @@ -2198,6 +2206,12 @@ ExplainNode(PlanState *planstate, List *ancestors, if (es->verbose) show_plan_tlist(planstate, ancestors, es); + /* explain(locus) doesn't support Orca yet */ + if (es->locus && !optimizer) + { + Explainlocus(es, plan->locustype, plan->parallel); + } + /* unique join */ switch (nodeTag(plan)) { @@ -2559,11 +2573,10 @@ ExplainNode(PlanState *planstate, List *ancestors, if (pMotion->motionType == MOTIONTYPE_HASH && pMotion->numHashSegments != motion_recv) { - Assert(pMotion->numHashSegments < motion_recv); - appendStringInfoSpaces(es->str, es->indent * 2); - appendStringInfo(es->str, - "Hash Module: %d\n", - pMotion->numHashSegments); + AssertImply(pMotion->senderSliceInfo && pMotion->senderSliceInfo->parallel_workers <= 1, + pMotion->numHashSegments < motion_recv); + ExplainPropertyInteger("Hash Module", NULL, + pMotion->numHashSegments, es); } } break; @@ -5755,3 +5768,62 @@ escape_yaml(StringInfo buf, const char *str) { escape_json(buf, str); } + +/* + * Explainlocus + * Show locus type and parallel workers(if it's > 1) of plan node. + */ +static void +Explainlocus(ExplainState *es, CdbLocusType locustype, int parallel) +{ + char* locus = NULL; + switch (locustype) + { + case CdbLocusType_Null: + locus = "NULL"; + break; + case CdbLocusType_Entry: + locus = "Entry"; + break; + case CdbLocusType_SingleQE: + locus = "SingleQE"; + break; + case CdbLocusType_General: + locus = "General"; + break; + case CdbLocusType_SegmentGeneral: + locus = "SegmentGeneral"; + break; + case CdbLocusType_SegmentGeneralWorkers: + locus = "SegmentGeneralWorkers"; + break; + case CdbLocusType_OuterQuery: + locus = "OuteryQuery"; + break; + case CdbLocusType_Replicated: + locus = "Replicated"; + break; + case CdbLocusType_ReplicatedWorkers: + locus = "ReplicatedWorkers"; + break; + case CdbLocusType_Hashed: + locus = "Hashed"; + break; + case CdbLocusType_HashedOJ: + locus = "HashedOJ"; + break; + case CdbLocusType_Strewn: + locus = "Strewn"; + break; + case CdbLocusType_HashedWorkers: + locus = "HashedWorkers"; + break; + default: + locus = "unknown"; + break; + } + + ExplainPropertyText("Locus", locus, es); + if (parallel > 1) + ExplainPropertyInteger("Parallel Workers", NULL, parallel, es); +} \ No newline at end of file diff --git a/src/backend/commands/explain_gp.c b/src/backend/commands/explain_gp.c index 45c54ec79cb..06a925129b4 100644 --- a/src/backend/commands/explain_gp.c +++ b/src/backend/commands/explain_gp.c @@ -69,6 +69,7 @@ typedef struct CdbExplain_StatInst IncrementalSortGroupInfo prefixsortGroupInfo; /* Prefix sort group info for Incremental Sort node */ int bnotes; /* Offset to beginning of node's extra text */ int enotes; /* Offset to end of node's extra text */ + int nworkers_launched; /* Number of workers launched for this node */ WalUsage walusage; /* add WAL usage */ } CdbExplain_StatInst; @@ -78,6 +79,7 @@ typedef struct CdbExplain_SliceWorker { double peakmemused; /* bytes alloc in per-query mem context tree */ double vmem_reserved; /* vmem reserved by a QE */ + int nworkers_launched; /* Number of workers launched for this slice */ } CdbExplain_SliceWorker; @@ -771,7 +773,10 @@ cdbexplain_depositSliceStats(CdbExplain_StatHdr *hdr, iworker = hdr->segindex - ss->segindex0; ssw = &ss->workers[iworker]; Assert(iworker >= 0 && iworker < ss->nworker); - Assert(ssw->peakmemused == 0); /* each worker should be seen just once */ + /* GPDB_PARALLEL_FIXME: reuse worker to store the stats of same slice */ +#if 0 + Assert(ssw->peakmemused == 0); /* each worker should be seen just once */ +#endif *ssw = hdr->worker; /* Rollup of per-worker stats into SliceSummary */ @@ -865,6 +870,20 @@ cdbexplain_collectStatsFromNode(PlanState *planstate, CdbExplain_SendStatCtx *ct &incrementalstate->incsort_info.prefixsortGroupInfo, sizeof(IncrementalSortGroupInfo)); } +#if 0 + if (IsA(planstate, GatherState)) + { + GatherState *gatherstate = (GatherState *) planstate; + + si->nworkers_launched = gatherstate->nworkers_launched; + } + if (IsA(planstate, GatherMergeState)) + { + GatherMergeState *gathermergestate = (GatherMergeState *) planstate; + + si->nworkers_launched = gathermergestate->nworkers_launched; + } +#endif } /* cdbexplain_collectStatsFromNode */ @@ -1068,6 +1087,12 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx) /* Update per-slice accumulators. */ cdbexplain_depStatAcc_upd(&peakmemused, rsh->worker.peakmemused, rsh, rsi, nsi); cdbexplain_depStatAcc_upd(&vmem_reserved, rsh->worker.vmem_reserved, rsh, rsi, nsi); +#if 0 + if (IsA(planstate, GatherState) || IsA(planstate, GatherMergeState)) + { + rsh->worker.nworkers_launched = nsi->nworkers_launched; + } +#endif } /* Save per-node accumulated stats in NodeSummary. */ @@ -1954,9 +1979,10 @@ gpexplain_formatSlicesOutput(struct CdbExplain_ShowStatCtx *showstatctx, cdbexplain_formatMemory(avgbuf, sizeof(avgbuf), cdbexplain_agg_avg(&ss->peakmemused)); cdbexplain_formatSeg(segbuf, sizeof(segbuf), ss->peakmemused.imax, ss->nworker); appendStringInfo(es->str, - "Executor memory: %s avg x %d workers, %s max%s.", + "Executor memory: %s avg x %dx(%d) workers, %s max%s.", avgbuf, ss->peakmemused.vcnt, + ss->workers->nworkers_launched, maxbuf, segbuf); } @@ -1965,6 +1991,7 @@ gpexplain_formatSlicesOutput(struct CdbExplain_ShowStatCtx *showstatctx, ExplainOpenGroup("Executor Memory", "Executor Memory", true, es); ExplainPropertyInteger("Average", "kB", cdbexplain_agg_avg(&ss->peakmemused), es); ExplainPropertyInteger("Workers", NULL, ss->peakmemused.vcnt, es); + ExplainPropertyInteger("Subworkers", NULL, ss->workers->nworkers_launched, es); ExplainPropertyInteger("Maximum Memory Used", "kB", ss->peakmemused.vmax, es); ExplainCloseGroup("Executor Memory", "Executor Memory", true, es); } @@ -2010,9 +2037,10 @@ gpexplain_formatSlicesOutput(struct CdbExplain_ShowStatCtx *showstatctx, cdbexplain_formatMemory(avgbuf, sizeof(avgbuf), cdbexplain_agg_avg(&ss->vmem_reserved)); cdbexplain_formatSeg(segbuf, sizeof(segbuf), ss->vmem_reserved.imax, ss->nworker); appendStringInfo(es->str, - " Vmem reserved: %s avg x %d workers, %s max%s.", + " Vmem reserved: %s avg x %dx(%d) workers, %s max%s.", avgbuf, ss->vmem_reserved.vcnt, + ss->workers->nworkers_launched, maxbuf, segbuf); } @@ -2021,6 +2049,7 @@ gpexplain_formatSlicesOutput(struct CdbExplain_ShowStatCtx *showstatctx, ExplainOpenGroup("Virtual Memory", "Virtual Memory", true, es); ExplainPropertyInteger("Average", "kB", cdbexplain_agg_avg(&ss->vmem_reserved), es); ExplainPropertyInteger("Workers", NULL, ss->vmem_reserved.vcnt, es); + ExplainPropertyInteger("Subworkers", NULL, ss->workers->nworkers_launched, es); ExplainPropertyInteger("Maximum Memory Used", "kB", ss->vmem_reserved.vmax, es); ExplainCloseGroup("Virtual Memory", "Virtual Memory", true, es); } diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 747f3c7965e..61a580873b1 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -69,7 +69,7 @@ typedef struct static int matview_maintenance_depth = 0; -static RefreshClause* MakeRefreshClause(bool concurrent, bool skipData, RangeVar *relation); +static RefreshClause* MakeRefreshClause(bool concurrent, bool skipData, RangeVar *relation, bool intoAO); static void transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo); static bool transientrel_receive(TupleTableSlot *slot, DestReceiver *self); static void transientrel_shutdown(DestReceiver *self); @@ -125,7 +125,7 @@ SetMatViewPopulatedState(Relation relation, bool newstate) } static RefreshClause* -MakeRefreshClause(bool concurrent, bool skipData, RangeVar *relation) +MakeRefreshClause(bool concurrent, bool skipData, RangeVar *relation, bool intoAO) { RefreshClause *refreshClause; refreshClause = makeNode(RefreshClause); @@ -133,6 +133,7 @@ MakeRefreshClause(bool concurrent, bool skipData, RangeVar *relation) refreshClause->concurrent = concurrent; refreshClause->skipData = skipData; refreshClause->relation = relation; + refreshClause->intoAO = intoAO; return refreshClause; } @@ -339,7 +340,8 @@ ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, dest = CreateTransientRelDestReceiver(OIDNewHeap, matviewOid, concurrent, relpersistence, stmt->skipData); - refreshClause = MakeRefreshClause(concurrent, stmt->skipData, stmt->relation); + bool intoAO = RelationIsAppendOptimized(matviewRel); + refreshClause = MakeRefreshClause(concurrent, stmt->skipData, stmt->relation, intoAO); /* * Only in dispather role, we should set intoPolicy, else it should remain NULL. @@ -470,7 +472,12 @@ refresh_matview_datafill(DestReceiver *dest, Query *query, CHECK_FOR_INTERRUPTS(); /* Plan the query which will generate data for the refresh. */ - plan = pg_plan_query(query, queryString, CURSOR_OPT_PARALLEL_OK, NULL); + + /* GPDB_PARALLEL_FIXME: hack here, use cursor_option to disable parallel */ + if (!refreshClause->intoAO) + plan = pg_plan_query(query, queryString, CURSOR_OPT_PARALLEL_OK, NULL); + else + plan = pg_plan_query(query, queryString, CURSOR_OPT_PARALLEL_NOT_OK, NULL); plan->refreshClause = refreshClause; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 2bc9704b5d7..76faa1f343d 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -1822,6 +1822,27 @@ ao_aux_tables_safe_truncate(Relation rel) */ RemoveFastSequenceEntry(aoseg_relid); InsertInitialFastSequenceEntries(aoseg_relid); + + /* GPDB truncate should also update pg_appendonly.segfilecount */ + Relation aorel; + HeapTuple aotup; + Form_pg_appendonly aoform; + + aotup = SearchSysCache1(AORELID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(aotup)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("appendonly table relid %u does not exist in pg_appendonly", relid))); + + aoform = (Form_pg_appendonly) GETSTRUCT(aotup); + if (aoform->segfilecount != 0) + { + aorel = table_open(AppendOnlyRelationId, RowExclusiveLock); + aoform->segfilecount = 0; + heap_inplace_update(aorel, aotup); + table_close(aorel, RowExclusiveLock); + } + ReleaseSysCache(aotup); } /* @@ -13959,6 +13980,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel, case OCLASS_SUBSCRIPTION: case OCLASS_TRANSFORM: case OCLASS_EXTPROTOCOL: + case OCLASS_TASK: /* * We don't expect any of these sorts of objects to depend on @@ -15623,12 +15645,9 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, case RELKIND_AOBLOCKDIR: case RELKIND_AOVISIMAP: if (RelationIsAppendOptimized(rel)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("altering reloptions for append only tables" - " is not permitted"))); - - (void) heap_reloptions(rel->rd_rel->relkind, newOptions, true); + (void) default_reloptions(newOptions, true, RELOPT_KIND_APPENDOPTIMIZED); + else + (void) heap_reloptions(rel->rd_rel->relkind, newOptions, true); break; case RELKIND_PARTITIONED_TABLE: (void) partitioned_table_reloptions(newOptions, true); @@ -18446,12 +18465,17 @@ ATExecSetDistributedBy(Relation rel, Node *node, AlterTableCmd *cmd) if (need_reorg) { /* + * Make sure the redistribution happens for a randomly distributed table. + * * Force the use of Postgres query optimizer, since Pivotal Optimizer (GPORCA) will not * redistribute the tuples if the current and required distributions * are both RANDOM even when reorganize is set to "true" + * Also set gp_force_random_redistribution to true. */ bool saveOptimizerGucValue = optimizer; + bool saveRedistributeGucValue = gp_force_random_redistribution; optimizer = false; + gp_force_random_redistribution = true; if (saveOptimizerGucValue) ereport(LOG, @@ -18539,6 +18563,7 @@ ATExecSetDistributedBy(Relation rel, Node *node, AlterTableCmd *cmd) PopActiveSnapshot(); optimizer = saveOptimizerGucValue; optimizer_replicated_table_insert = save_optimizer_replicated_table_insert; + gp_force_random_redistribution = saveRedistributeGucValue; CommandCounterIncrement(); /* see the effects of the command */ diff --git a/src/backend/commands/taskcmds.c b/src/backend/commands/taskcmds.c new file mode 100644 index 00000000000..e50db1c0c18 --- /dev/null +++ b/src/backend/commands/taskcmds.c @@ -0,0 +1,304 @@ +/*------------------------------------------------------------------------- + * + * taskcmds.c + * CloudBerry TASK SCHEDULE support code. + * + * Portions Copyright (c) 2023-Present Hashdata Inc. + * + * IDENTIFICATION + * src/backend/commands/taskcmds.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/dependency.h" +#include "catalog/namespace.h" +#include "catalog/pg_task.h" +#include "catalog/pg_task_run_history.h" +#include "cdb/cdbvars.h" +#include "commands/dbcommands.h" +#include "commands/defrem.h" +#include "commands/taskcmds.h" +#include "task/job_metadata.h" +#include "utils/acl.h" +#include "utils/builtins.h" + +/* + * DefineTask + * Create a new cron task. + */ +ObjectAddress +DefineTask(ParseState *pstate, CreateTaskStmt *stmt) +{ + ObjectAddress address; + char *dbname = NULL; + char *username = NULL; + ListCell *option; + DefElem *d_dbname = NULL; + DefElem *d_username = NULL; + Oid jobid = InvalidOid; + AclResult aclresult; + + /* must have CREATE privilege on database */ + aclresult = pg_database_aclcheck(MyDatabaseId, GetUserId(), ACL_CREATE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_DATABASE, + get_database_name(MyDatabaseId)); + + foreach(option, stmt->options) + { + DefElem *defel = (DefElem *) lfirst(option); + if (strcmp(defel->defname, "dbname") == 0) + { + if (d_dbname) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_dbname = defel; + } + else if (strcmp(defel->defname, "username") == 0) + { + if (d_username) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_username = defel; + } + else + elog(ERROR, "option \"%s\" not recognized", + defel->defname); + } + + /* use the specified or current user */ + if (d_username != NULL && d_username->arg) + { + username = defGetString(d_username); + if (!OidIsValid(get_role_oid(username, false))) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("role \"%s\" does not exist", username))); + } + else + username = GetUserNameFromId(GetUserId(), false); + + /* use the specified or current database */ + if (d_dbname != NULL && d_dbname->arg) + dbname = defGetString(d_dbname); + else + dbname = get_database_name(MyDatabaseId); + + if (!OidIsValid(get_database_oid(dbname, true))) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_DATABASE), + errmsg("database \"%s\" does not exist", dbname))); + + /* check if the task already exists */ + if (stmt->if_not_exists) + { + if (OidIsValid(GetTaskJobId(stmt->taskname, username))) + { + ereport(NOTICE, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("task \"%s\" already exists, skipping", + stmt->taskname))); + return InvalidObjectAddress; + } + } + + jobid = ScheduleCronJob(cstring_to_text(stmt->schedule), cstring_to_text(stmt->sql), + cstring_to_text(dbname), cstring_to_text(username), + true, cstring_to_text(stmt->taskname)); + + /* Depend on owner. */ + recordDependencyOnOwner(TaskRelationId, jobid, get_role_oid(username, false)); + + ObjectAddressSet(address, TaskRelationId, jobid); + return address; +} + +/* + * AlterTask + * Alter an existing cron task. + */ +ObjectAddress +AlterTask(ParseState *pstate, AlterTaskStmt *stmt) +{ + ObjectAddress address = InvalidObjectAddress; + char *current_user; + Oid jobid = InvalidOid; + DefElem *d_schedule = NULL; + DefElem *d_dbname = NULL; + DefElem *d_username = NULL; + DefElem *d_active = NULL; + DefElem *d_sql = NULL; + ListCell *option; + char *schedule = NULL; + char *dbname = NULL; + char *username = NULL; + bool active; + char *sql = NULL; + + current_user = GetUserNameFromId(GetUserId(), false); + jobid = GetTaskJobId(stmt->taskname, current_user); + if (!OidIsValid(jobid)) + { + if (stmt->missing_ok) + { + ereport(NOTICE, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("task \"%s\" does not exist, skipping", + stmt->taskname))); + return address; + } + else + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("task \"%s\" does not exist", stmt->taskname))); + } + + foreach(option, stmt->options) + { + DefElem *defel = (DefElem *) lfirst(option); + + if (strcmp(defel->defname, "schedule") == 0) + { + if (d_schedule) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_schedule = defel; + } + else if (strcmp(defel->defname, "dbname") == 0) + { + if (d_dbname) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_dbname = defel; + } + else if (strcmp(defel->defname, "username") == 0) + { + if (d_username) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_username = defel; + } + else if (strcmp(defel->defname, "active") == 0) + { + if (d_active) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_active = defel; + } + else if (strcmp(defel->defname, "sql") == 0) + { + if (d_sql) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"), + parser_errposition(pstate, defel->location))); + d_sql = defel; + } + else + elog(ERROR, "option \"%s\" not recognized", + defel->defname); + } + + if (d_schedule != NULL && d_schedule->arg) + schedule = defGetString(d_schedule); + + if (d_dbname != NULL && d_dbname->arg) + { + dbname = defGetString(d_dbname); + if (!OidIsValid(get_database_oid(dbname, true))) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_DATABASE), + errmsg("database \"%s\" does not exist", dbname))); + } + + if (d_username != NULL && d_username->arg) + { + username = defGetString(d_username); + if (!OidIsValid(get_role_oid(username, true))) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_DATABASE), + errmsg("role \"%s\" does not exist", username))); + } + + if (d_active != NULL) + { + active = intVal(d_active->arg); + Assert(BoolIsValid(active)); + } + + if (d_sql != NULL && d_sql->arg) + sql = defGetString(d_sql); + + AlterCronJob(jobid, schedule, sql, dbname, username, d_active != NULL ? &active : NULL); + + if (username) + { + /* Update owner dependency reference */ + changeDependencyOnOwner(TaskRelationId, + jobid, + get_role_oid(username, false)); + } + + ObjectAddressSet(address, TaskRelationId, jobid); + return address; +} + +/* + * DropTask + * Drop an existing cron task. + */ +ObjectAddress +DropTask(ParseState *pstate, DropTaskStmt *stmt) +{ + ObjectAddress address = InvalidObjectAddress; + char *username; + Oid jobid = InvalidOid; + + /* current username */ + username = GetUserNameFromId(GetUserId(), false); + + /* delete from pg_task */ + jobid = UnscheduleCronJob(stmt->taskname, username, InvalidOid, stmt->missing_ok); + + /* delete from pg_task_run_history according to the jobid */ + if (OidIsValid(jobid)) + { + RemoveTaskRunHistoryByJobId(jobid); + ObjectAddressSet(address, TaskRelationId, jobid); + /* Clean up dependencies */ + deleteSharedDependencyRecordsFor(TaskRelationId, jobid, 0); + } + + return address; +} + +/* + * RemoveTaskById + * Remove an existing cron task by jobid. + */ +void +RemoveTaskById(Oid jobid) +{ + /* remove the cron task in pg_task */ + UnscheduleCronJob(NULL, NULL, jobid, false); + /* delete from pg_task_run_history according to the jobid */ + if (OidIsValid(jobid)) + { + RemoveTaskRunHistoryByJobId(jobid); + } +} diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 2c3fd147aea..2c560274dd5 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -58,6 +58,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" +#include "access/appendonlywriter.h" #include "catalog/catalog.h" #include "catalog/heap.h" #include "catalog/pg_am.h" @@ -65,6 +66,7 @@ #include "catalog/oid_dispatch.h" #include "cdb/cdbdispatchresult.h" #include "cdb/cdbdisp_query.h" +#include "cdb/cdbutil.h" #include "cdb/cdbvars.h" #include "commands/analyzeutils.h" #include "libpq-int.h" @@ -3167,6 +3169,7 @@ vac_update_relstats_from_list(List *updated_stats) { VPgClassStats *stats = (VPgClassStats *) lfirst(lc); Relation rel; + int16 ao_segfile_count = 0; rel = relation_open(stats->relid, AccessShareLock); @@ -3177,6 +3180,41 @@ vac_update_relstats_from_list(List *updated_stats) stats->relallvisible = stats->relallvisible / rel->rd_cdbpolicy->numsegments; } + if (RelationIsAppendOptimized(rel)) + { + /* + * GPDB_PARALLEL_FIXME: This is very hacky! + * relallvisible came from vacuum AO/AOCO processes means the segment file count + * of AO/AOCO tables. We use it to update pg_appendonly.segfilecount. + * See ao_vacuum_rel_post_cleanup in vacuum_ao.c. + * relallvisible of AO/AOCO tables should always be 0 in pg_class, though, we need to reset + * it after we got the value and before updating the stats in pg_class. + */ + ao_segfile_count = stats->relallvisible/getgpsegmentCount(); /* Use rel->rd_cdbpolicy->numsegments instead of getgpsegmentCount()?*/ + stats->relallvisible = 0; /* Causion: relallvisible muset be set to 0 before updating pg_class */ + + Relation aorel; + Oid aorelid = RelationGetRelid(rel); + HeapTuple aotup; + Form_pg_appendonly aoform; + + aotup = SearchSysCache1(AORELID, ObjectIdGetDatum(aorelid)); + if (!HeapTupleIsValid(aotup)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("appendonly table relid %u does not exist in pg_appendonly", aorelid))); + + aoform = (Form_pg_appendonly) GETSTRUCT(aotup); + if (aoform->segfilecount < MAX_AOREL_CONCURRENCY && (aoform->segfilecount != ao_segfile_count)) + { + aorel = table_open(AppendOnlyRelationId, RowExclusiveLock); + aoform->segfilecount = ao_segfile_count; + heap_inplace_update(aorel, aotup); + table_close(aorel, RowExclusiveLock); + } + ReleaseSysCache(aotup); + } + /* * Pass 'false' for isvacuum, so that the stats are * actually updated. diff --git a/src/backend/commands/vacuum_ao.c b/src/backend/commands/vacuum_ao.c index 39c4c66c12a..ea1809407c0 100644 --- a/src/backend/commands/vacuum_ao.c +++ b/src/backend/commands/vacuum_ao.c @@ -161,7 +161,7 @@ static bool appendonly_tid_reaped(ItemPointer itemptr, void *state); static void vacuum_appendonly_fill_stats(Relation aorel, Snapshot snapshot, int elevel, BlockNumber *rel_pages, double *rel_tuples, - bool *relhasindex); + bool *relhasindex, BlockNumber *total_file_segs); static int vacuum_appendonly_indexes(Relation aoRelation, int options, BufferAccessStrategy bstrategy); static void scan_index(Relation indrel, @@ -220,6 +220,10 @@ ao_vacuum_rel_post_cleanup(Relation onerel, int options, VacuumParams *params, BlockNumber relpages; double reltuples; bool relhasindex; + /* AO/AOCO total file segment number, use type BlockNumber to + * represent same type with num_all_visible_pages in libpq. + */ + BlockNumber total_file_segs; int elevel; TransactionId OldestXmin; TransactionId FreezeLimit; @@ -255,7 +259,8 @@ ao_vacuum_rel_post_cleanup(Relation onerel, int options, VacuumParams *params, elevel, &relpages, &reltuples, - &relhasindex); + &relhasindex, + &total_file_segs); vacuum_set_xid_limits(onerel, params->freeze_min_age, @@ -265,11 +270,13 @@ ao_vacuum_rel_post_cleanup(Relation onerel, int options, VacuumParams *params, &OldestXmin, &FreezeLimit, &xidFullScanLimit, &MultiXactCutoff, &mxactFullScanLimit); + /* Causion: AO/AOCO use relallvisible to represent total segment file count */ vac_update_relstats(onerel, relpages, reltuples, - 0, /* AO does not currently have an equivalent to - Heap's 'all visible pages' */ + total_file_segs, /* AO/AOCO does not currently have an equivalent to + Heap's 'all visible pages', use this field to represent + AO/AOCO's total segment file count */ relhasindex, FreezeLimit, MultiXactCutoff, @@ -714,11 +721,12 @@ appendonly_tid_reaped(ItemPointer itemptr, void *state) * in pg_class. reltuples is the same as "pg_aoseg_:tupcount" * column and we simulate relpages by subdividing the eof value * ("pg_aoseg_:eof") over the defined page size. + * total_field_segs will be set only for AO/AOCO relation. */ static void vacuum_appendonly_fill_stats(Relation aorel, Snapshot snapshot, int elevel, BlockNumber *rel_pages, double *rel_tuples, - bool *relhasindex) + bool *relhasindex, BlockNumber *total_file_segs) { FileSegTotals *fstotal; BlockNumber nblocks; @@ -773,6 +781,7 @@ vacuum_appendonly_fill_stats(Relation aorel, Snapshot snapshot, int elevel, *rel_pages = nblocks; *rel_tuples = num_tuples; *relhasindex = aorel->rd_rel->relhasindex; + *total_file_segs = fstotal->totalfilesegs; ereport(elevel, (errmsg("\"%s\": found %.0f rows in %u pages.", diff --git a/src/backend/crypto/Makefile b/src/backend/crypto/Makefile new file mode 100644 index 00000000000..4bb9ebe5c2a --- /dev/null +++ b/src/backend/crypto/Makefile @@ -0,0 +1,20 @@ +#------------------------------------------------------------------------- +# +# Makefile +# Makefile for src/backend/crypto +# +# IDENTIFICATION +# src/backend/crypto/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/crypto +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + bufenc.o \ + sm4.o \ + kmgr.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/crypto/README b/src/backend/crypto/README new file mode 100644 index 00000000000..be5e5557baa --- /dev/null +++ b/src/backend/crypto/README @@ -0,0 +1,231 @@ +Cluster File Encryption +======================= + +This directory contains support functions and sample scripts to be used +for cluster file encryption. + +Architecture +------------ + +Fundamentally, cluster file encryption must store data in a file system +in such a way that the keys required to decrypt the file system data can +only be accessed using somewhere outside of the file system itself. The +external requirement can be someone typing in a passphrase, getting a +key from a key management server (KMS), or decrypting a key stored in +the file system using a hardware security module (HSM). The current +architecture supports all of these methods, and includes sample scripts +for them. + +The simplest method for accessing data keys using some external +requirement would be to retrieve all data encryption keys from a KMS. +However, retrieved keys would still need to be verified as valid. This +method also introduces unacceptable complexity for simpler use-cases, +like user-supplied passphrases or HSM usage. External key rotation +would also be very hard since it would require re-encrypting all the +file system data with the new externally-stored keys. + +For these reason, a two-tiered architecture is used, which uses two +types of encryption keys: a key encryption key (KEK) and data encryption +keys (DEK). The KEK should not be present unencrypted in the file system +--- it should be supplied the user, stored externally (e.g., in a KMS) +or stored in the file system encrypted with a HSM (e.g., PIV device). +The DEK is used to encrypt database files and is stored in the same file +system as the database but is encrypted using the KEK. Because the DEK +is encrypted, its storage in the file system is no more of a security +weakness and the storage of the encrypted database files in the same +file system. + +Implementation +-------------- + +To enable cluster file encryption, the initdb option +--cluster-key-command must be used, which specifies a command to +retrieve the KEK. initdb records the cluster_key_command in +postgresql.conf. Every time the KEK is needed, the command is run and +must return 64 hex characters which are decoded into the KEK. The +command is called twice during initdb, and every time the server starts. +initdb also sets the encryption method in controldata during server +bootstrap. + +initdb runs "postgres --boot", which calls function +kmgr.c::BootStrapKmgr(), which calls the cluster key command. The +cluster key command returns a KEK which is used to encrypt random bytes +for each DEK and writes them to the file system by +kmgr.c::KmgrWriteCryptoKeys() (unless --copy-encryption-keys is used). +Currently the DEK files are 0 and 1 and are stored in +$PGDATA/pg_cryptokeys/live. The wrapped DEK files use Key Wrapping with +Padding which verifies the validity of the KEK. + +initdb also does a non-boot backend start which calls +kmgr.c::InitializeKmgr(), which calls the cluster key command a second +time. This decrypts/unwraps the DEK keys and stores them in the shared +memory structure KmgrShmem. This step also happens every time the server +starts. Later patches will use the keys stored in KmgrShmem to +encrypt/decrypt database files. KmgrShmem is erased via +explicit_bzero() on server shutdown. + +Limitations +----------- + +There doesn't seem to be a reasonable way to detect all malicious data +modification or key extraction if a user has write permission on the +files in PGDATA. It might be possible to limit the key extraction risk +if postgresql.auto.conf were able to be moved to a directory outside of +PGDATA, and if postmaster.opts could be moved or ignored when cluster +file encryption is used. (This file is used by pg_ctl restart.) + +It doesn't appear possible to detect all malicious writes --- even if +you add message authentication code (MAC) checks to encrypted files, +modifying non-encrypted files could still affect encrypted ones, e.g., +modifying files in pg_xact could affect how heap rows are interpreted. +Basically you would need to encrypt all files, and at that point you +might as well just use an encrypted file system. There also doesn't seem +to be a way to prevent key extraction if someone has read permission on +postgres process memory. + +Initialization Vector +--------------------- + +Nonce means "number used once". An Initialization Vector (IV) is a +specific type of nonce. That is, unique but not necessarily random or +secret, as specified by the NIST +(https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf). +To generate unique IVs, the NIST recommends two methods: + + The first method is to apply the forward cipher function, under + the same key that is used for the encryption of the plaintext, + to a nonce. The nonce must be a data block that is unique to + each execution of the encryption operation. For example, the + nonce may be a counter, as described in Appendix B, or a message + number. The second method is to generate a random data block + using a FIPS-approved random number generator. + +We will use the first method to generate IVs. That is, select nonce +carefully and use a cipher with the key to make it unique enough to use +as an IV. The nonce selection for buffer encryption and WAL encryption +are described below. + +If the IV was used more than once with the same key (and we only use one +data encryption key), changes in the unencrypted data would be visible +in the encrypted data. + +IV for Heap/Index Encryption +- - - - - - - - - - - - - - + +To create the 16-byte IV needed by AES for each page version, we will +use the page LSN (8 bytes) and page number (4 bytes). In the remaining +four bytes, one bit will be used to indicate if the LSN is WAL (real) or +fake (see below). The LSN is ideal for use in the IV because it is +always increasing, and is changed every time a page is updated. The +same LSN is never used for two relations with different page contents. + +However, the same LSN can be used in multiple pages in the same relation +--- this can happen when a heap update expires an old tuple and adds a +new tuple to another page. By adding the page number to the IV, we keep +the IV unique. + +By not using the database id in the IV, CREATE DATABASE can copy the +heap/index files from the old database to a new one without +decryption/encryption. Both page copies are valid. Once a database +changes its pages, it gets new LSNs, and hence new IV. Using only the +LSN and page number also avoids requiring pg_upgrade to preserve +database oids, tablespace oids, and relfilenodes. + +As part of WAL logging, every change of a WAL-logged page gets a new +LSN, and therefore a new IV automatically. + +However, the LSN must then be visible on encrypted pages, so we will not +encrypt the LSN on the page. We will also not encrypt the CRC so +pg_checksums can still check pages offline without access to the keys. + +Non-Permanent Relations +- - - - - - - - - - - - + +To avoid the overhead of generating WAL for non-permanent (unlogged and +temporary) relations, we assign fake LSNs that are derived from a +counter via xlog.c::GetFakeLSNForUnloggedRel(). (GiST also uses this +counter for LSNs.) We also set a bit in the IV so the use of the same +value for WAL (real) and fake LSNs will still generate unique IVs. Only +main forks are encrypted, not init, vm, or fsm files. + +In the code, we need to identify if a page uses WAL or fake LSNs in +four places, when: + +1. Reading a page from the file system and decrypting +2. Setting the WAL or fake LSN on a page +3. Hint bits changes requiring new LSNs for the encryption IV +4. Encrypting and writing a page to the file system + +For all these case, we have access to the fork number and either the +relation's persistence state or the buffer state. If it is a "main" +fork and the relation persistence state is RELPERSISTENCE_PERMANENT, or +if it is an "init" fork, we use a real LSN. If it is a main fork and +RELPERSISTENCE_PERMANENT is false, we use a fake LSN. The buffer state +BM_PERMANENT is true if the relation is PERMANENT or is an init fork. + +Init Forks +- - - - - + +Init forks for unlogged relations get permanent LSNs because unlogged +relation creation is WAL logged/crash safe, even though the relation's +contents are not. When the init fork is copied to represent an empty +relation during crash recovery, it becomes a non-permanent page and must +be successfully decrypted as such. Therefore, when it is copied, its +LSN is changed to e fake LSN and then encrypted. This prevents a real +LSN from being encrypted with the fake nonce bit. + +LSN Assignment, GiST, & Non-Permanent Relations +- - - - - - - - - - - - - - - - - - - - - - - - + +LSN assignment has to be slightly modified for encryption. In normal, +non-encryption mode, LSNs are assigned to pages following these rules: + +1. During GiST builds, some pages are assigned fixed LSNs (GistBuildLSN) + +2. During GiST builds, non-permanent pages not assigned fixed LSNs in +#1 are assigned fake LSNs, via gistutil.c::gistGetFakeLSN(). + +3. All other permanent pages are assigned WAL-based LSNs based on the +WAL position of their WAL records. + +4. All other non-permanent pages have LSNs of zero. + +When encryption is enabled: + +1. During GiST builds, permanent pages are assigned WAL-based LSNs +generated by xloginsert.c::LSNForEncryption(). + +2. During GiST builds, non-permanent pages are assigned fake LSNs. +(No constant LSNs are used in #1 or #2.) + +3. same as #3 above + +4. All other non-permanent pages are assigned fake LSNs before page +encryption. + +When switching to an encrypted replica from a non-encrypted primary, +GiST indexes will be using fixed LSNs for permanent tables, so it is +recommended to rebuild GiST indexes. Non-permanent relations are not +replicated, so they are not an issue. + +Hint Bits +- - - - - + +For hint bit changes, the LSN normally doesn't change, which is a +problem. By enabling wal_log_hints, you get full page writes to the WAL +after the first hint bit change of the checkpoint. This is useful for +two reasons. First, it generates a new LSN, which is needed for the IV +to be secure. Second, full page images protect against torn pages, +which is an even bigger requirement for encryption because the new LSN +is re-encrypting the entire page, not just the hint bit changes. You +can safely lose the hint bit changes, but you need to use the same LSN +to decrypt the entire page, so a torn page with an LSN change cannot be +decrypted. To prevent this, wal_log_hints guarantees that the +pre-hint-bit version (and previous LSN version) of the page is restored. + +However, if a hint-bit-modified page is written to the file system +during a checkpoint, and there is a later hint bit change switching the +same page from clean to dirty during the same checkpoint, we need a new +LSN, and wal_log_hints doesn't give us a new LSN here. The fix for this +is to update the page LSN by writing a dummy WAL record via +xloginsert.c::LSNForEncryption() in such cases. diff --git a/src/backend/crypto/bufenc.c b/src/backend/crypto/bufenc.c new file mode 100644 index 00000000000..2a183cbc2e4 --- /dev/null +++ b/src/backend/crypto/bufenc.c @@ -0,0 +1,257 @@ +/*------------------------------------------------------------------------- + * + * bufenc.c + * + * Copyright (c) 2020, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/crypto/bufenc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "lib/stringinfo.h" + +#include "access/gist.h" +#include "access/xlog.h" +#include "crypto/bufenc.h" +#include "crypto/sm4.h" +#include "storage/bufpage.h" +#include "storage/fd.h" +#include "storage/shmem.h" + +extern XLogRecPtr LSNForEncryption(bool use_wal_lsn); + +/* + * We use the page LSN, page number, and permanent-bit to indicate if a fake + * LSN was used to create a nonce for each page. + */ +#define BUFENC_IV_SIZE 16 + +static unsigned char buf_encryption_iv[BUFENC_IV_SIZE]; + +void *BufEncCtx = NULL; +void *BufDecCtx = NULL; + +static void set_buffer_encryption_iv(Page page, BlockNumber blkno); + +void +InitializeBufferEncryption(void) +{ + const CryptoKey *key; + + if (!FileEncryptionEnabled) + return; + + key = KmgrGetKey(KMGR_KEY_ID_REL); + + if (CheckIsSM4Method()) + { + bool found; + BufEncCtx = ShmemInitStruct("sm4 encryption method encrypt ctx", + sizeof(sm4_ctx), &found); + + BufDecCtx = ShmemInitStruct("sm4 encryption method decrypt ctx", + sizeof(sm4_ctx), &found); + sm4_setkey_enc((sm4_ctx *)BufEncCtx, (unsigned char *)key->key); + sm4_setkey_dec((sm4_ctx *)BufDecCtx, (unsigned char *)key->key); + } + else + { + BufEncCtx = (void *)pg_cipher_ctx_create(PG_CIPHER_AES_CTR, + (unsigned char *) key->key, + (key->klen), true); + + BufDecCtx = (void *)pg_cipher_ctx_create(PG_CIPHER_AES_CTR, + (unsigned char *) key->key, + (key->klen), false); + } + if (!BufEncCtx) + elog(ERROR, "cannot intialize encryption context"); + + if (!BufDecCtx) + elog(ERROR, "cannot intialize decryption context"); +} + +/* Encrypt the given page with the relation key */ +void +EncryptPage(Page page, BlockNumber blkno) +{ + unsigned char *ptr = (unsigned char *) page + PageEncryptOffset; + + int enclen; + + Assert(BufEncCtx != NULL); + + set_buffer_encryption_iv(page, blkno); + if (CheckIsSM4Method()) + { + /* sm4 ofb mode use enc ctx, not dec ctx */ + sm4_ofb_cipher( + (sm4_ctx *)BufEncCtx, + ptr, + (const unsigned char *) ptr, + SizeOfPageEncryption, + buf_encryption_iv); + } + else + { + if (unlikely(!pg_cipher_encrypt((PgCipherCtx *)BufEncCtx, PG_CIPHER_AES_CTR, + (const unsigned char *) ptr, /* input */ + SizeOfPageEncryption, + ptr, /* length */ + &enclen, /* resulting length */ + buf_encryption_iv, /* iv */ + BUFENC_IV_SIZE, + NULL, 0))) + elog(ERROR, "cannot encrypt page %u", blkno); + + Assert(enclen == SizeOfPageEncryption); + } +} + +/* Decrypt the given page with the relation key */ +void +DecryptPage(Page page, BlockNumber blkno) +{ + unsigned char *ptr = (unsigned char *) page + PageEncryptOffset; + int enclen; + + Assert(BufDecCtx != NULL); + + set_buffer_encryption_iv(page, blkno); + if (CheckIsSM4Method()) + { + /* sm4 ofb mode use enc ctx, not dec ctx */ + sm4_ofb_cipher( + (sm4_ctx *)BufEncCtx, + ptr, + (const unsigned char *) ptr, + SizeOfPageEncryption, + buf_encryption_iv); + } + else + { + if (unlikely(!pg_cipher_decrypt((PgCipherCtx *)BufDecCtx, PG_CIPHER_AES_CTR, + (const unsigned char *) ptr, /* input */ + SizeOfPageEncryption, + ptr, /* output */ + &enclen, /* resulting length */ + buf_encryption_iv, /* iv */ + BUFENC_IV_SIZE, + NULL, 0))) + elog(ERROR, "cannot decrypt page %u", blkno); + + Assert(enclen == SizeOfPageEncryption); + } +} + +/* Construct iv for the given page */ +static void +set_buffer_encryption_iv(Page page, BlockNumber blkno) +{ + unsigned char *p = buf_encryption_iv; + + MemSet(buf_encryption_iv, 0, BUFENC_IV_SIZE); + + /* block number (4 byte) */ + memcpy(p, &blkno, sizeof(BlockNumber)); + p += sizeof(BlockNumber); + + /* + * set the last remain 12 bytes + */ + for (int i = 0 ; i < 12; i++) + *p++ = 0x80; +} + +/* Construct iv for the given page */ +static void +set_buffer_encryption_iv_for_ao(RelFileNode *file_node) +{ + unsigned char *p = buf_encryption_iv; + + MemSet(buf_encryption_iv, 0, BUFENC_IV_SIZE); + + /* copy the whole file node (4 bytes) */ + memcpy(p, &file_node->dbNode, sizeof(file_node->dbNode)); + p += sizeof(file_node->dbNode); + + /* + * set the last remain 12 bytes + */ + for (int i = 0 ; i < 12; i++) + *p++ = 0x80; +} + +void +EncryptAOBLock(unsigned char *data_buf, const int buf_len, + RelFileNode *file_node) +{ + int enclen; + Assert(BufEncCtx != NULL); + + set_buffer_encryption_iv_for_ao(file_node); + if (CheckIsSM4Method()) + { + sm4_ofb_cipher( + (sm4_ctx *)BufEncCtx, + data_buf, + (const unsigned char *)data_buf, + buf_len, + buf_encryption_iv); + } + else + { + if (unlikely(!pg_cipher_encrypt((PgCipherCtx *)BufEncCtx, PG_CIPHER_AES_CTR, + (const unsigned char *) data_buf, /* input */ + buf_len, + data_buf, /* length */ + &enclen, /* resulting length */ + buf_encryption_iv, /* iv */ + BUFENC_IV_SIZE, + NULL, 0))) + elog(ERROR, "cannot encrypt AO block"); + + Assert(buf_len == enclen); + } +} + +/* Decrypt the given page with the relation key */ +void +DecryptAOBlock(unsigned char *data_buf, const int buf_len, + RelFileNode *file_node) +{ + int enclen; + Assert(BufDecCtx != NULL); + + set_buffer_encryption_iv_for_ao(file_node); + if (CheckIsSM4Method()) + { + /* sm4 ofb mode use enc ctx, not dec ctx */ + sm4_ofb_cipher( + (sm4_ctx *)BufEncCtx, + data_buf, + (const unsigned char *)data_buf, + buf_len, + buf_encryption_iv); + } + else + { + if (unlikely(!pg_cipher_decrypt((PgCipherCtx *)BufDecCtx, PG_CIPHER_AES_CTR, + (const unsigned char *) data_buf, /* input */ + buf_len, + data_buf, /* output */ + &enclen, /* resulting length */ + buf_encryption_iv, /* iv */ + BUFENC_IV_SIZE, + NULL, 0))) + elog(ERROR, "cannot decrypt ao block"); + + Assert(enclen == buf_len); + } +} diff --git a/src/backend/crypto/ckey_aws.sh.sample b/src/backend/crypto/ckey_aws.sh.sample new file mode 100644 index 00000000000..d9bee531321 --- /dev/null +++ b/src/backend/crypto/ckey_aws.sh.sample @@ -0,0 +1,53 @@ +#!/bin/sh + +# This uses the AWS Secrets Manager using the AWS CLI and OpenSSL. +# This stores the AWS secret Id in $DIR. +# Do not create any file with extension "wkey" in $DIR; these are +# reserved for wrapped data key files. + +[ "$#" -ne 1 ] && echo "cluster_key_command usage: $0 \"%d\"" 1>&2 && exit 1 +# No need for %R or -R since we are not prompting + +DIR="$1" +[ ! -e "$DIR" ] && echo "$DIR does not exist" 1>&2 && exit 1 +[ ! -d "$DIR" ] && echo "$DIR is not a directory" 1>&2 && exit 1 + +# File containing the id of the AWS secret +AWS_ID_FILE="$DIR/aws-secret.id" + + +# ---------------------------------------------------------------------- + + +# Create an AWS Secrets Manager secret? +if [ ! -e "$AWS_ID_FILE" ] +then # The 'postgres' operating system user must have permission to + # access the AWS CLI + + # The epoch-time/directory/hostname combination is unique + HASH=$(echo -n "$(date '+%s')$DIR$(hostname)" | sha1sum | cut -d' ' -f1) + AWS_SECRET_ID="Postgres-cluster-key-$HASH" + + # Use stdin to avoid passing the secret on the command line + openssl rand -hex 32 | + aws secretsmanager create-secret \ + --name "$AWS_SECRET_ID" \ + --description "Postgres cluster file encryption on $(hostname)" \ + --secret-string 'file:///dev/stdin' \ + --output text > /dev/null + if [ "$?" -ne 0 ] + then echo 'cluster key generation failed' 1>&2 + exit 1 + fi + + echo "$AWS_SECRET_ID" > "$AWS_ID_FILE" +fi + +if ! aws secretsmanager get-secret-value \ + --secret-id "$(cat "$AWS_ID_FILE")" \ + --output text +then echo 'cluster key retrieval failed' 1>&2 + exit 1 +fi | awk -F'\t' 'NR == 1 {print $4}' + +exit 0 diff --git a/src/backend/crypto/ckey_direct.sh.sample b/src/backend/crypto/ckey_direct.sh.sample new file mode 100644 index 00000000000..492defcffed --- /dev/null +++ b/src/backend/crypto/ckey_direct.sh.sample @@ -0,0 +1,39 @@ +#!/bin/sh + +# This uses a 64-character hex key supplied by the user. +# If OpenSSL is installed, you can generate a pseudo-random key by running: +# openssl rand -hex 32 +# To get a true random key, run: +# wget -q -O - 'https://www.random.org/cgi-bin/randbyte?nbytes=32&format=h' | tr -d ' \n'; echo +# Do not create any fie with extension "wkey" in $DIR; these are +# reserved for wrapped data key files. + +[ "$#" -lt 1 ] && echo "cluster_key_command usage: $0 %R [%p]" 1>&2 && exit 1 +# Supports environment variable PROMPT + +FD="$1" +[ ! -t "$FD" ] && echo "file descriptor $FD does not refer to a terminal" 1>&2 && exit 1 + +[ "$2" ] && PROMPT="$2" + + +# ---------------------------------------------------------------------- + +[ ! "$PROMPT" ] && PROMPT='Enter cluster key as 64 hexadecimal characters: ' + +stty -echo <&"$FD" + +echo 1>&"$FD" +echo -n "$PROMPT" 1>&"$FD" +read KEY <&"$FD" + +stty echo <&"$FD" + +if [ "$(expr "$KEY" : '[0-9a-fA-F]*$')" -ne 64 ] +then echo 'invalid; must be 64 hexadecimal characters' 1>&2 + exit 1 +fi + +echo "$KEY" + +exit 0 diff --git a/src/backend/crypto/ckey_passphrase.sh.sample b/src/backend/crypto/ckey_passphrase.sh.sample new file mode 100644 index 00000000000..a5d837b45ec --- /dev/null +++ b/src/backend/crypto/ckey_passphrase.sh.sample @@ -0,0 +1,35 @@ +#!/bin/sh + +# This uses a passphrase supplied by the user. +# Do not create any fie with extension "wkey" in $DIR; these are +# reserved for wrapped data key files. + +[ "$#" -lt 1 ] && echo "cluster_key_command usage: $0 %R [\"%p\"]" 1>&2 && exit 1 + +FD="$1" +[ ! -t "$FD" ] && echo "file descriptor $FD does not refer to a terminal" 1>&2 && exit 1 +# Supports environment variable PROMPT + +[ "$2" ] && PROMPT="$2" + + +# ---------------------------------------------------------------------- + +[ ! "$PROMPT" ] && PROMPT='Enter cluster passphrase: ' + +stty -echo <&"$FD" + +echo 1>&"$FD" +echo -n "$PROMPT" 1>&"$FD" +read PASS <&"$FD" + +stty echo <&"$FD" + +if [ ! "$PASS" ] +then echo 'invalid: empty passphrase' 1>&2 + exit 1 +fi + +echo "$PASS" | sha256sum | cut -d' ' -f1 + +exit 0 diff --git a/src/backend/crypto/ckey_piv_nopin.sh.sample b/src/backend/crypto/ckey_piv_nopin.sh.sample new file mode 100644 index 00000000000..e90a579deac --- /dev/null +++ b/src/backend/crypto/ckey_piv_nopin.sh.sample @@ -0,0 +1,68 @@ +#!/bin/sh + +# This uses the public/private keys on a PIV device, like a CAC or Yubikey. +# It uses a PIN stored in a file. +# It uses OpenSSL with PKCS11 enabled via OpenSC. +# This stores the cluster encryption key encrypted with the PIV public +# key in $DIR. This is technically a three-level encryption +# architecture, with the third level requiring the PIV and PIN. +# Do not create any fie with extension "wkey" in $DIR; these are +# reserved for wrapped data key files. + +[ "$#" -ne 1 ] && echo "cluster_key_command usage: $0 \"%d\"" 1>&2 && exit 1 +# Supports environment variable PIV_PIN_FILE +# No need for %R or -R since we are not prompting for a PIN + +DIR="$1" +[ ! -e "$DIR" ] && echo "$DIR does not exist" 1>&2 && exit 1 +[ ! -d "$DIR" ] && echo "$DIR is not a directory" 1>&2 && exit 1 + +# Set these here or pass in as environment variables. +# File that stores the PIN to unlock the PIV +#PIV_PIN_FILE='' +# PIV slot 3 is the "Key Management" slot, so we use '0:3' +PIV_SLOT='0:3' + +# File containing the cluster key encrypted with the PIV_SLOT's public key +KEY_FILE="$DIR/pivpass.key" + + +# ---------------------------------------------------------------------- + +[ ! "$PIV_PIN_FILE" ] && echo 'PIV_PIN_FILE undefined' 1>&2 && exit 1 +[ ! -e "$PIV_PIN_FILE" ] && echo "$PIV_PIN_FILE does not exist" 1>&2 && exit 1 +[ -d "$PIV_PIN_FILE" ] && echo "$PIV_PIN_FILE is a directory" 1>&2 && exit 1 + +[ ! "$KEY_FILE" ] && echo 'KEY_FILE undefined' 1>&2 && exit 1 +[ -d "$KEY_FILE" ] && echo "$KEY_FILE is a directory" 1>&2 && exit 1 + +# Create a cluster key encrypted with the PIV_SLOT's public key? +if [ ! -e "$KEY_FILE" ] +then # The 'postgres' operating system user must have permission to + # access the PIV device. + + openssl rand -hex 32 | + if ! openssl rsautl -engine pkcs11 -keyform engine -encrypt \ + -inkey "$PIV_SLOT" -passin file:"$PIV_PIN_FILE" -out "$KEY_FILE" + then echo 'cluster key generation failed' 1>&2 + exit 1 + fi + + # Warn the user to save the cluster key in a safe place + cat 1>&2 <&2 + exit 1 +fi + +exit 0 diff --git a/src/backend/crypto/ckey_piv_pin.sh.sample b/src/backend/crypto/ckey_piv_pin.sh.sample new file mode 100644 index 00000000000..e693ac31ba7 --- /dev/null +++ b/src/backend/crypto/ckey_piv_pin.sh.sample @@ -0,0 +1,81 @@ +#!/bin/sh + +# This uses the public/private keys on a PIV device, like a CAC or Yubikey. +# It requires a user-entered PIN. +# It uses OpenSSL with PKCS11 enabled via OpenSC. +# This stores the cluster encryption key encrypted with the PIV public +# key in $DIR. This is technically a three-level encryption +# architecture, with the third level requiring the PIV and PIN. +# Do not create any fie with extension "wkey" in $DIR; these are +# reserved for wrapped data key files. + +[ "$#" -lt 2 ] && echo "cluster_key_command usage: $0 \"%d\" %R [\"%p\"]" 1>&2 && exit 1 +# Supports environment variable PROMPT + +DIR="$1" +[ ! -e "$DIR" ] && echo "$DIR does not exist" 1>&2 && exit 1 +[ ! -d "$DIR" ] && echo "$DIR is not a directory" 1>&2 && exit 1 + +FD="$2" +[ ! -t "$FD" ] && echo "file descriptor $FD does not refer to a terminal" 1>&2 && exit 1 + +[ "$3" ] && PROMPT="$3" + +# PIV slot 3 is the "Key Management" slot, so we use '0:3' +PIV_SLOT='0:3' + +# File containing the cluster key encrypted with the PIV_SLOT's public key +KEY_FILE="$DIR/pivpass.key" + + +# ---------------------------------------------------------------------- + +[ ! "$PROMPT" ] && PROMPT='Enter PIV PIN: ' + +stty -echo <&"$FD" + +# Create a cluster key encrypted with the PIV_SLOT's public key? +if [ ! -e "$KEY_FILE" ] +then echo 1>&"$FD" + echo -n "$PROMPT" 1>&"$FD" + + # The 'postgres' operating system user must have permission to + # access the PIV device. + + openssl rand -hex 32 | + # 'engine "pkcs11" set.' message confuses prompting + if ! openssl rsautl -engine pkcs11 -keyform engine -encrypt \ + -inkey "$PIV_SLOT" -passin fd:"$FD" -out "$KEY_FILE" 2>&1 + then stty echo <&"$FD" + echo 'cluster key generation failed' 1>&2 + exit 1 + fi | grep -v 'engine "pkcs11" set\.' + + echo 1>&"$FD" + + # Warn the user to save the cluster key in a safe place + cat 1>&"$FD" <&"$FD" +echo -n "$PROMPT" 1>&"$FD" + +# Decrypt the cluster key encrypted with the PIV_SLOT's public key +if ! openssl rsautl -engine pkcs11 -keyform engine -decrypt \ + -inkey "$PIV_SLOT" -passin fd:"$FD" -in "$KEY_FILE" 2>&1 +then stty echo <&"$FD" + echo 'cluster key retrieval failed' 1>&2 + exit 1 +fi | grep -v 'engine "pkcs11" set\.' + +echo 1>&"$FD" + +stty echo <&"$FD" + +exit 0 diff --git a/src/backend/crypto/kmgr.c b/src/backend/crypto/kmgr.c new file mode 100644 index 00000000000..32f2366a9a5 --- /dev/null +++ b/src/backend/crypto/kmgr.c @@ -0,0 +1,445 @@ +/*------------------------------------------------------------------------- + * + * kmgr.c + * Cluster file encryption routines + * + * Cluster file encryption is enabled if user requests it during initdb. + * During bootstrap, we generate data encryption keys, wrap them with the + * cluster-level key, and store them into each file located at KMGR_DIR. + * During startup, we decrypt all internal keys and load them to the shared + * memory. Internal keys in the shared memory are read-only. The wrapping + * and unwrapping key routines require the OpenSSL library. + * + * Copyright (c) 2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/crypto/kmgr.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "funcapi.h" +#include "miscadmin.h" +#include "pgstat.h" + +#include "access/xlog.h" +#include "common/file_perm.h" +#include "common/kmgr_utils.h" +#include "common/sha2.h" +#include "access/xlog.h" +#include "common/controldata_utils.h" +#include "crypto/kmgr.h" +#include "postmaster/postmaster.h" +#include "storage/copydir.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +/* Struct stores file encryption keys in plaintext format */ +typedef struct KmgrShmemData +{ + CryptoKey intlKeys[KMGR_NUM_DATA_KEYS]; +} KmgrShmemData; +static KmgrShmemData *KmgrShmem; + +/* GUC variables */ +char *cluster_key_command = NULL; +bool tde_force_switch = true; + +CryptoKey bootstrap_keys[KMGR_NUM_DATA_KEYS]; + +extern char *bootstrap_old_key_datadir; +extern int bootstrap_file_encryption_method; + +static void bzeroKmgrKeys(int status, Datum arg); +static void KmgrWriteCryptoKeys(const char *dir, unsigned char **keys, int *key_lens); +static CryptoKey *generate_crypto_key(int len); + +static void InitializeFileEncryptionStatus() +{ + FileEncryptionEnabled = tde_force_switch && (GetFileEncryptionMethod() != DISABLED_ENCRYPTION_METHOD); +} + +/* + * This function must be called ONCE during initdb. It creates the DEK + * files wrapped with the KEK supplied by kmgr_run_cluster_key_command(). + * There is also an option for the keys to be copied from another cluster. + */ +void +BootStrapKmgr(void) +{ + char live_path[MAXPGPATH]; + unsigned char *keys_wrap[KMGR_NUM_DATA_KEYS]; + int key_lens[KMGR_NUM_DATA_KEYS]; + char cluster_key_hex[ALLOC_KMGR_CLUSTER_KEY_LEN]; + int cluster_key_hex_len; + unsigned char cluster_key[KMGR_CLUSTER_KEY_LEN]; + + InitializeFileEncryptionStatus(); + + if (!FileEncryptionEnabled) + return; + +#ifndef USE_OPENSSL + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + (errmsg("cluster file encryption is not supported because OpenSSL is not supported by this build"), + errhint("Compile with --with-openssl to use this feature.")))); +#endif + + snprintf(live_path, sizeof(live_path), "%s/%s", DataDir, LIVE_KMGR_DIR); + + /* + * Copy cluster file encryption keys from an old cluster? This is useful + * for pg_upgrade upgrades where the copied database files are already + * encrypted using the old cluster's DEK keys. + */ + if (bootstrap_old_key_datadir != NULL) + { + char old_key_dir[MAXPGPATH]; + + snprintf(old_key_dir, sizeof(old_key_dir), "%s/%s", + bootstrap_old_key_datadir, LIVE_KMGR_DIR); + copydir(old_key_dir, LIVE_KMGR_DIR, true); + } + /* create an empty directory */ + else + { + if (mkdir(LIVE_KMGR_DIR, pg_dir_create_mode) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create cluster file encryption directory \"%s\": %m", + LIVE_KMGR_DIR))); + } + + /* + * Get key encryption key (KEK) from the cluster_key command. The cluster + * key command might need to check for the existence of files in the live + * directory, e.g., PIV, so run this _after_ copying the directory in + * place. + */ + cluster_key_hex_len = kmgr_run_cluster_key_command(cluster_key_command, + cluster_key_hex, + ALLOC_KMGR_CLUSTER_KEY_LEN, + live_path, terminal_fd); + + /* decode supplied hex */ + if (hex_decode(cluster_key_hex, cluster_key_hex_len, + (char *) cluster_key) != + KMGR_CLUSTER_KEY_LEN) + ereport(ERROR, + (errmsg("cluster key must be %d hexadecimal characters", + KMGR_CLUSTER_KEY_LEN * 2))); + + /* We are not in copy mode? Generate new cluster file encryption keys. */ + if (bootstrap_old_key_datadir == NULL) + { + unsigned char *bootstrap_keys_wrap[KMGR_NUM_DATA_KEYS]; + int key_lens[KMGR_NUM_DATA_KEYS]; + PgCipherCtx *cluster_key_ctx; + + /* Create KEK encryption context */ + cluster_key_ctx = pg_cipher_ctx_create(PG_CIPHER_AES_KWP, cluster_key, + KMGR_CLUSTER_KEY_LEN, true); + if (!cluster_key_ctx) + elog(ERROR, "could not initialize encryption context"); + + /* Wrap data encryption keys (DEK) using the key encryption key (KEK) */ + for (int id = 0; id < KMGR_NUM_DATA_KEYS; id++) + { + CryptoKey *key; + int block_size = 0; + + /* generate a DEK */ + key = generate_crypto_key( + encryption_methods[bootstrap_file_encryption_method].bit_length / 8); + + /* output generated random string as hex, for testing */ + { + char str[MAXPGPATH]; + int out_len; + + out_len = hex_encode((char *) (key->key), key->klen, + str); + str[out_len] = '\0'; + } + + block_size = pg_cipher_blocksize(cluster_key_ctx); + elog(LOG, "block_size:%d", block_size); + bootstrap_keys_wrap[id] = palloc0(KMGR_MAX_KEY_LEN_BYTES + + block_size); + + /* wrap DEK with KEK */ + if (!kmgr_wrap_data_key(cluster_key_ctx, key, bootstrap_keys_wrap[id], &(key_lens[id]))) + { + pg_cipher_ctx_free(cluster_key_ctx); + elog(ERROR, "failed to wrap data encryption key"); + } + + /* remove DEK from memory */ + explicit_bzero(key, sizeof(CryptoKey)); + } + + /* Write data encryption keys to the disk */ + KmgrWriteCryptoKeys(LIVE_KMGR_DIR, bootstrap_keys_wrap, key_lens); + + pg_cipher_ctx_free(cluster_key_ctx); + } + + /* + * We are either decrypting keys we copied from an old cluster, or + * decrypting keys we just wrote above --- either way, we decrypt them + * here and store them in a file-scoped variable for use in later + * encrypting during bootstrap mode. + */ + + /* Get the crypto keys from the live directory */ + kmgr_read_wrapped_data_keys(LIVE_KMGR_DIR, keys_wrap, key_lens); + + if (!kmgr_verify_cluster_key(cluster_key, keys_wrap, key_lens, bootstrap_keys)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("supplied cluster key does not match expected cluster_key"))); + + /* bzero DEK on exit */ + on_proc_exit(bzeroKmgrKeys, 0); + + /* bzero KEK */ + explicit_bzero(cluster_key_hex, cluster_key_hex_len); + explicit_bzero(cluster_key, KMGR_CLUSTER_KEY_LEN); +} + +/* Report shared-memory space needed by KmgrShmem */ +Size +KmgrShmemSize(void) +{ + if (!FileEncryptionEnabled) + return 0; + + return MAXALIGN(sizeof(KmgrShmemData)); +} + +/* Allocate and initialize key manager memory */ +void +KmgrShmemInit(void) +{ + bool found; + + if (!FileEncryptionEnabled) + return; + + KmgrShmem = (KmgrShmemData *) ShmemInitStruct("File encryption key manager", + KmgrShmemSize(), &found); + + /* bzero DEK on exit */ + on_shmem_exit(bzeroKmgrKeys, 0); +} + +/* + * Get cluster key and verify it, then get the data encryption keys. + * This function is called by postmaster at startup time. + */ +void +InitializeKmgr(void) +{ + unsigned char *keys_wrap[KMGR_NUM_DATA_KEYS]; + int key_lens[KMGR_NUM_DATA_KEYS]; + char cluster_key_hex[ALLOC_KMGR_CLUSTER_KEY_LEN]; + int cluster_key_hex_len; + struct stat buffer; + char live_path[MAXPGPATH]; + unsigned char cluster_key[KMGR_CLUSTER_KEY_LEN]; + + InitializeFileEncryptionStatus(); + + if (!FileEncryptionEnabled) + return; + +#ifndef USE_OPENSSL + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + (errmsg("cluster file encryption is not supported because OpenSSL is not supported by this build"), + errhint("Compile with --with-openssl to use this feature.")))); +#endif + + elog(DEBUG1, "starting up cluster file encryption manager"); + + if (stat(KMGR_DIR, &buffer) != 0 || !S_ISDIR(buffer.st_mode)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("cluster file encryption directory %s is missing", KMGR_DIR)))); + + if (stat(KMGR_DIR_PID, &buffer) == 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("cluster had a pg_alterckey failure that needs repair or pg_alterckey is running"), + errhint("Run pg_alterckey --repair or wait for it to complete.")))); + + /* + * We want OLD deleted since it allows access to the data encryption keys + * using the old cluster key. If NEW exists, it means either the new + * directory is partly written, or NEW wasn't renamed to LIVE --- in + * either case, it needs to be repaired. See src/bin/pg_alterckey/README + * for more details. + */ + if (stat(OLD_KMGR_DIR, &buffer) == 0 || stat(NEW_KMGR_DIR, &buffer) == 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("cluster had a pg_alterckey failure that needs repair"), + errhint("Run pg_alterckey --repair.")))); + + /* If OLD, NEW, and LIVE do not exist, there is a serious problem. */ + if (stat(LIVE_KMGR_DIR, &buffer) != 0 || !S_ISDIR(buffer.st_mode)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("cluster has no data encryption keys")))); + + /* Get the cluster key (KEK) */ + snprintf(live_path, sizeof(live_path), "%s/%s", DataDir, LIVE_KMGR_DIR); + cluster_key_hex_len = kmgr_run_cluster_key_command(cluster_key_command, + cluster_key_hex, + ALLOC_KMGR_CLUSTER_KEY_LEN, + live_path, terminal_fd); + + /* decode supplied hex */ + if (hex_decode(cluster_key_hex, cluster_key_hex_len, + (char *) cluster_key) != + KMGR_CLUSTER_KEY_LEN) + ereport(ERROR, + (errmsg("cluster key must be %d hexadecimal characters", + KMGR_CLUSTER_KEY_LEN * 2))); + + /* Load wrapped DEKs from their files into an array */ + kmgr_read_wrapped_data_keys(LIVE_KMGR_DIR, keys_wrap, key_lens); + + /* + * Verify cluster key and store the unwrapped data encryption keys in + * shared memory. + */ + if (!kmgr_verify_cluster_key(cluster_key, keys_wrap, key_lens, KmgrShmem->intlKeys)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("supplied cluster key does not match expected cluster key"))); + + /* Check that retrieved key lengths match controldata length. */ + for (int id = 0; id < KMGR_NUM_DATA_KEYS; id++) + if (KmgrShmem->intlKeys[id].klen * 8 != + encryption_methods[GetFileEncryptionMethod()].bit_length) + { + char path[MAXPGPATH]; + + CryptoKeyFilePath(path, DataDir, id); + + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("data encryption key %s of length %d does not match controldata key length %d", + path, KmgrShmem->intlKeys[id].klen * 8, + encryption_methods[GetFileEncryptionMethod()].bit_length))); + } + + /* bzero KEK */ + explicit_bzero(cluster_key_hex, cluster_key_hex_len); + explicit_bzero(cluster_key, KMGR_CLUSTER_KEY_LEN); +} + +static void +bzeroKmgrKeys(int status, Datum arg) +{ + if (IsBootstrapProcessingMode()) + explicit_bzero(bootstrap_keys, sizeof(bootstrap_keys)); + else + explicit_bzero(KmgrShmem->intlKeys, sizeof(KmgrShmem->intlKeys)); +} + +/* return requested DEK */ +const CryptoKey * +KmgrGetKey(int id) +{ + Assert(id < KMGR_NUM_DATA_KEYS); + + return (const CryptoKey *) (IsBootstrapProcessingMode() ? + &(bootstrap_keys[id]) : &(KmgrShmem->intlKeys[id])); +} + +bool CheckIsSM4Method(void) +{ + if (strcmp("SM4", encryption_methods[GetFileEncryptionMethod()].name) == 0) + return true; + + return false; + +} + +/* Generate a DEK inside a CryptoKey */ +static CryptoKey * +generate_crypto_key(int len) +{ + CryptoKey *newkey; + + Assert(len <= KMGR_MAX_KEY_LEN); + newkey = (CryptoKey *) palloc0(sizeof(CryptoKey)); + + newkey->klen = len; + + if (!pg_strong_random(newkey->key, len)) + elog(ERROR, "failed to generate new file encryption key"); + + return newkey; +} + +/* + * Write the DEKs to the disk. + */ +static void +KmgrWriteCryptoKeys(const char *dir, unsigned char **keys, int *key_lens) +{ + elog(DEBUG2, "writing data encryption keys wrapped using the cluster key"); + + for (int i = 0; i < KMGR_NUM_DATA_KEYS; i++) + { + int fd; + char path[MAXPGPATH]; + + CryptoKeyFilePath(path, dir, i); + + if ((fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY)) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_KEY_FILE_WRITE); + if (write(fd, keys[i], key_lens[i]) != key_lens[i]) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + path))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_KEY_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + path))); + pgstat_report_wait_end(); + + if (close(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + path))); + } +} diff --git a/src/backend/crypto/sm4.c b/src/backend/crypto/sm4.c new file mode 100644 index 00000000000..68a9a129287 --- /dev/null +++ b/src/backend/crypto/sm4.c @@ -0,0 +1,464 @@ +#include "postgres.h" +#include +#include "crypto/sm4.h" + +static const uint8_t SM4_S[256] = { + 0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, + 0x28, 0xFB, 0x2C, 0x05, 0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, + 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, 0x9C, 0x42, 0x50, 0xF4, + 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, + 0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, + 0x75, 0x8F, 0x3F, 0xA6, 0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, + 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, 0x68, 0x6B, 0x81, 0xB2, + 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, + 0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, + 0x01, 0x21, 0x78, 0x87, 0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, + 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, 0xEA, 0xBF, 0x8A, 0xD2, + 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, + 0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, + 0xF5, 0x8C, 0xB1, 0xE3, 0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, + 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, 0xD5, 0xDB, 0x37, 0x45, + 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, + 0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, + 0x1F, 0x10, 0x5A, 0xD8, 0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, + 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, 0x89, 0x69, 0x97, 0x4A, + 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, + 0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, + 0xD7, 0xCB, 0x39, 0x48 +}; + +/* + * SM4_SBOX_T[j] == L(SM4_SBOX[j]). + */ +static const uint32_t SM4_SBOX_T0[256] = { + 0x8ED55B5B, 0xD0924242, 0x4DEAA7A7, 0x06FDFBFB, 0xFCCF3333, 0x65E28787, + 0xC93DF4F4, 0x6BB5DEDE, 0x4E165858, 0x6EB4DADA, 0x44145050, 0xCAC10B0B, + 0x8828A0A0, 0x17F8EFEF, 0x9C2CB0B0, 0x11051414, 0x872BACAC, 0xFB669D9D, + 0xF2986A6A, 0xAE77D9D9, 0x822AA8A8, 0x46BCFAFA, 0x14041010, 0xCFC00F0F, + 0x02A8AAAA, 0x54451111, 0x5F134C4C, 0xBE269898, 0x6D482525, 0x9E841A1A, + 0x1E061818, 0xFD9B6666, 0xEC9E7272, 0x4A430909, 0x10514141, 0x24F7D3D3, + 0xD5934646, 0x53ECBFBF, 0xF89A6262, 0x927BE9E9, 0xFF33CCCC, 0x04555151, + 0x270B2C2C, 0x4F420D0D, 0x59EEB7B7, 0xF3CC3F3F, 0x1CAEB2B2, 0xEA638989, + 0x74E79393, 0x7FB1CECE, 0x6C1C7070, 0x0DABA6A6, 0xEDCA2727, 0x28082020, + 0x48EBA3A3, 0xC1975656, 0x80820202, 0xA3DC7F7F, 0xC4965252, 0x12F9EBEB, + 0xA174D5D5, 0xB38D3E3E, 0xC33FFCFC, 0x3EA49A9A, 0x5B461D1D, 0x1B071C1C, + 0x3BA59E9E, 0x0CFFF3F3, 0x3FF0CFCF, 0xBF72CDCD, 0x4B175C5C, 0x52B8EAEA, + 0x8F810E0E, 0x3D586565, 0xCC3CF0F0, 0x7D196464, 0x7EE59B9B, 0x91871616, + 0x734E3D3D, 0x08AAA2A2, 0xC869A1A1, 0xC76AADAD, 0x85830606, 0x7AB0CACA, + 0xB570C5C5, 0xF4659191, 0xB2D96B6B, 0xA7892E2E, 0x18FBE3E3, 0x47E8AFAF, + 0x330F3C3C, 0x674A2D2D, 0xB071C1C1, 0x0E575959, 0xE99F7676, 0xE135D4D4, + 0x661E7878, 0xB4249090, 0x360E3838, 0x265F7979, 0xEF628D8D, 0x38596161, + 0x95D24747, 0x2AA08A8A, 0xB1259494, 0xAA228888, 0x8C7DF1F1, 0xD73BECEC, + 0x05010404, 0xA5218484, 0x9879E1E1, 0x9B851E1E, 0x84D75353, 0x00000000, + 0x5E471919, 0x0B565D5D, 0xE39D7E7E, 0x9FD04F4F, 0xBB279C9C, 0x1A534949, + 0x7C4D3131, 0xEE36D8D8, 0x0A020808, 0x7BE49F9F, 0x20A28282, 0xD4C71313, + 0xE8CB2323, 0xE69C7A7A, 0x42E9ABAB, 0x43BDFEFE, 0xA2882A2A, 0x9AD14B4B, + 0x40410101, 0xDBC41F1F, 0xD838E0E0, 0x61B7D6D6, 0x2FA18E8E, 0x2BF4DFDF, + 0x3AF1CBCB, 0xF6CD3B3B, 0x1DFAE7E7, 0xE5608585, 0x41155454, 0x25A38686, + 0x60E38383, 0x16ACBABA, 0x295C7575, 0x34A69292, 0xF7996E6E, 0xE434D0D0, + 0x721A6868, 0x01545555, 0x19AFB6B6, 0xDF914E4E, 0xFA32C8C8, 0xF030C0C0, + 0x21F6D7D7, 0xBC8E3232, 0x75B3C6C6, 0x6FE08F8F, 0x691D7474, 0x2EF5DBDB, + 0x6AE18B8B, 0x962EB8B8, 0x8A800A0A, 0xFE679999, 0xE2C92B2B, 0xE0618181, + 0xC0C30303, 0x8D29A4A4, 0xAF238C8C, 0x07A9AEAE, 0x390D3434, 0x1F524D4D, + 0x764F3939, 0xD36EBDBD, 0x81D65757, 0xB7D86F6F, 0xEB37DCDC, 0x51441515, + 0xA6DD7B7B, 0x09FEF7F7, 0xB68C3A3A, 0x932FBCBC, 0x0F030C0C, 0x03FCFFFF, + 0xC26BA9A9, 0xBA73C9C9, 0xD96CB5B5, 0xDC6DB1B1, 0x375A6D6D, 0x15504545, + 0xB98F3636, 0x771B6C6C, 0x13ADBEBE, 0xDA904A4A, 0x57B9EEEE, 0xA9DE7777, + 0x4CBEF2F2, 0x837EFDFD, 0x55114444, 0xBDDA6767, 0x2C5D7171, 0x45400505, + 0x631F7C7C, 0x50104040, 0x325B6969, 0xB8DB6363, 0x220A2828, 0xC5C20707, + 0xF531C4C4, 0xA88A2222, 0x31A79696, 0xF9CE3737, 0x977AEDED, 0x49BFF6F6, + 0x992DB4B4, 0xA475D1D1, 0x90D34343, 0x5A124848, 0x58BAE2E2, 0x71E69797, + 0x64B6D2D2, 0x70B2C2C2, 0xAD8B2626, 0xCD68A5A5, 0xCB955E5E, 0x624B2929, + 0x3C0C3030, 0xCE945A5A, 0xAB76DDDD, 0x867FF9F9, 0xF1649595, 0x5DBBE6E6, + 0x35F2C7C7, 0x2D092424, 0xD1C61717, 0xD66FB9B9, 0xDEC51B1B, 0x94861212, + 0x78186060, 0x30F3C3C3, 0x897CF5F5, 0x5CEFB3B3, 0xD23AE8E8, 0xACDF7373, + 0x794C3535, 0xA0208080, 0x9D78E5E5, 0x56EDBBBB, 0x235E7D7D, 0xC63EF8F8, + 0x8BD45F5F, 0xE7C82F2F, 0xDD39E4E4, 0x68492121 }; + +static uint32_t SM4_SBOX_T1[256] = { + 0x5B8ED55B, 0x42D09242, 0xA74DEAA7, 0xFB06FDFB, 0x33FCCF33, 0x8765E287, + 0xF4C93DF4, 0xDE6BB5DE, 0x584E1658, 0xDA6EB4DA, 0x50441450, 0x0BCAC10B, + 0xA08828A0, 0xEF17F8EF, 0xB09C2CB0, 0x14110514, 0xAC872BAC, 0x9DFB669D, + 0x6AF2986A, 0xD9AE77D9, 0xA8822AA8, 0xFA46BCFA, 0x10140410, 0x0FCFC00F, + 0xAA02A8AA, 0x11544511, 0x4C5F134C, 0x98BE2698, 0x256D4825, 0x1A9E841A, + 0x181E0618, 0x66FD9B66, 0x72EC9E72, 0x094A4309, 0x41105141, 0xD324F7D3, + 0x46D59346, 0xBF53ECBF, 0x62F89A62, 0xE9927BE9, 0xCCFF33CC, 0x51045551, + 0x2C270B2C, 0x0D4F420D, 0xB759EEB7, 0x3FF3CC3F, 0xB21CAEB2, 0x89EA6389, + 0x9374E793, 0xCE7FB1CE, 0x706C1C70, 0xA60DABA6, 0x27EDCA27, 0x20280820, + 0xA348EBA3, 0x56C19756, 0x02808202, 0x7FA3DC7F, 0x52C49652, 0xEB12F9EB, + 0xD5A174D5, 0x3EB38D3E, 0xFCC33FFC, 0x9A3EA49A, 0x1D5B461D, 0x1C1B071C, + 0x9E3BA59E, 0xF30CFFF3, 0xCF3FF0CF, 0xCDBF72CD, 0x5C4B175C, 0xEA52B8EA, + 0x0E8F810E, 0x653D5865, 0xF0CC3CF0, 0x647D1964, 0x9B7EE59B, 0x16918716, + 0x3D734E3D, 0xA208AAA2, 0xA1C869A1, 0xADC76AAD, 0x06858306, 0xCA7AB0CA, + 0xC5B570C5, 0x91F46591, 0x6BB2D96B, 0x2EA7892E, 0xE318FBE3, 0xAF47E8AF, + 0x3C330F3C, 0x2D674A2D, 0xC1B071C1, 0x590E5759, 0x76E99F76, 0xD4E135D4, + 0x78661E78, 0x90B42490, 0x38360E38, 0x79265F79, 0x8DEF628D, 0x61385961, + 0x4795D247, 0x8A2AA08A, 0x94B12594, 0x88AA2288, 0xF18C7DF1, 0xECD73BEC, + 0x04050104, 0x84A52184, 0xE19879E1, 0x1E9B851E, 0x5384D753, 0x00000000, + 0x195E4719, 0x5D0B565D, 0x7EE39D7E, 0x4F9FD04F, 0x9CBB279C, 0x491A5349, + 0x317C4D31, 0xD8EE36D8, 0x080A0208, 0x9F7BE49F, 0x8220A282, 0x13D4C713, + 0x23E8CB23, 0x7AE69C7A, 0xAB42E9AB, 0xFE43BDFE, 0x2AA2882A, 0x4B9AD14B, + 0x01404101, 0x1FDBC41F, 0xE0D838E0, 0xD661B7D6, 0x8E2FA18E, 0xDF2BF4DF, + 0xCB3AF1CB, 0x3BF6CD3B, 0xE71DFAE7, 0x85E56085, 0x54411554, 0x8625A386, + 0x8360E383, 0xBA16ACBA, 0x75295C75, 0x9234A692, 0x6EF7996E, 0xD0E434D0, + 0x68721A68, 0x55015455, 0xB619AFB6, 0x4EDF914E, 0xC8FA32C8, 0xC0F030C0, + 0xD721F6D7, 0x32BC8E32, 0xC675B3C6, 0x8F6FE08F, 0x74691D74, 0xDB2EF5DB, + 0x8B6AE18B, 0xB8962EB8, 0x0A8A800A, 0x99FE6799, 0x2BE2C92B, 0x81E06181, + 0x03C0C303, 0xA48D29A4, 0x8CAF238C, 0xAE07A9AE, 0x34390D34, 0x4D1F524D, + 0x39764F39, 0xBDD36EBD, 0x5781D657, 0x6FB7D86F, 0xDCEB37DC, 0x15514415, + 0x7BA6DD7B, 0xF709FEF7, 0x3AB68C3A, 0xBC932FBC, 0x0C0F030C, 0xFF03FCFF, + 0xA9C26BA9, 0xC9BA73C9, 0xB5D96CB5, 0xB1DC6DB1, 0x6D375A6D, 0x45155045, + 0x36B98F36, 0x6C771B6C, 0xBE13ADBE, 0x4ADA904A, 0xEE57B9EE, 0x77A9DE77, + 0xF24CBEF2, 0xFD837EFD, 0x44551144, 0x67BDDA67, 0x712C5D71, 0x05454005, + 0x7C631F7C, 0x40501040, 0x69325B69, 0x63B8DB63, 0x28220A28, 0x07C5C207, + 0xC4F531C4, 0x22A88A22, 0x9631A796, 0x37F9CE37, 0xED977AED, 0xF649BFF6, + 0xB4992DB4, 0xD1A475D1, 0x4390D343, 0x485A1248, 0xE258BAE2, 0x9771E697, + 0xD264B6D2, 0xC270B2C2, 0x26AD8B26, 0xA5CD68A5, 0x5ECB955E, 0x29624B29, + 0x303C0C30, 0x5ACE945A, 0xDDAB76DD, 0xF9867FF9, 0x95F16495, 0xE65DBBE6, + 0xC735F2C7, 0x242D0924, 0x17D1C617, 0xB9D66FB9, 0x1BDEC51B, 0x12948612, + 0x60781860, 0xC330F3C3, 0xF5897CF5, 0xB35CEFB3, 0xE8D23AE8, 0x73ACDF73, + 0x35794C35, 0x80A02080, 0xE59D78E5, 0xBB56EDBB, 0x7D235E7D, 0xF8C63EF8, + 0x5F8BD45F, 0x2FE7C82F, 0xE4DD39E4, 0x21684921}; + +static uint32_t SM4_SBOX_T2[256] = { + 0x5B5B8ED5, 0x4242D092, 0xA7A74DEA, 0xFBFB06FD, 0x3333FCCF, 0x878765E2, + 0xF4F4C93D, 0xDEDE6BB5, 0x58584E16, 0xDADA6EB4, 0x50504414, 0x0B0BCAC1, + 0xA0A08828, 0xEFEF17F8, 0xB0B09C2C, 0x14141105, 0xACAC872B, 0x9D9DFB66, + 0x6A6AF298, 0xD9D9AE77, 0xA8A8822A, 0xFAFA46BC, 0x10101404, 0x0F0FCFC0, + 0xAAAA02A8, 0x11115445, 0x4C4C5F13, 0x9898BE26, 0x25256D48, 0x1A1A9E84, + 0x18181E06, 0x6666FD9B, 0x7272EC9E, 0x09094A43, 0x41411051, 0xD3D324F7, + 0x4646D593, 0xBFBF53EC, 0x6262F89A, 0xE9E9927B, 0xCCCCFF33, 0x51510455, + 0x2C2C270B, 0x0D0D4F42, 0xB7B759EE, 0x3F3FF3CC, 0xB2B21CAE, 0x8989EA63, + 0x939374E7, 0xCECE7FB1, 0x70706C1C, 0xA6A60DAB, 0x2727EDCA, 0x20202808, + 0xA3A348EB, 0x5656C197, 0x02028082, 0x7F7FA3DC, 0x5252C496, 0xEBEB12F9, + 0xD5D5A174, 0x3E3EB38D, 0xFCFCC33F, 0x9A9A3EA4, 0x1D1D5B46, 0x1C1C1B07, + 0x9E9E3BA5, 0xF3F30CFF, 0xCFCF3FF0, 0xCDCDBF72, 0x5C5C4B17, 0xEAEA52B8, + 0x0E0E8F81, 0x65653D58, 0xF0F0CC3C, 0x64647D19, 0x9B9B7EE5, 0x16169187, + 0x3D3D734E, 0xA2A208AA, 0xA1A1C869, 0xADADC76A, 0x06068583, 0xCACA7AB0, + 0xC5C5B570, 0x9191F465, 0x6B6BB2D9, 0x2E2EA789, 0xE3E318FB, 0xAFAF47E8, + 0x3C3C330F, 0x2D2D674A, 0xC1C1B071, 0x59590E57, 0x7676E99F, 0xD4D4E135, + 0x7878661E, 0x9090B424, 0x3838360E, 0x7979265F, 0x8D8DEF62, 0x61613859, + 0x474795D2, 0x8A8A2AA0, 0x9494B125, 0x8888AA22, 0xF1F18C7D, 0xECECD73B, + 0x04040501, 0x8484A521, 0xE1E19879, 0x1E1E9B85, 0x535384D7, 0x00000000, + 0x19195E47, 0x5D5D0B56, 0x7E7EE39D, 0x4F4F9FD0, 0x9C9CBB27, 0x49491A53, + 0x31317C4D, 0xD8D8EE36, 0x08080A02, 0x9F9F7BE4, 0x828220A2, 0x1313D4C7, + 0x2323E8CB, 0x7A7AE69C, 0xABAB42E9, 0xFEFE43BD, 0x2A2AA288, 0x4B4B9AD1, + 0x01014041, 0x1F1FDBC4, 0xE0E0D838, 0xD6D661B7, 0x8E8E2FA1, 0xDFDF2BF4, + 0xCBCB3AF1, 0x3B3BF6CD, 0xE7E71DFA, 0x8585E560, 0x54544115, 0x868625A3, + 0x838360E3, 0xBABA16AC, 0x7575295C, 0x929234A6, 0x6E6EF799, 0xD0D0E434, + 0x6868721A, 0x55550154, 0xB6B619AF, 0x4E4EDF91, 0xC8C8FA32, 0xC0C0F030, + 0xD7D721F6, 0x3232BC8E, 0xC6C675B3, 0x8F8F6FE0, 0x7474691D, 0xDBDB2EF5, + 0x8B8B6AE1, 0xB8B8962E, 0x0A0A8A80, 0x9999FE67, 0x2B2BE2C9, 0x8181E061, + 0x0303C0C3, 0xA4A48D29, 0x8C8CAF23, 0xAEAE07A9, 0x3434390D, 0x4D4D1F52, + 0x3939764F, 0xBDBDD36E, 0x575781D6, 0x6F6FB7D8, 0xDCDCEB37, 0x15155144, + 0x7B7BA6DD, 0xF7F709FE, 0x3A3AB68C, 0xBCBC932F, 0x0C0C0F03, 0xFFFF03FC, + 0xA9A9C26B, 0xC9C9BA73, 0xB5B5D96C, 0xB1B1DC6D, 0x6D6D375A, 0x45451550, + 0x3636B98F, 0x6C6C771B, 0xBEBE13AD, 0x4A4ADA90, 0xEEEE57B9, 0x7777A9DE, + 0xF2F24CBE, 0xFDFD837E, 0x44445511, 0x6767BDDA, 0x71712C5D, 0x05054540, + 0x7C7C631F, 0x40405010, 0x6969325B, 0x6363B8DB, 0x2828220A, 0x0707C5C2, + 0xC4C4F531, 0x2222A88A, 0x969631A7, 0x3737F9CE, 0xEDED977A, 0xF6F649BF, + 0xB4B4992D, 0xD1D1A475, 0x434390D3, 0x48485A12, 0xE2E258BA, 0x979771E6, + 0xD2D264B6, 0xC2C270B2, 0x2626AD8B, 0xA5A5CD68, 0x5E5ECB95, 0x2929624B, + 0x30303C0C, 0x5A5ACE94, 0xDDDDAB76, 0xF9F9867F, 0x9595F164, 0xE6E65DBB, + 0xC7C735F2, 0x24242D09, 0x1717D1C6, 0xB9B9D66F, 0x1B1BDEC5, 0x12129486, + 0x60607818, 0xC3C330F3, 0xF5F5897C, 0xB3B35CEF, 0xE8E8D23A, 0x7373ACDF, + 0x3535794C, 0x8080A020, 0xE5E59D78, 0xBBBB56ED, 0x7D7D235E, 0xF8F8C63E, + 0x5F5F8BD4, 0x2F2FE7C8, 0xE4E4DD39, 0x21216849}; + +static uint32_t SM4_SBOX_T3[256] = { + 0xD55B5B8E, 0x924242D0, 0xEAA7A74D, 0xFDFBFB06, 0xCF3333FC, 0xE2878765, + 0x3DF4F4C9, 0xB5DEDE6B, 0x1658584E, 0xB4DADA6E, 0x14505044, 0xC10B0BCA, + 0x28A0A088, 0xF8EFEF17, 0x2CB0B09C, 0x05141411, 0x2BACAC87, 0x669D9DFB, + 0x986A6AF2, 0x77D9D9AE, 0x2AA8A882, 0xBCFAFA46, 0x04101014, 0xC00F0FCF, + 0xA8AAAA02, 0x45111154, 0x134C4C5F, 0x269898BE, 0x4825256D, 0x841A1A9E, + 0x0618181E, 0x9B6666FD, 0x9E7272EC, 0x4309094A, 0x51414110, 0xF7D3D324, + 0x934646D5, 0xECBFBF53, 0x9A6262F8, 0x7BE9E992, 0x33CCCCFF, 0x55515104, + 0x0B2C2C27, 0x420D0D4F, 0xEEB7B759, 0xCC3F3FF3, 0xAEB2B21C, 0x638989EA, + 0xE7939374, 0xB1CECE7F, 0x1C70706C, 0xABA6A60D, 0xCA2727ED, 0x08202028, + 0xEBA3A348, 0x975656C1, 0x82020280, 0xDC7F7FA3, 0x965252C4, 0xF9EBEB12, + 0x74D5D5A1, 0x8D3E3EB3, 0x3FFCFCC3, 0xA49A9A3E, 0x461D1D5B, 0x071C1C1B, + 0xA59E9E3B, 0xFFF3F30C, 0xF0CFCF3F, 0x72CDCDBF, 0x175C5C4B, 0xB8EAEA52, + 0x810E0E8F, 0x5865653D, 0x3CF0F0CC, 0x1964647D, 0xE59B9B7E, 0x87161691, + 0x4E3D3D73, 0xAAA2A208, 0x69A1A1C8, 0x6AADADC7, 0x83060685, 0xB0CACA7A, + 0x70C5C5B5, 0x659191F4, 0xD96B6BB2, 0x892E2EA7, 0xFBE3E318, 0xE8AFAF47, + 0x0F3C3C33, 0x4A2D2D67, 0x71C1C1B0, 0x5759590E, 0x9F7676E9, 0x35D4D4E1, + 0x1E787866, 0x249090B4, 0x0E383836, 0x5F797926, 0x628D8DEF, 0x59616138, + 0xD2474795, 0xA08A8A2A, 0x259494B1, 0x228888AA, 0x7DF1F18C, 0x3BECECD7, + 0x01040405, 0x218484A5, 0x79E1E198, 0x851E1E9B, 0xD7535384, 0x00000000, + 0x4719195E, 0x565D5D0B, 0x9D7E7EE3, 0xD04F4F9F, 0x279C9CBB, 0x5349491A, + 0x4D31317C, 0x36D8D8EE, 0x0208080A, 0xE49F9F7B, 0xA2828220, 0xC71313D4, + 0xCB2323E8, 0x9C7A7AE6, 0xE9ABAB42, 0xBDFEFE43, 0x882A2AA2, 0xD14B4B9A, + 0x41010140, 0xC41F1FDB, 0x38E0E0D8, 0xB7D6D661, 0xA18E8E2F, 0xF4DFDF2B, + 0xF1CBCB3A, 0xCD3B3BF6, 0xFAE7E71D, 0x608585E5, 0x15545441, 0xA3868625, + 0xE3838360, 0xACBABA16, 0x5C757529, 0xA6929234, 0x996E6EF7, 0x34D0D0E4, + 0x1A686872, 0x54555501, 0xAFB6B619, 0x914E4EDF, 0x32C8C8FA, 0x30C0C0F0, + 0xF6D7D721, 0x8E3232BC, 0xB3C6C675, 0xE08F8F6F, 0x1D747469, 0xF5DBDB2E, + 0xE18B8B6A, 0x2EB8B896, 0x800A0A8A, 0x679999FE, 0xC92B2BE2, 0x618181E0, + 0xC30303C0, 0x29A4A48D, 0x238C8CAF, 0xA9AEAE07, 0x0D343439, 0x524D4D1F, + 0x4F393976, 0x6EBDBDD3, 0xD6575781, 0xD86F6FB7, 0x37DCDCEB, 0x44151551, + 0xDD7B7BA6, 0xFEF7F709, 0x8C3A3AB6, 0x2FBCBC93, 0x030C0C0F, 0xFCFFFF03, + 0x6BA9A9C2, 0x73C9C9BA, 0x6CB5B5D9, 0x6DB1B1DC, 0x5A6D6D37, 0x50454515, + 0x8F3636B9, 0x1B6C6C77, 0xADBEBE13, 0x904A4ADA, 0xB9EEEE57, 0xDE7777A9, + 0xBEF2F24C, 0x7EFDFD83, 0x11444455, 0xDA6767BD, 0x5D71712C, 0x40050545, + 0x1F7C7C63, 0x10404050, 0x5B696932, 0xDB6363B8, 0x0A282822, 0xC20707C5, + 0x31C4C4F5, 0x8A2222A8, 0xA7969631, 0xCE3737F9, 0x7AEDED97, 0xBFF6F649, + 0x2DB4B499, 0x75D1D1A4, 0xD3434390, 0x1248485A, 0xBAE2E258, 0xE6979771, + 0xB6D2D264, 0xB2C2C270, 0x8B2626AD, 0x68A5A5CD, 0x955E5ECB, 0x4B292962, + 0x0C30303C, 0x945A5ACE, 0x76DDDDAB, 0x7FF9F986, 0x649595F1, 0xBBE6E65D, + 0xF2C7C735, 0x0924242D, 0xC61717D1, 0x6FB9B9D6, 0xC51B1BDE, 0x86121294, + 0x18606078, 0xF3C3C330, 0x7CF5F589, 0xEFB3B35C, 0x3AE8E8D2, 0xDF7373AC, + 0x4C353579, 0x208080A0, 0x78E5E59D, 0xEDBBBB56, 0x5E7D7D23, 0x3EF8F8C6, + 0xD45F5F8B, 0xC82F2FE7, 0x39E4E4DD, 0x49212168}; + +static ossl_inline uint32_t rotl(uint32_t a, uint8_t n) +{ + return (a << n) | (a >> (32 - n)); +} + +static ossl_inline uint32_t load_u32_be(const uint8_t *b, uint32_t n) +{ + return ((uint32_t)b[4 * n] << 24) | + ((uint32_t)b[4 * n + 1] << 16) | + ((uint32_t)b[4 * n + 2] << 8) | + ((uint32_t)b[4 * n + 3]); +} + +static ossl_inline void store_u32_be(uint32_t v, uint8_t *b) +{ + b[0] = (uint8_t)(v >> 24); + b[1] = (uint8_t)(v >> 16); + b[2] = (uint8_t)(v >> 8); + b[3] = (uint8_t)(v); +} + +static ossl_inline uint32_t SM4_T_non_lin_sub(uint32_t X) +{ + uint32_t t = 0; + + t |= ((uint32_t)SM4_S[(uint8_t)(X >> 24)]) << 24; + t |= ((uint32_t)SM4_S[(uint8_t)(X >> 16)]) << 16; + t |= ((uint32_t)SM4_S[(uint8_t)(X >> 8)]) << 8; + t |= SM4_S[(uint8_t)X]; + + return t; +} + +static ossl_inline uint32_t SM4_T_slow(uint32_t X) +{ + uint32_t t = SM4_T_non_lin_sub(X); + + /* + * L linear transform + */ + return t ^ rotl(t, 2) ^ rotl(t, 10) ^ rotl(t, 18) ^ rotl(t, 24); +} + +static ossl_inline uint32_t SM4_T(uint32_t X) +{ + return SM4_SBOX_T0[(uint8_t)(X >> 24)] ^ + SM4_SBOX_T1[(uint8_t)(X >> 16)] ^ + SM4_SBOX_T2[(uint8_t)(X >> 8)] ^ + SM4_SBOX_T3[(uint8_t)X]; +} + +static ossl_inline uint32_t SM4_key_sub(uint32_t X) +{ + uint32_t t = SM4_T_non_lin_sub(X); + + return t ^ rotl(t, 13) ^ rotl(t, 23); +} + +int ossl_sm4_set_key(const uint8_t *key, SM4_KEY *ks) +{ + /* + * Family Key + */ + static const uint32_t FK[4] = + { 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc }; + + /* + * Constant Key + */ + static const uint32_t CK[32] = { + 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269, + 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9, + 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249, + 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9, + 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229, + 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299, + 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209, + 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 + }; + + uint32_t K[4]; + int i; + + K[0] = load_u32_be(key, 0) ^ FK[0]; + K[1] = load_u32_be(key, 1) ^ FK[1]; + K[2] = load_u32_be(key, 2) ^ FK[2]; + K[3] = load_u32_be(key, 3) ^ FK[3]; + + for (i = 0; i < SM4_KEY_SCHEDULE; i = i + 4) { + K[0] ^= SM4_key_sub(K[1] ^ K[2] ^ K[3] ^ CK[i]); + K[1] ^= SM4_key_sub(K[2] ^ K[3] ^ K[0] ^ CK[i + 1]); + K[2] ^= SM4_key_sub(K[3] ^ K[0] ^ K[1] ^ CK[i + 2]); + K[3] ^= SM4_key_sub(K[0] ^ K[1] ^ K[2] ^ CK[i + 3]); + ks->rk[i ] = K[0]; + ks->rk[i + 1] = K[1]; + ks->rk[i + 2] = K[2]; + ks->rk[i + 3] = K[3]; + } + + return 1; +} + +#define SM4_RNDS(k0, k1, k2, k3, F) \ + do { \ + B0 ^= F(B1 ^ B2 ^ B3 ^ ks->rk[k0]); \ + B1 ^= F(B0 ^ B2 ^ B3 ^ ks->rk[k1]); \ + B2 ^= F(B0 ^ B1 ^ B3 ^ ks->rk[k2]); \ + B3 ^= F(B0 ^ B1 ^ B2 ^ ks->rk[k3]); \ + } while(0) + +void ossl_sm4_encrypt(const uint8_t *in, uint8_t *out, const SM4_KEY *ks) +{ + uint32_t B0 = load_u32_be(in, 0); + uint32_t B1 = load_u32_be(in, 1); + uint32_t B2 = load_u32_be(in, 2); + uint32_t B3 = load_u32_be(in, 3); + + /* + * Uses byte-wise sbox in the first and last rounds to provide some + * protection from cache based side channels. + */ + SM4_RNDS( 0, 1, 2, 3, SM4_T_slow); + SM4_RNDS( 4, 5, 6, 7, SM4_T); + SM4_RNDS( 8, 9, 10, 11, SM4_T); + SM4_RNDS(12, 13, 14, 15, SM4_T); + SM4_RNDS(16, 17, 18, 19, SM4_T); + SM4_RNDS(20, 21, 22, 23, SM4_T); + SM4_RNDS(24, 25, 26, 27, SM4_T); + SM4_RNDS(28, 29, 30, 31, SM4_T_slow); + + store_u32_be(B3, out); + store_u32_be(B2, out + 4); + store_u32_be(B1, out + 8); + store_u32_be(B0, out + 12); +} + +void ossl_sm4_decrypt(const uint8_t *in, uint8_t *out, const SM4_KEY *ks) +{ + uint32_t B0 = load_u32_be(in, 0); + uint32_t B1 = load_u32_be(in, 1); + uint32_t B2 = load_u32_be(in, 2); + uint32_t B3 = load_u32_be(in, 3); + + SM4_RNDS(31, 30, 29, 28, SM4_T_slow); + SM4_RNDS(27, 26, 25, 24, SM4_T); + SM4_RNDS(23, 22, 21, 20, SM4_T); + SM4_RNDS(19, 18, 17, 16, SM4_T); + SM4_RNDS(15, 14, 13, 12, SM4_T); + SM4_RNDS(11, 10, 9, 8, SM4_T); + SM4_RNDS( 7, 6, 5, 4, SM4_T); + SM4_RNDS( 3, 2, 1, 0, SM4_T_slow); + + store_u32_be(B3, out); + store_u32_be(B2, out + 4); + store_u32_be(B1, out + 8); + store_u32_be(B0, out + 12); +} + +void sm4_setkey_enc(sm4_ctx *ctx, uint8_t* key) +{ + ossl_sm4_set_key(key, &ctx->rkey); + ctx->encrypt = SM4_ENCRYPT; +} + +void sm4_setkey_dec(sm4_ctx *ctx, uint8_t* key) +{ + ossl_sm4_set_key(key, &ctx->rkey); + ctx->encrypt = SM4_DECRYPT; +} + +#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT) +typedef size_t size_t_aX __attribute((__aligned__(1))); +#else +typedef size_t size_t_aX; +#endif + +/* + * The input and output encrypted as though 128bit ofb mode is being used. + * The extra state information to record how much of the 128bit block we have + * used is contained in *num; + */ +static void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, void *key, + unsigned char ivec[16], int *num) +{ + unsigned int n; + size_t l = 0; + + if (*num < 0) + { + /* There is no good way to signal an error return from here */ + *num = -1; + return; + } + n = *num; + + do { + while (n && len) + { + *(out++) = *(in++) ^ ivec[n]; + --len; + n = (n + 1) % 16; + } + while (len >= 16) + { + ossl_sm4_encrypt(ivec, ivec, key); + for (; n < 16; n += sizeof(size_t)) + *(size_t_aX *)(out + n) = + *(size_t_aX *)(in + n) + ^ *(size_t_aX *)(ivec + n); + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) + { + ossl_sm4_encrypt(ivec, ivec, key); + while (len--) + { + out[n] = in[n] ^ ivec[n]; + ++n; + } + } + *num = n; + return; + } while (0); + + /* the rest would be commonly eliminated by x86* compiler */ + while (l < len) + { + if (n == 0) + { + ossl_sm4_encrypt(ivec, ivec, key); + } + out[l] = in[l] ^ ivec[n]; + ++l; + n = (n + 1) % 16; + } + + *num = n; +} + +int sm4_ofb_cipher(sm4_ctx *ctx, unsigned char *out, + const unsigned char *in, size_t input_len, + unsigned char ivec[16]) +{ + int num = 0; + CRYPTO_ofb128_encrypt(in, out, input_len, &ctx->rkey, + ivec, &num); + return 0; +} \ No newline at end of file diff --git a/src/backend/crypto/ssl_passphrase.sh.sample b/src/backend/crypto/ssl_passphrase.sh.sample new file mode 100644 index 00000000000..efbf5c07201 --- /dev/null +++ b/src/backend/crypto/ssl_passphrase.sh.sample @@ -0,0 +1,35 @@ +#!/bin/sh + +# This uses a passphrase supplied by the user. +# Do not create any fie with extension "wkey" in $DIR; these are +# reserved for wrapped data key files. + +[ "$#" -lt 1 ] && echo "ssl_passphrase_command usage: $0 %R [\"%p\"]" 1>&2 && exit 1 + +FD="$1" +[ ! -t "$FD" ] && echo "file descriptor $FD does not refer to a terminal" 1>&2 && exit 1 +# Supports environment variable PROMPT + +[ "$2" ] && PROMPT="$2" + + +# ---------------------------------------------------------------------- + +[ ! "$PROMPT" ] && PROMPT='Enter cluster passphrase: ' + +stty -echo <&"$FD" + +echo 1>&"$FD" +echo -n "$PROMPT" 1>&"$FD" +read PASS <&"$FD" + +stty echo <&"$FD" + +if [ ! "$PASS" ] +then echo 'invalid: empty passphrase' 1>&2 + exit 1 +fi + +echo "$PASS" + +exit 0 diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 4fe69886acc..42e11836d60 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -797,6 +797,8 @@ ExecSquelchNode(PlanState *node) case T_TidRangeScanState: case T_TableFunctionState: case T_SampleScanState: + case T_GatherState: + case T_GatherMergeState: break; /* diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 9b46b6ea265..835d0179600 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1740,6 +1740,7 @@ InitPlan(QueryDesc *queryDesc, int eflags) start_plan_node = (Plan *) m; ExecSlice *sendSlice = &estate->es_sliceTable->slices[m->motionID]; estate->currentSliceId = sendSlice->parentIndex; + estate->useMppParallelMode = sendSlice->useMppParallelMode; } /* Compute SubPlans' root plan nodes for SubPlans reachable from this plan root */ estate->locallyExecutableSubplans = getLocallyExecutableSubplans(plannedstmt, start_plan_node); @@ -1772,13 +1773,18 @@ InitPlan(QueryDesc *queryDesc, int eflags) /* set our global sliceid variable for elog. */ int save_currentSliceId = estate->currentSliceId; + /* GPDB_PARALLEL_FIXME: Is it necessary to save and recover this? */ + bool save_useMppParallelMode = estate->useMppParallelMode; estate->currentSliceId = estate->es_plannedstmt->subplan_sliceIds[subplan_id - 1]; + /* FIXME: test whether mpp parallel style exists for subplan case */ + estate->useMppParallelMode = false; Plan *subplan = (Plan *) lfirst(l); subplanstate = ExecInitNode(subplan, estate, sp_eflags); estate->currentSliceId = save_currentSliceId; + estate->useMppParallelMode = save_useMppParallelMode; } estate->es_subplanstates = lappend(estate->es_subplanstates, subplanstate); @@ -2484,13 +2490,21 @@ ExecutePlan(EState *estate, * If the plan might potentially be executed multiple times, we must force * it to run without parallelism, because we might exit early. */ - if (!execute_once) + if (!execute_once || GP_ROLE_DISPATCH == Gp_role) use_parallel_mode = false; estate->es_use_parallel_mode = use_parallel_mode; if (use_parallel_mode) EnterParallelMode(); + /* + * GP style parallelism won't interfere PG style parallel mechanism. + * So that we will pass if use_parallel_mode is true which means there exists + * Gather/GatherMerge node. + */ + if (estate->useMppParallelMode) + GpInsertParallelDSMHash(planstate); + #ifdef FAULT_INJECTOR /* Inject a fault before tuple processing started */ SIMPLE_FAULT_INJECTOR("executor_pre_tuple_processed"); @@ -2606,6 +2620,9 @@ ExecutePlan(EState *estate, if (!(estate->es_top_eflags & EXEC_FLAG_BACKWARD)) (void) ExecShutdownNode(planstate); + if (estate->useMppParallelMode) + GpDestroyParallelDSMEntry(); + if (use_parallel_mode) ExitParallelMode(); } diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 7810b344312..5399811cac8 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -23,6 +23,7 @@ #include "postgres.h" +#include "access/tableam.h" #include "executor/execParallel.h" #include "executor/executor.h" #include "executor/nodeAgg.h" @@ -204,6 +205,7 @@ ExecSerializePlan(Plan *plan, EState *estate) pstmt->subplans = lappend(pstmt->subplans, subplan); } + pstmt->subplan_sliceIds = estate->es_plannedstmt->subplan_sliceIds; pstmt->rewindPlanIDs = NULL; pstmt->rowMarks = NIL; pstmt->relationOids = NIL; @@ -493,12 +495,9 @@ ExecParallelInitializeDSM(PlanState *planstate, d->pcxt); break; case T_BitmapHeapScanState: - /* GPDB_12_MERGE_FEATURE_NOT_SUPPORTED: the parallel StreamBitmap scan is not implemented */ - /* - * if (planstate->plan->parallel_aware) - * ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate, - * d->pcxt); - */ + if (planstate->plan->parallel_aware) + ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate, + d->pcxt); break; case T_HashJoinState: if (planstate->plan->parallel_aware) @@ -1500,3 +1499,207 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) FreeQueryDesc(queryDesc); receiver->rDestroy(receiver); } + +bool +EstimateGpParallelDSMEntrySize(PlanState *planstate, ParallelContext *pctx) +{ + if (planstate == NULL) + return false; + + switch (nodeTag(planstate)) + { + case T_MotionState: + /* + * If we walk to MotionStateNode with Recv Motion Type, we should return directly. + * So that we can only Initialize Parallel Entry for current Slice. + */ + if (((MotionState *) planstate)->mstype == MOTIONSTATE_RECV) + return false; + break; + case T_SeqScanState: + if (planstate->plan->parallel_aware) + ExecSeqScanEstimate((SeqScanState *) planstate, + pctx); + break; + case T_IndexScanState: + if (planstate->plan->parallel_aware) + ExecIndexScanEstimate((IndexScanState *) planstate, + pctx); + break; + case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) + ExecIndexOnlyScanEstimate((IndexOnlyScanState *) planstate, + pctx); + break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) + ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate, + pctx); + break; + case T_AppendState: + if (planstate->plan->parallel_aware) + ExecAppendEstimate((AppendState*) planstate, pctx); + break; + case T_HashJoinState: + if (planstate->plan->parallel_aware) + ExecHashJoinEstimate((HashJoinState *) planstate, + pctx); + break; + case T_HashState: + ExecHashEstimate((HashState *) planstate, pctx); + break; + case T_SortState: + ExecSortEstimate((SortState *) planstate, pctx); + break; + default: + break; + + } + + return planstate_tree_walker(planstate, EstimateGpParallelDSMEntrySize, pctx); +} + +bool +InitializeGpParallelWorkers(PlanState *planstate, ParallelWorkerContext *pwcxt) +{ + if (planstate == NULL) + return false; + /* + * GPDB_PARALLEL_FIXME: + * Why we call PG's xxxInitializeWorker functions for some nodes, but not for others? + */ + switch (nodeTag(planstate)) + { + case T_MotionState: + /* + * If we walk to MotionStateNode with Recv Motion Type, we should return directly. + * So that we can only Initialize Parallel Entry for current Slice. + */ + if (((MotionState *) planstate)->mstype == MOTIONSTATE_RECV) + return false; + break; + case T_SeqScanState: + break; + case T_IndexScanState: + break; + case T_IndexOnlyScanState: + break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) + ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate, pwcxt); + break; + case T_AppendState: + if (planstate->plan->parallel_aware) + ExecAppendInitializeWorker((AppendState *) planstate, pwcxt); + break; + case T_HashState: + break; + case T_SortState: + break; + case T_HashJoinState: + if (planstate->plan->parallel_aware) + ExecHashJoinInitializeWorker((HashJoinState *) planstate, pwcxt); + break; + default: + break; + } + + return planstate_tree_walker(planstate, InitializeGpParallelWorkers, pwcxt); +} + +bool +InitializeGpParallelDSMEntry(PlanState *planstate, ParallelContext *pctx) +{ + if (planstate == NULL) + return false; + + switch (nodeTag(planstate)) + { + case T_MotionState: + /* + * If we walk to MotionStateNode with Recv Motion Type, we should return directly. + * So that we can only Initialize Parallel Entry for current Slice. + */ + if (((MotionState *) planstate)->mstype == MOTIONSTATE_RECV) + return false; + break; + case T_SeqScanState: + if (planstate->plan->parallel_aware) + { + SeqScanState* node = (SeqScanState*) planstate; + + ParallelTableScanDesc pscan; + + pscan = shm_toc_allocate(pctx->toc, node->pscan_len); + + table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, + node->ss.ps.state->es_snapshot); + + Assert(pscan); + + shm_toc_insert(pctx->toc, node->ss.ps.plan->plan_node_id, pscan); + } + break; + case T_IndexScanState: + if (planstate->plan->parallel_aware) + { + IndexScanState* node = (IndexScanState*) planstate; + + ParallelIndexScanDesc piscan; + + piscan = shm_toc_allocate(pctx->toc, node->iss_PscanLen); + + index_parallelscan_initialize(node->ss.ss_currentRelation, + node->iss_RelationDesc, + node->ss.ps.state->es_snapshot, + piscan); + + Assert(piscan); + + shm_toc_insert(pctx->toc, node->ss.ps.plan->plan_node_id, piscan); + } + break; + case T_IndexOnlyScanState: + if (planstate->plan->parallel_aware) + { + IndexOnlyScanState* node = (IndexOnlyScanState*) planstate; + + ParallelIndexScanDesc piscan; + piscan = shm_toc_allocate(pctx->toc, node->ioss_PscanLen); + + index_parallelscan_initialize(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + node->ss.ps.state->es_snapshot, + piscan); + + Assert(piscan); + + shm_toc_insert(pctx->toc, node->ss.ps.plan->plan_node_id, piscan); + } + break; + case T_BitmapHeapScanState: + if (planstate->plan->parallel_aware) + ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate, + pctx); + break; + case T_AppendState: + if (planstate->plan->parallel_aware) + ExecAppendInitializeDSM((AppendState *) planstate, + pctx); + break; + case T_HashJoinState: + if (planstate->plan->parallel_aware) + ExecHashJoinInitializeDSM((HashJoinState *) planstate, pctx); + break; + case T_HashState: + ExecHashInitializeDSM((HashState *) planstate, pctx); + break; + case T_SortState: + ExecSortInitializeDSM((SortState *) planstate, pctx); + break; + default: + break; + } + + return planstate_tree_walker(planstate, InitializeGpParallelDSMEntry, pctx); +} diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 5f885eabe4b..b5d9cba778a 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -203,6 +203,7 @@ CreateExecutorState(void) estate->dispatcherState = NULL; estate->currentSliceId = 0; + estate->useMppParallelMode = false; estate->eliminateAliens = false; /* @@ -1586,9 +1587,13 @@ ExecPrefetchQual(JoinState *node, bool isJoinQual) static void FillSliceGangInfo(ExecSlice *slice, PlanSlice *ps) { + int factor = ps->parallel_workers ? ps->parallel_workers : 1; int numsegments = ps->numsegments; + DirectDispatchInfo *dd = &ps->directDispatch; + slice->useMppParallelMode = (ps->parallel_workers != 0); + slice->parallel_workers = factor; switch (slice->gangType) { case GANGTYPE_UNALLOCATED: @@ -1596,30 +1601,47 @@ FillSliceGangInfo(ExecSlice *slice, PlanSlice *ps) * It's either the root slice or an InitPlan slice that runs in * the QD process, or really unused slice. */ + /* GPDB_PARALLEL_FIXME: QD process should never be parallel, do we need to plus factor? */ slice->planNumSegments = 1; break; case GANGTYPE_PRIMARY_WRITER: case GANGTYPE_PRIMARY_READER: - slice->planNumSegments = numsegments; + slice->planNumSegments = numsegments * factor; if (dd->isDirectDispatch) { - slice->segments = list_copy(dd->contentIds); + int i; + ListCell *lc; + + foreach(lc, dd->contentIds) + { + int segment = lfirst_int(lc); + for (i = 0; i < factor; i++) + slice->segments = lappend_int(slice->segments, segment); + } } else { - int i; + int i, j; slice->segments = NIL; for (i = 0; i < numsegments; i++) - slice->segments = lappend_int(slice->segments, i % getgpsegmentCount()); + for (j = 0; j < factor; j++) + slice->segments = lappend_int(slice->segments, i % getgpsegmentCount()); } break; case GANGTYPE_ENTRYDB_READER: + /* GPDB_PARALLEL_FIXME: QD parallel is disabled */ slice->planNumSegments = 1; slice->segments = list_make1_int(-1); break; case GANGTYPE_SINGLETON_READER: - slice->planNumSegments = 1; - slice->segments = list_make1_int(ps->segindex); + /* + * GPDB_PARALLEL_FIXME: + * Could be parallel, parallel scan on replica tables. + */ + slice->planNumSegments = 1 * factor; + int i; + for (i = 0; i < factor; i++) + slice->segments = lappend_int(slice->segments, ps->segindex); break; default: elog(ERROR, "unexpected gang type"); @@ -2152,7 +2174,7 @@ MotionStateFinderWalker(PlanState *node, { Assert(ctx->motionState == NULL); ctx->motionState = ms; - return CdbVisit_Skip; /* don't visit subtree */ + return CdbVisit_Success; /* don't visit subtree */ } } diff --git a/src/backend/executor/nodeAppend.c b/src/backend/executor/nodeAppend.c index 98388e3ba19..8c063e1c0cc 100644 --- a/src/backend/executor/nodeAppend.c +++ b/src/backend/executor/nodeAppend.c @@ -57,6 +57,7 @@ #include "postgres.h" +#include "cdb/cdbvars.h" #include "executor/execAsync.h" #include "executor/execdebug.h" #include "executor/execPartition.h" @@ -531,6 +532,7 @@ ExecAppendInitializeDSM(AppendState *node, node->choose_next_subplan = choose_next_subplan_for_leader; } + /* ---------------------------------------------------------------- * ExecAppendReInitializeDSM * @@ -859,8 +861,19 @@ mark_invalid_subplans_as_finished(AppendState *node) /* Only valid to call this while in parallel Append mode */ Assert(node->as_pstate); + /* + * NB: In upstream, we assert node->as_prune_state to be not empty. + * However, after pg12 merge, we'll allow the case that + * as_valid_subplans and as_prune_state both be emty while + * node->join_prune_paramids is true. + * + * If as_valid_subplans is empty we'll call this function. And the + * assertion must be removed. + */ +#if 0 /* Shouldn't have been called when run-time pruning is not enabled */ Assert(node->as_prune_state); +#endif /* Nothing to do if all plans are valid */ if (bms_num_members(node->as_valid_subplans) == node->as_nplans) diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index f3331acd833..3f814b1907d 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -1007,14 +1007,18 @@ ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, { ParallelBitmapHeapState *pstate; Snapshot snapshot; + EState *estate = node->ss.ps.state; Assert(node->ss.ps.state->es_query_dsa != NULL); pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false); node->pstate = pstate; - snapshot = RestoreSnapshot(pstate->phs_snapshot_data); - table_scan_update_snapshot(node->ss.ss_currentScanDesc, snapshot); + if (!estate->useMppParallelMode) + { + snapshot = RestoreSnapshot(pstate->phs_snapshot_data); + table_scan_update_snapshot(node->ss.ss_currentScanDesc, snapshot); + } } void diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 1b966225ade..cae34c46f33 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -24,6 +24,8 @@ #include "postgres.h" #include "access/genam.h" +#include "access/nbtree.h" +#include "access/relscan.h" #include "executor/execdebug.h" #include "executor/nodeBitmapIndexscan.h" #include "executor/nodeIndexscan.h" @@ -85,6 +87,23 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node) */ scandesc = node->biss_ScanDesc; + /* + * Actually the bitmap building logic is different to upstream. + * We build the bitmap inside function `index_getbitmap`, however, + * the upsteam will build the bitmap before calling `index_getbitmap`. + * + * Thus, if we want to build bitmap in cbdb, we have to pass the dsa_area + * into the `index_getbitmap`. + * + * I work around it by adding a field for dsa in struct IndexScanDesc, + * but we might need a code refaction here to align the code to upstream. + * + */ + if (node->ss.ps.state->es_query_dsa != NULL && ((BitmapIndexScan *)node->ss.ps.plan)->isshared) + { + scandesc->dsa = node->ss.ps.state->es_query_dsa; + } + /* * If we have runtime keys and they've not already been set up, do it now. * Array keys are also treated as runtime keys; note that if ExecReScan diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index d7d2d641986..f61f8f53491 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -643,6 +643,7 @@ ExecHashTableCreate(HashState *state, HashJoinState *hjstate, { ParallelHashJoinState *pstate = hashtable->parallel_state; Barrier *build_barrier; + Barrier *sync_barrier; /* * Attach to the build barrier. The corresponding detach operation is @@ -653,8 +654,13 @@ ExecHashTableCreate(HashState *state, HashJoinState *hjstate, * algorithm), and we'll coordinate that using build_barrier. */ build_barrier = &pstate->build_barrier; + sync_barrier = &pstate->sync_barrier; + BarrierAttach(build_barrier); + if (((Hash *) state->ps.plan)->sync_barrier) + BarrierArriveAndWait(sync_barrier, WAIT_EVENT_PARALLEL_FINISH); + /* * So far we have no idea whether there are any other participants, * and if so, what phase they are working on. The only thing we care @@ -767,12 +773,13 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, * avoid the need to batch. If that won't work, it falls back to hash_mem * per worker and tries to process batches in parallel. */ - if (try_combined_hash_mem) + if (try_combined_hash_mem && parallel_workers > 0) { /* Careful, this could overflow size_t */ double newlimit; - newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1); + /* GP_PARALLEL_FIXME: if we enable pg style parallel some day, we should reconsider it. */ + newlimit = (double) hash_table_bytes * (double) parallel_workers; newlimit = Min(newlimit, (double) SIZE_MAX); hash_table_bytes = (size_t) newlimit; } @@ -1566,7 +1573,7 @@ ExecParallelHashRepartitionRest(HashJoinTable hashtable) NthParallelHashJoinBatch(old_batches, i); old_inner_tuples[i] = sts_attach(ParallelHashJoinBatchInner(shared), - ParallelWorkerNumber + 1, + hashtable->hjstate->worker_id, &pstate->fileset); } @@ -2556,7 +2563,8 @@ ExecHashTableExplainEnd(PlanState *planstate, struct StringInfoData *buf) } /* Report workfile I/O statistics. */ - if (hashtable->nbatch > 1) + /* GPDB_PARALLEL_FIXME: ExecHashTableExplainBatches if parallel_aware? */ + if (hashtable->nbatch > 1 && !planstate->plan->parallel_aware) { ExecHashTableExplainBatches(hashtable, buf, 0, 1, "Initial"); ExecHashTableExplainBatches(hashtable, @@ -3247,9 +3255,9 @@ void ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt) { SharedHashInfo *shared_info; - + HashJoinState *hjstate = node->hashtable->hjstate; /* don't need this if not instrumenting */ - if (!node->ps.instrument) + if (!node->ps.instrument || !hjstate) return; /* @@ -3259,7 +3267,8 @@ ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt) */ shared_info = (SharedHashInfo *) shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false); - node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber]; + Assert(hjstate->worker_id >= 1); + node->hinstrument = &shared_info->hinstrument[hjstate->worker_id - 1]; } /* @@ -3291,6 +3300,7 @@ ExecHashRetrieveInstrumentation(HashState *node) SharedHashInfo *shared_info = node->shared_info; size_t size; + if (shared_info == NULL) return; @@ -3617,7 +3627,7 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) accessor->inner_tuples = sts_initialize(ParallelHashJoinBatchInner(shared), pstate->nparticipants, - ParallelWorkerNumber + 1, + hashtable->hjstate->worker_id, sizeof(uint32), SHARED_TUPLESTORE_SINGLE_PASS, &pstate->fileset, @@ -3627,7 +3637,7 @@ ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch) sts_initialize(ParallelHashJoinBatchOuter(shared, pstate->nparticipants), pstate->nparticipants, - ParallelWorkerNumber + 1, + hashtable->hjstate->worker_id, sizeof(uint32), SHARED_TUPLESTORE_SINGLE_PASS, &pstate->fileset, @@ -3709,12 +3719,12 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable) accessor->done = false; accessor->inner_tuples = sts_attach(ParallelHashJoinBatchInner(shared), - ParallelWorkerNumber + 1, + hashtable->hjstate->worker_id, &pstate->fileset); accessor->outer_tuples = sts_attach(ParallelHashJoinBatchOuter(shared, pstate->nparticipants), - ParallelWorkerNumber + 1, + hashtable->hjstate->worker_id, &pstate->fileset); } diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 3f30559513a..0cfad9f7335 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -204,6 +204,8 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel) hashtable = node->hj_HashTable; econtext = node->js.ps.ps_ExprContext; parallel_state = hashNode->parallel_state; + /* GPDB_PARALLEL_FIXME: When parallel is true and parallel_state is NULL */ + parallel = parallel && (parallel_state != NULL); /* * Reset per-tuple memory context to free any expression evaluation @@ -803,7 +805,14 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) * where this function may be replaced with a parallel version, if we * managed to launch a parallel query. */ - hjstate->js.ps.ExecProcNode = ExecHashJoin; + if (node->join.plan.parallel_aware) + { + hjstate->js.ps.ExecProcNode = ExecParallelHashJoin; + } + else + { + hjstate->js.ps.ExecProcNode = ExecHashJoin; + } hjstate->js.jointype = node->join.jointype; /* @@ -968,6 +977,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) hjstate->hj_JoinState = HJ_BUILD_HASHTABLE; hjstate->hj_MatchedOuter = false; hjstate->hj_OuterNotEmpty = false; + hjstate->worker_id = -1; /* Setup the relationship of HashJoin, Hash and RuntimeFilter node. */ hstate = (HashState *) innerPlanState(hjstate); @@ -1386,6 +1396,8 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) HashJoinTable hashtable = hjstate->hj_HashTable; int start_batchno; int batchno; + Barrier *batch0_barrier = NULL; + ParallelHashJoinState *pstate = hashtable->parallel_state; /* * If we started up so late that the batch tracking array has been freed @@ -1425,9 +1437,19 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) SharedTuplestoreAccessor *inner_tuples; Barrier *batch_barrier = &hashtable->batches[batchno].shared->batch_barrier; + int phase = BarrierAttach(batch_barrier); - switch (BarrierAttach(batch_barrier)) + if (hashtable->nbatch == 1 && batchno == 0 && ((HashJoin *)hjstate->js.ps.plan)->batch0_barrier) { + Assert(phase == PHJ_BATCH_PROBING); + + batch0_barrier = &pstate->batch0_barrier; + BarrierArriveAndWait(batch0_barrier, WAIT_EVENT_PARALLEL_FINISH); + } + + switch (phase) + { + case PHJ_BATCH_ELECTING: /* One backend allocates the hash table. */ @@ -1783,7 +1805,7 @@ isNotDistinctJoin(List *qualList) static void ExecEagerFreeHashJoin(HashJoinState *node) { - if (node->hj_HashTable != NULL && !node->hj_HashTable->eagerlyReleased) + if (node->hj_HashTable != NULL && !node->hj_HashTable->eagerlyReleased && !node->hj_HashTable->parallel_state) { ReleaseHashTable(node); } @@ -1998,7 +2020,7 @@ ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt) int plan_node_id = state->js.ps.plan->plan_node_id; HashState *hashNode; ParallelHashJoinState *pstate; - + EState *estate = state->js.ps.state; /* * Disable shared hash table mode if we failed to create a real DSM * segment, because that means that we don't have a DSA area to work with. @@ -2028,7 +2050,11 @@ ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt) pstate->growth = PHJ_GROWTH_OK; pstate->chunk_work_queue = InvalidDsaPointer; pg_atomic_init_u32(&pstate->distributor, 0); - pstate->nparticipants = pcxt->nworkers + 1; + if (estate->useMppParallelMode) + pstate->nparticipants = pcxt->nworkers; + else + pstate->nparticipants = pcxt->nworkers + 1; + pstate->total_tuples = 0; LWLockInitialize(&pstate->lock, LWTRANCHE_PARALLEL_HASH_JOIN); @@ -2036,8 +2062,12 @@ ExecHashJoinInitializeDSM(HashJoinState *state, ParallelContext *pcxt) BarrierInit(&pstate->grow_batches_barrier, 0); BarrierInit(&pstate->grow_buckets_barrier, 0); + BarrierInit(&pstate->sync_barrier, pcxt->nworkers); + BarrierInit(&pstate->batch0_barrier, pcxt->nworkers); + /* Set up the space we'll use for shared temporary files. */ SharedFileSetInit(&pstate->fileset, pcxt->seg); + state->worker_id = 0; /* First worker process */ /* Initialize the shared state in the hash node. */ hashNode = (HashState *) innerPlanState(state); @@ -2088,6 +2118,7 @@ ExecHashJoinInitializeWorker(HashJoinState *state, ParallelWorkerContext *pwcxt) { HashState *hashNode; + EState *estate = state->js.ps.state; int plan_node_id = state->js.ps.plan->plan_node_id; ParallelHashJoinState *pstate = shm_toc_lookup(pwcxt->toc, plan_node_id, false); @@ -2098,6 +2129,12 @@ ExecHashJoinInitializeWorker(HashJoinState *state, /* Attach to the shared state in the hash node. */ hashNode = (HashState *) innerPlanState(state); hashNode->parallel_state = pstate; - - ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin); + if (estate->useMppParallelMode) + state->worker_id = pwcxt->worker_id; + else + { + Assert(ParallelWorkerNumber >= 0); + state->worker_id = ParallelWorkerNumber + 1; + ExecSetExecProcNode(&state->js.ps, ExecParallelHashJoin); + } } diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 8fee958135c..b413011e9aa 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -35,6 +35,7 @@ #include "access/tableam.h" #include "access/tupdesc.h" #include "access/visibilitymap.h" +#include "cdb/cdbvars.h" #include "executor/execdebug.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeIndexscan.h" @@ -85,16 +86,33 @@ IndexOnlyNext(IndexOnlyScanState *node) if (scandesc == NULL) { - /* - * We reach here if the index only scan is not parallel, or if we're - * serially executing an index only scan that was planned to be - * parallel. - */ - scandesc = index_beginscan(node->ss.ss_currentRelation, - node->ioss_RelationDesc, - estate->es_snapshot, - node->ioss_NumScanKeys, - node->ioss_NumOrderByKeys); + if (node->ss.ps.plan->parallel_aware && estate->useMppParallelMode) + { + ParallelIndexScanDesc piscan; + ParallelEntryTag tag; + int localSliceId = LocallyExecutingSliceIndex(estate); + INIT_PARALLELENTRYTAG(tag, gp_command_count, localSliceId, gp_session_id); + piscan = GpFetchParallelDSMEntry(tag, node->ss.ps.plan->plan_node_id); + Assert(piscan); + scandesc = index_beginscan_parallel(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys, + piscan); + } + else + { + /* + * We reach here if the index only scan is not parallel, or if we're + * serially executing an index only scan that was planned to be + * parallel. + */ + scandesc = index_beginscan(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + estate->es_snapshot, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys); + } node->ioss_ScanDesc = scandesc; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 3e7bb07a564..9875987d0a4 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -30,10 +30,10 @@ #include "postgres.h" #include "access/nbtree.h" -#include "cdb/cdbvars.h" #include "access/relscan.h" #include "access/tableam.h" #include "catalog/pg_am.h" +#include "cdb/cdbvars.h" #include "executor/execdebug.h" #include "executor/nodeIndexscan.h" #include "lib/pairingheap.h" @@ -106,17 +106,35 @@ IndexNext(IndexScanState *node) if (scandesc == NULL) { - /* - * We reach here if the index scan is not parallel, or if we're - * serially executing an index scan that was planned to be parallel. - */ - scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, - estate->es_snapshot, - node->iss_NumScanKeys, - node->iss_NumOrderByKeys); - - node->iss_ScanDesc = scandesc; + if (node->ss.ps.plan->parallel_aware && estate->useMppParallelMode) + { + ParallelIndexScanDesc piscan; + ParallelEntryTag tag; + int localSliceId = LocallyExecutingSliceIndex(estate); + INIT_PARALLELENTRYTAG(tag, gp_command_count, localSliceId, gp_session_id); + piscan = GpFetchParallelDSMEntry(tag, node->ss.ps.plan->plan_node_id); + Assert(piscan); + scandesc = index_beginscan_parallel(node->ss.ss_currentRelation, + node->iss_RelationDesc, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, + piscan); + node->iss_ScanDesc = scandesc; + } + else + { + /* + * We reach here if the index scan is not parallel, or if we're + * serially executing an index scan that was planned to be parallel. + */ + scandesc = index_beginscan(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys); + + node->iss_ScanDesc = scandesc; + } /* * If no run-time keys to calculate or they are ready, go ahead and @@ -124,8 +142,8 @@ IndexNext(IndexScanState *node) */ if (node->iss_NumRuntimeKeys == 0 || node->iss_RuntimeKeysReady) index_rescan(scandesc, - node->iss_ScanKeys, node->iss_NumScanKeys, - node->iss_OrderByKeys, node->iss_NumOrderByKeys); + node->iss_ScanKeys, node->iss_NumScanKeys, + node->iss_OrderByKeys, node->iss_NumOrderByKeys); } /* diff --git a/src/backend/executor/nodeMotion.c b/src/backend/executor/nodeMotion.c index 17ba977b19e..d615625eb1a 100644 --- a/src/backend/executor/nodeMotion.c +++ b/src/backend/executor/nodeMotion.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/heapam.h" +#include "access/parallel.h" #include "nodes/execnodes.h" /* Slice, SliceTable */ #include "cdb/cdbmotion.h" #include "cdb/cdbutil.h" @@ -27,6 +28,7 @@ #include "executor/nodeMotion.h" #include "lib/binaryheap.h" #include "utils/tuplesort.h" +#include "utils/wait_event.h" #include "miscadmin.h" #include "utils/memutils.h" @@ -217,6 +219,7 @@ execMotionSender(MotionState *node) motion->motionType == MOTIONTYPE_GATHER_SINGLE || motion->motionType == MOTIONTYPE_HASH || motion->motionType == MOTIONTYPE_BROADCAST || + motion->motionType == MOTIONTYPE_PARALLEL_BROADCAST || (motion->motionType == MOTIONTYPE_EXPLICIT && motion->segidColIdx > 0)); Assert(node->ps.state->interconnect_context); @@ -312,6 +315,7 @@ execMotionUnsortedReceiver(MotionState *node) motion->motionType == MOTIONTYPE_GATHER_SINGLE || motion->motionType == MOTIONTYPE_HASH || motion->motionType == MOTIONTYPE_BROADCAST || + motion->motionType == MOTIONTYPE_PARALLEL_BROADCAST || (motion->motionType == MOTIONTYPE_EXPLICIT && motion->segidColIdx > 0)); Assert(node->ps.state->motionlayer_context); @@ -681,7 +685,8 @@ ExecInitMotion(Motion *node, EState *estate, int eflags) errmsg("EvalPlanQual can not handle subPlan with Motion node"))); Assert(node->motionID > 0); - Assert(node->motionID < sliceTable->numSlices); + AssertImply(node->senderSliceInfo && node->senderSliceInfo->parallel_workers <= 1, + node->motionID < sliceTable->numSlices); AssertImply(node->motionType == MOTIONTYPE_HASH, node->numHashSegments > 0); parentIndex = estate->currentSliceId; @@ -698,6 +703,7 @@ ExecInitMotion(Motion *node, EState *estate, int eflags) motionstate->stopRequested = false; motionstate->hashExprs = NIL; motionstate->cdbhash = NULL; + motionstate->cdbhashworkers = NULL; /* Look up the sending and receiving gang's slice table entries. */ sendSlice = &sliceTable->slices[node->motionID]; @@ -770,6 +776,11 @@ ExecInitMotion(Motion *node, EState *estate, int eflags) motionstate->stopRequested = false; motionstate->numInputSegs = list_length(sendSlice->segments); + /* It should have been set to 1 in FillSliceGangInfo if parallel_workers == 0 */ + Assert(recvSlice->parallel_workers); + + motionstate->parallel_workers = recvSlice->parallel_workers; + /* * Miscellaneous initialization * @@ -799,6 +810,7 @@ ExecInitMotion(Motion *node, EState *estate, int eflags) tupDesc = ExecGetResultType(&motionstate->ps); motionstate->ps.ps_ProjInfo = NULL; + /* numHashSegments is target locus.numsegments without parallel_workes */ motionstate->numHashSegments = node->numHashSegments; /* Set up motion send data structures */ @@ -807,7 +819,8 @@ ExecInitMotion(Motion *node, EState *estate, int eflags) int nkeys; Assert(node->numHashSegments > 0); - Assert(node->numHashSegments <= recvSlice->planNumSegments); + AssertImply(node->senderSliceInfo && node->senderSliceInfo->parallel_workers <= 1, + node->numHashSegments <= recvSlice->planNumSegments); nkeys = list_length(node->hashExprs); if (nkeys > 0) @@ -817,9 +830,23 @@ ExecInitMotion(Motion *node, EState *estate, int eflags) /* * Create hash API reference */ - motionstate->cdbhash = makeCdbHash(motionstate->numHashSegments, - nkeys, - node->hashFuncs); + motionstate->cdbhash = makeCdbHash( + motionstate->numHashSegments, + nkeys, + node->hashFuncs); + + /* + * Create hash API reference for workers. + * During redistribute motion; we'll mod segments to decide to send to which + * segment and then mod workers to decide send to which segments. + */ + if (motionstate->parallel_workers >= 2) + { + motionstate->cdbhashworkers = makeCdbHash( + motionstate->numHashSegments * motionstate->parallel_workers, + nkeys, + node->hashFuncs); + } } /* @@ -977,6 +1004,11 @@ ExecEndMotion(MotionState *node) pfree(node->cdbhash); node->cdbhash = NULL; } + if (node->cdbhashworkers != NULL) + { + pfree(node->cdbhashworkers); + node->cdbhashworkers = NULL; + } /* * Free up this motion node's resources in the Motion Layer. @@ -1147,9 +1179,16 @@ doSendEndOfStream(Motion *motion, MotionState *node) void doSendTuple(Motion *motion, MotionState *node, TupleTableSlot *outerTupleSlot) { - int16 targetRoute; - SendReturnCode sendRC; + int16 targetRoute = 0; + SendReturnCode sendRC = STOP_SENDING; ExprContext *econtext = node->ps.ps_ExprContext; + int parallel_workers; + int i; + + int parentIndex = node->ps.state->currentSliceId; + ExecSlice *recvSlice = &node->ps.state->es_sliceTable->slices[parentIndex]; + parallel_workers = recvSlice->parallel_workers; + Assert(parallel_workers != 0); /* We got a tuple from the child-plan. */ node->numTuplesFromChild++; @@ -1170,24 +1209,80 @@ doSendTuple(Motion *motion, MotionState *node, TupleTableSlot *outerTupleSlot) { targetRoute = BROADCAST_SEGIDX; } + else if (motion->motionType == MOTIONTYPE_PARALLEL_BROADCAST) + { + int numSegments = recvSlice->planNumSegments; + + Assert(numSegments != 0); + Assert(numSegments % parallel_workers == 0); + + for (i = 0; i < numSegments / parallel_workers; i++) + { + targetRoute = i * parallel_workers + random() % parallel_workers; + + CheckAndSendRecordCache(node->ps.state->motionlayer_context, + node->ps.state->interconnect_context, + motion->motionID, + targetRoute); + sendRC = SendTuple(node->ps.state->motionlayer_context, + node->ps.state->interconnect_context, + motion->motionID, + outerTupleSlot, + targetRoute); + + Assert(sendRC == SEND_COMPLETE || sendRC == STOP_SENDING); + + if (sendRC == STOP_SENDING) + break; + +#ifdef CDB_MOTION_DEBUG + if (sendRC == SEND_COMPLETE && node->numTuplesToAMS <= 20) + { + StringInfoData buf; + + initStringInfo(&buf); + appendStringInfo(&buf, " motion%-3d snd->%-3d, %5d.", + motion->motionID, + targetRoute, + node->numTuplesToAMS); + formatTuple(&buf, outerTupleSlot, node->outputFunArray); + elog(DEBUG3, "%s", buf.data); + pfree(buf.data); + } +#endif + } + + Assert(sendRC == SEND_COMPLETE || sendRC == STOP_SENDING); + + if (sendRC == SEND_COMPLETE) + node->numTuplesToAMS++; + else + node->stopRequested = true; + + return; + } else if (motion->motionType == MOTIONTYPE_HASH) /* Redistribute */ { - uint32 hval = 0; + uint32 segIdx = 0; + uint32 workerIdx = 0; econtext->ecxt_outertuple = outerTupleSlot; + segIdx = evalHashKey(econtext, node->hashExprs, node->cdbhash); - hval = evalHashKey(econtext, node->hashExprs, node->cdbhash); + if (parallel_workers >= 2) + { + workerIdx = evalHashKey(econtext, node->hashExprs, node->cdbhashworkers) / node->numHashSegments; + } #ifdef USE_ASSERT_CHECKING - Assert(hval < node->numHashSegments && + Assert(segIdx < node->numHashSegments && "redistribute destination outside segment array"); #endif /* USE_ASSERT_CHECKING */ - /* - * hashSegIdx takes our uint32 and maps it to an int, and here we - * assign it to an int16. See below. - */ - targetRoute = hval; + if (parallel_workers >= 2) + targetRoute = segIdx * parallel_workers + workerIdx; + else + targetRoute = segIdx; /* * see MPP-2099, let's not run into this one again! NOTE: the diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 727435db44b..f0324a4817e 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -27,15 +27,19 @@ */ #include "postgres.h" +#include "access/heapam.h" #include "access/relscan.h" +#include "access/session.h" #include "access/tableam.h" #include "executor/execdebug.h" #include "executor/nodeSeqscan.h" #include "utils/rel.h" +#include "utils/builtins.h" #include "nodes/nodeFuncs.h" #include "cdb/cdbaocsam.h" #include "cdb/cdbappendonlyam.h" +#include "cdb/cdbvars.h" static TupleTableSlot *SeqNext(SeqScanState *node); @@ -68,19 +72,58 @@ SeqNext(SeqScanState *node) if (scandesc == NULL) { /* - * We reach here if the scan is not parallel, or if we're serially - * executing a scan that was planned to be parallel. + * parallel scan could be: + * normal mode(Heap, AO, AOCO) and AOCO extract columns mode. */ - /* - * GPDB: we are using table_beginscan_es in order to also initialize the - * scan state with the column info needed for AOCO relations. Check the - * comment in table_beginscan_es() for more info. - */ - scandesc = table_beginscan_es(node->ss.ss_currentRelation, - estate->es_snapshot, - node->ss.ps.plan->targetlist, - node->ss.ps.plan->qual); - node->ss.ss_currentScanDesc = scandesc; + if (node->ss.ps.plan->parallel_aware && estate->useMppParallelMode) + { + ParallelTableScanDesc pscan; + ParallelEntryTag tag; + int localSliceId = LocallyExecutingSliceIndex(estate); + INIT_PARALLELENTRYTAG(tag, gp_command_count, localSliceId, gp_session_id); + pscan = GpFetchParallelDSMEntry(tag, node->ss.ps.plan->plan_node_id); + Assert(pscan); + + /* + * GPDB: we are using table_beginscan_es in order to also initialize the + * scan state with the column info needed for AOCO relations. Check the + * comment in table_beginscan_es() for more info. + * table_beginscan_es could also be parallel. + * We need target_list and qual for AOCO extract columns. + * This is a little awful becuase of some duplicated checks both in + * scan_begin_extractcolumns am and table_beginscan_parallel am. + * But we shouldn't change upstream's API. + */ + if (node->ss.ss_currentRelation->rd_tableam->scan_begin_extractcolumns) + { + /* try parallel mode for AOCO extract columns */ + scandesc = table_beginscan_es(node->ss.ss_currentRelation, + estate->es_snapshot, + pscan, + node->ss.ps.plan->targetlist, + node->ss.ps.plan->qual); + } + else + { + /* normal parallel mode */ + scandesc = table_beginscan_parallel(node->ss.ss_currentRelation, pscan); + } + + node->ss.ss_currentScanDesc = scandesc; + } + else + { + /* + * We reach here if the scan is not parallel, or if we're serially + * executing a scan that was planned to be parallel. + */ + scandesc = table_beginscan_es(node->ss.ss_currentRelation, + estate->es_snapshot, + NULL, + node->ss.ps.plan->targetlist, + node->ss.ps.plan->qual); + node->ss.ss_currentScanDesc = scandesc; + } if (gp_enable_predicate_pushdown) { if (RelationIsAoRows(node->ss.ss_currentRelation)) @@ -312,6 +355,9 @@ ExecSeqScanInitializeDSM(SeqScanState *node, ParallelTableScanDesc pscan; pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); + + Assert(pscan); + table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, estate->es_snapshot); diff --git a/src/backend/executor/nodeShareInputScan.c b/src/backend/executor/nodeShareInputScan.c index 840aba17c2c..9c1a31527b5 100644 --- a/src/backend/executor/nodeShareInputScan.c +++ b/src/backend/executor/nodeShareInputScan.c @@ -944,7 +944,8 @@ shareinput_writer_notifyready(shareinput_Xslice_reference *ref) shareinput_Xslice_state *state = ref->xslice_state; uint32 old_ready PG_USED_FOR_ASSERTS_ONLY = pg_atomic_exchange_u32(&state->ready, 1); - Assert(old_ready == 0); + if (old_ready) + elog(ERROR, "shareinput_writer_notifyready() called create the tuplestore twice."); #ifdef FAULT_INJECTOR SIMPLE_FAULT_INJECTOR("shareinput_writer_notifyready"); diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index 9937a06bbf4..2eef4161d11 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -484,8 +484,8 @@ ExecSquelchSort(SortState *node) if (!node->delayEagerFree) { ExecEagerFreeSort(node); - ExecSquelchNode(outerPlanState(node)); } + ExecSquelchNode(outerPlanState(node)); } /* ---------------------------------------------------------------- diff --git a/src/backend/libpq/be-secure-common.c b/src/backend/libpq/be-secure-common.c index 7d082d78887..b29916ff6b9 100644 --- a/src/backend/libpq/be-secure-common.c +++ b/src/backend/libpq/be-secure-common.c @@ -22,6 +22,7 @@ #include #include +#include "postmaster/postmaster.h" #include "common/string.h" #include "libpq/libpq.h" #include "storage/fd.h" @@ -61,6 +62,19 @@ run_ssl_passphrase_command(const char *prompt, bool is_server_start, char *buf, appendStringInfoString(&command, prompt); p++; break; + case 'R': + { + char fd_str[20]; + + if (terminal_fd == -1) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("ssl_passphrase_command referenced %%R, but -R not specified"))); + p++; + snprintf(fd_str, sizeof(fd_str), "%d", terminal_fd); + appendStringInfoString(&command, fd_str); + break; + } case '%': appendStringInfoChar(&command, '%'); p++; diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 4892203b430..c018a695eb8 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -336,6 +336,7 @@ help(const char *progname) #endif printf(_(" -N MAX-CONNECT maximum number of allowed connections\n")); printf(_(" -p PORT port number to listen on\n")); + printf(_(" -R fd prompt for the cluster key\n")); printf(_(" -s show statistics after each query\n")); printf(_(" -S WORK-MEM set amount of memory for sorts (in kB)\n")); printf(_(" -V, --version output version information, then exit\n")); @@ -371,8 +372,10 @@ help(const char *progname) printf(_("\nOptions for bootstrapping mode:\n")); printf(_(" --boot selects bootstrapping mode (must be first argument)\n")); printf(_(" DBNAME database name (mandatory argument in bootstrapping mode)\n")); + printf(_(" -K LEN enable cluster file encryption with specified key bit length\n")); printf(_(" -r FILENAME send stdout and stderr to given file\n")); printf(_(" -x NUM internal use\n")); + printf(_(" -u DATADIR copy encryption keys from datadir\n")); printf(_("\nPlease read the documentation for the complete list of run-time\n" "configuration settings and how to set them on the command line or in\n" diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index b1305ae395d..5a40a9484ce 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -138,6 +138,7 @@ _copyPlannedStmt(const PlannedStmt *from) COPY_SCALAR_FIELD(slices[i].parentIndex); COPY_SCALAR_FIELD(slices[i].gangType); COPY_SCALAR_FIELD(slices[i].numsegments); + COPY_SCALAR_FIELD(slices[i].parallel_workers); COPY_SCALAR_FIELD(slices[i].segindex); COPY_SCALAR_FIELD(slices[i].directDispatch.isDirectDispatch); COPY_NODE_FIELD(slices[i].directDispatch.contentIds); @@ -1125,6 +1126,7 @@ _copyHashJoin(const HashJoin *from) COPY_NODE_FIELD(hashcollations); COPY_NODE_FIELD(hashkeys); COPY_NODE_FIELD(hashqualclauses); + COPY_SCALAR_FIELD(batch0_barrier); return newnode; } @@ -1405,6 +1407,7 @@ _copyHash(const Hash *from) COPY_SCALAR_FIELD(skewColumn); COPY_SCALAR_FIELD(skewInherit); COPY_SCALAR_FIELD(rows_total); + COPY_SCALAR_FIELD(sync_barrier); return newnode; } @@ -5421,6 +5424,8 @@ _copySliceTable(const SliceTable *from) COPY_SCALAR_FIELD(slices[i].planNumSegments); COPY_SCALAR_FIELD(slices[i].gangType); COPY_NODE_FIELD(slices[i].segments); + COPY_SCALAR_FIELD(slices[i].useMppParallelMode); + COPY_SCALAR_FIELD(slices[i].parallel_workers); newnode->slices[i].primaryGang = from->slices[i].primaryGang; COPY_SCALAR_FIELD(slices[i].parentIndex); diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index 1aa824c05fb..b30c764d02f 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -1997,6 +1997,8 @@ expression_tree_walker(Node *node, case T_AggExprId: case T_RowIdExpr: case T_CTESearchClause: + case T_Gather: + case T_GatherMerge: /* primitive node types with no expression subnodes */ break; case T_WithCheckOption: @@ -3540,6 +3542,24 @@ expression_tree_mutator(Node *node, return (Node *) newnode; } break; + case T_Gather: + { + Gather *gather = (Gather *) node; + Gather *newgather; + + FLATCOPY(newgather, gather, Gather); + return (Node *) newgather; + } + break; + case T_GatherMerge: + { + GatherMerge *gathermerge = (GatherMerge *) node; + GatherMerge *newgathermerge; + + FLATCOPY(newgathermerge, gathermerge, GatherMerge); + return (Node *) newgathermerge; + } + break; default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 8f9ffb0a808..62b798013be 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -368,6 +368,7 @@ _outPlannedStmt(StringInfo str, const PlannedStmt *node) WRITE_INT_FIELD(slices[i].parentIndex); WRITE_INT_FIELD(slices[i].gangType); WRITE_INT_FIELD(slices[i].numsegments); + WRITE_INT_FIELD(slices[i].parallel_workers); WRITE_INT_FIELD(slices[i].segindex); WRITE_BOOL_FIELD(slices[i].directDispatch.isDirectDispatch); WRITE_NODE_FIELD(slices[i].directDispatch.contentIds); @@ -861,6 +862,7 @@ _outHashJoin(StringInfo str, const HashJoin *node) WRITE_NODE_FIELD(hashcollations); WRITE_NODE_FIELD(hashkeys); WRITE_NODE_FIELD(hashqualclauses); + WRITE_BOOL_FIELD(batch0_barrier); } static void @@ -996,6 +998,7 @@ _outHash(StringInfo str, const Hash *node) WRITE_BOOL_FIELD(skewInherit); WRITE_FLOAT_FIELD(rows_total, "%.0f"); WRITE_BOOL_FIELD(rescannable); /*CDB*/ + WRITE_BOOL_FIELD(sync_barrier); } static void diff --git a/src/backend/nodes/outfuncs_common.c b/src/backend/nodes/outfuncs_common.c index b310fa4d4c2..70a7624ab04 100644 --- a/src/backend/nodes/outfuncs_common.c +++ b/src/backend/nodes/outfuncs_common.c @@ -1394,6 +1394,8 @@ _outSliceTable(StringInfo str, const SliceTable *node) WRITE_NODE_FIELD(slices[i].children); /* List of int index */ WRITE_ENUM_FIELD(slices[i].gangType, GangType); WRITE_NODE_FIELD(slices[i].segments); /* List of int */ + WRITE_BOOL_FIELD(slices[i].useMppParallelMode); + WRITE_INT_FIELD(slices[i].parallel_workers); WRITE_DUMMY_FIELD(slices[i].primaryGang); WRITE_NODE_FIELD(slices[i].primaryProcesses); /* List of (CDBProcess *) */ WRITE_BITMAPSET_FIELD(slices[i].processesMap); diff --git a/src/backend/nodes/print.c b/src/backend/nodes/print.c index 3c863854fea..c5223413071 100644 --- a/src/backend/nodes/print.c +++ b/src/backend/nodes/print.c @@ -582,6 +582,10 @@ plannode_type(Plan *p) return "FOREIGNSCAN"; case T_SplitUpdate: return "SPLITUPDATE"; + case T_Gather: + return "GATHER"; + case T_GatherMerge: + return "GATHERMERGE"; default: return "UNKNOWN"; } diff --git a/src/backend/nodes/readfast.c b/src/backend/nodes/readfast.c index d0d755c7ad3..7a2610ce781 100644 --- a/src/backend/nodes/readfast.c +++ b/src/backend/nodes/readfast.c @@ -979,6 +979,7 @@ _readMotion(void) local_node->motionType == MOTIONTYPE_GATHER_SINGLE || local_node->motionType == MOTIONTYPE_HASH || local_node->motionType == MOTIONTYPE_BROADCAST || + local_node->motionType == MOTIONTYPE_PARALLEL_BROADCAST || local_node->motionType == MOTIONTYPE_EXPLICIT); READ_BOOL_FIELD(sendSorted); diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 6f2cc58b2c3..b8654fb8e9f 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -1642,6 +1642,7 @@ _readPlannedStmt(void) READ_INT_FIELD(slices[i].parentIndex); READ_INT_FIELD(slices[i].gangType); READ_INT_FIELD(slices[i].numsegments); + READ_INT_FIELD(slices[i].parallel_workers); READ_INT_FIELD(slices[i].segindex); READ_BOOL_FIELD(slices[i].directDispatch.isDirectDispatch); READ_NODE_FIELD(slices[i].directDispatch.contentIds); @@ -2267,6 +2268,7 @@ _readHashJoin(void) READ_NODE_FIELD(hashcollations); READ_NODE_FIELD(hashkeys); READ_NODE_FIELD(hashqualclauses); + READ_BOOL_FIELD(batch0_barrier); READ_DONE(); } @@ -2500,6 +2502,7 @@ _readHash(void) READ_BOOL_FIELD(skewInherit); READ_FLOAT_FIELD(rows_total); READ_BOOL_FIELD(rescannable); /*CDB*/ + READ_BOOL_FIELD(sync_barrier); READ_DONE(); } diff --git a/src/backend/nodes/readfuncs_common.c b/src/backend/nodes/readfuncs_common.c index f46e4834166..03dec67f781 100644 --- a/src/backend/nodes/readfuncs_common.c +++ b/src/backend/nodes/readfuncs_common.c @@ -1519,6 +1519,8 @@ _readSliceTable(void) READ_NODE_FIELD(slices[i].children); /* List of int index */ READ_ENUM_FIELD(slices[i].gangType, GangType); READ_NODE_FIELD(slices[i].segments); /* List of int index */ + READ_BOOL_FIELD(slices[i].useMppParallelMode); + READ_INT_FIELD(slices[i].parallel_workers); local_node->slices[i].primaryGang = NULL; READ_NODE_FIELD(slices[i].primaryProcesses); /* List of (CDBProcess *) */ READ_BITMAPSET_FIELD(slices[i].processesMap); diff --git a/src/backend/optimizer/README.cbdb.parallel b/src/backend/optimizer/README.cbdb.parallel new file mode 100644 index 00000000000..6cdc0d6d56d --- /dev/null +++ b/src/backend/optimizer/README.cbdb.parallel @@ -0,0 +1,53 @@ +src/backend/optimizer/README.cbdb.parallel + +Cloudberry parallel query is based on Postgres parallel. +Most mechanisms are the same, refer to Parallel Query and Partial Paths in src/backend/optimizer/README. +The main difference is Postgres has a Gather/GatherMerge node which launches any number of workers +to execute a plan as a leader process(PG style), but Cloudberry doesn't have that. + +Cloudberry treats all workers equally. They work together to execute a plan node with some sync mechanism +to keep the right thing, ex: create a shared hash table etc. +That's called GP style. +GP style launches workers as non-parallel plan except that expand Gang size by factor if a top path node has +parallel_workers > 1. + +The reasons we choose GP style but not PG style or mix them is complex. +We encounter lots of problems when mixing them together and we don't have enough time to enable both +and don't know how much the benefit we could have. +1. PG style Gather/GatherMerge node will launch processes workers that lack GP's QE info like: +distributed transaction, distributed snapshot, GP roles and much more. If we mixed them, there would be +normal QE processes and worker processes launched by Gather node. They don't know each other. +2. PG style Gather/GatherMerge locus issue. +The locus of Gather node may be different from its child. Ex: a parallel scan on a hashed distributed table +will have all data that should be hashed on same segments as a whole(Hashed locus), +but each process has partial data(HashedWorkers locus). +The Gather node should be Hashed locus in that situation. But things become complex when joining with other locus +and if there is a Motion node below that. +3. GP style could parallelize plan as late as possible until the final Gather(to QD or to QE in the middle), +But PG style will Gather workers in apply_scanjoin_target_to_path. PG style can't generate the final scan/join +target in parallel workers. This is PG's last opportunity to use any partial paths that exist. +It will empty partial_pathlist, all paths are moved to pathlist that it couldn't participate in later parallel +join as the outer path, ex: parallel_aware hash join with a shared table. +But GP style could keep partial path in partial_pathlist because we have a Gather Motion on the top. + + +Parallel locus. +Making locus compatible in parallel mode is more complex than a non-parallel plan. +We have to add Motions even though the distributions are the same, but paths have different parallel_workers. +ex: +create table t1(b int) with(parallel_workers=3) distributed by (b); +create table t2(a int) with(parallel_workers=2) distributed by (a); +gpadmin=# explain(costs off) select * from t1 right join t2 on t1.b = t2.a; + QUERY PLAN +------------------------------------------------------------------ + Gather Motion 6:1 (slice1; segments: 6) + -> Parallel Hash Left Join + Hash Cond: (t2.a = t1.b) + -> Parallel Seq Scan on t2 + -> Parallel Hash + -> Redistribute Motion 9:6 (slice2; segments: 9) + Hash Key: t1.b + Hash Module: 3 + -> Parallel Seq Scan on t1 + +See function cdb_motion_for_parallel_join() for details. diff --git a/src/backend/optimizer/geqo/geqo_eval.c b/src/backend/optimizer/geqo/geqo_eval.c index 2ecba83490f..d39b6566b42 100644 --- a/src/backend/optimizer/geqo/geqo_eval.c +++ b/src/backend/optimizer/geqo/geqo_eval.c @@ -274,7 +274,9 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, int num_gene, * grouping_planner). */ if (old_clump->size + new_clump->size < num_gene) + #if 0 generate_useful_gather_paths(root, joinrel, false); + #endif /* Find and save the cheapest paths for this joinrel */ set_cheapest(joinrel); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index c2e23b6ecef..14f622d08ed 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -22,6 +22,7 @@ #include "access/sysattr.h" #include "access/tsmapi.h" +#include "catalog/catalog.h" #include "catalog/pg_class.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" @@ -52,6 +53,8 @@ #include "rewrite/rewriteManip.h" #include "utils/guc.h" #include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" #include "cdb/cdbmutate.h" /* cdbmutate_warn_ctid_without_segid */ #include "cdb/cdbpath.h" /* cdbpath_rows() */ @@ -519,6 +522,8 @@ bring_to_outer_query(PlannerInfo *root, RelOptInfo *rel, List *outer_quals) rel->cheapest_unique_path = NULL; rel->cheapest_parameterized_paths = NIL; rel->pathlist = NIL; + /* GPDB_PARALLEL_FIXME: Need to clear partial_pathlist before we enable OuterQuery locus in paralle mode */ + rel->partial_pathlist = NIL; foreach(lc, origpathlist) { @@ -646,6 +651,17 @@ bring_to_singleQE(PlannerInfo *root, RelOptInfo *rel) add_path(rel, path, root); } + /* + * GP_PARALLEL_FIXME: + * If we need to bring to single QE which commonly seen in lateral + * join with group by or limit, we better to set partial pathlist + * to NIL in order to make sure single QE locus is satisfied in + * upper paths. + * + * It's not trivial to apply single QE locus constrain to parallel + * in current code. We should think about that later. + */ + rel->partial_pathlist = NIL; set_cheapest(rel); } @@ -772,7 +788,10 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, (*set_rel_pathlist_hook) (root, rel, rti, rte); if (rel->upperrestrictinfo) + { bring_to_outer_query(root, rel, rel->upperrestrictinfo); + /* GP_PARALLEL_FIXME: enable parallel outer query? */ + } else if (root->config->force_singleQE) { /* @@ -800,9 +819,11 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, * we postpone gathering until the final scan/join targetlist is available * (see grouping_planner). */ +#if 0 if (rel->reloptkind == RELOPT_BASEREL && bms_membership(root->all_baserels) != BMS_SINGLETON) generate_useful_gather_paths(root, rel, false); +#endif /* Now find the cheapest of the paths for this rel */ set_cheapest(rel); @@ -898,6 +919,12 @@ set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, return; } + /* + * GP_PARALLEL_FIXME: GPDB don't allow parallelism for relations that are system catalogs. + */ + if (IsSystemClassByRelid(rte->relid)) + return; + /* * There are additional considerations for appendrels, which we'll * deal with in set_append_rel_size and set_append_rel_pathlist. @@ -1056,13 +1083,16 @@ create_plain_partial_paths(PlannerInfo *root, RelOptInfo *rel) { int parallel_workers; - parallel_workers = compute_parallel_worker(rel, rel->pages, -1, + parallel_workers = compute_parallel_worker(root, rel, rel->pages, -1, max_parallel_workers_per_gather); /* If any limit was set to zero, the user doesn't want a parallel scan. */ - if (parallel_workers <= 0) + /* GPDB parallel, parallel_workers <= 1 is bogus */ + if (parallel_workers <= 1) return; + /* GPDB_PARALLEL_FIXME: update locus.parallel_workers? */ + /* Add an unordered partial path based on a parallel sequential scan. */ add_partial_path(rel, create_seqscan_path(root, rel, NULL, parallel_workers)); } @@ -1638,6 +1668,12 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, accumulate_append_subpath(nppath, &pa_nonpartial_subpaths, NULL); + /* + * GPDB_PARALLEL_FIXME: can't use parallel append if subpath + * is not parallel safe. + */ + if (!nppath->parallel_safe) + pa_subpaths_valid = false; } } @@ -1733,7 +1769,14 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, parallel_workers = Max(parallel_workers, path->parallel_workers); } + /* + * GPDB_PARALLEL_FIXME: it still cannot be opened after we deal with append. + * Because we currently allow path with non parallel_workers been added to + * partial_path. + */ +#if 0 Assert(parallel_workers > 0); +#endif /* * If the use of parallel append is permitted, always request at least @@ -1751,22 +1794,33 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather); } - Assert(parallel_workers > 0); - - /* Generate a partial append path. */ - appendpath = create_append_path(root, rel, NIL, partial_subpaths, - NIL, NULL, parallel_workers, - enable_parallel_append, - -1); - /* - * Make sure any subsequent partial paths use the same row count - * estimate. + * GPDB_PARALLEL_FIXME: it still cannot be opened after we deal with append. + * Because we currently allow path with non parallel_workers been added to + * partial_path. */ - partial_rows = appendpath->path.rows; +#if 0 + Assert(parallel_workers > 0); +#endif - /* Add the path. */ - add_partial_path(rel, (Path *) appendpath); + /* GPDB parallel, parallel_workers <= 1 is bogus */ + if (parallel_workers > 1) + { + /* Generate a partial append path. */ + appendpath = create_append_path(root, rel, NIL, partial_subpaths, + NIL, NULL, parallel_workers, + enable_parallel_append, + -1); + + /* + * Make sure any subsequent partial paths use the same row count + * estimate. + */ + partial_rows = appendpath->path.rows; + /* Add the path if subpath has not Motion.*/ + if (appendpath->path.parallel_safe && appendpath->path.motionHazard == false) + add_partial_path(rel, (Path *)appendpath); + } } /* @@ -1803,11 +1857,16 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, max_parallel_workers_per_gather); Assert(parallel_workers > 0); - appendpath = create_append_path(root, rel, pa_nonpartial_subpaths, - pa_partial_subpaths, - NIL, NULL, parallel_workers, true, - partial_rows); - add_partial_path(rel, (Path *) appendpath); + /* GPDB parallel, parallel_workers <= 1 is bogus */ + if (parallel_workers > 1) + { + appendpath = create_append_path(root, rel, pa_nonpartial_subpaths, + pa_partial_subpaths, + NIL, NULL, parallel_workers, true, + partial_rows); + if (appendpath->path.parallel_safe && appendpath->path.motionHazard == false) + add_partial_path(rel, (Path *) appendpath); + } } /* @@ -1895,7 +1954,9 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, NIL, NULL, path->parallel_workers, true, partial_rows); - add_partial_path(rel, (Path *) appendpath); + + if (appendpath->path.parallel_safe && appendpath->path.motionHazard == false) + add_partial_path(rel, (Path *) appendpath); } } } @@ -2518,7 +2579,7 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, CdbPathLocus locus; if (forceDistRand) - CdbPathLocus_MakeStrewn(&locus, getgpsegmentCount()); + CdbPathLocus_MakeStrewn(&locus, getgpsegmentCount(), subpath->parallel_workers); else locus = cdbpathlocus_from_subquery(root, rel, subpath); @@ -2557,11 +2618,13 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, foreach(lc, sub_final_rel->partial_pathlist) { Path *subpath = (Path *) lfirst(lc); + Path *path; List *pathkeys; + List *l; CdbPathLocus locus; if (forceDistRand) - CdbPathLocus_MakeStrewn(&locus, getgpsegmentCount()); + CdbPathLocus_MakeStrewn(&locus, getgpsegmentCount(), subpath->parallel_workers); else locus = cdbpathlocus_from_subquery(root, rel, subpath); @@ -2572,11 +2635,14 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, make_tlist_from_pathtarget(subpath->pathtarget)); /* Generate outer path using this subpath */ - add_partial_path(rel, (Path *) - create_subqueryscan_path(root, rel, subpath, + path = (Path *) create_subqueryscan_path(root, rel, subpath, pathkeys, locus, - required_outer)); + required_outer); + /* turn into SingleQE if needed */ + l = lappend(list_make1(subquery->havingQual), subpath->pathtarget->exprs); + path = turn_volatile_seggen_to_singleqe(root, path, (Node *) l); + add_partial_path(rel, path); } } } @@ -3179,6 +3245,7 @@ set_worktable_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel, bool override_rows) { + Assert(false); Path *cheapest_partial_path; Path *simple_gather_path; ListCell *lc; @@ -3199,6 +3266,10 @@ generate_gather_paths(PlannerInfo *root, RelOptInfo *rel, bool override_rows) * of partial_pathlist because of the way add_partial_path works. */ cheapest_partial_path = linitial(rel->partial_pathlist); + + if (!cheapest_partial_path->parallel_safe) + return; + rows = cheapest_partial_path->rows * cheapest_partial_path->parallel_workers; simple_gather_path = (Path *) @@ -3317,6 +3388,7 @@ get_useful_pathkeys_for_relation(PlannerInfo *root, RelOptInfo *rel, void generate_useful_gather_paths(PlannerInfo *root, RelOptInfo *rel, bool override_rows) { + Assert(false); ListCell *lc; double rows; double *rowsp = NULL; @@ -3622,6 +3694,7 @@ make_rel_from_joinlist(PlannerInfo *root, List *joinlist) * already. */ bring_to_outer_query(root, rel, NIL); + /* GP_PARALLEL_FIXME: enable parallel outer query? */ } return rel; @@ -3734,12 +3807,15 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels) * partial paths. We'll do the same for the topmost scan/join rel * once we know the final targetlist (see grouping_planner). */ +#if 0 if (lev < levels_needed) generate_useful_gather_paths(root, rel, false); +#endif if (bms_equal(rel->relids, root->all_baserels) && root->is_correlated_subplan) { bring_to_outer_query(root, rel, NIL); + /* GP_PARALLEL_FIXME: enable parallel outer query? */ } /* Find and save the cheapest paths for this rel */ @@ -4517,10 +4593,17 @@ create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, pages_fetched = compute_bitmap_pages(root, rel, bitmapqual, 1.0, NULL, NULL); - parallel_workers = compute_parallel_worker(rel, pages_fetched, -1, + + /* + * Don't support parallel BitmapScan for AO/AOCS. + */ + if (rel->reloptkind == RELOPT_BASEREL && (AMHandlerIsAO(rel->amhandler))) + return; + + parallel_workers = compute_parallel_worker(root, rel, pages_fetched, -1, max_parallel_workers_per_gather); - if (parallel_workers <= 0) + if (parallel_workers <= 1) return; add_partial_path(rel, (Path *) create_bitmap_heap_path(root, rel, @@ -4543,7 +4626,7 @@ create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, * comes from a GUC. */ int -compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages, +compute_parallel_worker(PlannerInfo *root, RelOptInfo *rel, double heap_pages, double index_pages, int max_workers) { int parallel_workers = 0; @@ -4557,67 +4640,108 @@ compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages, else { /* - * If the number of pages being scanned is insufficient to justify a - * parallel scan, just return zero ... unless it's an inheritance - * child. In that case, we want to generate a parallel path here - * anyway. It might not be worthwhile just for this relation, but - * when combined with all of its inheritance siblings it may well pay - * off. + * We need to reconsider parallel workers for AO/AOCO tables + * because page number in ao is quite hard to estimate. + * The parallel for AO/AOCO tables is based on segment file count in + * pg_appendonly which will be updated by analyze/vacuum/truncate processes. */ - if (rel->reloptkind == RELOPT_BASEREL && - ((heap_pages >= 0 && heap_pages < min_parallel_table_scan_size) || - (index_pages >= 0 && index_pages < min_parallel_index_scan_size))) - return 0; - - if (heap_pages >= 0) + if (rel->reloptkind == RELOPT_BASEREL && (AMHandlerIsAO(rel->amhandler))) { - int heap_parallel_threshold; - int heap_parallel_workers = 1; + Oid aorelid = root->simple_rte_array[rel->relid]->relid; + HeapTuple aotup; + Form_pg_appendonly aoform; + aotup = SearchSysCache1(AORELID, ObjectIdGetDatum(aorelid)); + if (!HeapTupleIsValid(aotup)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("appendonly table relid %u does not exist in pg_appendonly", aorelid))); + + aoform = (Form_pg_appendonly) GETSTRUCT(aotup); + Assert(aoform->segfilecount >= 0); + parallel_workers = Min(aoform->segfilecount, max_workers); + ReleaseSysCache(aotup); /* - * Select the number of workers based on the log of the size of - * the relation. This probably needs to be a good deal more - * sophisticated, but we need something here for now. Note that - * the upper limit of the min_parallel_table_scan_size GUC is - * chosen to prevent overflow here. + * Disable parallel for AO/AOCO for: + * 1.We don't support parallel/non-parallel IndexScan/IndexOnlyScan. + * 2.If parallel_workers is 1, it is pointless in gp parallel mode. */ - heap_parallel_threshold = Max(min_parallel_table_scan_size, 1); - while (heap_pages >= (BlockNumber) (heap_parallel_threshold * 3)) - { - heap_parallel_workers++; - heap_parallel_threshold *= 3; - if (heap_parallel_threshold > INT_MAX / 3) - break; /* avoid overflow */ - } - - parallel_workers = heap_parallel_workers; + if (parallel_workers == 1 || index_pages >= 0) + parallel_workers = 0; } - - if (index_pages >= 0) + else { - int index_parallel_workers = 1; - int index_parallel_threshold; + /* + * If the number of pages being scanned is insufficient to justify a + * parallel scan, just return zero ... unless it's an inheritance + * child. In that case, we want to generate a parallel path here + * anyway. It might not be worthwhile just for this relation, but + * when combined with all of its inheritance siblings it may well pay + * off. + */ + if (rel->reloptkind == RELOPT_BASEREL && + ((heap_pages >= 0 && heap_pages < min_parallel_table_scan_size) || + (index_pages >= 0 && index_pages < min_parallel_index_scan_size))) + return 0; - /* same calculation as for heap_pages above */ - index_parallel_threshold = Max(min_parallel_index_scan_size, 1); - while (index_pages >= (BlockNumber) (index_parallel_threshold * 3)) + if (heap_pages >= 0) { - index_parallel_workers++; - index_parallel_threshold *= 3; - if (index_parallel_threshold > INT_MAX / 3) - break; /* avoid overflow */ + int heap_parallel_threshold; + int heap_parallel_workers = 1; + + /* + * Select the number of workers based on the log of the size of + * the relation. This probably needs to be a good deal more + * sophisticated, but we need something here for now. Note that + * the upper limit of the min_parallel_table_scan_size GUC is + * chosen to prevent overflow here. + */ + heap_parallel_threshold = Max(min_parallel_table_scan_size, 1); + while (heap_pages >= (BlockNumber) (heap_parallel_threshold * 3)) + { + heap_parallel_workers++; + heap_parallel_threshold *= 3; + if (heap_parallel_threshold > INT_MAX / 3) + break; /* avoid overflow */ + } + + parallel_workers = heap_parallel_workers; } - if (parallel_workers > 0) - parallel_workers = Min(parallel_workers, index_parallel_workers); - else - parallel_workers = index_parallel_workers; + if (index_pages >= 0) + { + int index_parallel_workers = 1; + int index_parallel_threshold; + + /* same calculation as for heap_pages above */ + index_parallel_threshold = Max(min_parallel_index_scan_size, 1); + while (index_pages >= (BlockNumber) (index_parallel_threshold * 3)) + { + index_parallel_workers++; + index_parallel_threshold *= 3; + if (index_parallel_threshold > INT_MAX / 3) + break; /* avoid overflow */ + } + + if (parallel_workers > 0) + parallel_workers = Min(parallel_workers, index_parallel_workers); + else + parallel_workers = index_parallel_workers; + } } } /* In no case use more than caller supplied maximum number of workers */ parallel_workers = Min(parallel_workers, max_workers); + /* + * GPDB parallel mode don't has a leader process. parallel_workers=1 may cause + * CdbLocusType_HashedWorkers locus type. This affect the plan to generate motion node + * which is not necessary. So we disable parallel_workers=1 in GPDB parallel mode. + */ + if (parallel_workers == 1) + parallel_workers = 0; + return parallel_workers; } @@ -4964,12 +5088,18 @@ print_path(PlannerInfo *root, Path *path, int indent) case CdbLocusType_SegmentGeneral: ltype = "SegmentGeneral"; break; + case CdbLocusType_SegmentGeneralWorkers: + ltype = "SegmentGeneralWorkers"; + break; case CdbLocusType_Replicated: ltype = "Replicated"; break; case CdbLocusType_Hashed: ltype = "Hashed"; break; + case CdbLocusType_HashedWorkers: + ltype = "HashedWorkers"; + break; case CdbLocusType_HashedOJ: ltype = "HashedOJ"; break; diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index d7bad89a240..2758c2cdb88 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -825,7 +825,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * sequential as for parallel scans the pages are accessed in random * order. */ - path->path.parallel_workers = compute_parallel_worker(baserel_orig, + path->path.parallel_workers = compute_parallel_worker(root, baserel_orig, rand_heap_pages, index_pages, max_parallel_workers_per_gather); @@ -835,7 +835,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count, * such a case this path will be rejected. So there is no benefit in * doing extra computation. */ - if (path->path.parallel_workers <= 0) + if (path->path.parallel_workers <= 1) return; path->path.parallel_aware = true; @@ -1540,9 +1540,11 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, Assert(baserel->relid > 0); Assert(baserel->rtekind == RTE_SUBQUERY); - /* Adjust row count if this runs in multiple segments */ + /* Adjust row count if this runs in multiple segments and parallel model */ if (CdbPathLocus_IsPartitioned(path->path.locus)) + { numsegments = CdbPathLocus_NumSegments(path->path.locus); + } else numsegments = 1; @@ -4072,7 +4074,30 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace, * number, so we need to undo the division. */ if (parallel_hash) - inner_path_rows_total *= get_parallel_divisor(inner_path); + { + /* + * GPDB + * For GP style parallel, inner path's locus could be ReplicatedWorkers. + * + * Join + * / \ + * Outer ParallelHash + * \ + * ParallelBroadcastMotion + * \ + * origin_inner + * + * In this case, inner_path.rows has already taken parallel into account. + * We shouldn't plus parallel_divisor again, else the estimation of Hash + * Table will be much more than the size it really is. + * The side-effect will lead to: + * 1. Estimate or allocate much more memory for shared Hash Table. + * 2. Use MergeJoin instead of HashJoin if planner recognize inner table + * is too big. + */ + if(!CdbPathLocus_IsReplicatedWorkers(inner_path->locus)) + inner_path_rows_total *= get_parallel_divisor(inner_path); + } /* * Get hash table size that executor would use for inner relation. @@ -6780,6 +6805,17 @@ get_parallel_divisor(Path *path) * its time servicing each worker, and the remainder executing the * parallel plan. */ + /* + * GPDB parallel: We don't have a leader like upstream. + * parallel_divisor is usually used to estimate rows. + * Since we don't have a leader in GP parallel style, set it the same + * as path's parallel_workers which may be 0 sometimes. + * Return 1 in case that caller got INF number. + * It has no impact on the parallel_workers of path nodes. + */ + if (parallel_divisor == 0) + parallel_divisor = 1; +#if 0 if (parallel_leader_participation) { double leader_contribution; @@ -6788,6 +6824,7 @@ get_parallel_divisor(Path *path) if (leader_contribution > 0) parallel_divisor += leader_contribution; } +#endif return parallel_divisor; } diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 48aefe6cb7e..cdb17fcdcd9 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -355,7 +355,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel) add_path(rel, (Path *) bpath, root); /* create a partial bitmap heap path */ - if (rel->consider_parallel && rel->lateral_relids == NULL) + if (rel->consider_parallel && bitmapqual->parallel_safe && rel->lateral_relids == NULL) create_partial_bitmap_paths(root, rel, bitmapqual); } diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 99cc9cd9b04..5e8a613f952 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -29,6 +29,7 @@ #include "optimizer/tlist.h" #include "utils/lsyscache.h" #include "utils/typcache.h" +#include "utils/guc.h" #include "executor/nodeHash.h" /* ExecHashRowSize() */ #include "cdb/cdbpath.h" /* cdbpath_rows() */ @@ -780,6 +781,7 @@ try_partial_nestloop_path(PlannerInfo *root, JoinPathExtraData *extra) { JoinCostWorkspace workspace; + Path *nestloop_path; /* * If the inner path is parameterized, the parameterization must be fully @@ -834,19 +836,26 @@ try_partial_nestloop_path(PlannerInfo *root, } /* Might be good enough to be worth trying, so let's try it. */ - add_partial_path(joinrel, (Path *) - create_nestloop_path(root, - joinrel, - jointype, - orig_jointype, - &workspace, - extra, - outer_path, - inner_path, - extra->restrictlist, - extra->redistribution_clauses, - pathkeys, - NULL)); + nestloop_path = create_nestloop_path(root, + joinrel, + jointype, + orig_jointype, + &workspace, + extra, + outer_path, + inner_path, + extra->restrictlist, + extra->redistribution_clauses, + pathkeys, + NULL); + + /* + * The final path could be not parallel safe because of Motions added. + */ + if (nestloop_path && nestloop_path->parallel_safe) + { + add_partial_path(joinrel, nestloop_path); + } } /* @@ -1003,23 +1012,24 @@ try_partial_mergejoin_path(PlannerInfo *root, if (!add_partial_path_precheck(joinrel, workspace.total_cost, pathkeys)) return; + Path *path = create_mergejoin_path(root, + joinrel, + jointype, + orig_jointype, + &workspace, + extra, + outer_path, + inner_path, + extra->restrictlist, + pathkeys, + NULL, + mergeclauses, + extra->redistribution_clauses, + outersortkeys, + innersortkeys); /* Might be good enough to be worth trying, so let's try it. */ - add_partial_path(joinrel, (Path *) - create_mergejoin_path(root, - joinrel, - jointype, - orig_jointype, - &workspace, - extra, - outer_path, - inner_path, - extra->restrictlist, - pathkeys, - NULL, - mergeclauses, - extra->redistribution_clauses, - outersortkeys, - innersortkeys)); + if (path && path->parallel_safe) + add_partial_path(joinrel, (Path *)path); } /* @@ -1078,7 +1088,8 @@ try_hashjoin_path(PlannerInfo *root, extra->restrictlist, required_outer, extra->redistribution_clauses, - hashclauses), + hashclauses, + false), root); } else @@ -1109,6 +1120,7 @@ try_partial_hashjoin_path(PlannerInfo *root, bool parallel_hash) { JoinCostWorkspace workspace; + Path *hashpath; /* * If the inner path is parameterized, the parameterization must be fully @@ -1134,21 +1146,57 @@ try_partial_hashjoin_path(PlannerInfo *root, if (!add_partial_path_precheck(joinrel, workspace.total_cost, NIL)) return; - /* Might be good enough to be worth trying, so let's try it. */ - add_partial_path(joinrel, (Path *) - create_hashjoin_path(root, - joinrel, - jointype, - orig_jointype, - &workspace, - extra, - outer_path, - inner_path, - parallel_hash, - extra->restrictlist, - NULL, - extra->redistribution_clauses, - hashclauses)); + /* + * GPDB_PARALLEL_FIXME + * Customers encounter an issue that when parallel hash, broadcast motion + * a smaller table may be worser than redistribute a big table. + * We add a path whic doesn't try broadcast if possible. + * And let the path cost decide which is better. + */ + if (parallel_hash) + { + hashpath = create_hashjoin_path(root, + joinrel, + jointype, + orig_jointype, + &workspace, + extra, + outer_path, + inner_path, + true, + extra->restrictlist, + NULL, + extra->redistribution_clauses, + hashclauses, + true); /* not use broadcast */ + if (hashpath && hashpath->parallel_safe) + add_partial_path(joinrel, hashpath); + } + + /* + * GPDB_PARALLEL_FIXME: + * We only want non-broadcast in parallel hash if the guc is set. + */ + if (parallel_hash && !parallel_hash_enable_motion_broadcast) + return; + + hashpath = create_hashjoin_path(root, + joinrel, + jointype, + orig_jointype, + &workspace, + extra, + outer_path, + inner_path, + parallel_hash, + extra->restrictlist, + NULL, + extra->redistribution_clauses, + hashclauses, + false); + /* Might be good enough to be worth trying and no motion, so let's try it. */ + if (hashpath && hashpath->parallel_safe) + add_partial_path(joinrel, hashpath); } /* @@ -1268,6 +1316,8 @@ sort_inner_and_outer(PlannerInfo *root, if (joinrel->consider_parallel && save_jointype != JOIN_UNIQUE_OUTER && save_jointype != JOIN_FULL && + save_jointype != JOIN_DEDUP_SEMI && + save_jointype != JOIN_DEDUP_SEMI_REVERSE && save_jointype != JOIN_RIGHT && outerrel->partial_pathlist != NIL && bms_is_empty(joinrel->lateral_relids)) @@ -1875,6 +1925,8 @@ match_unsorted_outer(PlannerInfo *root, if (joinrel->consider_parallel && save_jointype != JOIN_UNIQUE_OUTER && save_jointype != JOIN_FULL && + save_jointype != JOIN_DEDUP_SEMI && + save_jointype != JOIN_DEDUP_SEMI_REVERSE && save_jointype != JOIN_RIGHT && outerrel->partial_pathlist != NIL && bms_is_empty(joinrel->lateral_relids)) @@ -2246,6 +2298,9 @@ hash_inner_and_outer(PlannerInfo *root, save_jointype != JOIN_UNIQUE_OUTER && save_jointype != JOIN_FULL && save_jointype != JOIN_RIGHT && + save_jointype != JOIN_LASJ_NOTIN && + save_jointype != JOIN_DEDUP_SEMI && + save_jointype != JOIN_DEDUP_SEMI_REVERSE && outerrel->partial_pathlist != NIL && bms_is_empty(joinrel->lateral_relids)) { diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 3c18a774504..b26ab32a54e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -161,6 +161,7 @@ static BitmapHeapScan *create_bitmap_scan_plan(PlannerInfo *root, List *tlist, List *scan_clauses); static Plan *create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, List **qual, List **indexqual, List **indexECs); +static void bitmap_subplan_mark_shared(Plan *plan); static TidScan *create_tidscan_plan(PlannerInfo *root, TidPath *best_path, List *tlist, List *scan_clauses); static TidRangeScan *create_tidrangescan_plan(PlannerInfo *root, @@ -275,7 +276,7 @@ static HashJoin *make_hashjoin(List *tlist, List *hashoperators, List *hashcollations, List *hashkeys, Plan *lefttree, Plan *righttree, - JoinType jointype, bool inner_unique); + JoinType jointype, bool inner_unique, bool batch0_barrier); static Hash *make_hash(Plan *lefttree, List *hashkeys, Oid skewTable, @@ -617,6 +618,10 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags) break; } + Assert(best_path->parallel_workers == best_path->locus.parallel_workers); + plan->locustype = best_path->locus.locustype; + plan->parallel = best_path->locus.parallel_workers; + return plan; } @@ -3352,12 +3357,14 @@ create_motion_plan(PlannerInfo *root, CdbMotionPath *path) /* cannot motion from Entry DB */ sendSlice->gangType = GANGTYPE_ENTRYDB_READER; sendSlice->numsegments = 1; + sendSlice->parallel_workers = subpath->locus.parallel_workers; sendSlice->segindex = -1; break; case CdbLocusType_SingleQE: sendSlice->gangType = GANGTYPE_SINGLETON_READER; sendSlice->numsegments = 1; + sendSlice->parallel_workers = subpath->locus.parallel_workers; sendSlice->segindex = gp_session_id % subpath->locus.numsegments; break; @@ -3365,12 +3372,15 @@ create_motion_plan(PlannerInfo *root, CdbMotionPath *path) /* */ sendSlice->gangType = GANGTYPE_SINGLETON_READER; sendSlice->numsegments = 1; + sendSlice->parallel_workers = subpath->locus.parallel_workers; sendSlice->segindex = gp_session_id % getgpsegmentCount(); break; case CdbLocusType_SegmentGeneral: + case CdbLocusType_SegmentGeneralWorkers: sendSlice->gangType = GANGTYPE_SINGLETON_READER; sendSlice->numsegments = subpath->locus.numsegments; + sendSlice->parallel_workers = subpath->locus.parallel_workers; sendSlice->segindex = gp_session_id % subpath->locus.numsegments; break; @@ -3378,6 +3388,13 @@ create_motion_plan(PlannerInfo *root, CdbMotionPath *path) // is probably writer, set already //sendSlice->gangType == GANGTYPE_PRIMARY_READER; sendSlice->numsegments = subpath->locus.numsegments; + sendSlice->parallel_workers = subpath->locus.parallel_workers; + sendSlice->segindex = 0; + break; + + case CdbLocusType_ReplicatedWorkers: + sendSlice->numsegments = subpath->locus.numsegments; + sendSlice->parallel_workers = subpath->locus.parallel_workers; sendSlice->segindex = 0; break; @@ -3386,11 +3403,13 @@ create_motion_plan(PlannerInfo *root, CdbMotionPath *path) break; case CdbLocusType_Hashed: + case CdbLocusType_HashedWorkers: case CdbLocusType_HashedOJ: case CdbLocusType_Strewn: // might be writer, set already //sendSlice->gangType == GANGTYPE_PRIMARY_READER; sendSlice->numsegments = subpath->locus.numsegments; + sendSlice->parallel_workers = subpath->locus.parallel_workers; sendSlice->segindex = 0; break; @@ -3888,9 +3907,12 @@ create_bitmap_scan_plan(PlannerInfo *root, &indexECs); /* GPDB_12_MERGE_FEATURE_NOT_SUPPORTED: the parallel StreamBitmap scan is not implemented */ /* - * if (best_path->path.parallel_aware) - * bitmap_subplan_mark_shared(bitmapqualplan); + * FIXME: + * It's still unclear that if the code will be break in some case. + * We uncomment out the code here to make parallel bitmap scan work. */ + if (best_path->path.parallel_aware) + bitmap_subplan_mark_shared(bitmapqualplan); /* * The qpqual list must contain all restrictions not automatically handled @@ -5463,6 +5485,8 @@ create_mergejoin_plan(PlannerInfo *root, label_sort_with_costsize(root, sort, -1.0); outer_plan = (Plan *) sort; outerpathkeys = best_path->outersortkeys; + outer_plan->locustype = outer_path->locus.locustype; + outer_plan->parallel = outer_path->locus.parallel_workers; } else outerpathkeys = best_path->jpath.outerjoinpath->pathkeys; @@ -5477,6 +5501,8 @@ create_mergejoin_plan(PlannerInfo *root, label_sort_with_costsize(root, sort, -1.0); inner_plan = (Plan *) sort; innerpathkeys = best_path->innersortkeys; + inner_plan->locustype = inner_path->locus.locustype; + inner_plan->parallel = inner_path->locus.parallel_workers; } else innerpathkeys = best_path->jpath.innerjoinpath->pathkeys; @@ -5756,10 +5782,16 @@ create_hashjoin_plan(PlannerInfo *root, Oid skewTable = InvalidOid; AttrNumber skewColumn = InvalidAttrNumber; bool skewInherit = false; - bool partition_selectors_created; + bool partition_selectors_created = false; ListCell *lc; - push_partition_selector_candidate_for_join(root, &best_path->jpath); + /* GP_PARALLEL_FIXME: + * PartitionSelector is not parallel-aware, so disable it temporarily. + * In future, after enabling merging partition prune info in shared memory, + * PartitionSelector could work in parallel mode. + */ + if (!best_path->jpath.path.parallel_aware) + push_partition_selector_candidate_for_join(root, &best_path->jpath); /* * HashJoin can project, so we don't have to demand exact tlists from the @@ -5775,8 +5807,9 @@ create_hashjoin_plan(PlannerInfo *root, * If the outer side contained Append nodes that can do partition pruning, * inject Partition Selectors to the inner side. */ - partition_selectors_created = - pop_and_inject_partition_selectors(root, &best_path->jpath); + if (!best_path->jpath.path.parallel_aware) + partition_selectors_created = + pop_and_inject_partition_selectors(root, &best_path->jpath); inner_plan = create_plan_recurse(root, best_path->jpath.innerjoinpath, CP_SMALL_TLIST); @@ -5885,6 +5918,41 @@ create_hashjoin_plan(PlannerInfo *root, skewColumn, skewInherit); + /* + * GPDB parallel explain(locus) shows Hash table locus. + * Hash table locus could be different from inner table when parallel_aware. + * ex: Gather xxxWorkers to xxx if possible. + */ + hash_plan->plan.locustype = inner_plan->locustype; + hash_plan->plan.parallel = inner_plan->parallel; + if (best_path->jpath.path.parallel_aware) + { + hash_plan->plan.parallel = 0; + switch (inner_plan->locustype) + { + case CdbLocusType_ReplicatedWorkers: + hash_plan->plan.locustype = CdbLocusType_Replicated; + break; + case CdbLocusType_SegmentGeneralWorkers: + hash_plan->plan.locustype = CdbLocusType_SegmentGeneral; + break; + case CdbLocusType_HashedWorkers: + hash_plan->plan.locustype = CdbLocusType_Hashed; + break; + case CdbLocusType_Hashed: + case CdbLocusType_Strewn: + /* case kept here to set parallel = 0 */ + break; + default: + /* recover parallel if not matched */ + hash_plan->plan.parallel = inner_plan->parallel; + } + } + + if (best_path->jpath.path.parallel_aware && + best_path->jpath.innerjoinpath->barrierHazard) + hash_plan->sync_barrier = true; + /* * Set Hash node's startup & total costs equal to total cost of input * plan; this only affects EXPLAIN display not decisions. @@ -5913,7 +5981,8 @@ create_hashjoin_plan(PlannerInfo *root, outer_plan, (Plan *) hash_plan, best_path->jpath.jointype, - best_path->jpath.inner_unique); + best_path->jpath.inner_unique, + best_path->batch0_barrier); /* * MPP-4635. best_path->jpath.outerjoinpath may be NULL. @@ -6544,7 +6613,6 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples) plan->plan.parallel_safe = lefttree->parallel_safe; } -#if 0 /* * bitmap_subplan_mark_shared * Set isshared flag in bitmap subplan so that it will be created in @@ -6565,7 +6633,6 @@ bitmap_subplan_mark_shared(Plan *plan) else elog(ERROR, "unrecognized node type: %d", nodeTag(plan)); } -#endif /***************************************************************************** * @@ -7090,7 +7157,8 @@ make_hashjoin(List *tlist, Plan *lefttree, Plan *righttree, JoinType jointype, - bool inner_unique) + bool inner_unique, + bool batch0_barrier) { HashJoin *node = makeNode(HashJoin); Plan *plan = &node->join.plan; @@ -7106,6 +7174,7 @@ make_hashjoin(List *tlist, node->join.jointype = jointype; node->join.inner_unique = inner_unique; node->join.joinqual = joinclauses; + node->batch0_barrier = batch0_barrier; return node; } @@ -7131,6 +7200,7 @@ make_hash(Plan *lefttree, node->skewInherit = skewInherit; node->rescannable = false; /* CDB (unused for now) */ + node->sync_barrier = false; return node; } @@ -8592,6 +8662,11 @@ cdbpathtoplan_create_motion_plan(PlannerInfo *root, int numHashSegments; numHashSegments = CdbPathLocus_NumSegments(path->path.locus); + if (path->path.locus.parallel_workers > 0) + { + root->glob->parallelModeNeeded = true; + } + if (path->is_explicit_motion) { TargetEntry *segmentid_tle; @@ -8704,6 +8779,9 @@ cdbpathtoplan_create_motion_plan(PlannerInfo *root, else if (CdbPathLocus_IsReplicated(path->path.locus)) motion = make_broadcast_motion(subplan); + else if (CdbPathLocus_IsReplicatedWorkers(path->path.locus)) + motion = make_parallel_broadcast_motion(subplan); + /* Hashed redistribution to all QEs in gang above... */ else if (CdbPathLocus_IsHashed(path->path.locus) || CdbPathLocus_IsHashedOJ(path->path.locus)) @@ -8723,6 +8801,24 @@ cdbpathtoplan_create_motion_plan(PlannerInfo *root, hashOpfamilies, numHashSegments); } + else if (CdbPathLocus_IsHashedWorkers(path->path.locus)) + { + List *hashExprs; + List *hashOpfamilies; + + cdbpathlocus_get_distkey_exprs(path->path.locus, + path->path.parent->relids, + subplan->targetlist, + &hashExprs, &hashOpfamilies); + if (!hashExprs) + elog(ERROR, "could not find hash distribution key expressions in target list"); + + motion = make_hashed_motion(subplan, + hashExprs, + hashOpfamilies, + numHashSegments); + } + /* Hashed redistribution to all QEs in gang above... */ else if (CdbPathLocus_IsStrewn(path->path.locus)) { diff --git a/src/backend/optimizer/plan/joinpartprune.c b/src/backend/optimizer/plan/joinpartprune.c index a9ea585a54f..1446b86d83b 100644 --- a/src/backend/optimizer/plan/joinpartprune.c +++ b/src/backend/optimizer/plan/joinpartprune.c @@ -278,7 +278,9 @@ create_partition_selector_path(PlannerInfo *root, pathnode->path.pathkeys = subpath->pathkeys; pathnode->path.locus = subpath->locus; + pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; pathnode->path.rescannable = subpath->rescannable; pathnode->path.sameslice_relids = subpath->sameslice_relids; diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index 3460b50b655..4d007cae92d 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -172,7 +172,8 @@ query_planner(PlannerInfo *root, CdbPathLocus_MakeEntry(&result_path->locus); else if (exec_location == PROEXECLOCATION_ALL_SEGMENTS) CdbPathLocus_MakeStrewn(&result_path->locus, - getgpsegmentCount()); + getgpsegmentCount(), + 0); } else CdbPathLocus_MakeEntry(&result_path->locus); diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 523e7f40fc2..a32eacffaa8 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -89,7 +89,7 @@ /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; int force_parallel_mode = FORCE_PARALLEL_OFF; -bool parallel_leader_participation = true; +bool parallel_leader_participation = false; /* Hook for plugins to get control in planner() */ planner_hook_type planner_hook = NULL; @@ -252,7 +252,9 @@ static RelOptInfo *create_partial_grouping_paths(PlannerInfo *root, grouping_sets_data *gd, GroupPathExtraData *extra, bool force_rel_creation); +#if 0 static void gather_grouping_paths(PlannerInfo *root, RelOptInfo *rel); +#endif static bool can_partial_agg(PlannerInfo *root); static void apply_scanjoin_target_to_paths(PlannerInfo *root, RelOptInfo *rel, @@ -450,9 +452,6 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, * restriction, but for now it seems best not to have parallel workers * trying to create their own parallel workers. */ - /* GPDB_96_MERGE_FIXME: disable parallel workers for now */ - glob->parallelModeOK = false; -#if 0 if ((cursorOptions & CURSOR_OPT_PARALLEL_OK) != 0 && IsUnderPostmaster && parse->commandType == CMD_SELECT && @@ -470,7 +469,12 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, glob->maxParallelHazard = PROPARALLEL_UNSAFE; glob->parallelModeOK = false; } -#endif + + /* + * GPDB: allow to use parallel or not. + */ + if (!enable_parallel) + glob->parallelModeOK = false; /* * glob->parallelModeNeeded is normally set to false here and changed to @@ -539,6 +543,22 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, /* Select best Path and turn it into a Plan */ final_rel = fetch_upper_rel(root, UPPERREL_FINAL, NULL); + + /* + * GPDB parallel: + * Unlike upstream, partial_path is valid in GP without Gather nodes. + * Keep the two pathlist separated until the final. Now it's the time + * to choose the best. + * GPDB_PARALLEL_FIXME: + * Take GP's special into partial_pathlist, ex: agg and etc. + */ + if (final_rel->partial_pathlist != NIL) + { + Path *cheapest_partial_path; + cheapest_partial_path = linitial(final_rel->partial_pathlist); + add_path(final_rel, cheapest_partial_path, root); + set_cheapest(final_rel); + } best_path = get_cheapest_fractional_path(final_rel, tuple_fraction); if (Gp_role == GP_ROLE_DISPATCH) @@ -565,6 +585,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, } #endif +#if 0 /* * Optionally add a Gather node for testing purposes, provided this is * actually a safe thing to do. @@ -613,7 +634,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, top_plan = &gather->plan; } - +#endif /* * If any Params were generated, run through the plan tree and compute * each plan node's extParam/allParam sets. Ideally we'd merge this into @@ -2194,6 +2215,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) * Gather Motion, which will be added below. */ if (parse->limitCount && limit_needed(parse) && + gp_enable_multiphase_limit && !contain_volatile_functions(parse->limitOffset) && !contain_volatile_functions(parse->limitCount)) { @@ -2440,14 +2462,104 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) * Generate partial paths for final_rel, too, if outer query levels might * be able to make use of them. */ - if (final_rel->consider_parallel && root->query_level > 1 && - !limit_needed(parse)) + /* + * GPDB_PARALLEL_FIXME: should keep query_level > 1 in GPDB? + * It will lose parallel path, ex: plain parallel scan. + * PG have Gather node but GP delay partial path until Gather Motion. + * + * Limit parallel: + * PG doesn't have to handle limit here becuase all partial paths have been Gathered + * into pathlist, and the subpath of Limit node could be parallel. + * For our GP style, we don't have Gather node and keep the partial path in partial_pathlist + * until the last step if possible. + * When we generate two phase limit path or limit has sub partial path, + * the Limit node on QEs could be parallel. + * Ex: select * from t1 limit 1; + * Two phase Limit, parallel Limit on QEs under Limit on QD + * Limit + * -> Gather + * -> Limit + * -> Parallel Seq Scan on t1 + * + * One phase Limit, parallel plan on QEs under Limit on QD + * Limit + * -> Gather + * -> Parallel Seq Scan on t1 + * + */ + if (final_rel->consider_parallel/* && root->query_level > 1 && !limit_needed(parse)*/) { Assert(!parse->rowMarks && parse->commandType == CMD_SELECT); + + /* GPDB_PARALLEL_FIXEME: support parallel SCATTER BY? */ + if (parse->scatterClause) + { + current_rel->partial_pathlist = NIL; + final_rel->partial_pathlist = NIL; + } + foreach(lc, current_rel->partial_pathlist) { Path *partial_path = (Path *) lfirst(lc); + if (CdbPathLocus_IsPartitioned(partial_path->locus) && + (limit_needed(parse) || must_gather)) + { + CdbPathLocus locus; + List *pathkeys; + + if (parse->limitCount && limit_needed(parse) && + gp_enable_multiphase_limit && + !contain_volatile_functions(parse->limitOffset) && + !contain_volatile_functions(parse->limitCount)) + { + partial_path = (Path *) create_preliminary_limit_path(root, final_rel, partial_path, + parse->limitOffset, + parse->limitCount, + parse->limitOption, + offset_est, count_est); + } + + pathkeys = + cdbpullup_truncatePathKeysForTargetList(partial_path->pathkeys, + make_tlist_from_pathtarget(partial_path->pathtarget)); + + CdbPathLocus_MakeSingleQE(&locus, getgpsegmentCount()); + partial_path = cdbpath_create_motion_path(root, partial_path, pathkeys, false, locus); + } + else if ((CdbPathLocus_IsHashed(root->final_locus) || + CdbPathLocus_IsSingleQE(root->final_locus) || + CdbPathLocus_IsEntry(root->final_locus) || + CdbPathLocus_IsReplicated(root->final_locus)) && + !root->glob->is_parallel_cursor) + { + /* + * GPDB PARALLEL + * This is a little different from inserting Limit node from pathlist. + * We must gather partial results before Limit on QD. + */ + Path *orig_path = partial_path; + + partial_path = cdbpath_create_motion_path(root, orig_path, + root->sort_pathkeys, + false, + root->final_locus); + if (!partial_path) + partial_path = orig_path; + } + + /* + * If there is a LIMIT/OFFSET clause, add the LIMIT node. + */ + if (limit_needed(parse)) + { + partial_path = (Path *) create_limit_path(root, final_rel, partial_path, + parse->limitOffset, + parse->limitCount, + parse->limitOption, + offset_est, count_est); + } + add_partial_path(final_rel, partial_path); } } @@ -4209,16 +4321,15 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, return; } +#if 0 /* Gather any partially grouped partial paths. */ if (partially_grouped_rel && partially_grouped_rel->partial_pathlist) + { gather_grouping_paths(root, partially_grouped_rel); - - /* - * The non-partial paths can come either from the Gather above or from - * aggregate push-down. - */ - if (partially_grouped_rel && partially_grouped_rel->pathlist) - set_cheapest(partially_grouped_rel); + if (partially_grouped_rel->pathlist) + set_cheapest(partially_grouped_rel); + } +#endif /* * Estimate number of groups. @@ -4226,7 +4337,11 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel, double num_total_input_rows; if (CdbPathLocus_IsPartitioned(cheapest_path->locus)) + { num_total_input_rows = cheapest_path->rows * CdbPathLocus_NumSegments(cheapest_path->locus); + if (cheapest_path->locus.parallel_workers > 1) + num_total_input_rows *= cheapest_path->locus.parallel_workers; + } else num_total_input_rows = cheapest_path->rows; @@ -4316,8 +4431,12 @@ consider_groupingsets_paths(PlannerInfo *root, * is only a fraction of the total. */ if (CdbPathLocus_IsPartitioned(path->locus)) + { dNumGroups = clamp_row_est(dNumGroupsTotal / CdbPathLocus_NumSegments(path->locus)); + if (path->locus.parallel_workers > 1) + dNumGroups /= path->locus.parallel_workers; + } else dNumGroups = dNumGroupsTotal; @@ -4353,6 +4472,23 @@ consider_groupingsets_paths(PlannerInfo *root, parse->groupClause, srd->new_rollups); + // GPDB_12_MERGE_FIXME: fix computation of dNumGroups +#if 0 + /* + * dNumGroupsTotal is the total number of groups across all segments. If the + * Aggregate is distributed, then the number of groups in one segment + * is only a fraction of the total. + */ + if (CdbPathLocus_IsPartitioned(path->locus)) + { + dNumGroups = clamp_row_est(dNumGroupsTotal / + CdbPathLocus_NumSegments(path->locus)); + if (path->locus.parallel_workers > 1) + dNumGroups /= path->locus.parallel_workers; + } + else + dNumGroups = dNumGroupsTotal; +#endif add_path(grouped_rel, (Path *) create_groupingsets_path(root, @@ -4391,8 +4527,15 @@ consider_groupingsets_paths(PlannerInfo *root, * is only a fraction of the total. */ if (CdbPathLocus_IsPartitioned(path->locus)) - dNumGroups = clamp_row_est(dNumGroupsTotal / + { + if (path->locus.parallel_workers > 1) + dNumGroups = clamp_row_est(dNumGroupsTotal / + path->locus.parallel_workers / + CdbPathLocus_NumSegments(path->locus)); + else + dNumGroups = clamp_row_est(dNumGroupsTotal / CdbPathLocus_NumSegments(path->locus)); + } else dNumGroups = dNumGroupsTotal; @@ -4816,7 +4959,11 @@ create_distinct_paths(PlannerInfo *root, distinct_rel->exec_location = input_rel->exec_location; if (CdbPathLocus_IsPartitioned(cheapest_input_path->locus)) + { numInputRowsTotal = cheapest_input_path->rows * CdbPathLocus_NumSegments(cheapest_input_path->locus); + if (cheapest_input_path->locus.parallel_workers > 1) + numInputRowsTotal *= cheapest_input_path->locus.parallel_workers; + } else numInputRowsTotal = cheapest_input_path->rows; @@ -4892,7 +5039,12 @@ create_distinct_paths(PlannerInfo *root, /* On how many segments will the distinct result reside? */ if (CdbPathLocus_IsPartitioned(path->locus)) + { + /* GPDB_PARALLEL_FIXME: should we consider parallel in distinct path? */ numDistinctRows = numDistinctRowsTotal / CdbPathLocus_NumSegments(path->locus); + if (path->locus.parallel_workers > 1) + numDistinctRows /= path->locus.parallel_workers; + } else numDistinctRows = numDistinctRowsTotal; @@ -4931,7 +5083,12 @@ create_distinct_paths(PlannerInfo *root, NIL); if (CdbPathLocus_IsPartitioned(path->locus)) + { + /* GPDB_PARALLEL_FIXME: should we consider parallel in distinct path? */ numDistinctRows = numDistinctRowsTotal / CdbPathLocus_NumSegments(path->locus); + if (path->locus.parallel_workers > 1) + numDistinctRows /= path->locus.parallel_workers; + } else numDistinctRows = numDistinctRowsTotal; @@ -4975,7 +5132,12 @@ create_distinct_paths(PlannerInfo *root, NIL); if (CdbPathLocus_IsPartitioned(path->locus)) + { + /* GPDB_PARALLEL_FIXME: should we consider parallel in distinct path? */ numDistinctRows = clamp_row_est(numDistinctRowsTotal / CdbPathLocus_NumSegments(path->locus)); + if (path->locus.parallel_workers > 1) + numDistinctRows /= path->locus.parallel_workers; + } else numDistinctRows = numDistinctRowsTotal; @@ -5178,7 +5340,7 @@ create_ordered_paths(PlannerInfo *root, Path *cheapest_partial_path; cheapest_partial_path = linitial(input_rel->partial_pathlist); - + Path *sorted_path = cheapest_partial_path; /* * If cheapest partial path doesn't need a sort, this is redundant * with what's already been tried. @@ -5187,14 +5349,16 @@ create_ordered_paths(PlannerInfo *root, cheapest_partial_path->pathkeys)) { Path *path; +#if 0 double total_groups; +#endif path = (Path *) create_sort_path(root, ordered_rel, cheapest_partial_path, root->sort_pathkeys, limit_tuples); - +#if 0 total_groups = cheapest_partial_path->rows * cheapest_partial_path->parallel_workers; path = (Path *) @@ -5204,12 +5368,22 @@ create_ordered_paths(PlannerInfo *root, root->sort_pathkeys, NULL, &total_groups); +#endif /* Add projection step if needed */ if (path->pathtarget != target) path = apply_projection_to_path(root, ordered_rel, path, target); - add_path(ordered_rel, path, root); + add_partial_path(ordered_rel, path); + } + else + { + /* Use the input path as is, but add a projection step if needed */ + if (sorted_path->pathtarget != target) + sorted_path = apply_projection_to_path(root, ordered_rel, + sorted_path, target); + + add_partial_path(ordered_rel, sorted_path); } /* @@ -5229,7 +5403,9 @@ create_ordered_paths(PlannerInfo *root, Path *sorted_path; bool is_sorted; int presorted_keys; +#if 0 double total_groups; +#endif /* * We don't care if this is the cheapest partial path - we @@ -5256,6 +5432,7 @@ create_ordered_paths(PlannerInfo *root, root->sort_pathkeys, presorted_keys, limit_tuples); +#if 0 total_groups = input_path->rows * input_path->parallel_workers; sorted_path = (Path *) @@ -5264,13 +5441,13 @@ create_ordered_paths(PlannerInfo *root, sorted_path->pathtarget, root->sort_pathkeys, NULL, &total_groups); - +#endif /* Add projection step if needed */ if (sorted_path->pathtarget != target) sorted_path = apply_projection_to_path(root, ordered_rel, sorted_path, target); - add_path(ordered_rel, sorted_path, root); + add_partial_path(ordered_rel, sorted_path); } } } @@ -5308,7 +5485,7 @@ create_scatter_path(PlannerInfo *root, List *scatterClause, Path *path) /* Deal with the special case of SCATTER RANDOMLY */ if (list_length(scatterClause) == 1 && linitial(scatterClause) == NULL) { - CdbPathLocus_MakeStrewn(&locus, getgpsegmentCount()); + CdbPathLocus_MakeStrewn(&locus, getgpsegmentCount(), path->parallel_workers); } else { @@ -5331,7 +5508,7 @@ create_scatter_path(PlannerInfo *root, List *scatterClause, Path *path) locus = cdbpathlocus_from_exprs(root, path->parent, scatterClause, - opfamilies, sortrefs, getgpsegmentCount()); + opfamilies, sortrefs, getgpsegmentCount(), path->locus.parallel_workers); } /* @@ -6760,7 +6937,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) * Determine number of workers to scan the heap relation using generic * model */ - parallel_workers = compute_parallel_worker(rel, heap_blocks, -1, + parallel_workers = compute_parallel_worker(root, rel, heap_blocks, -1, max_parallel_maintenance_workers); /* @@ -6846,8 +7023,15 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * is only a fraction of the total. */ if (CdbPathLocus_IsPartitioned(path->locus)) - dNumGroups = clamp_row_est(dNumGroupsTotal / - CdbPathLocus_NumSegments(path->locus)); + { + if (path->locus.parallel_workers > 1) + dNumGroups = clamp_row_est(dNumGroupsTotal / + path->locus.parallel_workers / + CdbPathLocus_NumSegments(path->locus)); + else + dNumGroups = clamp_row_est(dNumGroupsTotal / + CdbPathLocus_NumSegments(path->locus)); + } else dNumGroups = dNumGroupsTotal; @@ -6953,8 +7137,15 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * is only a fraction of the total. */ if (CdbPathLocus_IsPartitioned(path->locus)) - dNumGroups = clamp_row_est(dNumGroupsTotal / - CdbPathLocus_NumSegments(path->locus)); + { + if (path->locus.parallel_workers > 1) + dNumGroups = clamp_row_est(dNumGroupsTotal / + path->locus.parallel_workers / + CdbPathLocus_NumSegments(path->locus)); + else + dNumGroups = clamp_row_est(dNumGroupsTotal / + CdbPathLocus_NumSegments(path->locus)); + } else dNumGroups = dNumGroupsTotal; @@ -7009,7 +7200,78 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, Assert(false); } } + if (grouped_rel->consider_parallel) + { + foreach(lc, input_rel->partial_pathlist) + { + Path *path = (Path *) lfirst(lc); + bool is_sorted; + int presorted_keys; + double dNumGroups; + + if (!CdbPathLocus_IsPartitioned(path->locus)) + continue; + + is_sorted = pathkeys_count_contained_in(root->group_pathkeys, + path->pathkeys, + &presorted_keys); + + path = cdb_prepare_path_for_sorted_agg(root, + is_sorted, + 0, /* presorted_keys */ + grouped_rel, + path, + path->pathtarget, + root->group_pathkeys, + -1.0, + parse->groupClause, + gd ? gd->rollups : NIL); + /* + * dNumGroupsTotal is the total number of groups across all segments. If the + * Aggregate is distributed, then the number of groups in one segment + * is only a fraction of the total. + */ + if (CdbPathLocus_IsPartitioned(path->locus)) + { + if (path->locus.parallel_workers > 1) + dNumGroups = clamp_row_est(dNumGroupsTotal / + path->locus.parallel_workers / + CdbPathLocus_NumSegments(path->locus)); + else + dNumGroups = clamp_row_est(dNumGroupsTotal / + CdbPathLocus_NumSegments(path->locus)); + } + else + dNumGroups = dNumGroupsTotal; + + + /* Now decide what to stick atop it */ + if (parse->groupingSets) + { + /* do nothing, not support parallel now */ + } + else if (parse->hasAggs || parse->groupClause) + { + /* + * We have aggregation, possibly with plain GROUP BY. Make an + * AggPath. + */ + add_partial_path(grouped_rel, (Path *) + create_agg_path(root, + grouped_rel, + path, + grouped_rel->reltarget, + parse->groupClause ? AGG_SORTED : AGG_PLAIN, + AGGSPLIT_SIMPLE, + false, /* streaming */ + parse->groupClause, + havingQual, + agg_costs, + dNumGroups)); + } + } + } /* * Instead of operating directly on the input relation, we can * consider finalizing a partially aggregated path. @@ -7054,8 +7316,15 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * is only a fraction of the total. */ if (CdbPathLocus_IsPartitioned(path->locus)) - dNumGroups = clamp_row_est(dNumGroupsTotal / - CdbPathLocus_NumSegments(path->locus)); + { + if (path->locus.parallel_workers > 1) + dNumGroups = clamp_row_est(dNumGroupsTotal / + path->locus.parallel_workers / + CdbPathLocus_NumSegments(path->locus)); + else + dNumGroups = clamp_row_est(dNumGroupsTotal / + CdbPathLocus_NumSegments(path->locus)); + } else dNumGroups = dNumGroupsTotal; @@ -7194,8 +7463,15 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * is only a fraction of the total. */ if (CdbPathLocus_IsPartitioned(path->locus)) - dNumGroups = clamp_row_est(dNumGroupsTotal / - CdbPathLocus_NumSegments(path->locus)); + { + if (path->locus.parallel_workers > 1) + dNumGroups = clamp_row_est(dNumGroupsTotal / + path->locus.parallel_workers / + CdbPathLocus_NumSegments(path->locus)); + else + dNumGroups = clamp_row_est(dNumGroupsTotal / + CdbPathLocus_NumSegments(path->locus)); + } else dNumGroups = dNumGroupsTotal; @@ -7228,6 +7504,55 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, dNumGroups), root); } + if (input_rel->partial_pathlist && grouped_rel->consider_parallel) + { + Path *path = linitial(input_rel->partial_pathlist); + double dNumGroups; + + path = cdb_prepare_path_for_hashed_agg(root, + path, + path->pathtarget, + parse->groupClause, + NIL); + + /* + * dNumGroupsTotal is the total number of groups across all segments. If the + * Aggregate is distributed, then the number of groups in one segment + * is only a fraction of the total. + */ + if (CdbPathLocus_IsPartitioned(path->locus)) + { + if (path->locus.parallel_workers > 1) + dNumGroups = clamp_row_est(dNumGroupsTotal / + path->locus.parallel_workers / + CdbPathLocus_NumSegments(path->locus)); + else + dNumGroups = clamp_row_est(dNumGroupsTotal / + CdbPathLocus_NumSegments(path->locus)); + + + hashaggtablesize = estimate_hashagg_tablesize(root, path, + agg_costs, + dNumGroups); + + if (enable_hashagg_disk || + hashaggtablesize < work_mem * 1024L) + { + add_partial_path(grouped_rel, (Path *) + create_agg_path(root, + grouped_rel, + path, + grouped_rel->reltarget, + AGG_HASHED, + AGGSPLIT_SIMPLE, + false, + parse->groupClause, + havingQual, + agg_costs, + dNumGroups)); + } + } + } } /* @@ -7251,8 +7576,15 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * is only a fraction of the total. */ if (CdbPathLocus_IsPartitioned(path->locus)) - dNumGroups = clamp_row_est(dNumGroupsTotal / - CdbPathLocus_NumSegments(path->locus)); + { + if (path->locus.parallel_workers > 1) + dNumGroups = clamp_row_est(dNumGroupsTotal / + path->locus.parallel_workers / + CdbPathLocus_NumSegments(path->locus)); + else + dNumGroups = clamp_row_est(dNumGroupsTotal / + CdbPathLocus_NumSegments(path->locus)); + } else dNumGroups = dNumGroupsTotal; @@ -7286,8 +7618,10 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, * consider a path for grouped_rel consisting of a Parallel Append of * non-partial paths from each child. */ +#if 0 if (grouped_rel->partial_pathlist != NIL) gather_grouping_paths(root, grouped_rel); +#endif /* * Add GPDB two-and three-stage agg plans @@ -7396,7 +7730,8 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel, &extra->agg_final_costs, gd ? gd->rollups : NIL, new_rollups, - strat); + strat, + partially_grouped_rel ? partially_grouped_rel->partial_pathlist : NIL); } } @@ -7863,6 +8198,7 @@ create_partial_grouping_paths(PlannerInfo *root, return partially_grouped_rel; } +#if 0 /* * Generate Gather and Gather Merge paths for a grouping relation or partial * grouping relation. @@ -7879,6 +8215,7 @@ create_partial_grouping_paths(PlannerInfo *root, static void gather_grouping_paths(PlannerInfo *root, RelOptInfo *rel) { + Assert(false); ListCell *lc; Path *cheapest_partial_path; @@ -7957,6 +8294,7 @@ gather_grouping_paths(PlannerInfo *root, RelOptInfo *rel) add_path(rel, path, root); } } +#endif /* * can_partial_agg @@ -8060,7 +8398,12 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, * paths by doing it after the final scan/join target has been * applied. */ - generate_useful_gather_paths(root, rel, false); + if (rel->upperrestrictinfo) + rel->consider_parallel = is_parallel_safe(root, (Node *) rel->upperrestrictinfo); +#if 0 + if (rel->consider_parallel) + generate_useful_gather_paths(root, rel, false); +#endif /* Can't use parallel query above this level. */ rel->partial_pathlist = NIL; @@ -8213,8 +8556,10 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, * this after all paths have been generated and before set_cheapest, since * one of the generated paths may turn out to be the cheapest one. */ +#if 0 if (rel->consider_parallel && !IS_OTHER_REL(rel)) generate_useful_gather_paths(root, rel, false); +#endif /* * Reassess which paths are the cheapest, now that we've potentially added diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index bd82b6bed4c..c80e61c1170 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -3483,13 +3483,21 @@ fix_upper_expr_mutator(Node *node, fix_upper_expr_context *context) if (IsA(node, Var)) { Var *var = (Var *) node; - - newvar = search_indexed_tlist_for_var(var, - context->subplan_itlist, - context->newvarno, - context->rtoffset); - if (!newvar) - elog(ERROR, "variable not found in subplan target list"); + if (context->subplan_itlist->has_non_vars) + { + newvar = search_indexed_tlist_for_non_var((Expr *) node, + context->subplan_itlist, + context->newvarno); + } + else + { + newvar = search_indexed_tlist_for_var(var, + context->subplan_itlist, + context->newvarno, + context->rtoffset); + if (!newvar) + elog(ERROR, "variable not found in subplan target list"); + } return (Node *) newvar; } if (IsA(node, PlaceHolderVar)) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 3d319139345..3304cd9d5b3 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -332,15 +332,22 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, } if (Gp_role == GP_ROLE_DISPATCH) + { config->is_under_subplan = true; - if (Gp_role == GP_ROLE_DISPATCH) - { - config->gp_cte_sharing = IsSubqueryCorrelated(subquery) || - !(subLinkType == ROWCOMPARE_SUBLINK || - subLinkType == ARRAY_SUBLINK || - subLinkType == EXPR_SUBLINK || - subLinkType == EXISTS_SUBLINK); + /* + * Disable CTE sharing in subplan. + * + * fixup_subplans() copys duplicate subplan (subplan with same + * plan_id), but doesn't copy the subroot. + * If enable cte sharing here, it leads to mismatch of the length + * of subplans and subroots. And apply_shareinput_xslice() cannot + * make it correct when shared scan is in subplan, then an assert + * (or panic) error will happen in init_tuplestore_state(). + * + * See github issue: https://github.com/greenplum-db/gpdb/issues/12701 + */ + config->gp_cte_sharing = false; } /* * Strictly speaking, the order of rows in a subquery doesn't matter. @@ -377,6 +384,16 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, * seems no reason to postpone doing that. */ final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL); + /* + * CBDB parallel: add the cheapest partial path to the final_rel. + */ + if (final_rel->partial_pathlist != NIL) + { + Path *cheapest_partial_path; + cheapest_partial_path = linitial(final_rel->partial_pathlist); + add_path(final_rel, cheapest_partial_path, root); + set_cheapest(final_rel); + } best_path = get_cheapest_fractional_path(final_rel, tuple_fraction); /* @@ -2554,12 +2571,18 @@ SS_charge_for_initplans(PlannerInfo *root, RelOptInfo *final_rel) } /* - * Forget about any partial paths and clear consider_parallel, too; - * they're not usable if we attached an initPlan. + * Now adjust the costs for partial paths. */ - final_rel->partial_pathlist = NIL; - final_rel->consider_parallel = false; + if (final_rel->partial_pathlist) + { + foreach(lc, final_rel->partial_pathlist) + { + Path *path = (Path *) lfirst(lc); + path->startup_cost += initplan_cost; + path->total_cost += initplan_cost; + } + } /* We needn't do set_cheapest() here, caller will do it */ } @@ -2701,10 +2724,10 @@ finalize_plan(PlannerInfo *root, Plan *plan, */ if (plan->parallel_aware) { - if (gather_param < 0) - elog(ERROR, "parallel-aware plan node is not below a Gather"); - context.paramids = - bms_add_member(context.paramids, gather_param); + /* make GPDB stype parallisim work. */ + if (gather_param >= 0) + context.paramids = + bms_add_member(context.paramids, gather_param); } /* Check additional node-type-specific fields */ diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index d63da034583..468d2ba8810 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -498,7 +498,7 @@ generate_recursion_path(SetOperationStmt *setOp, PlannerInfo *root, * merge, and things seem to be working with this much simpler thing, but * I'm not sure if the logic is 100% correct now. */ - if (CdbPathLocus_IsSegmentGeneral(lpath->locus)) + if (CdbPathLocus_IsSegmentGeneral(lpath->locus) || CdbPathLocus_IsSegmentGeneralWorkers(lpath->locus)) { CdbPathLocus gather_locus; @@ -590,7 +590,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, ListCell *lc; List *pathlist = NIL; List *partial_pathlist = NIL; - bool partial_paths_valid = true; + bool partial_paths_valid = false; /* GPDB_PARALLEL_FIXME: temproary disable partial path */ bool consider_parallel = true; List *rellist; List *tlist_list; @@ -749,9 +749,12 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root, NIL, NULL, parallel_workers, enable_parallel_append, -1); + /* GPDB_PARALLEL_FIXME: we disable pg styple Gather/GatherMerge node */ +#if 0 ppath = (Path *) create_gather_path(root, result_rel, ppath, result_rel->reltarget, NULL, NULL); +#endif if (!op->all) ppath = make_union_unique(op, ppath, tlist, root); add_path(result_rel, ppath, root); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 037932351b7..90e2f8044ee 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -38,6 +38,7 @@ #include "utils/selfuncs.h" #include "optimizer/tlist.h" +#include "catalog/pg_am.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" #include "cdb/cdbhash.h" /* cdb_default_distribution_opfamily_for_type() */ @@ -74,7 +75,7 @@ static List *reparameterize_pathlist_by_child(PlannerInfo *root, RelOptInfo *child_rel); static void set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, - List *pathkeys); + List *pathkeys, int parallel_workers, bool parallel_aware); static CdbPathLocus adjust_modifytable_subpath(PlannerInfo *root, CmdType operation, int resultRelationRTI, Path **pSubpath, @@ -1031,11 +1032,12 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, required_outer); pathnode->parallel_aware = parallel_workers > 0 ? true : false; pathnode->parallel_safe = rel->consider_parallel; - pathnode->parallel_workers = parallel_workers; pathnode->pathkeys = NIL; /* seqscan has unordered result */ - pathnode->locus = cdbpathlocus_from_baserel(root, rel); + pathnode->locus = cdbpathlocus_from_baserel(root, rel, parallel_workers); + pathnode->parallel_workers = pathnode->locus.parallel_workers; pathnode->motionHazard = false; + pathnode->barrierHazard = false; pathnode->rescannable = true; pathnode->sameslice_relids = rel->relids; @@ -1063,8 +1065,9 @@ create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer pathnode->parallel_workers = 0; pathnode->pathkeys = NIL; /* samplescan has unordered result */ - pathnode->locus = cdbpathlocus_from_baserel(root, rel); + pathnode->locus = cdbpathlocus_from_baserel(root, rel, 0); pathnode->motionHazard = false; + pathnode->barrierHazard = false; pathnode->rescannable = true; pathnode->sameslice_relids = rel->relids; @@ -1118,7 +1121,8 @@ create_index_path(PlannerInfo *root, pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + /* GPDB_12_MERGE_FEATURE_NOT_SUPPORTED: the parallel StreamBitmap scan is not implemented */ + pathnode->path.parallel_safe = rel->consider_parallel && (index->relam != BITMAP_AM_OID); pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = pathkeys; @@ -1129,13 +1133,15 @@ create_index_path(PlannerInfo *root, pathnode->indexscandir = indexscandir; /* Distribution is same as the base table. */ - pathnode->path.locus = cdbpathlocus_from_baserel(root, rel); pathnode->path.motionHazard = false; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = true; pathnode->path.sameslice_relids = rel->relids; cost_index(pathnode, root, loop_count, partial_path); + pathnode->path.locus = cdbpathlocus_from_baserel(root, rel, partial_path ? pathnode->path.parallel_workers : 0); + return pathnode; } @@ -1168,12 +1174,13 @@ create_bitmap_heap_path(PlannerInfo *root, required_outer); pathnode->path.parallel_aware = parallel_degree > 0 ? true : false; pathnode->path.parallel_safe = rel->consider_parallel; - pathnode->path.parallel_workers = parallel_degree; pathnode->path.pathkeys = NIL; /* always unordered */ /* Distribution is same as the base table. */ - pathnode->path.locus = cdbpathlocus_from_baserel(root, rel); + pathnode->path.locus = cdbpathlocus_from_baserel(root, rel, parallel_degree); + pathnode->path.parallel_workers = pathnode->path.locus.parallel_workers; pathnode->path.motionHazard = false; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = true; pathnode->path.sameslice_relids = rel->relids; @@ -1198,6 +1205,7 @@ create_bitmap_and_path(PlannerInfo *root, BitmapAndPath *pathnode = makeNode(BitmapAndPath); Relids required_outer = NULL; ListCell *lc; + bool parallel_safe = true; pathnode->path.pathtype = T_BitmapAnd; pathnode->path.parent = rel; @@ -1214,6 +1222,9 @@ create_bitmap_and_path(PlannerInfo *root, required_outer = bms_add_members(required_outer, PATH_REQ_OUTER(bitmapqual)); + if (!bitmapqual->parallel_safe) + parallel_safe = false; + } pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); @@ -1225,7 +1236,7 @@ create_bitmap_and_path(PlannerInfo *root, * without actually iterating over the list of children. */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = rel->consider_parallel && parallel_safe; pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = NIL; /* always unordered */ @@ -1250,6 +1261,7 @@ create_bitmap_or_path(PlannerInfo *root, BitmapOrPath *pathnode = makeNode(BitmapOrPath); Relids required_outer = NULL; ListCell *lc; + bool parallel_safe = true; pathnode->path.pathtype = T_BitmapOr; pathnode->path.parent = rel; @@ -1266,6 +1278,9 @@ create_bitmap_or_path(PlannerInfo *root, required_outer = bms_add_members(required_outer, PATH_REQ_OUTER(bitmapqual)); + if (!bitmapqual->parallel_safe) + parallel_safe = false; + } pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); @@ -1277,7 +1292,7 @@ create_bitmap_or_path(PlannerInfo *root, * without actually iterating over the list of children. */ pathnode->path.parallel_aware = false; - pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_safe = rel->consider_parallel && parallel_safe; pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = NIL; /* always unordered */ @@ -1317,8 +1332,9 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals, pathnode->tidquals = tidquals; /* Distribution is same as the base table. */ - pathnode->path.locus = cdbpathlocus_from_baserel(root, rel); + pathnode->path.locus = cdbpathlocus_from_baserel(root, rel, 0); pathnode->path.motionHazard = false; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = true; pathnode->path.sameslice_relids = rel->relids; @@ -1352,8 +1368,9 @@ create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->tidrangequals = tidrangequals; /* Distribution is same as the base table. */ - pathnode->path.locus = cdbpathlocus_from_baserel(root, rel); + pathnode->path.locus = cdbpathlocus_from_baserel(root, rel, 0); pathnode->path.motionHazard = false; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = true; pathnode->path.sameslice_relids = rel->relids; @@ -1382,7 +1399,15 @@ create_append_path(PlannerInfo *root, AppendPath *pathnode = makeNode(AppendPath); ListCell *l; + /* + * GPDB_PARALLEL_FIXME: it still cannot be opened after we deal with append. + * Because we currently allow path with non parallel_workers been added to + * partial_path. + */ +#if 0 Assert(!parallel_aware || parallel_workers > 0); +#endif + pathnode->path.pathtype = T_Append; pathnode->path.parent = rel; @@ -1411,6 +1436,7 @@ create_append_path(PlannerInfo *root, pathnode->path.pathkeys = pathkeys; pathnode->path.motionHazard = false; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = true; /* @@ -1446,7 +1472,7 @@ create_append_path(PlannerInfo *root, else pathnode->limit_tuples = -1.0; - set_append_path_locus(root, (Path *) pathnode, rel, NIL); + set_append_path_locus(root, (Path *)pathnode, rel, NIL, parallel_workers, parallel_aware); foreach(l, pathnode->subpaths) { @@ -1457,9 +1483,18 @@ create_append_path(PlannerInfo *root, /* All child paths must have same parameterization */ Assert(bms_equal(PATH_REQ_OUTER(subpath), required_outer)); + + if (subpath->barrierHazard) + pathnode->path.barrierHazard = true; } + /* + * set_append_path_locus() maybe add motion node to pathnode->subpaths, + * so append path is not parallel safe. We remove assert here. + */ +#if 0 Assert(!parallel_aware || pathnode->path.parallel_safe); +#endif /* * If there's exactly one child path, the Append is a no-op and will be @@ -1572,7 +1607,7 @@ create_merge_append_path(PlannerInfo *root, * Add Motions to the child nodes as needed, and determine the locus * of the MergeAppend itself. */ - set_append_path_locus(root, (Path *) pathnode, rel, pathkeys); + set_append_path_locus(root, (Path *) pathnode, rel, pathkeys, 0, false); /* * Add up the sizes and costs of the input paths. @@ -1643,7 +1678,7 @@ create_merge_append_path(PlannerInfo *root, */ static void set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, - List *pathkeys) + List *pathkeys, int parallel_workers, bool parallel_aware) { ListCell *l; CdbLocusType targetlocustype; @@ -1748,6 +1783,36 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, { CdbLocusType_General, CdbLocusType_General, CdbLocusType_General }, }; + + static const struct + { + CdbLocusType a; + CdbLocusType b; + CdbLocusType result; + } parallel_append_locus_compatibility_table[] = + { + /* + * Cases for CdbLocusType_SegmentGeneralWorkers. + * If it's a mix of partitioned and generalworkers, we still consider the + * result as partitioned. But the general part will be restricted to + * only produce rows on a single QE. + */ + { CdbLocusType_SegmentGeneralWorkers, CdbLocusType_SegmentGeneralWorkers, CdbLocusType_SegmentGeneralWorkers }, + { CdbLocusType_SegmentGeneralWorkers, CdbLocusType_SegmentGeneral, CdbLocusType_SegmentGeneralWorkers }, + { CdbLocusType_SegmentGeneralWorkers, CdbLocusType_General, CdbLocusType_SegmentGeneralWorkers }, + { CdbLocusType_SegmentGeneralWorkers, CdbLocusType_Strewn, CdbLocusType_Strewn }, + + /* + * GPDB_PARALLEL_FIXME: The following three locus are not considering parallel for now. + * We might need to consider it in the future. + */ + { CdbLocusType_SegmentGeneralWorkers, CdbLocusType_OuterQuery, CdbLocusType_OuterQuery}, + { CdbLocusType_SegmentGeneralWorkers, CdbLocusType_Entry, CdbLocusType_Entry}, + { CdbLocusType_SegmentGeneralWorkers, CdbLocusType_SingleQE, CdbLocusType_SingleQE}, + + /* GPDB_PARALLEL_FIXME: Is there any chance replicated workers exist in append subpath? */ + }; + targetlocustype = CdbLocusType_General; foreach(l, subpaths) { @@ -1781,6 +1846,19 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, break; } } + + for (i = 0; i < lengthof(parallel_append_locus_compatibility_table); i++) + { + if ((parallel_append_locus_compatibility_table[i].a == targetlocustype && + parallel_append_locus_compatibility_table[i].b == subtype) || + (parallel_append_locus_compatibility_table[i].a == subtype && + parallel_append_locus_compatibility_table[i].b == targetlocustype)) + { + targetlocustype = parallel_append_locus_compatibility_table[i].result; + break; + } + } + if (i == lengthof(append_locus_compatibility_table)) elog(ERROR, "could not determine target locus for Append"); } @@ -1806,7 +1884,8 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, } else if (targetlocustype == CdbLocusType_SingleQE || targetlocustype == CdbLocusType_Replicated || - targetlocustype == CdbLocusType_SegmentGeneral) + targetlocustype == CdbLocusType_SegmentGeneral || + targetlocustype == CdbLocusType_SegmentGeneralWorkers) { /* By default put Append node on all the segments */ numsegments = getgpsegmentCount(); @@ -1826,7 +1905,11 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, CdbPathLocus_NumSegments(subpath->locus)); } } - CdbPathLocus_MakeSimple(&targetlocus, targetlocustype, numsegments); + if (targetlocustype == CdbLocusType_SegmentGeneralWorkers || + (targetlocustype == CdbLocusType_SegmentGeneral && parallel_workers > 1)) + CdbPathLocus_MakeSegmentGeneralWorkers(&targetlocus, numsegments, parallel_workers); + else + CdbPathLocus_MakeSimple(&targetlocus, targetlocustype, numsegments); } else if (targetlocustype == CdbLocusType_Strewn) { @@ -1841,10 +1924,11 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, CdbPathLocus projectedlocus; if (CdbPathLocus_IsGeneral(subpath->locus) || - CdbPathLocus_IsSegmentGeneral(subpath->locus)) + CdbPathLocus_IsSegmentGeneral(subpath->locus) || + CdbPathLocus_IsSegmentGeneralWorkers(subpath->locus)) { /* Afterwards, General/SegmentGeneral will be projected as Strewn */ - CdbPathLocus_MakeStrewn(&projectedlocus, numsegments); + CdbPathLocus_MakeStrewn(&projectedlocus, numsegments, pathnode->parallel_workers); } else { @@ -1854,17 +1938,20 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, /* Transform subpath locus into the appendrel's space for comparison. */ if (subpath->parent->reloptkind == RELOPT_OTHER_MEMBER_REL && subpath->parent != rel && - (CdbPathLocus_IsHashed(subpath->locus) || CdbPathLocus_IsHashedOJ(subpath->locus))) + (CdbPathLocus_IsHashed(subpath->locus) || CdbPathLocus_IsHashedOJ(subpath->locus) || + (CdbPathLocus_IsHashedWorkers(subpath->locus) && parallel_aware))) { CdbPathLocus l; l = cdbpathlocus_pull_above_projection(root, - subpath->locus, - subpath->parent->relids, - subpath->parent->reltarget->exprs, - rel->reltarget->exprs, - rel->relid); - if (CdbPathLocus_IsHashed(l) || CdbPathLocus_IsHashedOJ(l)) + subpath->locus, + subpath->parent->relids, + subpath->parent->reltarget->exprs, + rel->reltarget->exprs, + rel->relid, + parallel_aware); + if (CdbPathLocus_IsHashed(l) || CdbPathLocus_IsHashedOJ(l) || + (CdbPathLocus_IsHashedWorkers(l) && parallel_aware)) projectedlocus = l; } } @@ -1887,8 +1974,52 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, } else if (cdbpathlocus_equal(targetlocus, projectedlocus)) { + /* compatible */ } + else if (cdbpath_distkey_equal(targetlocus.distkey, projectedlocus.distkey) && parallel_aware) + { + if (CdbPathLocus_IsHashedWorkers(targetlocus) && + CdbPathLocus_IsHashedWorkers(projectedlocus)) + { + /* projectedlocus compatible with targetlocus */ + if (CdbPathLocus_NumParallelWorkers(targetlocus) < CdbPathLocus_NumParallelWorkers(projectedlocus)) + targetlocus = projectedlocus; + } + else if (CdbPathLocus_IsHashedWorkers(targetlocus) && + CdbPathLocus_IsHashed(projectedlocus)) + { + /* projectedlocus compatible to targetlocus */ + if (CdbPathLocus_NumParallelWorkers(targetlocus) < CdbPathLocus_NumParallelWorkers(projectedlocus)) + targetlocus = projectedlocus; + } + else if (CdbPathLocus_IsHashed(targetlocus) && + CdbPathLocus_IsHashed(projectedlocus)) + { + /* projectedlocus compatible to targetlocus */ + if (CdbPathLocus_NumParallelWorkers(targetlocus) < CdbPathLocus_NumParallelWorkers(projectedlocus)) + targetlocus = projectedlocus; + } + else if (CdbPathLocus_IsHashed(targetlocus) && + CdbPathLocus_IsHashedWorkers(projectedlocus)) + { + /* targetlocus compatible to projectedlocus */ + targetlocus = projectedlocus; + } + else + { + /* + * subpaths have different distributed policy, mark it as random + * distributed and set the numsegments to the maximum of all + * subpaths to not missing any tuples. + * + * max_numsegments is computed in the first deduction loop, + * even here we use projectdlocus, the numsegments never change. + */ + CdbPathLocus_MakeStrewn(&targetlocus, max_numsegments, pathnode->parallel_workers); + break; + } + } else { /* @@ -1899,7 +2030,7 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, * max_numsegments is computed in the first deduction loop, * even here we use projectdlocus, the numsegments never change. */ - CdbPathLocus_MakeStrewn(&targetlocus, max_numsegments); + CdbPathLocus_MakeStrewn(&targetlocus, max_numsegments, pathnode->parallel_workers); break; } } @@ -1916,10 +2047,11 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, if (CdbPathLocus_IsPartitioned(targetlocus)) { if (CdbPathLocus_IsGeneral(subpath->locus) || - CdbPathLocus_IsSegmentGeneral(subpath->locus)) + CdbPathLocus_IsSegmentGeneral(subpath->locus) || + CdbPathLocus_IsSegmentGeneralWorkers(subpath->locus)) { /* - * If a General/SegmentGeneral is mixed with other Strewn's, + * If a General/SegmentGeneral/SegmentGeneralWorkers is mixed with other Strewn's, * add a projection path with cdb_restrict_clauses, so that only * a single QE will actually produce rows. */ @@ -1957,14 +2089,19 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, ((ProjectionPath *) subpath)->direct_dispath_contentIds = list_make1_int(gp_session_id % numsegments); CdbPathLocus_MakeStrewn(&(subpath->locus), - numsegments); + numsegments, + subpath->parallel_workers); } /* we already determined that all the loci are compatible */ Assert(CdbPathLocus_IsPartitioned(subpath->locus)); } - else + else if (!CdbPathLocus_IsSegmentGeneralWorkers(targetlocus)) { + /* + * SegmentGeneralWorkers can only exist if all subpath's locus is general. And no motion need be added. + * For other cases, we should add motion for them. + */ if (pathkeys_contained_in(pathkeys, subpath->pathkeys)) subpath = cdbpath_create_motion_path(root, subpath, pathkeys, false, targetlocus); else @@ -1976,12 +2113,46 @@ set_append_path_locus(PlannerInfo *root, Path *pathnode, RelOptInfo *rel, if (subpath->motionHazard) pathnode->motionHazard = true; + if (subpath->barrierHazard) + pathnode->barrierHazard = true; + if (!subpath->rescannable) pathnode->rescannable = false; new_subpaths = lappend(new_subpaths, subpath); } + + if (parallel_aware && + (CdbPathLocus_IsHashed(targetlocus) || + CdbPathLocus_IsHashedOJ(targetlocus)|| + CdbPathLocus_IsHashedWorkers(targetlocus)) && + parallel_workers > 0) + { + /* + * Reset targetlocus to HashedWorkers anyway if parallel_workers > 0, + * becuase Hashed could have parallel_workers > 0 now, to be fixed later. + */ + targetlocus.locustype = CdbLocusType_HashedWorkers; + targetlocus.parallel_workers = Max(parallel_workers, CdbPathLocus_NumParallelWorkers(targetlocus)); + } + pathnode->locus = targetlocus; + /* + * GPDB_PARALLEL_FIXME: + * Workaround for assertions in create_plan, + * else will get wrong plan, ex: general locus with parallel_workers > 1. + * Reconsider this after append locus is fixed. + */ + /* + * Partially fixed append issue. + * But there are still several locus can't be parallel so that we can't handle it currently. + */ + AssertImply(parallel_workers > 1 && + !CdbPathLocus_IsEntry(targetlocus) && + !CdbPathLocus_IsOuterQuery(targetlocus) && + !CdbPathLocus_IsGeneral(targetlocus) && + !CdbPathLocus_IsSingleQE(targetlocus), targetlocus.parallel_workers > 1); + pathnode->parallel_workers = targetlocus.parallel_workers; *subpaths_out = new_subpaths; } @@ -2036,6 +2207,7 @@ create_group_result_path(PlannerInfo *root, RelOptInfo *rel, /* Result can be on any segments */ CdbPathLocus_MakeGeneral(&pathnode->path.locus); pathnode->path.motionHazard = false; + pathnode->path.barrierHazard = false; pathnode->path.rescannable = true; return pathnode; @@ -2065,6 +2237,7 @@ create_material_path(RelOptInfo *rel, Path *subpath) pathnode->path.locus = subpath->locus; pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; pathnode->cdb_strict = false; pathnode->path.rescannable = true; /* Independent of sub-path */ pathnode->path.sameslice_relids = subpath->sameslice_relids; @@ -2204,7 +2377,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, locus = cdbpathlocus_from_exprs(root, subpath->parent, - sjinfo->semi_rhs_exprs, opfamilies, sortrefs, numsegments); + sjinfo->semi_rhs_exprs, opfamilies, sortrefs, numsegments, subpath->parallel_workers); subpath = cdbpath_create_motion_path(root, subpath, NIL, false, locus); /* * We probably add agg/sort node above the added motion node, but it is @@ -2419,6 +2592,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, { /* hybrid hash agg is not rescannable, and may present a motion hazard */ pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; pathnode->path.rescannable = false; } else @@ -2428,6 +2602,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, * existing ordering; but Unique sort is never optimized away at present.) */ pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; /* Same reasoning applies to rescanablilty. If no actual sort is placed * in the plan, then rescannable is set correctly to the subpath value. @@ -2550,7 +2725,8 @@ create_unique_rowid_path(PlannerInfo *root, list_make1(rowidexpr), list_make1_oid(cdb_default_distribution_opfamily_for_type(INT8OID)), list_make1_int(0), - numsegments); + numsegments, + subpath->parallel_workers); subpath = cdbpath_create_motion_path(root, subpath, NIL, false, locus); if (!subpath) return NULL; @@ -2566,7 +2742,7 @@ create_unique_rowid_path(PlannerInfo *root, * Unique will care about the row id expresssion, so it's OK to forget * that the rows are currently hashed by the row id. */ - CdbPathLocus_MakeStrewn(&locus, numsegments); + CdbPathLocus_MakeStrewn(&locus, numsegments, subpath->parallel_workers); } else { @@ -2687,6 +2863,7 @@ create_unique_rowid_path(PlannerInfo *root, { /* hybrid hash agg is not rescannable, and may present a motion hazard */ pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; pathnode->path.rescannable = false; } else @@ -2696,6 +2873,7 @@ create_unique_rowid_path(PlannerInfo *root, * existing ordering; but Unique sort is never optimized away at present.) */ pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; /* Same reasoning applies to rescanablilty. If no actual sort is placed * in the plan, then rescannable is set correctly to the subpath value. @@ -2719,6 +2897,7 @@ create_gather_merge_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, PathTarget *target, List *pathkeys, Relids required_outer, double *rows) { + Assert(false); GatherMergePath *pathnode = makeNode(GatherMergePath); Cost input_startup_cost = 0; Cost input_total_cost = 0; @@ -2809,6 +2988,7 @@ GatherPath * create_gather_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, PathTarget *target, Relids required_outer, double *rows) { + Assert(false); GatherPath *pathnode = makeNode(GatherPath); Assert(subpath->parallel_safe); @@ -2836,9 +3016,6 @@ create_gather_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, cost_gather(pathnode, root, rel, pathnode->path.param_info, rows); - /* GPDB_96_MERGE_FIXME: how do data distribution locus and parallelism work together? */ - pathnode->path.locus = subpath->locus; - return pathnode; } @@ -2867,6 +3044,7 @@ create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.locus = locus; pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; pathnode->path.rescannable = false; pathnode->path.sameslice_relids = NULL; @@ -3033,7 +3211,8 @@ create_functionscan_path(PlannerInfo *root, RelOptInfo *rel, if (contain_outer_params) elog(ERROR, "cannot execute EXECUTE ON ALL SEGMENTS function in a subquery with arguments from outer query"); CdbPathLocus_MakeStrewn(&pathnode->locus, - getgpsegmentCount()); + getgpsegmentCount(), + 0); break; default: elog(ERROR, "unrecognized proexeclocation '%c'", exec_location); @@ -3043,6 +3222,7 @@ create_functionscan_path(PlannerInfo *root, RelOptInfo *rel, CdbPathLocus_MakeEntry(&pathnode->locus); pathnode->motionHazard = false; + pathnode->barrierHazard = false; /* * FunctionScan is always rescannable. It uses a tuplestore to @@ -3090,6 +3270,7 @@ create_tablefunction_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->subpath = subpath; pathnode->path.motionHazard = true; /* better safe than sorry */ + pathnode->path.barrierHazard = true; /* better safe than sorry */ pathnode->path.rescannable = false; /* better safe than sorry */ /* @@ -3105,7 +3286,7 @@ create_tablefunction_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, /* Mark the output as random if the input is partitioned */ if (CdbPathLocus_IsPartitioned(pathnode->path.locus)) CdbPathLocus_MakeStrewn(&pathnode->path.locus, - CdbPathLocus_NumSegments(pathnode->path.locus)); + CdbPathLocus_NumSegments(pathnode->path.locus), 0); pathnode->path.sameslice_relids = NULL; cost_tablefunction(pathnode, root, rel, pathnode->path.param_info); @@ -3178,6 +3359,7 @@ create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel, } pathnode->motionHazard = false; + pathnode->barrierHazard = false; pathnode->rescannable = true; pathnode->sameslice_relids = NULL; @@ -3221,6 +3403,7 @@ create_ctescan_path(PlannerInfo *root, RelOptInfo *rel, * shared cte */ pathnode->motionHazard = true; + pathnode->barrierHazard = true; pathnode->rescannable = false; pathnode->sameslice_relids = NULL; @@ -3237,6 +3420,8 @@ create_ctescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->rows = clamp_row_est(rel->rows / numsegments); pathnode->startup_cost = subpath->startup_cost; pathnode->total_cost = subpath->total_cost; + /* GPDB_PARALLEL_FIXME: Is it correct to set parallel workers here? */ + pathnode->parallel_workers = subpath->parallel_workers; ctepath->subpath = subpath; } @@ -3309,13 +3494,19 @@ create_resultscan_path(PlannerInfo *root, RelOptInfo *rel, char exec_location; exec_location = check_execute_on_functions((Node *) rel->reltarget->exprs); - /* - * A function with EXECUTE ON { COORDINATOR | ALL SEGMENTS } attribute - * must be a set-returning function, a subquery has set-returning - * functions in tlist can't be pulled up as RTE_RESULT relation. - */ - Assert(exec_location == PROEXECLOCATION_ANY); - CdbPathLocus_MakeGeneral(&pathnode->locus); + if (exec_location == PROEXECLOCATION_COORDINATOR) + CdbPathLocus_MakeEntry(&pathnode->locus); + else if (exec_location == PROEXECLOCATION_ALL_SEGMENTS) + { + /* GPDB_PARALLEL_FIXME: I'm not sure if this makes sense. This + * would return multiple rows, one for each segment, but usually + * a "SELECT func()" is expected to return just one row. + */ + CdbPathLocus_MakeStrewn(&pathnode->locus, + getgpsegmentCount(), 0); + } + else + CdbPathLocus_MakeGeneral(&pathnode->locus); } cost_resultscan(pathnode, root, rel, pathnode->param_info); @@ -3357,7 +3548,7 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel, "segmentgeneral locus."); break; default: - CdbPathLocus_MakeStrewn(&ctelocus, numsegments); + CdbPathLocus_MakeStrewn(&ctelocus, numsegments, 0); break; } @@ -3373,6 +3564,7 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel, pathnode->locus = ctelocus; pathnode->motionHazard = false; + pathnode->barrierHazard = false; pathnode->rescannable = true; pathnode->sameslice_relids = rel->relids; @@ -3454,7 +3646,7 @@ create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel, CdbPathLocus_MakeGeneral(&(pathnode->path.locus)); break; case FTEXECLOCATION_ALL_SEGMENTS: - CdbPathLocus_MakeStrewn(&(pathnode->path.locus), rel->num_segments); + CdbPathLocus_MakeStrewn(&(pathnode->path.locus), rel->num_segments, 0); break; case FTEXECLOCATION_COORDINATOR: CdbPathLocus_MakeEntry(&(pathnode->path.locus)); @@ -3519,7 +3711,7 @@ create_foreign_join_path(PlannerInfo *root, RelOptInfo *rel, CdbPathLocus_MakeGeneral(&(pathnode->path.locus)); break; case FTEXECLOCATION_ALL_SEGMENTS: - CdbPathLocus_MakeStrewn(&(pathnode->path.locus), rel->num_segments); + CdbPathLocus_MakeStrewn(&(pathnode->path.locus), rel->num_segments, 0); break; case FTEXECLOCATION_COORDINATOR: CdbPathLocus_MakeEntry(&(pathnode->path.locus)); @@ -3579,7 +3771,7 @@ create_foreign_upper_path(PlannerInfo *root, RelOptInfo *rel, CdbPathLocus_MakeGeneral(&(pathnode->path.locus)); break; case FTEXECLOCATION_ALL_SEGMENTS: - CdbPathLocus_MakeStrewn(&(pathnode->path.locus), getgpsegmentCount()); + CdbPathLocus_MakeStrewn(&(pathnode->path.locus), getgpsegmentCount(), 0); break; case FTEXECLOCATION_COORDINATOR: CdbPathLocus_MakeEntry(&(pathnode->path.locus)); @@ -3687,19 +3879,40 @@ create_nestloop_path(PlannerInfo *root, Relids inner_req_outer = PATH_REQ_OUTER(inner_path); bool inner_must_be_local = !bms_is_empty(inner_req_outer); int rowidexpr_id; + bool isParallel = (outer_path->locus.parallel_workers > 1 || inner_path->locus.parallel_workers > 1); + + if (!isParallel) + { + /* Add motion nodes above subpaths and decide where to join. */ + join_locus = cdbpath_motion_for_join(root, + orig_jointype, + &outer_path, /* INOUT */ + &inner_path, /* INOUT */ + &rowidexpr_id, /* OUT */ + redistribution_clauses, + restrict_clauses, + pathkeys, + NIL, + outer_must_be_local, + inner_must_be_local); + } + else + { + join_locus = cdbpath_motion_for_parallel_join(root, + orig_jointype, + &outer_path, /* INOUT */ + &inner_path, /* INOUT */ + &rowidexpr_id, /* OUT */ + redistribution_clauses, + restrict_clauses, + pathkeys, + NIL, + outer_must_be_local, + inner_must_be_local, + false, + false); + } - /* Add motion nodes above subpaths and decide where to join. */ - join_locus = cdbpath_motion_for_join(root, - orig_jointype, - &outer_path, /* INOUT */ - &inner_path, /* INOUT */ - &rowidexpr_id, /* OUT */ - redistribution_clauses, - restrict_clauses, - pathkeys, - NIL, - outer_must_be_local, - inner_must_be_local); if (CdbPathLocus_IsNull(join_locus)) return NULL; @@ -3744,7 +3957,7 @@ create_nestloop_path(PlannerInfo *root, if (inner_path->motionHazard && outer_path->motionHazard) { matinner->cdb_strict = true; - matinner->path.motionHazard = false; + matinner->path.barrierHazard = false; } inner_path = (Path *) matinner; @@ -3792,8 +4005,12 @@ create_nestloop_path(PlannerInfo *root, pathnode->path.parallel_aware = false; pathnode->path.parallel_safe = joinrel->consider_parallel && outer_path->parallel_safe && inner_path->parallel_safe; +#if 0 /* This is a foolish way to estimate parallel_workers, but for now... */ pathnode->path.parallel_workers = outer_path->parallel_workers; +#endif + /* GPDB parallel, use join locus parallel_workers as we may add Motion path above inner and outer */ + pathnode->path.parallel_workers = join_locus.parallel_workers; pathnode->path.pathkeys = pathkeys; pathnode->jointype = jointype; pathnode->inner_unique = extra->inner_unique; @@ -3803,6 +4020,7 @@ create_nestloop_path(PlannerInfo *root, pathnode->path.locus = join_locus; pathnode->path.motionHazard = outer_path->motionHazard || inner_path->motionHazard; + pathnode->path.barrierHazard = outer_path->barrierHazard || inner_path->barrierHazard; /* we're only as rescannable as our child plans */ pathnode->path.rescannable = outer_path->rescannable && inner_path->rescannable; @@ -3942,17 +4160,39 @@ create_mergejoin_path(PlannerInfo *root, preserve_outer_ordering = preserve_outer_ordering || !bms_is_empty(PATH_REQ_OUTER(outer_path)); preserve_inner_ordering = preserve_inner_ordering || !bms_is_empty(PATH_REQ_OUTER(inner_path)); - join_locus = cdbpath_motion_for_join(root, - orig_jointype, - &outer_path, /* INOUT */ - &inner_path, /* INOUT */ - &rowidexpr_id, - redistribution_clauses, - restrict_clauses, - outermotionkeys, - innermotionkeys, - preserve_outer_ordering, - preserve_inner_ordering); + bool isParallel = (outer_path->locus.parallel_workers > 1 || inner_path->locus.parallel_workers > 1); + + if (!isParallel) + { + join_locus = cdbpath_motion_for_join(root, + orig_jointype, + &outer_path, /* INOUT */ + &inner_path, /* INOUT */ + &rowidexpr_id, + redistribution_clauses, + restrict_clauses, + outermotionkeys, + innermotionkeys, + preserve_outer_ordering, + preserve_inner_ordering); + } + else + { + join_locus = cdbpath_motion_for_parallel_join(root, + orig_jointype, + &outer_path, /* INOUT */ + &inner_path, /* INOUT */ + &rowidexpr_id, + redistribution_clauses, + restrict_clauses, + outermotionkeys, + innermotionkeys, + preserve_outer_ordering, + preserve_inner_ordering, + false, + false); + } + if (CdbPathLocus_IsNull(join_locus)) return NULL; @@ -3981,13 +4221,18 @@ create_mergejoin_path(PlannerInfo *root, pathnode->jpath.path.parallel_aware = false; pathnode->jpath.path.parallel_safe = joinrel->consider_parallel && outer_path->parallel_safe && inner_path->parallel_safe; +#if 0 /* This is a foolish way to estimate parallel_workers, but for now... */ pathnode->jpath.path.parallel_workers = outer_path->parallel_workers; +#endif + /* GPDB parallel, use join locus parallel_workers as we may add Motion path above inner and outer */ + pathnode->jpath.path.parallel_workers = join_locus.parallel_workers; pathnode->jpath.path.pathkeys = pathkeys; pathnode->jpath.path.locus = join_locus; pathnode->jpath.path.motionHazard = outer_path->motionHazard || inner_path->motionHazard; + pathnode->jpath.path.barrierHazard = outer_path->barrierHazard || inner_path->barrierHazard; pathnode->jpath.path.rescannable = outer_path->rescannable && inner_path->rescannable; pathnode->jpath.path.sameslice_relids = bms_union(inner_path->sameslice_relids, outer_path->sameslice_relids); @@ -4076,7 +4321,7 @@ Path * create_hashjoin_path(PlannerInfo *root, RelOptInfo *joinrel, JoinType jointype, - JoinType orig_jointype, /* CDB */ + JoinType orig_jointype, /* CDB */ JoinCostWorkspace *workspace, JoinPathExtraData *extra, Path *outer_path, @@ -4084,8 +4329,9 @@ create_hashjoin_path(PlannerInfo *root, bool parallel_hash, List *restrict_clauses, Relids required_outer, - List *redistribution_clauses, /* CDB */ - List *hashclauses) + List *redistribution_clauses, /* CDB */ + List *hashclauses, + bool uninterested_broadcast) /* GPDB parallel */ { HashPath *pathnode; CdbPathLocus join_locus; @@ -4093,18 +4339,47 @@ create_hashjoin_path(PlannerInfo *root, bool inner_must_be_local = !bms_is_empty(PATH_REQ_OUTER(inner_path)); int rowidexpr_id; - /* Add motion nodes above subpaths and decide where to join. */ - join_locus = cdbpath_motion_for_join(root, - orig_jointype, - &outer_path, /* INOUT */ - &inner_path, /* INOUT */ - &rowidexpr_id, - redistribution_clauses, - restrict_clauses, - NIL, /* don't care about ordering */ - NIL, - outer_must_be_local, - inner_must_be_local); + /* + * GPDB_PARALLEL_FIXME: + * We do have outer_path(parallel_workers=0) when parallel_aware is true + * as we try more partial hash join paths than upstream. + * Are them reasonable? Better to remove them until we have a clear answer. + */ + bool isParallel = (outer_path->locus.parallel_workers > 1 || inner_path->locus.parallel_workers > 1); + + if (!isParallel) + { + /* Add motion nodes above subpaths and decide where to join. */ + join_locus = cdbpath_motion_for_join(root, + orig_jointype, + &outer_path, /* INOUT */ + &inner_path, /* INOUT */ + &rowidexpr_id, + redistribution_clauses, + restrict_clauses, + NIL, /* don't care about ordering */ + NIL, + outer_must_be_local, + inner_must_be_local); + } + else + { + /* Parallel join logic */ + join_locus = cdbpath_motion_for_parallel_join(root, + orig_jointype, + &outer_path, /* INOUT */ + &inner_path, /* INOUT */ + &rowidexpr_id, + redistribution_clauses, + restrict_clauses, + NIL, /* don't care about ordering */ + NIL, + outer_must_be_local, + inner_must_be_local, + parallel_hash, + uninterested_broadcast); + } + if (CdbPathLocus_IsNull(join_locus)) return NULL; @@ -4118,13 +4393,13 @@ create_hashjoin_path(PlannerInfo *root, */ if (jointype == JOIN_INNER && gp_enable_hashjoin_size_heuristic) { - double outersize; - double innersize; + double outersize; + double innersize; outersize = ExecHashRowSize(outer_path->parent->reltarget->width) * - outer_path->rows; + outer_path->rows; innersize = ExecHashRowSize(inner_path->parent->reltarget->width) * - inner_path->rows; + inner_path->rows; if (innersize > outersize) return NULL; @@ -4153,8 +4428,12 @@ create_hashjoin_path(PlannerInfo *root, joinrel->consider_parallel && parallel_hash; pathnode->jpath.path.parallel_safe = joinrel->consider_parallel && outer_path->parallel_safe && inner_path->parallel_safe; +#if 0 /* This is a foolish way to estimate parallel_workers, but for now... */ pathnode->jpath.path.parallel_workers = outer_path->parallel_workers; +#endif + /* GPDB parallel, use join locus parallel_workers as we may add Motion path above inner and outer */ + pathnode->jpath.path.parallel_workers = join_locus.parallel_workers; /* * A hashjoin never has pathkeys, since its output ordering is @@ -4175,6 +4454,7 @@ create_hashjoin_path(PlannerInfo *root, pathnode->jpath.outerjoinpath = outer_path; pathnode->jpath.innerjoinpath = inner_path; pathnode->jpath.joinrestrictinfo = restrict_clauses; + pathnode->path_hashclauses = hashclauses; /* final_cost_hashjoin will fill in pathnode->num_batches */ @@ -4187,10 +4467,30 @@ create_hashjoin_path(PlannerInfo *root, pathnode->jpath.path.rescannable = outer_path->rescannable && inner_path->rescannable; /* see the comment above; we may have a motion hazard on our inner ?! */ - if (pathnode->jpath.path.rescannable) + if (pathnode->jpath.path.rescannable && !parallel_hash) + { pathnode->jpath.path.motionHazard = outer_path->motionHazard; + pathnode->jpath.path.barrierHazard = outer_path->barrierHazard; + } else + { pathnode->jpath.path.motionHazard = outer_path->motionHazard || inner_path->motionHazard; + pathnode->jpath.path.barrierHazard = outer_path->barrierHazard || inner_path->barrierHazard; + } + + /* + * For parallel hash, it is motionHazard. If there are parallel hash join on outside child, + * not use parallel hash. + * GPDB_PARALLEL_FIXME: At least, should not have impact on non-parallel path generation. + */ + if (enable_parallel && outer_path->barrierHazard && !parallel_hash) + return NULL; + + if (parallel_hash && outer_path->barrierHazard) + pathnode->batch0_barrier = true; + else + pathnode->batch0_barrier = false; + pathnode->jpath.path.sameslice_relids = bms_union(inner_path->sameslice_relids, outer_path->sameslice_relids); /* @@ -4282,6 +4582,8 @@ create_projection_path_with_quals(PlannerInfo *root, pathnode->path.pathkeys = subpath->pathkeys; pathnode->path.locus = subpath->locus; pathnode->path.sameslice_relids = subpath->sameslice_relids; + pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; pathnode->subpath = subpath; @@ -4592,6 +4894,8 @@ create_sort_path(PlannerInfo *root, pathnode->path.parallel_workers = subpath->parallel_workers; pathnode->path.pathkeys = pathkeys; pathnode->path.locus = subpath->locus; + pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; pathnode->subpath = subpath; @@ -4758,6 +5062,7 @@ create_agg_path(PlannerInfo *root, pathnode->path.pathkeys = subpath->pathkeys; /* preserves order */ else pathnode->path.pathkeys = NIL; /* output is unordered */ + pathnode->path.barrierHazard = subpath->barrierHazard; pathnode->subpath = subpath; pathnode->streaming = streaming; @@ -4767,6 +5072,8 @@ create_agg_path(PlannerInfo *root, pathnode->transitionSpace = aggcosts ? aggcosts->transitionSpace : 0; pathnode->groupClause = groupClause; pathnode->qual = qual; + pathnode->path.motionHazard = subpath->motionHazard; + pathnode->path.barrierHazard = subpath->barrierHazard; cost_agg(&pathnode->path, root, aggstrategy, aggcosts, @@ -4829,7 +5136,8 @@ create_tup_split_path(PlannerInfo *root, subpath->rows); CdbPathLocus_MakeStrewn(&pathnode->path.locus, - subpath->locus.numsegments); + subpath->locus.numsegments, + subpath->parallel_workers); return pathnode; } @@ -5152,7 +5460,8 @@ create_groupingsets_path(PlannerInfo *root, */ if (CdbPathLocus_IsPartitioned(subpath->locus)) CdbPathLocus_MakeStrewn(&pathnode->path.locus, - CdbPathLocus_NumSegments(subpath->locus)); + CdbPathLocus_NumSegments(subpath->locus), + pathnode->path.parallel_workers); else pathnode->path.locus = subpath->locus; @@ -5728,7 +6037,7 @@ adjust_modifytable_subpath(PlannerInfo *root, CmdType operation, Assert(numsegments >= 0); - CdbPathLocus_MakeReplicated(&resultLocus, numsegments); + CdbPathLocus_MakeReplicated(&resultLocus, numsegments, 0); return resultLocus; } else @@ -5737,7 +6046,7 @@ adjust_modifytable_subpath(PlannerInfo *root, CmdType operation, Assert(numsegments >= 0); - CdbPathLocus_MakeStrewn(&resultLocus, numsegments); + CdbPathLocus_MakeStrewn(&resultLocus, numsegments, 0); return resultLocus; } diff --git a/src/backend/optimizer/util/walkers.c b/src/backend/optimizer/util/walkers.c index 3472ecf6de5..c94d64625ea 100644 --- a/src/backend/optimizer/util/walkers.c +++ b/src/backend/optimizer/util/walkers.c @@ -407,6 +407,7 @@ plan_tree_walker(Node *node, break; case T_Gather: + case T_GatherMerge: if (walk_plan_node_fields((Plan *) node, walker, context)) return true; /* Other fields are simple items. */ diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 840053087b6..70cff912d51 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -314,7 +314,7 @@ static void check_expressions_in_partition_key(PartitionSpec *spec, core_yyscan_ CreateMatViewStmt RefreshMatViewStmt CreateAmStmt CreatePublicationStmt AlterPublicationStmt CreateSubscriptionStmt AlterSubscriptionStmt DropSubscriptionStmt - RetrieveStmt + RetrieveStmt CreateTaskStmt AlterTaskStmt DropTaskStmt /* GPDB-specific commands */ %type AlterQueueStmt AlterResourceGroupStmt @@ -537,6 +537,10 @@ static void check_expressions_in_partition_key(PartitionSpec *spec, core_yyscan_ %type OptSeqOptList SeqOptList OptParenthesizedSeqOptList %type SeqOptElem +%type OptTaskOptList TaskOptList AlterTaskOptList +%type TaskOptElem AlterTaskElem +%type task_schedule task_command + %type insert_rest %type opt_conf_expr %type opt_on_conflict @@ -770,7 +774,7 @@ static void check_expressions_in_partition_key(PartitionSpec *spec, core_yyscan_ LABEL LANGUAGE LARGE_P LAST_P LATERAL_P LEADING LEAKPROOF LEAST LEFT LEVEL LIKE LIMIT LISTEN LOAD LOCAL - LOCALTIME LOCALTIMESTAMP LOCATION LOCK_P LOCKED LOGGED + LOCALTIME LOCALTIMESTAMP LOCATION LOCK_P LOCKED LOCUS LOGGED MAPPING MATCH MATERIALIZED MAXVALUE MEMORY_LIMIT MEMORY_SHARED_QUOTA MEMORY_SPILL_RATIO METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE @@ -861,6 +865,8 @@ static void check_expressions_in_partition_key(PartitionSpec *spec, core_yyscan_ SCATTER SEGMENT SEGMENTS SPLIT SUBPARTITION + TASK SCHEDULE + THRESHOLD VALIDATION @@ -1137,6 +1143,7 @@ static void check_expressions_in_partition_key(PartitionSpec *spec, core_yyscan_ %nonassoc ROLLBACK %nonassoc RULE %nonassoc SAVEPOINT + %nonassoc SCHEDULE %nonassoc SCHEMA %nonassoc SCROLL %nonassoc SEARCH @@ -1163,6 +1170,7 @@ static void check_expressions_in_partition_key(PartitionSpec *spec, core_yyscan_ %nonassoc SYSTEM_P %nonassoc STRICT_P %nonassoc TABLESPACE + %nonassoc TASK %nonassoc TEMP %nonassoc TEMPLATE %nonassoc TEMPORARY @@ -1368,6 +1376,7 @@ stmt: | AlterObjectSchemaStmt | AlterOwnerStmt | AlterOperatorStmt + | AlterTaskStmt | AlterTypeStmt | AlterPolicyStmt | AlterQueueStmt @@ -1421,6 +1430,7 @@ stmt: | CreateSubscriptionStmt | CreateStatsStmt | CreateTableSpaceStmt + | CreateTaskStmt | CreateTransformStmt | CreateTrigStmt | CreateEventTrigStmt @@ -1443,6 +1453,7 @@ stmt: | DropStmt | DropSubscriptionStmt | DropTableSpaceStmt + | DropTaskStmt | DropTransformStmt | DropRoleStmt | DropUserMappingStmt @@ -6988,6 +6999,148 @@ OptTableSpaceOwner: OWNER RoleSpec { $$ = $2; } | /*EMPTY */ { $$ = NULL; } ; +/***************************************************************************** + * + * QUERY: + * CREATE TASK [IF NOT EXISTS] SCHEDULE = '{ SECONDS | }' + * [ DATABASE ] + * [ USER ] + * AS + * + * + *****************************************************************************/ + +CreateTaskStmt: + CREATE TASK name SCHEDULE task_schedule OptTaskOptList AS task_command + { + CreateTaskStmt *n = makeNode(CreateTaskStmt); + n->taskname = $3; + n->schedule = $5; + n->options = $6; + n->sql = $8; + n->if_not_exists = false; + $$ = (Node *) n; + } + | CREATE TASK IF_P NOT EXISTS name SCHEDULE task_schedule OptTaskOptList AS task_command + { + CreateTaskStmt *n = makeNode(CreateTaskStmt); + n->taskname = $6; + n->schedule = $8; + n->options = $9; + n->sql = $11; + n->if_not_exists = true; + $$ = (Node *) n; + } + ; + +OptTaskOptList: TaskOptList { $$ = $1; } + | /*EMPTY*/ { $$ = NIL; } + ; + +TaskOptList: TaskOptElem { $$ = list_make1($1); } + | TaskOptList TaskOptElem { $$ = lappend($1, $2); } + ; + +TaskOptElem: DATABASE name + { + $$ = makeDefElem("dbname", (Node *)makeString($2), @1); + } + | USER name + { + $$ = makeDefElem("username", (Node *)makeString($2), @1); + } + ; + +task_schedule: + Sconst { $$ = $1; } + | NULL_P { $$ = NULL; } + ; + +task_command: + Sconst { $$ = $1; } + | NULL_P { $$ = NULL; } + ; + +/***************************************************************************** + * + * ALTER TASK + * + *****************************************************************************/ + +AlterTaskStmt: + ALTER TASK name AlterTaskOptList + { + AlterTaskStmt *n = makeNode(AlterTaskStmt); + n->taskname = $3; + n->options = $4; + n->missing_ok = false; + $$ = (Node *) n; + } + | ALTER TASK IF_P EXISTS name AlterTaskOptList + { + AlterTaskStmt *n = makeNode(AlterTaskStmt); + n->taskname = $5; + n->options = $6; + n->missing_ok = true; + $$ = (Node *) n; + } + ; + +AlterTaskOptList: AlterTaskElem { $$ = list_make1($1); } + | AlterTaskOptList AlterTaskElem { $$ = lappend($1, $2); } + ; + +AlterTaskElem: + SCHEDULE task_schedule + { + $$ = makeDefElem("schedule", (Node *)makeString($2), @1); + } + | DATABASE name + { + $$ = makeDefElem("dbname", (Node *)makeString($2), @1); + } + | USER name + { + $$ = makeDefElem("username", (Node *)makeString($2), @1); + } + | ACTIVE + { + $$ = makeDefElem("active", (Node *)makeInteger(true), @1); + } + | NOT ACTIVE + { + $$ = makeDefElem("active", (Node *)makeInteger(false), @1); + } + | AS task_command + { + $$ = makeDefElem("sql", (Node *)makeString($2), @1); + } + ; + +/***************************************************************************** + * + * QUERY: + * DROP TASK [ IF EXISTS ] + * + *****************************************************************************/ + +DropTaskStmt: + DROP TASK name + { + DropTaskStmt *n = makeNode(DropTaskStmt); + n->taskname = $3; + n->missing_ok = false; + $$ = (Node *) n; + } + | DROP TASK IF_P EXISTS name + { + DropTaskStmt *n = makeNode(DropTaskStmt); + n->taskname = $5; + n->missing_ok = true; + $$ = (Node *) n; + } + ; + /***************************************************************************** * * QUERY : @@ -18456,6 +18609,7 @@ unreserved_keyword: | LOCATION | LOCK_P | LOCKED + | LOCUS | LOGGED | MAPPING | MASTER @@ -18566,6 +18720,7 @@ unreserved_keyword: | ROWS | RULE | SAVEPOINT + | SCHEDULE | SCHEMA | SCHEMAS | SCROLL @@ -18606,6 +18761,7 @@ unreserved_keyword: | SYSTEM_P | TABLES | TABLESPACE + | TASK | TEMP | TEMPLATE | TEMPORARY @@ -19396,6 +19552,7 @@ bare_label_keyword: | LOCATION | LOCK_P | LOCKED + | LOCUS | LOG_P | LOGGED | MAPPING @@ -19528,6 +19685,7 @@ bare_label_keyword: | ROWS | RULE | SAVEPOINT + | SCHEDULE | SCHEMA | SCHEMAS | SCROLL @@ -19577,6 +19735,7 @@ bare_label_keyword: | TABLES | TABLESAMPLE | TABLESPACE + | TASK | TEMP | TEMPLATE | TEMPORARY diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index feef706b69c..250ee3982df 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -35,6 +35,7 @@ #include "storage/proc.h" #include "storage/procsignal.h" #include "storage/shmem.h" +#include "task/pg_cron.h" #include "tcop/tcopprot.h" #include "utils/ascii.h" #include "utils/ps_status.h" @@ -156,6 +157,12 @@ static const struct { "BackoffSweeperMain", BackoffSweeperMain }, + { + "PgCronLauncherMain", PgCronLauncherMain + }, + { + "CronBackgroundWorker", CronBackgroundWorker + }, #ifdef ENABLE_IC_PROXY { "ICProxyMain", ICProxyMain diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 2678e7ec748..fff6213b465 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -108,6 +108,8 @@ #include "common/file_perm.h" #include "common/ip.h" #include "common/string.h" +#include "crypto/bufenc.h" +#include "crypto/kmgr.h" #include "lib/ilist.h" #include "libpq/auth.h" #include "libpq/libpq.h" @@ -135,6 +137,7 @@ #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procsignal.h" +#include "task/pg_cron.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" #include "utils/datetime.h" @@ -261,6 +264,7 @@ static int SendStop = false; /* still more option variables */ bool EnableSSL = false; +int terminal_fd = -1; int PreAuthDelay = 0; int AuthenticationTimeout = 60; @@ -427,6 +431,13 @@ static BackgroundWorker PMAuxProcList[MaxPMAuxProc] = "postgres", "BackoffSweeperMain", 0, {0}, 0, BackoffSweeperStartRule}, + {"pg_cron launcher", "pg_cron launcher", + BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION, + BgWorkerStart_RecoveryFinished, + 1, + "postgres", "PgCronLauncherMain", 0, {0}, 0, + PgCronStartRule}, + #ifdef ENABLE_IC_PROXY {"ic proxy process", "ic proxy process", 0, @@ -813,7 +824,7 @@ PostmasterMain(int argc, char *argv[]) * tcop/postgres.c (the option sets should not conflict) and with the * common help() function in main/main.c. */ - while ((opt = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lMmN:nOo:Pp:r:S:sTt:W:-:")) != -1) + while ((opt = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lMmN:nOo:Pp:r:R:S:sTt:W:-:")) != -1) { switch (opt) { @@ -926,6 +937,10 @@ PostmasterMain(int argc, char *argv[]) /* only used by single-user backend */ break; + case 'R': + terminal_fd = atoi(optarg); + break; + case 'S': SetConfigOption("work_mem", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; @@ -1499,6 +1514,12 @@ PostmasterMain(int argc, char *argv[]) } #endif + InitializeKmgr(); + InitializeBufferEncryption(); + + if (terminal_fd != -1) + close(terminal_fd); + /* * check that we have some socket to listen on */ diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 3b8a8b15e7d..c4dd626353c 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -19,6 +19,7 @@ #include "access/xlog_internal.h" /* for pg_start/stop_backup */ #include "catalog/pg_type.h" #include "common/file_perm.h" +#include "common/kmgr_utils.h" #include "commands/progress.h" #include "lib/stringinfo.h" #include "libpq/libpq.h" @@ -173,6 +174,9 @@ struct exclude_list_item */ static const char *const excludeDirContents[] = { + /* Skip temporary crypto key directories */ + NEW_KMGR_DIR, + OLD_KMGR_DIR, /* * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even * when stats_temp_directory is set because PGSS_TEXT_FILE is always diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 00e59599e3b..891282bcf85 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -238,6 +238,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) /* GPDB_14_MERGE_FIXME: see pg_control.h, Compatible, Figure out whether 0xC0 already used? */ case XLOG_NEXTRELFILENODE: case XLOG_OVERWRITE_CONTRECORD: + case XLOG_ENCRYPTION_LSN: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 617cda8ae0f..e4deb326e7d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -42,6 +42,8 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/storage.h" +#include "catalog/storage_xlog.h" +#include "crypto/bufenc.h" #include "executor/instrument.h" #include "lib/binaryheap.h" #include "miscadmin.h" @@ -1111,7 +1113,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* check for garbage data */ - if (!PageIsVerifiedExtended((Page) bufBlock, blockNum, + if (!PageIsVerifiedExtended((Page) bufBlock, forkNum, + blockNum, PIV_LOG_WARNING | PIV_REPORT_STAT)) { if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) @@ -3041,12 +3044,24 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) */ bufBlock = BufHdrGetBlock(buf); + if (FileEncryptionEnabled) + { + /* + * Technically BM_PERMANENT could indicate an init fork, but that's + * okay since forkNum would also tell us not to encrypt init forks. + */ + bufToWrite = PageEncryptCopy((Page) bufBlock, buf->tag.forkNum, + buf->tag.blockNum); + PageSetChecksumInplace((Page) bufToWrite, buf->tag.blockNum); + } + else + bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); + /* * Update page checksum if desired. Since we have only shared lock on the * buffer, other processes might be updating hint bits in it, so we must * copy the page to private storage if we do checksumming. */ - bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); /* * bufToWrite is either the shared buffer or a copy, as appropriate. @@ -3733,6 +3748,9 @@ FlushRelationBuffers(Relation rel) errcallback.previous = error_context_stack; error_context_stack = &errcallback; + /* XXX should we be writing a copy of the page here? */ + PageEncryptInplace(localpage, bufHdr->tag.forkNum, + bufHdr->tag.blockNum); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); smgrwrite(rel->rd_smgr, @@ -4025,11 +4043,12 @@ IncrBufferRefCount(Buffer buffer) * This is essentially the same as MarkBufferDirty, except: * * 1. The caller does not write WAL; so if checksums are enabled, we may need - * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. + * to write an XLOG_FPI_FOR_HINT record to protect against torn pages, or + * XLOG_ENCRYPTION_LSN to generate a new LSN for the page. * 2. The caller might have only share-lock instead of exclusive-lock on the - * buffer's content lock. + * buffer's content lock. * 3. This function does not guarantee that the buffer is always marked dirty - * (due to a race condition), so it cannot be used for important changes. + * (due to a race condition), so it cannot be used for important changes. */ void MarkBufferDirtyHint(Buffer buffer, bool buffer_std) @@ -4075,54 +4094,112 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * If we need to protect hint bit updates from torn writes, WAL-log a * full page image of the page. This full page image is only necessary * if the hint bit update is the first change to the page since the - * last checkpoint. + * last checkpoint. If cluster file encryption is enabled, we also + * need to generate new page LSNs for all other cases of page writes. * * We don't check full_page_writes here because that logic is included * when we call XLogInsert() since the value changes dynamically. */ - if (XLogHintBitIsNeeded() && - (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)) + if (XLogHintBitIsNeeded()) { /* - * If we must not write WAL, due to a relfilenode-specific - * condition or being in recovery, don't dirty the page. We can - * set the hint, just not dirty the page as a result so the hint - * is lost when we evict the page or shutdown. + * If we must not write WAL during recovery so don't dirty the page. + * We can set the hint, just not dirty the page as a result so the + * hint is lost when we evict the page or shutdown. * * See src/backend/storage/page/README for longer discussion. */ if (RecoveryInProgress() || IsInitProcessingMode() || - RelFileNodeSkippingWAL(bufHdr->tag.rnode)) + (RelFileNodeSkippingWAL(bufHdr->tag.rnode) && + !FileEncryptionEnabled)) return; - /* - * If the block is already dirty because we either made a change - * or set a hint already, then we don't need to write a full page - * image. Note that aggressive cleaning of blocks dirtied by hint - * bit setting would increase the call rate. Bulk setting of hint - * bits would reduce the call rate... - * - * We must issue the WAL record before we mark the buffer dirty. - * Otherwise we might write the page before we write the WAL. That - * causes a race condition, since a checkpoint might occur between - * writing the WAL record and marking the buffer dirty. We solve - * that with a kluge, but one that is already in use during - * transaction commit to prevent race conditions. Basically, we - * simply prevent the checkpoint WAL record from being written - * until we have marked the buffer dirty. We don't start the - * checkpoint flush until we have marked dirty, so our checkpoint - * must flush the change to disk successfully or the checkpoint - * never gets written, so crash recovery will fix. - * - * It's possible we may enter here without an xid, so it is - * essential that CreateCheckpoint waits for virtual transactions - * rather than full transactionids. + /* + * Non-BM_PERMANENT objects don't need full page images because + * they are not restored. WAL-skipped relfilenodes should never + * have full page images generated. */ - Assert(!MyProc->delayChkpt); - MyProc->delayChkpt = true; - delayChkpt = true; - lsn = XLogSaveBufferForHint(buffer, buffer_std); + if (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT && + !RelFileNodeSkippingWAL(bufHdr->tag.rnode)) + { + /* + * If the block is already dirty because we either made a change + * or set a hint already, then we don't need to write a full page + * image. Note that aggressive cleaning of blocks dirtied by hint + * bit setting would increase the call rate. Bulk setting of hint + * bits would reduce the call rate... + * + * We must issue the WAL record before we mark the buffer dirty. + * Otherwise we might write the page before we write the WAL. That + * causes a race condition, since a checkpoint might occur between + * writing the WAL record and marking the buffer dirty. We solve + * that with a kluge, but one that is already in use during + * transaction commit to prevent race conditions. Basically, we + * simply prevent the checkpoint WAL record from being written + * until we have marked the buffer dirty. We don't start the + * checkpoint flush until we have marked dirty, so our checkpoint + * must flush the change to disk successfully or the checkpoint + * never gets written, so crash recovery will fix. + * + * It's possible we may enter here without an xid, so it is + * essential that CreateCheckpoint waits for virtual transactions + * rather than full transactionids. + */ + Assert(!MyProc->delayChkpt); + MyProc->delayChkpt = true; + delayChkpt = true; + lsn = XLogSaveBufferForHint(buffer, buffer_std); + } + + /* + * Above, for hint bit changes, we might have generated a new page + * LSN and a full-page WAL record for a page's first-clean-to-dirty + * during a checkpoint for permanent, non-WAL-skipped relfilenodes. + * If we didn't (the lsn variable is invalid), and we are + * doing cluster file encryption, we must generate a new + * page LSN here for either non-permanent relations or page + * non-first-clean-to-dirty during a checkpoint. (Cluster file + * encryption does not support WAL-skip relfilenodes.) We must + * update the page LSN even if the page with the hint bit change is + * later overwritten in the file system with an earlier version of + * the page during crash recovery. + * + * XXX Can we rely on the full page write above with no lock being + * held to avoid torn pages? Above, the LSN and page image are + * tied together, but here is just the page LSN update. + */ + if (XLogRecPtrIsInvalid(lsn) && FileEncryptionEnabled) + { + /* + * For cluster file encryption we need a new page LSN because + * the LSN is used, with the page number and permanent flag, as + * part of the nonce, and the nonce must be unique for every + * page write. If we reencrypt a page with hint bit changes + * using the same nonce as previous writes, it would expose the + * hint bit change locations. To avoid this, we write a simple + * WAL record to advance the lsn, which can then be assigned to + * the page below. + * + * Above we are relying on the full page writes to revert + * any partial pages writes caused by this LSN change for + * permanent, non-WAL-skip relfilenodes. For non-permanent + * relations, they crash recover as empty. For WAL-skip + * relfilenodes, they recover with their original contents, so + * that works too. + */ + /* XXX Do we need the checkpoint delay here? */ + MyProc->delayChkpt |= DELAY_CHKPT_START; + delayChkpt = true; + /* + * XXX We probably don't need to replay this WAL on the primary + * since the full page image is restored, but do we have + * to replay this on the repicas (for relations that are + * replicated)? + */ + lsn = LSNForEncryption( + pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT); + } } buf_state = LockBufHdr(bufHdr); diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 887d0026f59..56afad01085 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -166,7 +166,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, } return bufHdr; } - + #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%lu,%d,%d) %d\n", smgr->smgr_rnode.node.relNode, forkNum, blockNum, @@ -226,6 +226,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, // GPDB_93_MERGE_FIXME: is this TODO comment still relevant? // UNDONE: Unfortunately, I think we write temp relations to the mirror... + /* + * Technically BM_PERMANENT could indicate an init fork, but that's + * okay since forkNum would also tell us not to encrypt init forks. + */ + PageEncryptInplace(localpage, bufHdr->tag.forkNum, + bufHdr->tag.blockNum); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); /* And write... */ diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index da8b7cbeca3..67ca0858d3b 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -22,11 +22,15 @@ #include #include +#include "crypto/bufenc.h" +#include "common/file_utils.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/copydir.h" #include "storage/fd.h" +extern XLogRecPtr LSNForEncryption(bool use_wal_lsn); + /* * copydir: copy a directory * @@ -74,7 +78,7 @@ copydir(char *fromdir, char *todir, bool recurse) copydir(fromfile, tofile, true); } else if (S_ISREG(fst.st_mode)) - copy_file(fromfile, tofile); + copy_file(fromfile, tofile, false); } FreeDir(xldir); @@ -124,7 +128,7 @@ copydir(char *fromdir, char *todir, bool recurse) * copy one file */ void -copy_file(char *fromfile, char *tofile) +copy_file(char *fromfile, char *tofile, bool encrypt_init_file) { char *buffer; int srcfd; @@ -132,9 +136,8 @@ copy_file(char *fromfile, char *tofile) int nbytes; off_t offset; off_t flush_offset; - /* Size of copy buffer (read and write requests) */ -#define COPY_BUF_SIZE (8 * BLCKSZ) + int copy_buf_size = (encrypt_init_file) ? BLCKSZ : 8 * BLCKSZ; /* * Size of data flush requests. It seems beneficial on most platforms to @@ -149,7 +152,7 @@ copy_file(char *fromfile, char *tofile) #endif /* Use palloc to ensure we get a maxaligned buffer */ - buffer = palloc(COPY_BUF_SIZE); + buffer = palloc(copy_buf_size); /* * Open the files @@ -187,7 +190,7 @@ copy_file(char *fromfile, char *tofile) } pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ); - nbytes = read(srcfd, buffer, COPY_BUF_SIZE); + nbytes = read(srcfd, buffer, copy_buf_size); pgstat_report_wait_end(); if (nbytes < 0) ereport(ERROR, @@ -195,6 +198,24 @@ copy_file(char *fromfile, char *tofile) errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; + /* + * When we copy an init fork page to be part of an empty unlogged + * relation, its real LSN must be replaced with a fake one, and the + * page encrypted. + */ + if (encrypt_init_file) + { + Page page = (Page) buffer; + + if (nbytes != BLCKSZ) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("nbytes is not block size \"%d\": %m", nbytes))); + PageSetLSN(page, LSNForEncryption(false)); + PageEncryptInplace(page, MAIN_FORKNUM, offset / BLCKSZ); + PageSetChecksumInplace(page, offset / BLCKSZ); + } + errno = 0; pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE); if ((int) write(dstfd, buffer, nbytes) != nbytes) diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 16769697526..7a34cdceb81 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -16,6 +16,7 @@ #include +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/pg_tablespace.h" #include "cdb/cdbvars.h" @@ -303,7 +304,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) /* OK, we're ready to perform the actual copy. */ elog(DEBUG2, "copying %s to %s", srcpath, dstpath); - copy_file(srcpath, dstpath); + copy_file(srcpath, dstpath, FileEncryptionEnabled); } FreeDir(dbspace_dir); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index e2392516535..91d4dc8bf17 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -21,6 +21,7 @@ #include "access/heapam.h" #include "access/multixact.h" #include "access/nbtree.h" +#include "access/parallel.h" #include "access/subtrans.h" #include "access/syncscan.h" #include "access/twophase.h" @@ -28,6 +29,7 @@ #include "cdb/cdblocaldistribxact.h" #include "cdb/cdbvars.h" #include "commands/async.h" +#include "crypto/kmgr.h" #include "executor/nodeShareInputScan.h" #include "miscadmin.h" #include "pgstat.h" @@ -52,6 +54,7 @@ #include "storage/procsignal.h" #include "storage/sinvaladt.h" #include "storage/spin.h" +#include "task/pg_cron.h" #include "utils/backend_cancel.h" #include "utils/resource_manager.h" #include "utils/faultinjector.h" @@ -148,6 +151,7 @@ CreateSharedMemoryAndSemaphores(void) sizeof(ShmemIndexEnt))); size = add_size(size, dsm_estimate_size()); size = add_size(size, BufferShmemSize()); + size = add_size(size, GpParallelDSMHashSize()); size = add_size(size, LockShmemSize()); size = add_size(size, PredicateLockShmemSize()); @@ -187,10 +191,12 @@ CreateSharedMemoryAndSemaphores(void) size = add_size(size, PgArchShmemSize()); size = add_size(size, ApplyLauncherShmemSize()); size = add_size(size, FTSReplicationStatusShmemSize()); + size = add_size(size, PgCronLauncherShmemSize()); size = add_size(size, SnapMgrShmemSize()); size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); + size = add_size(size, KmgrShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -303,6 +309,8 @@ CreateSharedMemoryAndSemaphores(void) tmShmemInit(); InitBufferPool(); + InitGpParallelDSMHash(); + /* * Set up lock manager */ @@ -363,6 +371,7 @@ CreateSharedMemoryAndSemaphores(void) PgArchShmemInit(); ApplyLauncherShmemInit(); FTSReplicationStatusShmemInit(); + PgCronLauncherShmemInit(); #ifdef FAULT_INJECTOR FaultInjector_ShmemInit(); @@ -386,6 +395,7 @@ CreateSharedMemoryAndSemaphores(void) InstrShmemInit(); GpExpandVersionShmemInit(); + KmgrShmemInit(); #ifdef EXEC_BACKEND diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 64f44bb9aca..72276cc3989 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -80,6 +80,8 @@ #define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) +CountDBSession_hook_type CountDBSession_hook = NULL; + /* Our shared memory area */ typedef struct ProcArrayStruct { @@ -2334,10 +2336,10 @@ copyLocalSnapshot(Snapshot snapshot) memcpy(snapshot->xip, SharedLocalSnapshotSlot->snapshot.xip, snapshot->xcnt * sizeof(TransactionId)); snapshot->curcid = SharedLocalSnapshotSlot->snapshot.curcid; - snapshot->subxcnt = -1; + snapshot->subxcnt = 0; if (TransactionIdPrecedes(snapshot->xmin, TransactionXmin)) - TransactionXmin = snapshot->xmin; + MyProc->xmin = TransactionXmin = snapshot->xmin; ereport((Debug_print_snapshot_dtm ? LOG : DEBUG5), (errmsg("Reader qExec setting shared local snapshot to: xmin: %d xmax: %d curcid: %d", @@ -4748,6 +4750,13 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) LWLockRelease(ProcArrayLock); + /* + * Only check local procArray maybe not enough in a distributed + * environment use this hook to check it. + */ + if (CountDBSession_hook) + found = found || (*CountDBSession_hook)(databaseId); + if (!found) return false; /* no conflicting backends, so done */ @@ -6381,3 +6390,19 @@ ResGroupSignalMoveQuery(int sessionId, void *slot, Oid groupId) } LWLockRelease(ProcArrayLock); } + +void +LoopBackendProc(BackendProcCallbackFunction func, void *args) +{ + uint32 i; + + ProcArrayStruct *arrayP = procArray; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + for (i = 0; i < arrayP->numProcs; i++) + { + volatile PGPROC *proc = &allProcs[arrayP->pgprocnos[i]]; + (*func)(proc, args); + } + LWLockRelease(ProcArrayLock); +} \ No newline at end of file diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 45ddb8e5ce0..fe77016bcab 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -54,6 +54,17 @@ #include "cdb/cdbvars.h" +/* + * ActivateLock_hook will be called after locallock granted. this hook + * can be used to activate the acquired lock, and return whether + * activation was successful, if not we release the acquired lock, and + * treat this situation as the lock is unavailable. + * DeactivateLock_hook will be called before decreasing the owner's lock + * count. the releaseAll indicates whether we will decrease the owner's + * lock count to 0. + */ +ActivateLock_hook_type ActivateLock_hook = NULL; +DeactivateLock_hook_type DeactivateLock_hook = NULL; /* This configuration variable is used to set the lock table size */ int max_locks_per_xact; /* set by guc.c */ @@ -895,6 +906,14 @@ LockAcquireExtended(const LOCKTAG *locktag, if (locallock->nLocks > 0) { GrantLockLocal(locallock, owner); + + if (ActivateLock_hook && + !(*ActivateLock_hook)(locktag, lockmode, sessionLock, dontWait)) + { + LockRelease(locktag, lockmode, sessionLock); + return LOCKACQUIRE_NOT_AVAIL; + } + if (locallock->lockCleared) return LOCKACQUIRE_ALREADY_CLEAR; else @@ -1018,6 +1037,14 @@ LockAcquireExtended(const LOCKTAG *locktag, locallock->lock = NULL; locallock->proclock = NULL; GrantLockLocal(locallock, owner); + + if (ActivateLock_hook && + !(*ActivateLock_hook)(locktag, lockmode, sessionLock, dontWait)) + { + LockRelease(locktag, lockmode, sessionLock); + return LOCKACQUIRE_NOT_AVAIL; + } + return LOCKACQUIRE_OK; } } @@ -1279,6 +1306,13 @@ LockAcquireExtended(const LOCKTAG *locktag, LWLockRelease(partitionLock); + if (ActivateLock_hook && + !(*ActivateLock_hook)(locktag, lockmode, sessionLock, dontWait)) + { + LockRelease(locktag, lockmode, sessionLock); + return LOCKACQUIRE_NOT_AVAIL; + } + /* * Emit a WAL record if acquisition of this lock needs to be replayed in a * standby server. @@ -2296,6 +2330,9 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) */ locallock->nLocks--; + if (DeactivateLock_hook) + (*DeactivateLock_hook)(locktag, lockmode, sessionLock, false); + if (locallock->nLocks > 0) return true; @@ -2539,6 +2576,14 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid) continue; + if (DeactivateLock_hook) + { + /* Deactivate all transaction locks */ + (*DeactivateLock_hook)(&(locallock->tag.lock), locallock->tag.mode, false, true); + /* Deactivate all session locks */ + if (allLocks) + (*DeactivateLock_hook)(&(locallock->tag.lock), locallock->tag.mode, true, true); + } /* * If we are asked to release all locks, we can just zap the entry. * Otherwise, must scan to see if there are session locks. We assume diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index a533cf8e743..c83f1588529 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -69,3 +69,5 @@ ParallelCursorEndpointLock 59 SharedSnapshotLock 60 DistributedLogControlLock 61 CdbConfigCacheLock 62 +KmgrFileLock 63 +GpParallelDSMHashLock 64 diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index d45b76f8f4e..f717a8fc15a 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -75,6 +75,8 @@ #include "utils/resscheduler.h" #include "utils/session_state.h" +AllocSessionId_hook_type AllocSessionId_hook = NULL; + /* GUC variables */ int DeadlockTimeout = 1000; int StatementTimeout = 0; @@ -602,6 +604,18 @@ InitProcess(void) */ InitLWLockAccess(); InitDeadLockChecking(); + + /* + * we only do overwrite if gp_session_id is set above. + */ + if (gp_session_id == mppLocalProcessSerial && + IS_QUERY_DISPATCHER() && + Gp_role == GP_ROLE_DISPATCH && + AllocSessionId_hook) + { + gp_session_id = (*AllocSessionId_hook)(false); + MyProc->mppSessionId = gp_session_id; + } } /* @@ -2561,6 +2575,12 @@ void ProcNewMppSessionId(int *newSessionId) *newSessionId = MyProc->mppSessionId = pg_atomic_add_fetch_u32((pg_atomic_uint32 *)&ProcGlobal->mppLocalProcessCounter, 1); + /* overwrite it, minimize conflicts */ + if (AllocSessionId_hook) + { + *newSessionId = (*AllocSessionId_hook)(true); + MyProc->mppSessionId = *newSessionId; + } /* * Make sure that our SessionState entry correctly records our * new session id. @@ -2654,3 +2674,17 @@ BecomeLockGroupMember(PGPROC *leader, int pid) return ok; } + +void +LoopAuxProc(AuxProcCallbackFunction func, void *args) +{ + uint8 index; + + SpinLockAcquire(ProcStructLock); + for (index = 0; index < NUM_AUXILIARY_PROCS; index++) + { + volatile PGPROC *proc = &AuxiliaryProcs[index]; + (*func)(proc, args); + } + SpinLockRelease(ProcStructLock); +} diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 82ca91f5977..2e1927cfd6d 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -17,6 +17,7 @@ #include "access/htup_details.h" #include "access/itup.h" #include "access/xlog.h" +#include "crypto/bufenc.h" #include "pgstat.h" #include "storage/checksum.h" #include "utils/memdebug.h" @@ -85,7 +86,8 @@ PageInit(Page page, Size pageSize, Size specialSize) * to pgstat. */ bool -PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) +PageIsVerifiedExtended(Page page, ForkNumber forknum, + BlockNumber blkno, int flags) { PageHeader p = (PageHeader) page; size_t *pagebytes; @@ -108,6 +110,8 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) checksum_failure = true; } + PageDecryptInplace(page, forknum, blkno); + /* * The following checks don't prove the header is correct, only that * it looks sane enough to allow into the buffer pool. Later usage of @@ -1537,3 +1541,48 @@ PageSetChecksumInplace(Page page, BlockNumber blkno) ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); } + +char * +PageEncryptCopy(Page page, ForkNumber forknum, + BlockNumber blkno) +{ + static char *pageCopy = NULL; + + /* If we don't need a checksum, just return the passed-in data */ + if (PageIsNew(page) || !PageNeedsToBeEncrypted(forknum)) + return (char *) page; + + /* + * We allocate the copy space once and use it over on each subsequent + * call. The point of palloc'ing here, rather than having a static char + * array, is first to ensure adequate alignment for the checksumming code + * and second to avoid wasting space in processes that never call this. + */ + if (pageCopy == NULL) + pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + + memcpy(pageCopy, (char *) page, BLCKSZ); + EncryptPage(pageCopy, blkno); + return pageCopy; +} + +void +PageEncryptInplace(Page page, ForkNumber forknum, + BlockNumber blkno) +{ + if (PageIsNew(page) || !PageNeedsToBeEncrypted(forknum)) + return; + + EncryptPage(page, blkno); +} + + +void +PageDecryptInplace(Page page, ForkNumber forknum, + BlockNumber blkno) +{ + if (PageIsNew(page) || !PageNeedsToBeEncrypted(forknum)) + return; + + DecryptPage(page, blkno); +} diff --git a/src/backend/task/Makefile b/src/backend/task/Makefile new file mode 100644 index 00000000000..8fcfec794aa --- /dev/null +++ b/src/backend/task/Makefile @@ -0,0 +1,23 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/task +# +# IDENTIFICATION +# src/backend/task/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/task +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global +override CPPFLAGS := -I$(libpq_srcdir) -Wno-implicit-fallthrough $(CPPFLAGS) + +OBJS = \ + entry.o \ + job_metadata.o \ + misc.o \ + pg_cron.o \ + task_states.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/task/entry.c b/src/backend/task/entry.c new file mode 100644 index 00000000000..3d0037ae2aa --- /dev/null +++ b/src/backend/task/entry.c @@ -0,0 +1,447 @@ +/* + * Copyright 1988,1990,1993,1994 by Paul Vixie + * All rights reserved + * + * Distribute freely, except: don't remove my name from the source or + * documentation (don't take credit for my work), mark your changes (don't + * get me blamed for your possible bugs), don't alter or remove this + * notice. May be sold if buildable source is provided to buyer. No + * warrantee of any kind, express or implied, is included with this + * software; use at your own risk, responsibility for damages (if any) to + * anyone resulting from the use of this software rests entirely with the + * user. + * + * Send bug reports, bug fixes, enhancements, requests, flames, etc., and + * I'll try to keep a version up to date. I can be reached as follows: + * Paul Vixie uunet!decwrl!vixie!paul + */ + +/* + * marco 04sep16 [integrated into pg_cron] + * vix 26jan87 [RCS'd; rest of log is in RCS file] + * vix 01jan87 [added line-level error recovery] + * vix 31dec86 [added /step to the from-to range, per bob@acornrc] + * vix 30dec86 [written] + */ + +#include "postgres.h" + +#include "stdlib.h" +#include "string.h" +#include "task/cron.h" + +typedef enum ecode { + e_none, e_minute, e_hour, e_dom, e_month, e_dow, + e_cmd, e_timespec, e_username, e_cmd_len +} ecode_e; + +static int get_list(bitstr_t *, int, int, char *[], int, FILE *), + get_range(bitstr_t *, int, int, char *[], int, FILE *), + get_number(int *, int, char *[], int, FILE *); +static int set_element(bitstr_t *, int, int, int); + +void +free_entry(entry *e) +{ + free(e); +} + +/* + * return NULL if eof or syntax error occurs; + * otherwise return a pointer to a new entry. + * + * Note: This function is a modified version of load_entry in Vixie + * cron. It only parses the schedule part of a cron entry and uses + * an in-memry buffer. + */ +entry * +parse_cron_entry(char *schedule) +{ + /* + * this function reads one crontab entry -- the next -- from a file. + * it skips any leading blank lines, ignores comments, and returns + * EOF if for any reason the entry can't be read and parsed. + * + * the entry is also parsed here. + * + * syntax: + * user crontab: + * minutes hours doms months dows cmd\n + * system crontab (/etc/crontab): + * minutes hours doms months dows USERNAME cmd\n + */ + + ecode_e ecode = e_none; + entry *e = (entry *) calloc(sizeof(entry), sizeof(char)); + int ch = 0; + char cmd[MAX_COMMAND]; + file_buffer buffer = {{},0,0,{},0}; + FILE *file = (FILE *) &buffer; + + int scheduleLength = strlen(schedule); + if (scheduleLength >= MAX_FILE_BUFFER_LENGTH) + { + ch = EOF; + ecode = e_cmd_len; + goto eof; + } + + strcpy(buffer.data, schedule); + buffer.length = scheduleLength; + buffer.pointer = 0; + + Debug(DPARS, ("load_entry()...about to eat comments\n")) + + skip_comments(file); + + ch = get_char(file); + if (ch == EOF) + { + free_entry(e); + return NULL; + } + + /* + * ch is now the first useful character of a useful line. + * it may be an @special or it may be the first character + * of a list of minutes. + */ + + if (ch == '@') { + /* + * all of these should be flagged and load-limited; i.e., + * instead of @hourly meaning "0 * * * *" it should mean + * "close to the front of every hour but not 'til the + * system load is low". Problems are: how do you know + * what "low" means? (save me from /etc/cron.conf!) and: + * how to guarantee low variance (how low is low?), which + * means how to we run roughly every hour -- seems like + * we need to keep a history or let the first hour set + * the schedule, which means we aren't load-limited + * anymore. too much for my overloaded brain. (vix, jan90) + * HINT + */ + ch = get_string(cmd, MAX_COMMAND, file, " \t\n"); + if (!strcmp("reboot", cmd) || !strcmp("restart", cmd)) { + e->flags |= WHEN_REBOOT; + } else if (!strcmp("yearly", cmd) || !strcmp("annually", cmd)){ + bit_set(e->minute, 0); + bit_set(e->hour, 0); + bit_set(e->dom, 0); + bit_set(e->month, 0); + bit_nset(e->dow, 0, (LAST_DOW-FIRST_DOW+1)); + e->flags |= DOW_STAR; + } else if (!strcmp("monthly", cmd)) { + bit_set(e->minute, 0); + bit_set(e->hour, 0); + bit_set(e->dom, 0); + bit_nset(e->month, 0, (LAST_MONTH-FIRST_MONTH+1)); + bit_nset(e->dow, 0, (LAST_DOW-FIRST_DOW+1)); + e->flags |= DOW_STAR; + } else if (!strcmp("weekly", cmd)) { + bit_set(e->minute, 0); + bit_set(e->hour, 0); + bit_nset(e->dom, 0, (LAST_DOM-FIRST_DOM+1)); + e->flags |= DOM_STAR; + bit_nset(e->month, 0, (LAST_MONTH-FIRST_MONTH+1)); + bit_nset(e->dow, 0,0); + } else if (!strcmp("daily", cmd) || !strcmp("midnight", cmd)) { + bit_set(e->minute, 0); + bit_set(e->hour, 0); + bit_nset(e->dom, 0, (LAST_DOM-FIRST_DOM+1)); + bit_nset(e->month, 0, (LAST_MONTH-FIRST_MONTH+1)); + bit_nset(e->dow, 0, (LAST_DOW-FIRST_DOW+1)); + } else if (!strcmp("hourly", cmd)) { + bit_set(e->minute, 0); + bit_nset(e->hour, 0, (LAST_HOUR-FIRST_HOUR+1)); + bit_nset(e->dom, 0, (LAST_DOM-FIRST_DOM+1)); + bit_nset(e->month, 0, (LAST_MONTH-FIRST_MONTH+1)); + bit_nset(e->dow, 0, (LAST_DOW-FIRST_DOW+1)); + e->flags |= HR_STAR; + } else { + ecode = e_timespec; + goto eof; + } + } else { + Debug(DPARS, ("load_entry()...about to parse numerics\n")) + + if (ch == '*') + e->flags |= MIN_STAR; + ch = get_list(e->minute, FIRST_MINUTE, LAST_MINUTE, + PPC_NULL, ch, file); + if (ch == EOF) { + ecode = e_minute; + goto eof; + } + + /* hours */ + if (ch == '*') + e->flags |= HR_STAR; + ch = get_list(e->hour, FIRST_HOUR, LAST_HOUR, + PPC_NULL, ch, file); + if (ch == EOF) { + ecode = e_hour; + goto eof; + } + + /* DOM (days of month) */ + if (ch == '*') + e->flags |= DOM_STAR; + ch = get_list(e->dom, FIRST_DOM, LAST_DOM, + PPC_NULL, ch, file); + if (ch == EOF) { + ecode = e_dom; + goto eof; + } + + /* month */ + ch = get_list(e->month, FIRST_MONTH, LAST_MONTH, + MonthNames, ch, file); + if (ch == EOF) { + ecode = e_month; + goto eof; + } + + /* DOW (days of week) */ + if (ch == '*') + e->flags |= DOW_STAR; + ch = get_list(e->dow, FIRST_DOW, LAST_DOW, + DowNames, ch, file); + if (ch == EOF) { + ecode = e_month; + goto eof; + } + } + + /* make sundays equivalent */ + if (bit_test(e->dow, 0) || bit_test(e->dow, 7)) { + bit_set(e->dow, 0); + bit_set(e->dow, 7); + } + + /* success, fini, return pointer to the entry we just created... */ + return e; + + eof: + elog(DEBUG1, "failed to parse entry %d", ecode); + free_entry(e); + while (ch != EOF && ch != '\n') + ch = get_char(file); + return NULL; +} + +static int +get_list(bits, low, high, names, ch, file) + bitstr_t *bits; /* one bit per flag, default=FALSE */ + int low, high; /* bounds, impl. offset for bitstr */ + char *names[]; /* NULL or *[] of names for these elements */ + int ch; /* current character being processed */ + FILE *file; /* file being read */ +{ + register int done; + + /* + * we know that we point to a non-blank character here; + * must do a Skip_Blanks before we exit, so that the + * next call (or the code that picks up the cmd) can + * assume the same thing. + */ + + Debug(DPARS|DEXT, ("get_list()...entered\n")) + + /* list = range {"," range} */ + + /* clear the bit string, since the default is 'off'.*/ + bit_nclear(bits, 0, (high-low+1)); + + /* process all ranges */ + done = FALSE; + while (!done) { + ch = get_range(bits, low, high, names, ch, file); + if (ch == ',') + ch = get_char(file); + else + done = TRUE; + } + + /* exiting. skip to some blanks, then skip over the blanks. */ + Skip_Nonblanks(ch, file) + Skip_Blanks(ch, file) + + Debug(DPARS|DEXT, ("get_list()...exiting w/ %02x\n", ch)) + + return ch; +} + +static int +get_range(bits, low, high, names, ch, file) + bitstr_t *bits; /* one bit per flag, default=FALSE */ + int low, + high; /* bounds, impl. offset for bitstr */ + char *names[]; /* NULL or names of elements */ + int ch; /* current character being processed */ + FILE *file; /* file being read */ +{ + /* range = number | number "-" number [ "/" number ] */ + register int i; + auto int num1, + num2, + num3; + + Debug(DPARS|DEXT, ("get_range()...entering, exit won't show\n")) + + if (ch == '*') { + /* '*' means "first-last" but can still be modified by /step */ + num1 = low; + num2 = high; + ch = get_char(file); + if (ch == EOF) + return EOF; + } else { + if (EOF == (ch = get_number(&num1, low, names, ch, file))) + return EOF; + + if (ch != '-') { + /* not a range, it's a single number. */ + + /* + * Unsupported syntax: Step specified without range, + * eg: 1/20 * * * * /bin/echo "this fails" + */ + if (ch == '/') + return EOF; + + if (EOF == set_element(bits, low, high, num1)) + return EOF; + return ch; + } else { + /* eat the dash */ + ch = get_char(file); + if (ch == EOF) + return EOF; + + /* get the number following the dash */ + ch = get_number(&num2, low, names, ch, file); + if (ch == EOF) + return EOF; + } + } + + /* check for step size */ + if (ch == '/') { + /* eat the slash */ + ch = get_char(file); + if (ch == EOF) + return EOF; + + /* + * get the step size -- note: we don't pass the + * names here, because the number is not an + * element id, it's a step size. 'low' is + * sent as a 0 since there is no offset either. + */ + ch = get_number(&num3, 0, PPC_NULL, ch, file); + if (ch == EOF || num3 <= 0) + return EOF; + } else { + /* no step. default==1. */ + num3 = 1; + } + + /* + * Explicitly check for sane values. Certain combinations of ranges and + * steps which should return EOF don't get picked up by the code below, + * eg: + * 5-64/30 * * * * touch /dev/null + * + * Code adapted from set_elements() where this error was probably intended + * to be catched. + */ + if (num1 < low || num1 > high || num2 < low || num2 > high) + return EOF; + + /* + * range. set all elements from num1 to num2, stepping + * by num3. (the step is a downward-compatible extension + * proposed conceptually by bob@acornrc, syntactically + * designed then implemented by paul vixie). + */ + for (i = num1; i <= num2; i += num3) + if (EOF == set_element(bits, low, high, i)) + return EOF; + + return ch; +} + +static int +get_number(numptr, low, names, ch, file) + int *numptr; /* where does the result go? */ + int low; /* offset applied to result if symbolic enum used */ + char *names[]; /* symbolic names, if any, for enums */ + int ch; /* current character */ + FILE *file; /* source */ +{ + char temp[MAX_TEMPSTR], *pc; + int len, i, all_digits; + + /* collect alphanumerics into our fixed-size temp array */ + pc = temp; + len = 0; + all_digits = TRUE; + while (isalnum(ch)) { + if (++len >= MAX_TEMPSTR) + return EOF; + + *pc++ = ch; + + if (!isdigit(ch)) + all_digits = FALSE; + + ch = get_char(file); + } + *pc = '\0'; + + if (len == 0) { + return EOF; + } + + /* try to find the name in the name list */ + if (names) { + for (i = 0; names[i] != NULL; i++) { + Debug(DPARS|DEXT, + ("get_num, compare(%s,%s)\n", names[i], temp)) + if (!strcasecmp(names[i], temp)) { + *numptr = i+low; + return ch; + } + } + } + + /* + * no name list specified, or there is one and our string isn't + * in it. either way: if it's all digits, use its magnitude. + * otherwise, it's an error. + */ + if (all_digits) { + *numptr = atoi(temp); + return ch; + } + + return EOF; +} + +static int +set_element(bits, low, high, number) + bitstr_t *bits; /* one bit per flag, default=FALSE */ + int low; + int high; + int number; +{ + Debug(DPARS|DEXT, ("set_element(?,%d,%d,%d)\n", low, high, number)) + + if (number < low || number > high) + return EOF; + + bit_set(bits, (number-low)); + return OK; +} diff --git a/src/backend/task/job_metadata.c b/src/backend/task/job_metadata.c new file mode 100644 index 00000000000..e5c4da74ad5 --- /dev/null +++ b/src/backend/task/job_metadata.c @@ -0,0 +1,832 @@ +/*------------------------------------------------------------------------- + * + * src/job_metadata.c + * + * Functions for reading and manipulating pg_cron metadata. + * + * Copyright (c) 2016, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#include "fmgr.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postgres.h" + +#include "access/genam.h" +#include "access/hash.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/skey.h" +#include "access/xact.h" +#include "access/xlog.h" + +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/pg_authid.h" +#include "catalog/pg_extension.h" +#include "catalog/pg_task.h" +#include "catalog/pg_task_run_history.h" +#include "catalog/pg_type.h" + +#include "commands/dbcommands.h" +#include "commands/extension.h" +#include "commands/sequence.h" +#include "commands/trigger.h" +#include "executor/spi.h" +#include "postmaster/postmaster.h" +#include "postmaster/bgworker.h" +#include "storage/lock.h" +#include "task/job_metadata.h" +#include "task/pg_cron.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/formatting.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/varlena.h" + +static HTAB *CreateCronJobHash(void); + +static void EnsureDeletePermission(Relation cronJobsTable, HeapTuple heapTuple); +static void InvalidateJobCacheCallback(Datum argument, Oid relationId); +static void InvalidateJobCache(void); +static CronJob *TupleToCronJob(TupleDesc tupleDescriptor, HeapTuple heapTuple); +static Oid GetRoleOidIfCanLogin(char *username); +static entry *ParseSchedule(char *scheduleText); +static bool TryParseInterval(char *scheduleText, uint32 *secondsInterval); + +/* GUC settings */ +bool task_enable_superuser_jobs = true; + +/* global variables */ +static MemoryContext CronJobContext = NULL; +static HTAB *CronJobHash = NULL; +static Oid CachedCronJobRelationId = InvalidOid; +bool CronJobCacheValid = false; + +/* + * InitializeJobMetadataCache initializes the data structures for caching + * job metadata. + */ +void +InitializeJobMetadataCache(void) +{ + /* watch for invalidation events */ + CacheRegisterRelcacheCallback(InvalidateJobCacheCallback, (Datum) 0); + + CronJobContext = AllocSetContextCreate(CurrentMemoryContext, + "pg_cron job context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + CronJobHash = CreateCronJobHash(); +} + +/* + * ResetJobMetadataCache resets the job metadata cache to its initial + * state. + */ +void +ResetJobMetadataCache(void) +{ + MemoryContextResetAndDeleteChildren(CronJobContext); + + CronJobHash = CreateCronJobHash(); +} + +/* + * CreateCronJobHash creates the hash for caching job metadata. + */ +static HTAB * +CreateCronJobHash(void) +{ + HTAB *taskHash = NULL; + HASHCTL info; + int hashFlags = 0; + + memset(&info, 0, sizeof(info)); + info.keysize = sizeof(int64); + info.entrysize = sizeof(CronJob); + info.hash = tag_hash; + info.hcxt = CronJobContext; + hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + + taskHash = hash_create("pg_cron jobs", 32, &info, hashFlags); + + return taskHash; +} + +/* + * GetCronJob gets the cron job with the given id. + */ +CronJob * +GetCronJob(int64 jobId) +{ + CronJob *job = NULL; + int64 hashKey = jobId; + bool isPresent = false; + + job = hash_search(CronJobHash, &hashKey, HASH_FIND, &isPresent); + + return job; +} + +/* + * ScheduleCronJob schedules a cron job with the given name. + */ +int64 +ScheduleCronJob(text *scheduleText, text *commandText, text *databaseText, + text *usernameText, bool active, text *jobnameText) +{ + entry *parsedSchedule = NULL; + char *schedule; + char *command; + char *database_name = NULL; + char *jobName = NULL; + char *username = NULL; + AclResult aclresult; + Oid userIdcheckacl; + + int64 jobId = 0; + Oid savedUserId = InvalidOid; + int savedSecurityContext = 0; + userIdcheckacl = GetUserId(); + + /* check schedule is valid */ + schedule = text_to_cstring(scheduleText); + parsedSchedule = ParseSchedule(schedule); + + if (parsedSchedule == NULL) + { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid schedule: %s", schedule), + errhint("Use cron format (e.g. 5 4 * * *), or interval " + "format '[1-59] seconds'"))); + } + + free_entry(parsedSchedule); + + command = text_to_cstring(commandText); + + if (jobnameText != NULL) + { + jobName = text_to_cstring(jobnameText); + } + + /* username has been provided */ + if (usernameText != NULL) + { + username = text_to_cstring(usernameText); + userIdcheckacl = GetRoleOidIfCanLogin(username); + } + + /* database has been provided */ + if (databaseText != NULL) + database_name = text_to_cstring(databaseText); + + /* first do a crude check to see whether superuser jobs are allowed */ + if (!task_enable_superuser_jobs && superuser_arg(userIdcheckacl)) + { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("cannot schedule jobs as superuser"), + errdetail("Scheduling jobs as superuser is disallowed when " + "cron.enable_superuser_jobs is set to off."))); + } + + /* ensure the user that is used in the job can connect to the database */ + aclresult = pg_database_aclcheck(get_database_oid(database_name, false), + userIdcheckacl, ACL_CONNECT); + if (aclresult != ACLCHECK_OK) + elog(ERROR, "User %s does not have CONNECT privilege on %s", + GetUserNameFromId(userIdcheckacl, false), database_name); + + GetUserIdAndSecContext(&savedUserId, &savedSecurityContext); + + /* insert task into pg_catalog.pg_task table */ + jobId = TaskCreate(schedule, command, task_host_addr, PostPortNumber, + database_name, username, active, jobName); + + SetUserIdAndSecContext(savedUserId, savedSecurityContext); + + InvalidateJobCache(); + + return jobId; +} + +/* + * GetRoleOidIfCanLogin + * Checks user exist and can log in + */ +static Oid +GetRoleOidIfCanLogin(char *username) +{ + HeapTuple roletup; + Form_pg_authid rform; + Oid roleOid = InvalidOid; + + roletup = SearchSysCache1(AUTHNAME, PointerGetDatum(username)); + if (!HeapTupleIsValid(roletup)) + ereport(ERROR, + (errmsg("role \"%s\" does not exist", + username))); + + rform = (Form_pg_authid) GETSTRUCT(roletup); + + if (!rform->rolcanlogin) + ereport(ERROR, + (errmsg("role \"%s\" can not log in", + username), + errdetail("Jobs may only be run by roles that have the LOGIN attribute."))); + + roleOid = rform->oid; + + ReleaseSysCache(roletup); + return roleOid; +} + +/* + * NextRunId draws a new run ID from GetNewOidWithIndex. + */ +int64 +NextRunId(void) +{ + Relation pg_task_run_history; + int64 runId = 0; + MemoryContext originalContext = CurrentMemoryContext; + + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + pg_task_run_history = table_open(TaskRunHistoryRelationId, RowExclusiveLock); + runId = GetNewOidWithIndex(pg_task_run_history, TaskRunHistoryRunIdIndexId, + Anum_pg_task_run_history_runid); + table_close(pg_task_run_history, RowExclusiveLock); + + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); + + return runId; +} + +Oid +UnscheduleCronJob(const char *jobname, const char *username, Oid taskid, bool missing_ok) +{ + Relation pg_task = NULL; + SysScanDesc scanDescriptor = NULL; + HeapTuple heapTuple = NULL; + pid_t cron_pid; + Form_pg_task task = NULL; + Oid jobid = InvalidOid; + + pg_task = table_open(TaskRelationId, RowExclusiveLock); + + if (OidIsValid(taskid)) { + ScanKeyData scanKey[1]; + ScanKeyInit(&scanKey[0], Anum_pg_task_jobid, + BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(taskid)); + + scanDescriptor = systable_beginscan(pg_task, TaskJobIdIndexId, false, + NULL, 1, scanKey); + } + else + { + ScanKeyData scanKey[2]; + ScanKeyInit(&scanKey[0], Anum_pg_task_jobname, + BTEqualStrategyNumber, F_TEXTEQ, CStringGetTextDatum(jobname)); + ScanKeyInit(&scanKey[1], Anum_pg_task_username, + BTEqualStrategyNumber, F_TEXTEQ, CStringGetTextDatum(username)); + + scanDescriptor = systable_beginscan(pg_task, TaskJobNameUserNameIndexId, false, + NULL, 2, scanKey); + } + + heapTuple = systable_getnext(scanDescriptor); + if (!HeapTupleIsValid(heapTuple)) + { + if (missing_ok) + { + systable_endscan(scanDescriptor); + table_close(pg_task, RowExclusiveLock); + ereport(NOTICE, + (errmsg("task \"%s\" does not exist, skipping", + jobname))); + return InvalidOid; + } else + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("task \"%s\" does not exist", jobname))); + } + + task = (Form_pg_task) GETSTRUCT(heapTuple); + jobid = task->jobid; + + EnsureDeletePermission(pg_task, heapTuple); + + simple_heap_delete(pg_task, &heapTuple->t_self); + + systable_endscan(scanDescriptor); + table_close(pg_task, RowExclusiveLock); + + CommandCounterIncrement(); + InvalidateJobCache(); + + /* Send SIGHUP to pg_cron launcher to reload the task */ + cron_pid = PgCronLauncherPID(); + if (cron_pid == InvalidPid) + elog(ERROR, "could not find pid of pg_cron launcher process"); + if (kill(cron_pid, SIGHUP) < 0) + elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) cron_pid, SIGHUP); + + return jobid; +} + +/* + * EnsureDeletePermission throws an error if the current user does + * not have permission to delete the given cron.job tuple. + */ +static void +EnsureDeletePermission(Relation cronJobsTable, HeapTuple heapTuple) +{ + TupleDesc tupleDescriptor = RelationGetDescr(cronJobsTable); + + /* check if the current user owns the row */ + Oid userId = GetUserId(); + char *userName = GetUserNameFromId(userId, false); + + bool isNull = false; + Datum ownerNameDatum = heap_getattr(heapTuple, Anum_pg_task_username, + tupleDescriptor, &isNull); + char *ownerName = TextDatumGetCString(ownerNameDatum); + if (pg_strcasecmp(userName, ownerName) != 0) + { + /* otherwise, allow if the user has DELETE permission */ + AclResult aclResult = pg_class_aclcheck(TaskRelationId, GetUserId(), + ACL_DELETE); + if (aclResult != ACLCHECK_OK) + { + aclcheck_error(aclResult, + OBJECT_TABLE, + get_rel_name(TaskRelationId)); + } + } +} + +/* + * Invalidate job cache ensures the job cache is reloaded on the next + * iteration of pg_cron. + */ +static void +InvalidateJobCache(void) +{ + HeapTuple classTuple = NULL; + + classTuple = SearchSysCache1(RELOID, ObjectIdGetDatum(TaskRelationId)); + if (HeapTupleIsValid(classTuple)) + { + CacheInvalidateRelcacheByTuple(classTuple); + ReleaseSysCache(classTuple); + } +} + +/* + * InvalidateJobCacheCallback invalidates the job cache in response to + * an invalidation event. + */ +static void +InvalidateJobCacheCallback(Datum argument, Oid relationId) +{ + if (relationId == CachedCronJobRelationId || + CachedCronJobRelationId == InvalidOid) + { + CronJobCacheValid = false; + CachedCronJobRelationId = InvalidOid; + } +} + +/* + * LoadCronJobList loads the current list of jobs from the + * cron.job table and adds each job to the CronJobHash. + */ +List * +LoadCronJobList(void) +{ + List *jobList = NIL; + + Relation cronJobTable = NULL; + + SysScanDesc scanDescriptor = NULL; + ScanKeyData scanKey[1]; + int scanKeyCount = 0; + HeapTuple heapTuple = NULL; + TupleDesc tupleDescriptor = NULL; + MemoryContext originalContext = CurrentMemoryContext; + + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + /* + * If we are on a hot standby, the job table is treated as being empty. + */ + if (RecoveryInProgress()) + { + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); + + return NIL; + } + + cronJobTable = table_open(TaskRelationId, AccessShareLock); + + scanDescriptor = systable_beginscan(cronJobTable, + InvalidOid, false, + NULL, scanKeyCount, scanKey); + + tupleDescriptor = RelationGetDescr(cronJobTable); + + heapTuple = systable_getnext(scanDescriptor); + while (HeapTupleIsValid(heapTuple)) + { + MemoryContext oldContext = NULL; + CronJob *job = NULL; + Oid jobOwnerId = InvalidOid; + + oldContext = MemoryContextSwitchTo(CronJobContext); + + job = TupleToCronJob(tupleDescriptor, heapTuple); + + jobOwnerId = get_role_oid(job->userName, false); + if (!task_enable_superuser_jobs && superuser_arg(jobOwnerId)) + { + /* + * Someone inserted a superuser into the metadata. Skip over the + * job when cron.enable_superuser_jobs is disabled. The memory + * will be cleaned up when CronJobContext is reset. + */ + ereport(WARNING, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("skipping job " INT64_FORMAT " since superuser jobs " + "are currently disallowed", + job->jobId))); + } + else + { + jobList = lappend(jobList, job); + } + + MemoryContextSwitchTo(oldContext); + + heapTuple = systable_getnext(scanDescriptor); + } + + systable_endscan(scanDescriptor); + table_close(cronJobTable, AccessShareLock); + + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); + + return jobList; +} + +/* + * TupleToCronJob takes a heap tuple and converts it into a CronJob + * struct. + */ +static CronJob * +TupleToCronJob(TupleDesc tupleDescriptor, HeapTuple heapTuple) +{ + CronJob *job = NULL; + int64 jobKey = 0; + bool isNull = false; + bool isPresent = false; + entry *parsedSchedule = NULL; + + Datum jobId = heap_getattr(heapTuple, Anum_pg_task_jobid, + tupleDescriptor, &isNull); + Datum schedule = heap_getattr(heapTuple, Anum_pg_task_schedule, + tupleDescriptor, &isNull); + Datum command = heap_getattr(heapTuple, Anum_pg_task_command, + tupleDescriptor, &isNull); + Datum nodeName = heap_getattr(heapTuple, Anum_pg_task_nodename, + tupleDescriptor, &isNull); + Datum nodePort = heap_getattr(heapTuple, Anum_pg_task_nodeport, + tupleDescriptor, &isNull); + Datum database = heap_getattr(heapTuple, Anum_pg_task_database, + tupleDescriptor, &isNull); + Datum userName = heap_getattr(heapTuple, Anum_pg_task_username, + tupleDescriptor, &isNull); + + jobKey = DatumGetInt64(jobId); + job = hash_search(CronJobHash, &jobKey, HASH_ENTER, &isPresent); + + job->jobId = DatumGetInt64(jobId); + job->scheduleText = TextDatumGetCString(schedule); + job->command = TextDatumGetCString(command); + job->nodeName = TextDatumGetCString(nodeName); + job->nodePort = DatumGetInt32(nodePort); + job->userName = TextDatumGetCString(userName); + job->database = TextDatumGetCString(database); + + if (HeapTupleHeaderGetNatts(heapTuple->t_data) >= Anum_pg_task_active) + { + Datum active = heap_getattr(heapTuple, Anum_pg_task_active, + tupleDescriptor, &isNull); + Assert(!isNull); + job->active = DatumGetBool(active); + } + else + { + job->active = true; + } + + if (tupleDescriptor->natts >= Anum_pg_task_jobname) + { + bool isJobNameNull = false; + Datum jobName = heap_getattr(heapTuple, Anum_pg_task_jobname, + tupleDescriptor, &isJobNameNull); + if (!isJobNameNull) + { + job->jobName = TextDatumGetCString(jobName); + } + else + { + job->jobName = NULL; + } + } + + parsedSchedule = ParseSchedule(job->scheduleText); + if (parsedSchedule != NULL) + { + /* copy the schedule and free the allocated memory immediately */ + + job->schedule = *parsedSchedule; + free_entry(parsedSchedule); + } + else + { + ereport(LOG, (errmsg("invalid pg_cron schedule for job " INT64_FORMAT ": %s", + job->jobId, job->scheduleText))); + + /* a zeroed out schedule never runs */ + memset(&job->schedule, 0, sizeof(entry)); + } + + return job; +} + +void +InsertJobRunDetail(int64 runId, int64 *jobId, char *database, char *username, char *command, char *status) +{ + MemoryContext originalContext = CurrentMemoryContext; + + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + if (RecoveryInProgress()) + { + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); + return; + } + + TaskRunHistoryCreate(runId, jobId, database, username, command, status); + + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); +} + +void +UpdateJobRunDetail(int64 runId, int32 *job_pid, char *status, char *return_message, + TimestampTz *start_time,TimestampTz *end_time) +{ + MemoryContext originalContext = CurrentMemoryContext; + + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + if (RecoveryInProgress()) + { + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); + return; + } + + TaskRunHistoryUpdate(runId, job_pid, status, return_message, start_time, end_time); + + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); +} + +void +AlterCronJob(int64 jobId, char *schedule, char *command, + char *database_name, char *username, bool *active) +{ + AclResult aclresult; + Oid userIdcheckacl; + Oid savedUserId; + int savedSecurityContext; + entry *parsedSchedule = NULL; + + userIdcheckacl = GetUserId(); + + savedUserId = InvalidOid; + savedSecurityContext = 0; + + if (RecoveryInProgress()) + { + return; + } + + /* username has been provided */ + if (username != NULL) + { + if (!superuser()) + elog(ERROR, "must be superuser to alter username"); + + userIdcheckacl = GetRoleOidIfCanLogin(username); + } + + if (!task_enable_superuser_jobs && superuser_arg(userIdcheckacl)) + { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("cannot schedule jobs as superuser"), + errdetail("Scheduling jobs as superuser is disallowed when " + "task_enable_superuser_jobs is set to off."))); + } + + /* database has been provided */ + if (database_name != NULL) + { + /* ensure the user that is used in the job can connect to the database */ + aclresult = pg_database_aclcheck(get_database_oid(database_name, false), + userIdcheckacl, ACL_CONNECT); + + if (aclresult != ACLCHECK_OK) + elog(ERROR, "User %s does not have CONNECT privilege on %s", GetUserNameFromId(userIdcheckacl, false), database_name); + } + + /* ensure schedule is valid */ + if (schedule != NULL) + { + parsedSchedule = ParseSchedule(schedule); + + if (parsedSchedule == NULL) + { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid schedule: %s", schedule), + errhint("Use cron format (e.g. 5 4 * * *), or interval " + "format '[1-59] seconds'"))); + } + + free_entry(parsedSchedule); + } + + GetUserIdAndSecContext(&savedUserId, &savedSecurityContext); + + TaskUpdate(jobId, schedule, command, database_name, username, active); + + SetUserIdAndSecContext(savedUserId, savedSecurityContext); + InvalidateJobCache(); +} + +void +MarkPendingRunsAsFailed(void) +{ + MemoryContext originalContext = CurrentMemoryContext; + + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + if (RecoveryInProgress()) + { + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); + return; + } + + MarkRunningTaskAsFailed(); + + PopActiveSnapshot(); + CommitTransactionCommand(); + MemoryContextSwitchTo(originalContext); +} + +char * +GetCronStatus(CronStatus cronstatus) +{ + char *statusDesc = "unknown status"; + + switch (cronstatus) + { + case CRON_STATUS_STARTING: + statusDesc = "starting"; + break; + case CRON_STATUS_RUNNING: + statusDesc = "running"; + break; + case CRON_STATUS_SENDING: + statusDesc = "sending"; + break; + case CRON_STATUS_CONNECTING: + statusDesc = "connecting"; + break; + case CRON_STATUS_SUCCEEDED: + statusDesc = "succeeded"; + break; + case CRON_STATUS_FAILED: + statusDesc = "failed"; + break; + default: + break; + } + return statusDesc; +} + +/* + * ParseSchedule attempts to parse a cron schedule or an interval in seconds. + * The returned pointer is allocated using malloc and should be freed by the + * caller. + */ +static entry * +ParseSchedule(char *scheduleText) +{ + uint32 secondsInterval = 0; + + /* + * First try to parse as a cron schedule. + */ + entry *schedule = parse_cron_entry(scheduleText); + if (schedule != NULL) + { + /* valid cron schedule */ + return schedule; + } + + /* + * Parse as interval on seconds. + */ + if (TryParseInterval(scheduleText, &secondsInterval)) + { + entry *schedule = calloc(sizeof(entry), sizeof(char)); + schedule->secondsInterval = secondsInterval; + return schedule; + } + + elog(LOG, "failed to parse schedule: %s", scheduleText); + return NULL; +} + + +/* + * TryParseInterval returns whether scheduleText is of the form + * second[s]. + */ +static bool +TryParseInterval(char *scheduleText, uint32 *secondsInterval) +{ + char lastChar = '\0'; + char plural = '\0'; + char extra = '\0'; + char *lowercaseSchedule = asc_tolower(scheduleText, strlen(scheduleText)); + + int numParts = sscanf(lowercaseSchedule, " %u secon%c%c %c", secondsInterval, + &lastChar, &plural, &extra); + if (lastChar != 'd') + { + /* value did not have a "second" suffix */ + return false; + } + + if (numParts == 2) + { + /* second (allow "2 second") */ + return 0 < *secondsInterval && *secondsInterval < 60; + } + else if (numParts == 3 && plural == 's') + { + /* seconds (allow "1 seconds") */ + return 0 < *secondsInterval && *secondsInterval < 60; + } + + return false; +} diff --git a/src/backend/task/misc.c b/src/backend/task/misc.c new file mode 100644 index 00000000000..abc2f8a7f62 --- /dev/null +++ b/src/backend/task/misc.c @@ -0,0 +1,164 @@ +/* + * Copyright 1988,1990,1993,1994 by Paul Vixie + * All rights reserved + * + * Distribute freely, except: don't remove my name from the source or + * documentation (don't take credit for my work), mark your changes (don't + * get me blamed for your possible bugs), don't alter or remove this + * notice. May be sold if buildable source is provided to buyer. No + * warrantee of any kind, express or implied, is included with this + * software; use at your own risk, responsibility for damages (if any) to + * anyone resulting from the use of this software rests entirely with the + * user. + * + * Send bug reports, bug fixes, enhancements, requests, flames, etc., and + * I'll try to keep a version up to date. I can be reached as follows: + * Paul Vixie uunet!decwrl!vixie!paul + */ + +/* + * marco 07nov16 [removed code not needed by pg_cron] + * marco 04sep16 [integrated into pg_cron] + * vix 26jan87 [RCS has the rest of the log] + * vix 30dec86 [written] + */ + +#include +#include +#include +#include +#include +#include "task/cron.h" + +/* + * get_char(file) : like getc() but increment LineNumber on newlines + */ +int +get_char(file) + FILE *file; +{ + int ch; + + /* + * Sneaky hack: we wrapped an in-memory buffer into a FILE* + * to minimize changes to cron.c. + * + * This code replaces: + * ch = getc(file); + */ + file_buffer *buffer = (file_buffer *) file; + + if (buffer->unget_count > 0) + { + ch = buffer->unget_data[--buffer->unget_count]; + } + else if (buffer->pointer == buffer->length) + { + ch = '\0'; + buffer->pointer++; + } + else if (buffer->pointer > buffer->length) + { + ch = EOF; + } + else + { + ch = buffer->data[buffer->pointer++]; + } + + if (ch == '\n') + Set_LineNum(LineNumber + 1); + return ch; +} + +/* + * unget_char(ch, file) : like ungetc but do LineNumber processing + */ +void +unget_char(ch, file) + int ch; + FILE *file; +{ + /* + * Sneaky hack: we wrapped an in-memory buffer into a FILE* + * to minimize changes to cron.c. + * + * This code replaces: + * ungetc(ch, file); + */ + file_buffer *buffer = (file_buffer *) file; + + if (buffer->unget_count >= 1024) + { + perror("ungetc limit exceeded"); + exit(ERROR_EXIT); + } + + buffer->unget_data[buffer->unget_count++] = ch; + + if (ch == '\n') + Set_LineNum(LineNumber - 1); +} + +/* + * get_string(str, max, file, termstr) : like fgets() but + * (1) has terminator string which should include \n + * (2) will always leave room for the null + * (3) uses get_char() so LineNumber will be accurate + * (4) returns EOF or terminating character, whichever + */ +int +get_string(string, size, file, terms) + char *string; + int size; + FILE *file; + char *terms; +{ + int ch; + + while (EOF != (ch = get_char(file)) && !strchr(terms, ch)) { + if (size > 1) { + *string++ = (char) ch; + size--; + } + } + + if (size > 0) + *string = '\0'; + + return ch; +} + +/* + * skip_comments(file) : read past comment (if any) + */ +void +skip_comments(file) + FILE *file; +{ + int ch; + + while (EOF != (ch = get_char(file))) { + /* ch is now the first character of a line. */ + while (ch == ' ' || ch == '\t') + ch = get_char(file); + + if (ch == EOF) + break; + + /* ch is now the first non-blank character of a line. */ + if (ch != '\n' && ch != '#') + break; + + /* + * ch must be a newline or comment as first non-blank + * character on a line. + */ + while (ch != '\n' && ch != EOF) + ch = get_char(file); + + /* ch is now the newline of a line which we're going to ignore. */ + } + if (ch != EOF) + unget_char(ch, file); +} diff --git a/src/backend/task/pg_cron.c b/src/backend/task/pg_cron.c new file mode 100644 index 00000000000..808b81d406e --- /dev/null +++ b/src/backend/task/pg_cron.c @@ -0,0 +1,2008 @@ +/*------------------------------------------------------------------------- + * + * src/pg_cron.c + * + * Implementation of the pg_cron task scheduler. + * Wording: + * - A job is a scheduling definition of a task + * - A task is what is actually executed within the database engine + * + * Copyright (c) 2016, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ +#include + +#include "fmgr.h" +#include "pgstat.h" +#include "postgres.h" + +/* these are always necessary for a bgworker */ +#include "miscadmin.h" +#include "postmaster/bgworker.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/shm_mq.h" +#include "storage/shm_toc.h" +#include "storage/shmem.h" + +/* these headers are used by this particular worker's code */ + +#define MAIN_PROGRAM + +#ifdef HAVE_POLL_H +#include +#elif defined(HAVE_SYS_POLL_H) +#include +#endif + +#include "sys/time.h" +#include "time.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/printtup.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/pg_extension.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "cdb/cdbvars.h" +#include "commands/async.h" +#include "commands/dbcommands.h" +#include "commands/extension.h" +#include "commands/sequence.h" +#include "commands/trigger.h" +#if (PG_VERSION_NUM >= 160000) +#include "utils/guc_hooks.h" +#else +#include "commands/variable.h" +#endif +#include "lib/stringinfo.h" +#include "libpq-fe.h" +#include "libpq/pqformat.h" +#include "libpq/pqmq.h" +#include "libpq/pqsignal.h" +#include "mb/pg_wchar.h" +#include "nodes/nodes.h" +#include "parser/analyze.h" +#include "postmaster/postmaster.h" +#include "task/pg_cron.h" +#include "task/task_states.h" +#include "tcop/pquery.h" +#include "tcop/utility.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/portal.h" +#include "utils/ps_status.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" +#include "utils/varlena.h" + +#ifndef MAXINT8LEN +#define MAXINT8LEN 20 +#endif + +/* Table-of-contents constants for our dynamic shared memory segment. */ +#define PG_CRON_MAGIC 0x51028080 +#define PG_CRON_KEY_DATABASE 0 +#define PG_CRON_KEY_USERNAME 1 +#define PG_CRON_KEY_COMMAND 2 +#define PG_CRON_KEY_QUEUE 3 +#define PG_CRON_NKEYS 4 + +/* ways in which the clock can change between main loop iterations */ +typedef enum +{ + CLOCK_JUMP_BACKWARD = 0, + CLOCK_PROGRESSED = 1, + CLOCK_JUMP_FORWARD = 2, + CLOCK_CHANGE = 3 +} ClockProgress; + +static void pg_cron_sigterm(SIGNAL_ARGS); +static void pg_cron_sighup(SIGNAL_ARGS); + +static void StartAllPendingRuns(List *taskList, TimestampTz currentTime); +static void StartPendingRuns(CronTask *task, ClockProgress clockProgress, + TimestampTz lastMinute, TimestampTz currentTime); +static int MinutesPassed(TimestampTz startTime, TimestampTz stopTime); +static TimestampTz TimestampMinuteStart(TimestampTz time); +static TimestampTz TimestampMinuteEnd(TimestampTz time); +static bool ShouldRunTask(entry *schedule, TimestampTz currentMinute, + bool doWild, bool doNonWild); + +static void WaitForCronTasks(List *taskList); +static void WaitForLatch(int timeoutMs); +static void PollForTasks(List *taskList); +static bool CanStartTask(CronTask *task); +static void ManageCronTasks(List *taskList, TimestampTz currentTime); +static void ManageCronTask(CronTask *task, TimestampTz currentTime); +static void ExecuteSqlString(const char *sql); +static void GetTaskFeedback(PGresult *result, CronTask *task); +static void ProcessBgwTaskFeedback(CronTask *task, bool running); + +static bool jobCanceled(CronTask *task); +static bool jobStartupTimeout(CronTask *task, TimestampTz currentTime); +static char* pg_cron_cmdTuples(char *msg); +static void bgw_generate_returned_message(StringInfoData *display_msg, ErrorData edata); + +/* GUC settings */ +bool task_log_statement = true; +bool task_log_run = true; +bool task_use_background_worker = false; +char *task_timezone = "GMT"; +int max_running_tasks = 5; +char *task_host_addr = "127.0.0.1"; + +/* flags set by signal handlers */ +static volatile sig_atomic_t got_sigterm = false; + +/* global variables */ +static int CronTaskStartTimeout = 10000; /* maximum connection time */ +static const int MaxWait = 1000; /* maximum time in ms that poll() can block */ +static bool RebootJobsScheduled = false; +static int RunningTaskCount = 0; +static char *CronTableDatabaseName = "postgres"; +static PgCronData *PgCron = NULL; + +/* + * Signal handler for SIGTERM + * Set a flag to let the main loop to terminate, and set our latch to wake it up. + */ +static void +pg_cron_sigterm(SIGNAL_ARGS) +{ + got_sigterm = true; + + if (MyProc != NULL) + { + SetLatch(&MyProc->procLatch); + } +} + +/* + * Signal handler for SIGHUP + * Set a flag to tell the main loop to reload the cron jobs. + */ +static void +pg_cron_sighup(SIGNAL_ARGS) +{ + CronJobCacheValid = false; + + if (MyProc != NULL) + { + SetLatch(&MyProc->procLatch); + } +} + +/* + * pg_cron_cmdTuples - + * mainly copy/pasted from PQcmdTuples + * If the last command was INSERT/UPDATE/DELETE/MOVE/FETCH/COPY, return + * a string containing the number of inserted/affected tuples. If not, return "". + * XXX: this should probably return an int + */ + +static char * +pg_cron_cmdTuples(char *msg) +{ + char *p, + *c; + + if (!msg) + return ""; + + if (strncmp(msg, "INSERT ", 7) == 0) + { + p = msg + 7; + /* INSERT: skip oid and space */ + while (*p && *p != ' ') + p++; + if (*p == 0) + goto interpret_error; /* no space? */ + p++; + } + else if (strncmp(msg, "SELECT ", 7) == 0 || + strncmp(msg, "DELETE ", 7) == 0 || + strncmp(msg, "UPDATE ", 7) == 0) + p = msg + 7; + else if (strncmp(msg, "FETCH ", 6) == 0) + p = msg + 6; + else if (strncmp(msg, "MOVE ", 5) == 0 || + strncmp(msg, "COPY ", 5) == 0) + p = msg + 5; + else + return ""; + + /* check that we have an integer (at least one digit, nothing else) */ + for (c = p; *c; c++) + { + if (!isdigit((unsigned char) *c)) + goto interpret_error; + } + if (c == p) + goto interpret_error; + + return p; + +interpret_error: + ereport(LOG, (errmsg("could not interpret result from server: %s", msg))); + return ""; +} + +/* + * bgw_generate_returned_message - + * generates the message to be inserted into the job_run_details table + * first part is comming from error_severity (elog.c) + */ +static void +bgw_generate_returned_message(StringInfoData *display_msg, ErrorData edata) +{ + const char *prefix; + + switch (edata.elevel) + { + case DEBUG1: + case DEBUG2: + case DEBUG3: + case DEBUG4: + case DEBUG5: + prefix = gettext_noop("DEBUG"); + break; + case LOG: + case LOG_SERVER_ONLY: + prefix = gettext_noop("LOG"); + break; + case INFO: + prefix = gettext_noop("INFO"); + break; + case NOTICE: + prefix = gettext_noop("NOTICE"); + break; + case WARNING: + prefix = gettext_noop("WARNING"); + break; + case ERROR: + prefix = gettext_noop("ERROR"); + break; + case FATAL: + prefix = gettext_noop("FATAL"); + break; + case PANIC: + prefix = gettext_noop("PANIC"); + break; + default: + prefix = "???"; + break; + } + + appendStringInfo(display_msg, "%s: %s", prefix, edata.message); + if (edata.detail != NULL) + appendStringInfo(display_msg, "\nDETAIL: %s", edata.detail); + + if (edata.hint != NULL) + appendStringInfo(display_msg, "\nHINT: %s", edata.hint); + + if (edata.context != NULL) + appendStringInfo(display_msg, "\nCONTEXT: %s", edata.context); +} + +bool PgCronStartRule(Datum main_arg) +{ + return (Gp_role == GP_ROLE_DISPATCH); +} + +pid_t +PgCronLauncherPID(void) +{ + return PgCron->cron_pid; +} + +Size PgCronLauncherShmemSize(void) +{ + Size size = 0; + + size = add_size(size, sizeof(PgCronData)); + + return size; +} + +/* Allocate and initialize pg cron related shared memory */ +void +PgCronLauncherShmemInit(void) +{ + bool found; + + PgCron = (PgCronData *) + ShmemInitStruct("Cron Data", PgCronLauncherShmemSize(), &found); + + if (!found) + { + /* First time through, so initialize */ + MemSet(PgCron, 0, PgCronLauncherShmemSize()); + PgCron->cron_pid = 0; + } +} + +/* + * PgCronLauncherMain is the main entry-point for the background worker + * that performs tasks. + */ +void +PgCronLauncherMain(Datum arg) +{ + PgCron->cron_pid = MyProcPid; + + MemoryContext CronLoopContext = NULL; + + /* Establish signal handlers before unblocking signals. */ + pqsignal(SIGHUP, pg_cron_sighup); + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, pg_cron_sigterm); + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + + /* Connect to our database */ + BackgroundWorkerInitializeConnection(CronTableDatabaseName, NULL, 0); + + /* Make pg_cron recognisable in pg_stat_activity */ + pgstat_report_appname("pg_cron scheduler"); + + /* + * Mark anything that was in progress before the database restarted as + * failed. + */ + MarkPendingRunsAsFailed(); + + CronLoopContext = AllocSetContextCreate(CurrentMemoryContext, + "pg_cron loop context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + InitializeJobMetadataCache(); + InitializeTaskStateHash(); + + ereport(LOG, (errmsg("pg_cron scheduler started"))); + + MemoryContextSwitchTo(CronLoopContext); + + while (!got_sigterm) + { + List *taskList = NIL; + TimestampTz currentTime = 0; + + AcceptInvalidationMessages(); + + if (!CronJobCacheValid) + { + RefreshTaskHash(); + } + + taskList = CurrentTaskList(); + currentTime = GetCurrentTimestamp(); + + StartAllPendingRuns(taskList, currentTime); + + WaitForCronTasks(taskList); + ManageCronTasks(taskList, currentTime); + + MemoryContextReset(CronLoopContext); + } + + ereport(LOG, (errmsg("pg_cron scheduler shutting down"))); + + proc_exit(0); +} + +/* + * StartPendingRuns goes through the list of tasks and kicks of + * runs for tasks that should start, taking clock changes into + * into consideration. + */ +static void +StartAllPendingRuns(List *taskList, TimestampTz currentTime) +{ + static TimestampTz lastMinute = 0; + + int minutesPassed = 0; + ListCell *taskCell = NULL; + ClockProgress clockProgress; + + if (!RebootJobsScheduled) + { + /* find jobs with @reboot as a schedule */ + foreach(taskCell, taskList) + { + CronTask *task = (CronTask *) lfirst(taskCell); + CronJob *cronJob = GetCronJob(task->jobId); + entry *schedule = &cronJob->schedule; + + if (schedule->flags & WHEN_REBOOT && + task->isActive) + { + task->pendingRunCount += 1; + } + } + + RebootJobsScheduled = true; + } + + foreach(taskCell, taskList) + { + CronTask *task = (CronTask *) lfirst(taskCell); + + if (task->secondsInterval > 0 && task->isActive) + { + /* + * For interval jobs, if a task takes longer than the interval, + * we only queue up once. So if a task that is supposed to run + * every 30 seconds takes 5 minutes, we start another run + * immediately after 5 minutes, but then return to regular cadence. + */ + if (task->pendingRunCount == 0 && + TimestampDifferenceExceeds(task->lastStartTime, currentTime, + task->secondsInterval * 1000)) + { + task->pendingRunCount += 1; + } + } + } + + if (lastMinute == 0) + { + lastMinute = TimestampMinuteStart(currentTime); + } + + minutesPassed = MinutesPassed(lastMinute, currentTime); + if (minutesPassed == 0) + { + /* wait for new minute */ + return; + } + + /* use Vixie cron logic for clock jumps */ + if (minutesPassed > (3*MINUTE_COUNT)) + { + /* clock jumped forward by more than 3 hours */ + clockProgress = CLOCK_CHANGE; + } + else if (minutesPassed > 5) + { + /* clock went forward by more than 5 minutes (DST?) */ + clockProgress = CLOCK_JUMP_FORWARD; + } + else if (minutesPassed > 0) + { + /* clock went forward by 1-5 minutes */ + clockProgress = CLOCK_PROGRESSED; + } + else if (minutesPassed > -(3*MINUTE_COUNT)) + { + /* clock jumped backwards by less than 3 hours (DST?) */ + clockProgress = CLOCK_JUMP_BACKWARD; + } + else + { + /* clock jumped backwards 3 hours or more */ + clockProgress = CLOCK_CHANGE; + } + + foreach(taskCell, taskList) + { + CronTask *task = (CronTask *) lfirst(taskCell); + + if (!task->isActive) + { + /* + * The job has been unscheduled, so we should not schedule + * new runs. The task will be safely removed on the next call + * to ManageCronTask. + */ + continue; + } + + StartPendingRuns(task, clockProgress, lastMinute, currentTime); + } + + /* + * If the clock jump backwards then we avoid repeating the fixed-time + * tasks by preserving the last minute from before the clock jump, + * until the clock has caught up (clockProgress will be + * CLOCK_JUMP_BACKWARD until then). + */ + if (clockProgress != CLOCK_JUMP_BACKWARD) + { + lastMinute = TimestampMinuteStart(currentTime); + } +} + +/* + * StartPendingRuns kicks off pending runs for a task if it + * should start, taking clock changes into consideration. + */ +static void +StartPendingRuns(CronTask *task, ClockProgress clockProgress, + TimestampTz lastMinute, TimestampTz currentTime) +{ + CronJob *cronJob = GetCronJob(task->jobId); + entry *schedule = &cronJob->schedule; + TimestampTz virtualTime = lastMinute; + TimestampTz currentMinute = TimestampMinuteStart(currentTime); + + switch (clockProgress) + { + case CLOCK_PROGRESSED: + { + /* + * case 1: minutesPassed is a small positive number + * run jobs for each virtual minute until caught up. + */ + + do + { + virtualTime = TimestampTzPlusMilliseconds(virtualTime, + 60*1000); + + if (ShouldRunTask(schedule, virtualTime, true, true)) + { + task->pendingRunCount += 1; + } + } + while (virtualTime < currentMinute); + + break; + } + + case CLOCK_JUMP_FORWARD: + { + /* + * case 2: minutesPassed is a medium-sized positive number, + * for example because we went to DST run wildcard + * jobs once, then run any fixed-time jobs that would + * otherwise be skipped if we use up our minute + * (possible, if there are a lot of jobs to run) go + * around the loop again so that wildcard jobs have + * a chance to run, and we do our housekeeping + */ + + /* run fixed-time jobs for each minute missed */ + do + { + virtualTime = TimestampTzPlusMilliseconds(virtualTime, + 60*1000); + + if (ShouldRunTask(schedule, virtualTime, false, true)) + { + task->pendingRunCount += 1; + } + + } while (virtualTime < currentMinute); + + /* run wildcard jobs for current minute */ + if (ShouldRunTask(schedule, currentMinute, true, false)) + { + task->pendingRunCount += 1; + } + + break; + } + + case CLOCK_JUMP_BACKWARD: + { + /* + * case 3: timeDiff is a small or medium-sized + * negative num, eg. because of DST ending just run + * the wildcard jobs. The fixed-time jobs probably + * have already run, and should not be repeated + * virtual time does not change until we are caught up + */ + + if (ShouldRunTask(schedule, currentMinute, true, false)) + { + task->pendingRunCount += 1; + } + + break; + } + + default: + { + /* + * other: time has changed a *lot*, skip over any + * intermediate fixed-time jobs and go back to + * normal operation. + */ + if (ShouldRunTask(schedule, currentMinute, true, true)) + { + task->pendingRunCount += 1; + } + } + } +} + +/* + * MinutesPassed returns the number of minutes between startTime and + * stopTime rounded down to the closest integer. + */ +static int +MinutesPassed(TimestampTz startTime, TimestampTz stopTime) +{ + int microsPassed = 0; + long secondsPassed = 0; + int minutesPassed = 0; + + TimestampDifference(startTime, stopTime, + &secondsPassed, µsPassed); + + minutesPassed = secondsPassed / 60; + + return minutesPassed; +} + +/* + * TimestampMinuteEnd returns the timestamp at the start of the + * current minute for the given time. + */ +static TimestampTz +TimestampMinuteStart(TimestampTz time) +{ + TimestampTz result = 0; + +#ifdef HAVE_INT64_TIMESTAMP + result = time - time % 60000000; +#else + result = (long) time - (long) time % 60; +#endif + + return result; +} + +/* + * TimestampMinuteEnd returns the timestamp at the start of the + * next minute from the given time. + */ +static TimestampTz +TimestampMinuteEnd(TimestampTz time) +{ + TimestampTz result = TimestampMinuteStart(time); + +#ifdef HAVE_INT64_TIMESTAMP + result += 60000000; +#else + result += 60; +#endif + + return result; +} + +/* + * ShouldRunTask returns whether a job should run in the current + * minute according to its schedule. + */ +static bool +ShouldRunTask(entry *schedule, TimestampTz currentTime, bool doWild, + bool doNonWild) +{ + pg_time_t currentTime_t = timestamptz_to_time_t(currentTime); + struct pg_tm* tm = pg_localtime(¤tTime_t, pg_tzset(task_timezone)); + + int minute = tm->tm_min -FIRST_MINUTE; + int hour = tm->tm_hour -FIRST_HOUR; + int dayOfMonth = tm->tm_mday -FIRST_DOM; + int month = tm->tm_mon +1 -FIRST_MONTH; + int dayOfWeek = tm->tm_wday -FIRST_DOW; + + if (bit_test(schedule->minute, minute) && + bit_test(schedule->hour, hour) && + bit_test(schedule->month, month) && + ( ((schedule->flags & DOM_STAR) || (schedule->flags & DOW_STAR)) + ? (bit_test(schedule->dow,dayOfWeek) && bit_test(schedule->dom,dayOfMonth)) + : (bit_test(schedule->dow,dayOfWeek) || bit_test(schedule->dom,dayOfMonth)))) { + if ((doNonWild && !(schedule->flags & (MIN_STAR|HR_STAR))) + || (doWild && (schedule->flags & (MIN_STAR|HR_STAR)))) + { + return true; + } + } + + return false; +} + +/* + * WaitForCronTasks blocks waiting for any active task for at most + * 1 second. + */ +static void +WaitForCronTasks(List *taskList) +{ + int taskCount = list_length(taskList); + + if (taskCount > 0) + { + PollForTasks(taskList); + } + else + { + WaitForLatch(MaxWait); + } +} + +/* + * WaitForLatch waits for the given number of milliseconds unless a signal + * is received or postmaster shuts down. + */ +static void +WaitForLatch(int timeoutMs) +{ + int rc = 0; + int waitFlags = WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT; + + /* nothing to do, wait for new jobs */ + rc = WaitLatch(MyLatch, waitFlags, timeoutMs, PG_WAIT_EXTENSION); + + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + if (rc & WL_POSTMASTER_DEATH) + { + /* postmaster died and we should bail out immediately */ + proc_exit(1); + } +} + +/* + * PollForTasks calls poll() for the sockets of all tasks. It checks for + * read or write events based on the pollingStatus of the task. + */ +static void +PollForTasks(List *taskList) +{ + TimestampTz currentTime = 0; + TimestampTz nextEventTime = 0; + int pollTimeout = 0; + long waitSeconds = 0; + int waitMicros = 0; + CronTask **polledTasks = NULL; + struct pollfd *pollFDs = NULL; + int pollResult = 0; + + int taskIndex = 0; + int taskCount = list_length(taskList); + int activeTaskCount = 0; + ListCell *taskCell = NULL; + + polledTasks = (CronTask **) palloc0(taskCount * sizeof(CronTask *)); + pollFDs = (struct pollfd *) palloc0(taskCount * sizeof(struct pollfd)); + + currentTime = GetCurrentTimestamp(); + + /* + * At the latest, wake up when the next minute starts. + */ + nextEventTime = TimestampMinuteEnd(currentTime); + + foreach(taskCell, taskList) + { + CronTask *task = (CronTask *) lfirst(taskCell); + PostgresPollingStatusType pollingStatus = task->pollingStatus; + struct pollfd *pollFileDescriptor = &pollFDs[activeTaskCount]; + + if (activeTaskCount >= max_running_tasks) + { + /* already polling the maximum number of tasks */ + break; + } + + if (task->state == CRON_TASK_ERROR || task->state == CRON_TASK_DONE || + CanStartTask(task)) + { + /* there is work to be done, don't wait */ + pfree(polledTasks); + pfree(pollFDs); + return; + } + + if (task->state == CRON_TASK_WAITING && task->pendingRunCount == 0) + { + /* + * Make sure we do not wait past the next run time of an interval + * job. + */ + if (task->secondsInterval > 0) + { + TimestampTz nextRunTime = + TimestampTzPlusMilliseconds(task->lastStartTime, + task->secondsInterval * 1000); + + if (TimestampDifferenceExceeds(nextRunTime, nextEventTime, 0)) + { + nextEventTime = nextRunTime; + } + } + + /* don't poll idle tasks */ + continue; + } + + if (task->state == CRON_TASK_CONNECTING || + task->state == CRON_TASK_SENDING) + { + /* + * We need to wake up when a timeout expires. + * Take the minimum of nextEventTime and task->startDeadline. + */ + if (TimestampDifferenceExceeds(task->startDeadline, nextEventTime, 0)) + { + nextEventTime = task->startDeadline; + } + } + + /* we plan to poll this task */ + pollFileDescriptor = &pollFDs[activeTaskCount]; + polledTasks[activeTaskCount] = task; + + if (task->state == CRON_TASK_CONNECTING || + task->state == CRON_TASK_SENDING || + task->state == CRON_TASK_BGW_RUNNING || + task->state == CRON_TASK_RUNNING) + { + PGconn *connection = task->connection; + int pollEventMask = 0; + + /* + * Set the appropriate mask for poll, based on the current polling + * status of the task, controlled by ManageCronTask. + */ + + if (pollingStatus == PGRES_POLLING_READING) + { + pollEventMask = POLLERR | POLLIN; + } + else if (pollingStatus == PGRES_POLLING_WRITING) + { + pollEventMask = POLLERR | POLLOUT; + } + + pollFileDescriptor->fd = PQsocket(connection); + pollFileDescriptor->events = pollEventMask; + } + else + { + /* + * Task is not running. + */ + + pollFileDescriptor->fd = -1; + pollFileDescriptor->events = 0; + } + + pollFileDescriptor->revents = 0; + + activeTaskCount++; + } + + /* + * Find the first time-based event, which is either the start of a new + * minute or a timeout. + */ + TimestampDifference(currentTime, nextEventTime, &waitSeconds, &waitMicros); + + pollTimeout = waitSeconds * 1000 + waitMicros / 1000; + if (pollTimeout <= 0) + { + /* + * Interval jobs might frequently be overdue, inject a small + * 1ms wait to avoid getting into a tight loop. + */ + pollTimeout = 1; + } + else if (pollTimeout > MaxWait) + { + /* + * We never wait more than 1 second, this gives us a chance to react + * to external events like a TERM signal and job changes. + */ + + pollTimeout = MaxWait; + } + + if (activeTaskCount == 0) + { + /* turns out there's nothing to do, just wait for something to happen */ + WaitForLatch(pollTimeout); + + pfree(polledTasks); + pfree(pollFDs); + return; + } + + pollResult = poll(pollFDs, activeTaskCount, pollTimeout); + if (pollResult < 0) + { + /* + * This typically happens in case of a signal, though we should + * probably check errno in case something bad happened. + */ + + pfree(polledTasks); + pfree(pollFDs); + return; + } + + for (taskIndex = 0; taskIndex < activeTaskCount; taskIndex++) + { + CronTask *task = polledTasks[taskIndex]; + struct pollfd *pollFileDescriptor = &pollFDs[taskIndex]; + + task->isSocketReady = pollFileDescriptor->revents & + pollFileDescriptor->events; + } + + pfree(polledTasks); + pfree(pollFDs); +} + +/* + * CanStartTask determines whether a task is ready to be started because + * it has pending runs and we are running less than MaxRunningTasks. + */ +static bool +CanStartTask(CronTask *task) +{ + return task->state == CRON_TASK_WAITING && task->pendingRunCount > 0 && + RunningTaskCount < max_running_tasks; +} + +/* + * ManageCronTasks proceeds the state machines of the given list of tasks. + */ +static void +ManageCronTasks(List *taskList, TimestampTz currentTime) +{ + ListCell *taskCell = NULL; + + foreach(taskCell, taskList) + { + CronTask *task = (CronTask *) lfirst(taskCell); + + ManageCronTask(task, currentTime); + } +} + +/* + * ManageCronTask implements the cron task state machine. + */ +static void +ManageCronTask(CronTask *task, TimestampTz currentTime) +{ + CronTaskState checkState = task->state; + int64 jobId = task->jobId; + CronJob *cronJob = GetCronJob(jobId); + PGconn *connection = task->connection; + ConnStatusType connectionStatus = CONNECTION_BAD; + TimestampTz start_time; + + switch (checkState) + { + case CRON_TASK_WAITING: + { + /* check if job has been removed */ + if (!task->isActive) + { + /* remove task as well */ + RemoveTask(jobId); + break; + } + + if (!CanStartTask(task)) + { + break; + } + + task->pendingRunCount -= 1; + if (task_use_background_worker) + task->state = CRON_TASK_BGW_START; + else + task->state = CRON_TASK_START; + + task->lastStartTime = currentTime; + + RunningTaskCount++; + + /* Add new entry to audit table. */ + task->runId = NextRunId(); + if (task_log_run) + InsertJobRunDetail(task->runId, &cronJob->jobId, + cronJob->database, cronJob->userName, + cronJob->command, GetCronStatus(CRON_STATUS_STARTING)); + } + + case CRON_TASK_START: + { + /* + * as there is no break at the end of the previous case + * to not add an extra second, then do another check here + */ + if (!task_use_background_worker) + { + const char *clientEncoding = GetDatabaseEncodingName(); + char nodePortString[12]; + TimestampTz startDeadline = 0; + + const char *keywordArray[] = { + "host", + "port", + "fallback_application_name", + "client_encoding", + "dbname", + "user", + NULL + }; + const char *valueArray[] = { + cronJob->nodeName, + nodePortString, + "pg_cron", + clientEncoding, + cronJob->database, + cronJob->userName, + NULL + }; + sprintf(nodePortString, "%d", cronJob->nodePort); + + Assert(sizeof(keywordArray) == sizeof(valueArray)); + + if (task_log_statement) + { + char *command = cronJob->command; + + ereport(LOG, (errmsg("cron job " INT64_FORMAT " %s: %s", + jobId, GetCronStatus(CRON_STATUS_STARTING), command))); + } + + connection = PQconnectStartParams(keywordArray, valueArray, false); + PQsetnonblocking(connection, 1); + + connectionStatus = PQstatus(connection); + if (connectionStatus == CONNECTION_BAD) + { + /* make sure we call PQfinish on the connection */ + task->connection = connection; + + task->errorMessage = "connection failed"; + task->pollingStatus = 0; + task->state = CRON_TASK_ERROR; + break; + } + + startDeadline = TimestampTzPlusMilliseconds(currentTime, CronTaskStartTimeout); + + task->startDeadline = startDeadline; + task->connection = connection; + task->pollingStatus = PGRES_POLLING_WRITING; + task->state = CRON_TASK_CONNECTING; + + if (task_log_run) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_CONNECTING), NULL, NULL, NULL); + + break; + } + } + + case CRON_TASK_BGW_START: + { + + BackgroundWorker worker; + pid_t pid; + shm_toc_estimator e; + shm_toc *toc; + char *database; + char *username; + char *command; + MemoryContext oldcontext; + shm_mq *mq; + Size segsize; + BackgroundWorkerHandle *handle; + BgwHandleStatus status; + bool registered; + TimestampTz startDeadline = 0; + + /* + * break in the previous case has not been reached + * checking just for extra precaution + */ + Assert(task_use_background_worker); + + #define QUEUE_SIZE ((Size) 65536) + + /* + * Create the shared memory that we will pass to the background + * worker process. We use DSM_CREATE_NULL_IF_MAXSEGMENTS so that we + * do not ERROR here. This way, we can mark the job as failed and + * keep the launcher process running normally. + */ + shm_toc_initialize_estimator(&e); + shm_toc_estimate_chunk(&e, strlen(cronJob->database) + 1); + shm_toc_estimate_chunk(&e, strlen(cronJob->userName) + 1); + shm_toc_estimate_chunk(&e, strlen(cronJob->command) + 1); + shm_toc_estimate_chunk(&e, QUEUE_SIZE); + shm_toc_estimate_keys(&e, PG_CRON_NKEYS); + segsize = shm_toc_estimate(&e); + + task->seg = dsm_create(segsize, DSM_CREATE_NULL_IF_MAXSEGMENTS); + if (task->seg == NULL) + { + task->state = CRON_TASK_ERROR; + task->errorMessage = "unable to create a DSM segment; more " + "details may be available in the server log"; + + ereport(WARNING, + (errmsg("max number of DSM segments may has been reached"))); + + break; + } + + toc = shm_toc_create(PG_CRON_MAGIC, dsm_segment_address(task->seg), segsize); + + database = shm_toc_allocate(toc, strlen(cronJob->database) + 1); + strcpy(database, cronJob->database); + shm_toc_insert(toc, PG_CRON_KEY_DATABASE, database); + + username = shm_toc_allocate(toc, strlen(cronJob->userName) + 1); + strcpy(username, cronJob->userName); + shm_toc_insert(toc, PG_CRON_KEY_USERNAME, username); + + command = shm_toc_allocate(toc, strlen(cronJob->command) + 1); + strcpy(command, cronJob->command); + shm_toc_insert(toc, PG_CRON_KEY_COMMAND, command); + + mq = shm_mq_create(shm_toc_allocate(toc, QUEUE_SIZE), QUEUE_SIZE); + shm_toc_insert(toc, PG_CRON_KEY_QUEUE, mq); + shm_mq_set_receiver(mq, MyProc); + + /* + * Attach the queue before launching a worker, so that we'll automatically + * detach the queue if we error out. (Otherwise, the worker might sit + * there trying to write the queue long after we've gone away.) + */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + task->sharedMemoryQueue = shm_mq_attach(mq, task->seg, NULL); + MemoryContextSwitchTo(oldcontext); + + /* Prepare the background worker. */ + memset(&worker, 0, sizeof(BackgroundWorker)); + worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + sprintf(worker.bgw_library_name, "postgres"); + sprintf(worker.bgw_function_name, "CronBackgroundWorker"); + snprintf(worker.bgw_type, BGW_MAXLEN, "pg_cron"); + snprintf(worker.bgw_name, BGW_MAXLEN, "pg_cron worker"); + worker.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(task->seg)); + worker.bgw_notify_pid = MyProcPid; + + /* + * Start the worker process. + */ + if (task_log_statement) + { + ereport(LOG, (errmsg("cron job " INT64_FORMAT " %s: %s", + jobId, GetCronStatus(CRON_STATUS_STARTING), command))); + } + + /* + * If no no background worker slots are currently available + * let's try until we reach jobStartupTimeout + */ + startDeadline = TimestampTzPlusMilliseconds(currentTime, + CronTaskStartTimeout); + task->startDeadline = startDeadline; + do + { + registered = RegisterDynamicBackgroundWorker(&worker, &handle); + } + while (!registered && !jobStartupTimeout(task, GetCurrentTimestamp())); + + if (!registered) + { + dsm_detach(task->seg); + task->seg = NULL; + task->state = CRON_TASK_ERROR; + task->errorMessage = "could not start background process; more " + "details may be available in the server log"; + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of background worker slots"), + errhint("You might need to increase max_worker_processes."))); + break; + } + + task->startDeadline = 0; + task->handle = *handle; + status = WaitForBackgroundWorkerStartup(&task->handle, &pid); + if (status != BGWH_STARTED && status != BGWH_STOPPED) + { + dsm_detach(task->seg); + task->seg = NULL; + task->state = CRON_TASK_ERROR; + task->errorMessage = "could not start background process; more " + "details may be available in the server log"; + break; + } + + start_time = GetCurrentTimestamp(); + + if (task_log_run) + UpdateJobRunDetail(task->runId, &pid, GetCronStatus(CRON_STATUS_RUNNING), NULL, &start_time, NULL); + + task->state = CRON_TASK_BGW_RUNNING; + break; + } + + case CRON_TASK_CONNECTING: + { + PostgresPollingStatusType pollingStatus = 0; + + Assert(!task_use_background_worker); + + /* check if job has been removed */ + if (jobCanceled(task)) + break; + + /* check if timeout has been reached */ + if (jobStartupTimeout(task, currentTime)) + break; + + /* check if connection is still alive */ + connectionStatus = PQstatus(connection); + if (connectionStatus == CONNECTION_BAD) + { + task->errorMessage = "connection failed"; + task->pollingStatus = 0; + task->state = CRON_TASK_ERROR; + break; + } + + /* check if socket is ready to send */ + if (!task->isSocketReady) + { + break; + } + + /* check whether a connection has been established */ + pollingStatus = PQconnectPoll(connection); + if (pollingStatus == PGRES_POLLING_OK) + { + pid_t pid; + /* wait for socket to be ready to send a query */ + task->pollingStatus = PGRES_POLLING_WRITING; + + task->state = CRON_TASK_SENDING; + + pid = (pid_t) PQbackendPID(connection); + if (task_log_run) + UpdateJobRunDetail(task->runId, &pid, GetCronStatus(CRON_STATUS_SENDING), NULL, NULL, NULL); + } + else if (pollingStatus == PGRES_POLLING_FAILED) + { + task->errorMessage = "connection failed"; + task->pollingStatus = 0; + task->state = CRON_TASK_ERROR; + } + else + { + /* + * Connection is still being established. + * + * On the next WaitForTasks round, we wait for reading or writing + * based on the status returned by PQconnectPoll, see: + * https://www.postgresql.org/docs/9.5/static/libpq-connect.html + */ + task->pollingStatus = pollingStatus; + } + + break; + } + + case CRON_TASK_SENDING: + { + char *command = cronJob->command; + int sendResult = 0; + + Assert(!task_use_background_worker); + + /* check if job has been removed */ + if (jobCanceled(task)) + break; + + /* check if timeout has been reached */ + if (jobStartupTimeout(task, currentTime)) + break; + + /* check if socket is ready to send */ + if (!task->isSocketReady) + { + break; + } + + /* check if connection is still alive */ + connectionStatus = PQstatus(connection); + if (connectionStatus == CONNECTION_BAD) + { + task->errorMessage = "connection lost"; + task->pollingStatus = 0; + task->state = CRON_TASK_ERROR; + break; + } + + sendResult = PQsendQuery(connection, command); + if (sendResult == 1) + { + /* wait for socket to be ready to receive results */ + task->pollingStatus = PGRES_POLLING_READING; + + /* command is underway, stop using timeout */ + task->startDeadline = 0; + task->state = CRON_TASK_RUNNING; + + start_time = GetCurrentTimestamp(); + if (task_log_run) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_RUNNING), NULL, &start_time, NULL); + } + else + { + /* not yet ready to send */ + } + + break; + } + + case CRON_TASK_RUNNING: + { + int connectionBusy = 0; + PGresult *result = NULL; + Assert(!task_use_background_worker); + + /* check if job has been removed */ + if (jobCanceled(task)) + break; + + /* check if connection is still alive */ + connectionStatus = PQstatus(connection); + if (connectionStatus == CONNECTION_BAD) + { + task->errorMessage = "connection lost"; + task->pollingStatus = 0; + task->state = CRON_TASK_ERROR; + break; + } + + /* check if socket is ready to send */ + if (!task->isSocketReady) + { + break; + } + + PQconsumeInput(connection); + + connectionBusy = PQisBusy(connection); + if (connectionBusy) + { + /* still waiting for results */ + break; + } + + while ((result = PQgetResult(connection)) != NULL) + { + GetTaskFeedback(result, task); + } + + PQfinish(connection); + + task->connection = NULL; + task->pollingStatus = 0; + task->isSocketReady = false; + + task->state = CRON_TASK_DONE; + RunningTaskCount--; + + break; + } + + case CRON_TASK_BGW_RUNNING: + { + pid_t pid; + + Assert(task_use_background_worker); + + /* check if job has been removed */ + if (jobCanceled(task)) + { + TerminateBackgroundWorker(&task->handle); + WaitForBackgroundWorkerShutdown(&task->handle); + dsm_detach(task->seg); + task->seg = NULL; + + break; + } + + /* still waiting for job to complete */ + if (GetBackgroundWorkerPid(&task->handle, &pid) != BGWH_STOPPED) + { + bool isRunning = true; + + /* process notices and warnings */ + ProcessBgwTaskFeedback(task, isRunning); + } + else + { + bool isRunning = false; + + /* process remaining notices and final task result */ + ProcessBgwTaskFeedback(task, isRunning); + + task->state = CRON_TASK_DONE; + + dsm_detach(task->seg); + + task->seg = NULL; + RunningTaskCount--; + } + + break; + } + + case CRON_TASK_ERROR: + { + if (connection != NULL) + { + PQfinish(connection); + task->connection = NULL; + } + + if (!task->isActive) + { + RemoveTask(jobId); + } + + if (task->errorMessage != NULL) + { + if (task_log_run) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_FAILED), task->errorMessage, NULL, NULL); + + ereport(LOG, (errmsg("cron job " INT64_FORMAT " %s", + jobId, task->errorMessage))); + + + if (task->freeErrorMessage) + { + free(task->errorMessage); + } + } + else + { + ereport(LOG, (errmsg("cron job " INT64_FORMAT " %s", jobId, GetCronStatus(CRON_STATUS_FAILED)))); + } + + task->startDeadline = 0; + task->isSocketReady = false; + task->state = CRON_TASK_DONE; + + RunningTaskCount--; + + /* fall through to CRON_TASK_DONE */ + } + + case CRON_TASK_DONE: + default: + { + int currentPendingRunCount = task->pendingRunCount; + CronJob *job = GetCronJob(jobId); + + /* + * It may happen that job was unscheduled during task execution. + * In this case we keep task as-is. Otherwise, we should + * re-initialize task, i.e. reset fields to initial values including + * status. + */ + if (job != NULL && job->active) + InitializeCronTask(task, jobId); + else + task->state = CRON_TASK_WAITING; + + /* + * We keep the number of runs that should have started while + * the task was still running. If >0, this will trigger another + * run immediately. + */ + task->pendingRunCount = currentPendingRunCount; + } + } +} + +static void +GetTaskFeedback(PGresult *result, CronTask *task) +{ + + TimestampTz end_time; + ExecStatusType executionStatus; + + end_time = GetCurrentTimestamp(); + executionStatus = PQresultStatus(result); + + switch (executionStatus) + { + case PGRES_COMMAND_OK: + { + char *cmdStatus = PQcmdStatus(result); + char *cmdTuples = PQcmdTuples(result); + + if (task_log_run) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_SUCCEEDED), cmdStatus, NULL, &end_time); + + if (task_log_statement) + { + ereport(LOG, (errmsg("cron job " INT64_FORMAT " COMMAND completed: %s %s", + task->jobId, cmdStatus, cmdTuples))); + } + + break; + } + + case PGRES_BAD_RESPONSE: + case PGRES_FATAL_ERROR: + { + task->errorMessage = strdup(PQresultErrorMessage(result)); + task->freeErrorMessage = true; + task->pollingStatus = 0; + task->state = CRON_TASK_ERROR; + + if (task_log_run) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_FAILED), task->errorMessage, NULL, &end_time); + + PQclear(result); + + return; + } + + case PGRES_COPY_IN: + case PGRES_COPY_OUT: + case PGRES_COPY_BOTH: + { + /* cannot handle COPY input/output */ + task->errorMessage = "COPY not supported"; + task->pollingStatus = 0; + task->state = CRON_TASK_ERROR; + + if (task_log_run) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_FAILED), task->errorMessage, NULL, &end_time); + + PQclear(result); + + return; + } + + case PGRES_TUPLES_OK: + case PGRES_EMPTY_QUERY: + case PGRES_SINGLE_TUPLE: + case PGRES_NONFATAL_ERROR: + default: + { + int tupleCount = PQntuples(result); + char *rowString = ngettext("row", "rows", + tupleCount); + char rows[MAXINT8LEN + 1]; + char outputrows[MAXINT8LEN + 4 + 1]; + + pg_lltoa(tupleCount, rows); + snprintf(outputrows, sizeof(outputrows), "%s %s", rows, rowString); + + if (task_log_run) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_SUCCEEDED), outputrows, NULL, &end_time); + + if (task_log_statement) + { + ereport(LOG, (errmsg("cron job " INT64_FORMAT " completed: " + "%d %s", + task->jobId, tupleCount, + rowString))); + } + + break; + } + + } + + PQclear(result); +} + +/* + * ProcessBgwTaskFeedback reads messages from a shared memory queue associated + * with the background worker that is executing a given task. If the task is + * still running, the function does not block if the queue is empty. Otherwise, + * it reads until the end of the queue. + */ +static void +ProcessBgwTaskFeedback(CronTask *task, bool running) +{ + shm_mq_handle *responseq = task->sharedMemoryQueue; + TimestampTz end_time; + + Size nbytes; + void *data; + char msgtype; + StringInfoData msg; + shm_mq_result res; + + end_time = GetCurrentTimestamp(); + /* + * Message-parsing routines operate on a null-terminated StringInfo, + * so we must construct one. + */ + for (;;) + { + /* do not wait if the task is running */ + bool nowait = running; + + /* Get next message. */ + res = shm_mq_receive(responseq, &nbytes, &data, nowait); + + if (res != SHM_MQ_SUCCESS) + break; + + initStringInfo(&msg); + resetStringInfo(&msg); + enlargeStringInfo(&msg, nbytes); + msg.len = nbytes; + memcpy(msg.data, data, nbytes); + msg.data[nbytes] = '\0'; + msgtype = pq_getmsgbyte(&msg); + switch (msgtype) + { + case 'N': + case 'E': + { + ErrorData edata; + StringInfoData display_msg; + + pq_parse_errornotice(&msg, &edata); + initStringInfo(&display_msg); + bgw_generate_returned_message(&display_msg, edata); + + if (task_log_run) + { + + if (edata.elevel >= ERROR) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_FAILED), display_msg.data, NULL, &end_time); + else if (running) + UpdateJobRunDetail(task->runId, NULL, NULL, display_msg.data, NULL, NULL); + else + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_SUCCEEDED), display_msg.data, NULL, &end_time); + } + + ereport(LOG, (errmsg("cron job " INT64_FORMAT ": %s", + task->jobId, display_msg.data))); + pfree(display_msg.data); + + break; + } + case 'T': + break; + case 'C': + { + const char *tag = pq_getmsgstring(&msg); + char *nonconst_tag; + char *cmdTuples; + + nonconst_tag = strdup(tag); + + if (task_log_run) + UpdateJobRunDetail(task->runId, NULL, GetCronStatus(CRON_STATUS_SUCCEEDED), nonconst_tag, NULL, &end_time); + + if (task_log_statement) { + cmdTuples = pg_cron_cmdTuples(nonconst_tag); + ereport(LOG, (errmsg("cron job " INT64_FORMAT " COMMAND completed: %s %s", + task->jobId, nonconst_tag, cmdTuples))); + } + + free(nonconst_tag); + break; + } + case 'A': + case 'D': + case 'G': + case 'H': + case 'W': + case 'Z': + break; + default: + elog(WARNING, "unknown message type: %c (%zu bytes)", + msg.data[0], nbytes); + break; + } + pfree(msg.data); + } +} + +/* + * Background worker logic. + */ +void +CronBackgroundWorker(Datum main_arg) +{ + dsm_segment *seg; + shm_toc *toc; + char *database; + char *username; + char *command; + shm_mq *mq; + shm_mq_handle *responseq; + + /* handle SIGTERM like regular backend */ + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* Set up a memory context and resource owner. */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "pg_cron"); + CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext, + "pg_cron worker", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + /* Set up a dynamic shared memory segment. */ + seg = dsm_attach(DatumGetInt32(main_arg)); + if (seg == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("unable to map dynamic shared memory segment"))); + toc = shm_toc_attach(PG_CRON_MAGIC, dsm_segment_address(seg)); + if (toc == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("bad magic number in dynamic shared memory segment"))); + + database = shm_toc_lookup(toc, PG_CRON_KEY_DATABASE, false); + username = shm_toc_lookup(toc, PG_CRON_KEY_USERNAME, false); + command = shm_toc_lookup(toc, PG_CRON_KEY_COMMAND, false); + mq = shm_toc_lookup(toc, PG_CRON_KEY_QUEUE, false); + + shm_mq_set_sender(mq, MyProc); + responseq = shm_mq_attach(mq, seg, NULL); + pq_redirect_to_shm_mq(seg, responseq); + + BackgroundWorkerInitializeConnection(database, username, 0); + + /* Prepare to execute the query. */ + SetCurrentStatementStartTimestamp(); + debug_query_string = command; + pgstat_report_activity(STATE_RUNNING, command); + StartTransactionCommand(); + if (StatementTimeout > 0) + enable_timeout_after(STATEMENT_TIMEOUT, StatementTimeout); + else + disable_timeout(STATEMENT_TIMEOUT, false); + + /* Execute the query. */ + ExecuteSqlString(command); + + /* Post-execution cleanup. */ + disable_timeout(STATEMENT_TIMEOUT, false); + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, command); + pgstat_report_stat(true); + + /* Signal that we are done. */ + ReadyForQuery(DestRemote); + + dsm_detach(seg); + proc_exit(0); +} + +/* + * Execute given SQL string without SPI or a libpq session. + */ +static void +ExecuteSqlString(const char *sql) +{ + List *raw_parsetree_list; + ListCell *lc1; + bool isTopLevel; + int commands_remaining; + MemoryContext parsecontext; + MemoryContext oldcontext; + + /* + * Parse the SQL string into a list of raw parse trees. + * + * Because we allow statements that perform internal transaction control, + * we can't do this in TopTransactionContext; the parse trees might get + * blown away before we're done executing them. + */ + parsecontext = AllocSetContextCreate(TopMemoryContext, + "pg_cron parse/plan", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldcontext = MemoryContextSwitchTo(parsecontext); + raw_parsetree_list = pg_parse_query(sql); + commands_remaining = list_length(raw_parsetree_list); + isTopLevel = commands_remaining == 1; + MemoryContextSwitchTo(oldcontext); + + /* + * Do parse analysis, rule rewrite, planning, and execution for each raw + * parsetree. We must fully execute each query before beginning parse + * analysis on the next one, since there may be interdependencies. + */ + foreach(lc1, raw_parsetree_list) + { + RawStmt *parsetree = (RawStmt *) lfirst(lc1); + CommandTag commandTag; + QueryCompletion qc; + List *querytree_list; + List *plantree_list; + bool snapshot_set = false; + Portal portal; + DestReceiver *receiver; + int16 format = 1; + + /* + * We don't allow transaction-control commands like COMMIT and ABORT + * here. The entire SQL statement is executed as a single transaction + * which commits if no errors are encountered. + */ + if (IsA(parsetree, TransactionStmt)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("transaction control statements are not allowed in pg_cron"))); + + /* + * Get the command name for use in status display (it also becomes the + * default completion tag, down inside PortalRun). Set ps_status and + * do any special start-of-SQL-command processing needed by the + * destination. + */ + commandTag = CreateCommandTag(parsetree->stmt); + + set_ps_display(GetCommandTagName(commandTag)); + + BeginCommand(commandTag, DestNone); + + /* Set up a snapshot if parse analysis/planning will need one. */ + if (analyze_requires_snapshot(parsetree)) + { + PushActiveSnapshot(GetTransactionSnapshot()); + snapshot_set = true; + } + + /* + * OK to analyze, rewrite, and plan this query. + * + * As with parsing, we need to make sure this data outlives the + * transaction, because of the possibility that the statement might + * perform internal transaction control. + */ + oldcontext = MemoryContextSwitchTo(parsecontext); + #if PG_VERSION_NUM >= 150000 + querytree_list = pg_analyze_and_rewrite_fixedparams(parsetree, sql, NULL, 0, NULL); + #elif PG_VERSION_NUM >= 100000 + querytree_list = pg_analyze_and_rewrite(parsetree, sql, NULL, 0, NULL); + #else + querytree_list = pg_analyze_and_rewrite(parsetree, sql, NULL, 0); + #endif + + plantree_list = pg_plan_queries(querytree_list, sql, 0, NULL); + + /* Done with the snapshot used for parsing/planning */ + if (snapshot_set) + PopActiveSnapshot(); + + /* If we got a cancel signal in analysis or planning, quit */ + CHECK_FOR_INTERRUPTS(); + + /* + * Execute the query using the unnamed portal. + */ + portal = CreatePortal("", true, true); + /* Don't display the portal in pg_cursors */ + portal->visible = false; + PortalDefineQuery(portal, NULL, sql, nodeTag(parsetree->stmt), commandTag, plantree_list, NULL); + PortalStart(portal, NULL, 0, InvalidSnapshot, NULL); + PortalSetResultFormat(portal, 1, &format); /* binary format */ + + --commands_remaining; + receiver = CreateDestReceiver(DestNone); + + /* + * Only once the portal and destreceiver have been established can + * we return to the transaction context. All that stuff needs to + * survive an internal commit inside PortalRun! + */ + MemoryContextSwitchTo(oldcontext); + + /* Here's where we actually execute the command. */ + (void) PortalRun(portal, FETCH_ALL, isTopLevel, true, receiver, receiver, &qc); + + /* Clean up the receiver. */ + (*receiver->rDestroy) (receiver); + + /* + * Send a CommandComplete message even if we suppressed the query + * results. The user backend will report these in the absence of + * any true query results. + */ + EndCommand(&qc, DestRemote, false); + + /* Clean up the portal. */ + PortalDrop(portal, false); + } + + /* Be sure to advance the command counter after the last script command */ + CommandCounterIncrement(); +} + +/* + * If a task is not marked as active, set an appropriate error state on the task + * and return true. Note that this should only be called after a task has + * already been launched. + */ +static bool +jobCanceled(CronTask *task) +{ + Assert(task->state == CRON_TASK_CONNECTING || \ + task->state == CRON_TASK_SENDING || \ + task->state == CRON_TASK_BGW_RUNNING || \ + task->state == CRON_TASK_RUNNING); + + if (task->isActive) + return false; + else + { + /* Use the American spelling for consistency with PG code. */ + task->errorMessage = "job canceled"; + task->state = CRON_TASK_ERROR; + + /* + * Technically, pollingStatus is only used by when UseBackgroundWorkers + * is false, but no damage in setting it in both cases. + */ + task->pollingStatus = 0; + return true; + } +} + +/* + * If a task has hit it's startup deadline, set an appropriate error state on + * the task and return true. Note that this should only be called after a task + * has already been launched. + */ +static bool +jobStartupTimeout(CronTask *task, TimestampTz currentTime) +{ + Assert(task->state == CRON_TASK_CONNECTING || \ + task->state == CRON_TASK_SENDING || \ + task->state == CRON_TASK_BGW_START); + + if (TimestampDifferenceExceeds(task->startDeadline, currentTime, 0)) + { + task->errorMessage = "job startup timeout"; + task->pollingStatus = 0; + task->state = CRON_TASK_ERROR; + return true; + } + else + return false; +} diff --git a/src/backend/task/task_states.c b/src/backend/task/task_states.c new file mode 100644 index 00000000000..92a771a29d8 --- /dev/null +++ b/src/backend/task/task_states.c @@ -0,0 +1,180 @@ +/*------------------------------------------------------------------------- + * + * src/task_states.c + * + * Logic for storing and manipulating cron task states. + * + * Copyright (c) 2016, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#include "fmgr.h" +#include "miscadmin.h" +#include "postgres.h" + +#include "access/hash.h" +#include "task/pg_cron.h" +#include "task/task_states.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" + +/* forward declarations */ +static HTAB * CreateCronTaskHash(void); +static CronTask * GetCronTask(int64 jobId); + +/* global variables */ +static MemoryContext CronTaskContext = NULL; +static HTAB *CronTaskHash = NULL; + +/* + * InitializeTaskStateHash initializes the hash for storing task states. + */ +void +InitializeTaskStateHash(void) +{ + CronTaskContext = AllocSetContextCreate(CurrentMemoryContext, + "pg_cron task context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + CronTaskHash = CreateCronTaskHash(); +} + +/* + * CreateCronTaskHash creates the hash for storing cron task states. + */ +static HTAB * +CreateCronTaskHash(void) +{ + HTAB *taskHash = NULL; + HASHCTL info; + int hashFlags = 0; + + memset(&info, 0, sizeof(info)); + info.keysize = sizeof(int64); + info.entrysize = sizeof(CronTask); + info.hash = tag_hash; + info.hcxt = CronTaskContext; + hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + + taskHash = hash_create("pg_cron tasks", 32, &info, hashFlags); + + return taskHash; +} + +/* + * RefreshTaskHash reloads the cron jobs from the cron.job table. + * If a job that has an active task has been removed, the task + * is marked as inactive by this function. + */ +void +RefreshTaskHash(void) +{ + List *jobList = NIL; + ListCell *jobCell = NULL; + CronTask *task = NULL; + HASH_SEQ_STATUS status; + + ResetJobMetadataCache(); + + hash_seq_init(&status, CronTaskHash); + + /* mark all tasks as inactive */ + while ((task = hash_seq_search(&status)) != NULL) + { + task->isActive = false; + } + + jobList = LoadCronJobList(); + + /* mark tasks that still have a job as active */ + foreach(jobCell, jobList) + { + CronJob *job = (CronJob *) lfirst(jobCell); + + task = GetCronTask(job->jobId); + task->isActive = job->active; + task->secondsInterval = job->schedule.secondsInterval; + } + + CronJobCacheValid = true; +} + +/* + * GetCronTask gets the current task with the given job ID. + */ +static CronTask * +GetCronTask(int64 jobId) +{ + CronTask *task = NULL; + int64 hashKey = jobId; + bool isPresent = false; + + task = hash_search(CronTaskHash, &hashKey, HASH_ENTER, &isPresent); + if (!isPresent) + { + InitializeCronTask(task, jobId); + + /* + * We only initialize last run when entering into the hash. + * The net effect is that the timer for the first run of an + * interval job starts when pg_cron first learns about the job. + */ + task->lastStartTime = GetCurrentTimestamp(); + } + + return task; +} + +/* + * InitializeCronTask intializes a CronTask struct. + */ +void +InitializeCronTask(CronTask *task, int64 jobId) +{ + task->runId = 0; + task->jobId = jobId; + task->state = CRON_TASK_WAITING; + task->pendingRunCount = 0; + task->connection = NULL; + task->pollingStatus = 0; + task->startDeadline = 0; + task->isSocketReady = false; + task->isActive = true; + task->errorMessage = NULL; + task->freeErrorMessage = false; +} + +/* + * CurrentTaskList extracts the current list of tasks from the + * cron task hash. + */ +List * +CurrentTaskList(void) +{ + List *taskList = NIL; + CronTask *task = NULL; + HASH_SEQ_STATUS status; + + hash_seq_init(&status, CronTaskHash); + + while ((task = hash_seq_search(&status)) != NULL) + { + taskList = lappend(taskList, task); + } + + return taskList; +} + +/* + * RemoveTask remove the task for the given job ID. + */ +void +RemoveTask(int64 jobId) +{ + bool isPresent = false; + + hash_search(CronTaskHash, &jobId, HASH_REMOVE, &isPresent); +} diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index cc97f6022e0..c8b8e77c814 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -46,6 +46,8 @@ #include "commands/async.h" #include "commands/prepare.h" #include "commands/extension.h" +#include "crypto/bufenc.h" +#include "crypto/kmgr.h" #include "executor/spi.h" #include "jit/jit.h" #include "libpq/libpq.h" @@ -1413,7 +1415,7 @@ exec_mpp_query(const char *query_string, (void) PortalRun(portal, FETCH_ALL, true, /* Effectively always top level. */ - portal->run_once, + true, receiver, receiver, &qc); @@ -4598,7 +4600,7 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx, * postmaster/postmaster.c (the option sets should not conflict) and with * the common help() function in main/main.c. */ - while ((flag = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lMm:N:nOPp:r:S:sTt:v:W:-:")) != -1) + while ((flag = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lMm:N:nOPp:r:R:S:sTt:v:W:-:")) != -1) { switch (flag) { @@ -4713,6 +4715,19 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx, strlcpy(OutputFileName, optarg, MAXPGPATH); break; + case 'R': + terminal_fd = atoi(optarg); + if (terminal_fd == -1) + { + /* + * Allow file descriptor closing to be bypassed via -1. + * We just duplicate sterr. This is useful for + * single-user mode. + */ + terminal_fd = dup(2); + } + break; + case 'S': SetConfigOption("work_mem", optarg, ctx, gucsource); break; @@ -5034,6 +5049,21 @@ PostgresMain(int argc, char *argv[], /* Early initialization */ BaseInit(); + if (!IsUnderPostmaster) + { + /* + * Initialize kmgr for cluster encryption. Since kmgr needs to attach to + * shared memory the initialization must be called after BaseInit(). + * we need some information from the controlFile, + * so must call the InitializeKmgr after LocalProcessControlFile. + */ + InitializeKmgr(); + InitializeBufferEncryption(); + + if (terminal_fd != -1) + close(terminal_fd); + } + /* * Create a per-backend PGPROC struct in shared memory, except in the * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index f1013b41b75..67981f82514 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -56,6 +56,7 @@ #include "commands/subscriptioncmds.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" +#include "commands/taskcmds.h" #include "commands/trigger.h" #include "commands/typecmds.h" #include "commands/user.h" @@ -234,6 +235,9 @@ ClassifyUtilityCommandAsReadOnly(Node *parsetree) case T_AlterResourceGroupStmt: case T_CreateQueueStmt: case T_CreateResourceGroupStmt: + case T_CreateTaskStmt: + case T_AlterTaskStmt: + case T_DropTaskStmt: case T_DropQueueStmt: case T_DropResourceGroupStmt: case T_CreateExternalStmt: @@ -1770,6 +1774,18 @@ ProcessUtilitySlow(ParseState *pstate, } } break; + + case T_CreateTaskStmt: + address = DefineTask(pstate, (CreateTaskStmt *) parsetree); + break; + + case T_AlterTaskStmt: + address = AlterTask(pstate, (AlterTaskStmt *) parsetree); + break; + + case T_DropTaskStmt: + address = DropTask(pstate, (DropTaskStmt *) parsetree); + break; case T_CreateExternalStmt: { @@ -3235,6 +3251,18 @@ CreateCommandTag(Node *parsetree) case T_CreateEnumStmt: tag = CMDTAG_CREATE_TYPE; break; + + case T_CreateTaskStmt: + tag = CMDTAG_CREATE_TASK; + break; + + case T_AlterTaskStmt: + tag = CMDTAG_ALTER_TASK; + break; + + case T_DropTaskStmt: + tag = CMDTAG_DROP_TASK; + break; case T_CreateRangeStmt: tag = CMDTAG_CREATE_TYPE; diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index dfcaa94def8..4661a4884a2 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -608,6 +608,15 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_DSM_FILL_ZERO_WRITE: event_name = "DSMFillZeroWrite"; break; + case WAIT_EVENT_KEY_FILE_READ: + event_name = "KeyFileRead"; + break; + case WAIT_EVENT_KEY_FILE_WRITE: + event_name = "KeyFileWrite"; + break; + case WAIT_EVENT_KEY_FILE_SYNC: + event_name = "KeyFileSync"; + break; case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ: event_name = "LockFileAddToDataDirRead"; break; diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index f790254dbec..36f40637b95 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -38,6 +38,7 @@ #include "utils/relmapper.h" #include "utils/syscache.h" +#include "access/appendonlywriter.h" #include "access/tableam.h" #include "catalog/pg_appendonly.h" #include "libpq-fe.h" @@ -45,6 +46,7 @@ #include "cdb/cdbdisp_query.h" #include "cdb/cdbdispatchresult.h" #include "cdb/cdbvars.h" +#include "cdb/cdbutil.h" #include "utils/snapmgr.h" /* Divide by two and round away from zero */ @@ -259,6 +261,42 @@ pg_database_size_name(PG_FUNCTION_ARGS) PG_RETURN_INT64(size); } +/* + * GPDB: get segment file count of AO/AOCO tables. + * Could the segment file count be different between segments? + * Take the average of counts, there is no difference if they are same. + */ +Datum +gp_ao_segment_file_count(PG_FUNCTION_ARGS) +{ + Oid relOid = PG_GETARG_OID(0); + Relation rel; + int16 count = 0; + + ERROR_ON_ENTRY_DB(); + + rel = try_relation_open(relOid, AccessShareLock, false); + if (rel == NULL) + PG_RETURN_NULL(); + + if (!RelationIsAppendOptimized(rel)) + { + relation_close(rel, AccessShareLock); + PG_RETURN_NULL(); + } + + if (Gp_role == GP_ROLE_DISPATCH) + { + char *sql; + sql = psprintf("select pg_catalog.gp_ao_segment_file_count(%u)", relOid); + count = get_size_from_segDBs(sql)/getgpsegmentCount(); + } else { + count = GetAppendOnlySegmentFilesCount(rel); + } + Assert(count <= MAX_AOREL_CONCURRENCY); + relation_close(rel, AccessShareLock); + PG_RETURN_INT16(count); +} /* * Calculate total size of tablespace. Returns -1 if the tablespace directory diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 98006984ee3..743a52e19ab 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -57,6 +57,7 @@ #include #include "access/transam.h" +#include "access/tableam.h" #include "catalog/namespace.h" #include "executor/executor.h" #include "miscadmin.h" @@ -962,11 +963,23 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, snapshot_set = true; } + /* + * GP_PARALLEL_FIXME: + * GPDB hack here for IntoClause, see GetCachedPlan(). + * Disable parallel if into a AO/AOCS table. + */ + char* am = (intoClause && intoClause->accessMethod) ? intoClause->accessMethod : default_table_access_method; + bool intoAO = ((strcmp(am, "ao_row") == 0) || (strcmp(am, "ao_column") == 0)); + /* * Generate the plan. */ - plist = pg_plan_queries(qlist, plansource->query_string, - plansource->cursor_options, boundParams); + if (!intoAO) + plist = pg_plan_queries(qlist, plansource->query_string, + plansource->cursor_options, boundParams); + else + plist = pg_plan_queries(qlist, plansource->query_string, + plansource->cursor_options & ~CURSOR_OPT_PARALLEL_OK, boundParams); /* Release snapshot if we got one */ if (snapshot_set) diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 36b40d6e600..f25e05b28f4 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -516,6 +516,17 @@ static const struct cachedesc cacheinfo[] = { }, 1024 }, + {AppendOnlyRelationId, /* AORELID */ + AppendOnlyRelidIndexId, + 1, + { + Anum_pg_appendonly_relid, + 0, + 0, + 0 + }, + 1024 + }, {IndexRelationId, /* INDEXRELID */ IndexRelidIndexId, 1, diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index fd20bfbade2..56bb76cdfef 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -2085,6 +2085,15 @@ SharedRecordTypmodRegistryEstimate(void) return sizeof(SharedRecordTypmodRegistry); } +/* + * Return the next typmod from shared registry. + */ +uint32 +GetSharedNextRecordTypmod(SharedRecordTypmodRegistry* registry) +{ + return pg_atomic_read_u32(®istry->next_typmod); +} + /* * Initialize 'registry' in a pre-existing shared memory region, which must be * maximally aligned and have space for SharedRecordTypmodRegistryEstimate() @@ -2455,7 +2464,7 @@ build_tuple_node_list(int start) List *transientTypeList = NIL; int i = start; - if (NextRecordTypmod == 0) + if (NextRecordTypmod == 0 && CurrentSession->shared_typmod_registry == NULL) return transientTypeList; for (; i < NextRecordTypmod; i++) @@ -2469,6 +2478,20 @@ build_tuple_node_list(int start) transientTypeList = lappend(transientTypeList, node); } + if (CurrentSession->shared_typmod_registry != NULL) + { + for (; i < GetSharedNextRecordTypmod(CurrentSession->shared_typmod_registry); i++) + { + TupleDesc tmp = RecordCacheArray[i]; + + TupleDescNode *node = palloc0(sizeof(TupleDescNode)); + node->type = T_TupleDescNode; + node->natts = tmp->natts; + node->tuple = CreateTupleDescCopy(tmp); + transientTypeList = lappend(transientTypeList, node); + } + } + return transientTypeList; } diff --git a/src/backend/utils/datumstream/datumstream.c b/src/backend/utils/datumstream/datumstream.c index eb4c18b569e..2c0cd1a7c4a 100644 --- a/src/backend/utils/datumstream/datumstream.c +++ b/src/backend/utils/datumstream/datumstream.c @@ -23,13 +23,14 @@ #include "access/detoast.h" #include "access/heaptoast.h" #include "access/tupmacs.h" - +#include "access/xlog.h" #include "catalog/pg_attribute_encoding.h" #include "cdb/cdbappendonlyam.h" #include "cdb/cdbappendonlyblockdirectory.h" #include "cdb/cdbappendonlystoragelayer.h" #include "cdb/cdbappendonlystorageread.h" #include "cdb/cdbappendonlystoragewrite.h" +#include "crypto/bufenc.h" #include "utils/datumstream.h" #include "utils/guc.h" #include "catalog/pg_compression.h" @@ -497,7 +498,8 @@ create_datumstreamwrite( Form_pg_attribute attr, char *relname, char *title, - bool needsWAL) + bool needsWAL, + RelFileNodeBackend *rnode) { DatumStreamWrite *acc = palloc0(sizeof(DatumStreamWrite)); @@ -572,6 +574,7 @@ create_datumstreamwrite( acc->ao_write.compressionState = compressionState; acc->ao_write.verifyWriteCompressionState = verifyBlockCompressionState; acc->title = title; + acc->ao_write.relFileNode = *rnode; /* * Temporarily set the firstRowNum for the block so that we can @@ -625,7 +628,8 @@ create_datumstreamwrite( /* errdetailCallback */ datumstreamwrite_detail_callback, /* errdetailArg */ (void *) acc, /* errcontextCallback */ datumstreamwrite_context_callback, - /* errcontextArg */ (void *) acc); + /* errcontextArg */ (void *) acc, + &acc->ao_write.relFileNode.node); return acc; } @@ -639,7 +643,8 @@ create_datumstreamread( int32 maxsz, Form_pg_attribute attr, char *relname, - char *title) + char *title, + RelFileNode *relFileNode) { DatumStreamRead *acc = palloc0(sizeof(DatumStreamRead)); @@ -695,7 +700,8 @@ create_datumstreamread( acc->maxAoBlockSize, relname, title, - &acc->ao_attr); + &acc->ao_attr, + relFileNode); acc->ao_read.compression_functions = compressionFunctions; acc->ao_read.compressionState = compressionState; @@ -889,7 +895,8 @@ datumstreamwrite_block_orig(DatumStreamWrite * acc) writesz = DatumStreamBlockWrite_Block( &acc->blockWrite, - buffer); + buffer, + &acc->ao_write.relFileNode.node); acc->ao_write.logicalBlockStartOffset = BufferedAppendNextBufferPosition(&(acc->ao_write.bufferedAppend)); @@ -943,7 +950,8 @@ datumstreamwrite_block_dense(DatumStreamWrite * acc) writesz = DatumStreamBlockWrite_Block( &acc->blockWrite, - buffer); + buffer, + &acc->ao_write.relFileNode.node); acc->ao_write.logicalBlockStartOffset = BufferedAppendNextBufferPosition(&(acc->ao_write.bufferedAppend)); @@ -1024,6 +1032,8 @@ datumstreamwrite_lob(DatumStreamWrite * acc, { uint8 *p; int32 varLen; + uint8 *content; + int32 contentLen; Assert(acc); Assert(acc->datumStreamVersion == DatumStreamVersion_Original || @@ -1061,14 +1071,38 @@ datumstreamwrite_lob(DatumStreamWrite * acc, p); } + content = p; + contentLen = varLen; + + if (FileEncryptionEnabled) + { + int32 alignedHeaderSize; + int32 encryptLen; + char* encryptData; + + alignedHeaderSize = MAXALIGN(sizeof(uint16)); + contentLen += alignedHeaderSize; + content = palloc(contentLen); + + encryptData = VARDATA_ANY(p); + encryptLen = VARSIZE_ANY_EXHDR(p); + + EncryptAOBLock((unsigned char *)encryptData, + encryptLen, + &acc->ao_write.relFileNode.node); + + *(uint16 *)content = 1; + memcpy(content + alignedHeaderSize, p, varLen); + } + /* Set the BlockFirstRowNum */ AppendOnlyStorageWrite_SetFirstRowNum(&acc->ao_write, acc->blockFirstRowNum); AppendOnlyStorageWrite_Content( &acc->ao_write, - p, - varLen, + content, + contentLen, AOCSBK_BLOB, /* rowCount */ 1); @@ -1081,6 +1115,10 @@ datumstreamwrite_lob(DatumStreamWrite * acc, 1, /*itemCount -- always just the lob just inserted */ addColAction); + + if (FileEncryptionEnabled) + pfree(content); + return varLen; } @@ -1145,7 +1183,8 @@ datumstreamread_block_get_ready(DatumStreamRead * acc) acc->getBlockInfo.firstRow, acc->getBlockInfo.rowCnt, &hadToAdjustRowCount, - &adjustedRowCount); + &adjustedRowCount, + &acc->ao_read.relFileNode); if (hadToAdjustRowCount) { acc->blockRowCount = adjustedRowCount; @@ -1154,6 +1193,32 @@ datumstreamread_block_get_ready(DatumStreamRead * acc) else if (acc->getBlockInfo.execBlockKind == AOCSBK_BLOB) { Assert(acc->buffer_beginp == acc->large_object_buffer); + if (FileEncryptionEnabled) + { + int32 alignedHeaderSize; + struct varlena *va; + char* decryptData; + int32 decryptLen; + uint16 encrypted; + + Assert(acc->buffer_beginp == acc->large_object_buffer); + encrypted = *(uint16 *)acc->buffer_beginp; + if (encrypted) + { + /* set the flag to 0, mark the block has been decrypted */ + *(uint16 *)acc->buffer_beginp = 0; + + alignedHeaderSize = MAXALIGN(sizeof(uint16)); + acc->buffer_beginp += alignedHeaderSize; + + va = (struct varlena *) acc->buffer_beginp; + decryptData = VARDATA_ANY(va); + decryptLen = VARSIZE_ANY_EXHDR(va); + DecryptAOBlock((unsigned char*)decryptData, + decryptLen, + &acc->ao_read.relFileNode); + } + } } else { diff --git a/src/backend/utils/datumstream/datumstreamblock.c b/src/backend/utils/datumstream/datumstreamblock.c index 17d0db829ac..98f65f43f38 100755 --- a/src/backend/utils/datumstream/datumstreamblock.c +++ b/src/backend/utils/datumstream/datumstreamblock.c @@ -16,6 +16,8 @@ #include "access/detoast.h" #include "access/heaptoast.h" #include "access/tupmacs.h" +#include "access/xlog.h" +#include "crypto/bufenc.h" #include "utils/datumstreamblock.h" #include "utils/guc.h" @@ -155,7 +157,8 @@ DatumStreamBlockRead_GetReadyOrig( int64 firstRowNum, int32 rowCount, bool *hadToAdjustRowCount, - int32 * adjustedRowCount) + int32 * adjustedRowCount, + RelFileNode *node) { uint8 *p; @@ -221,20 +224,6 @@ DatumStreamBlockRead_GetReadyOrig( errcontext_datumstreamblockread(dsr))); } - if (!minimalIntegrityChecks) - { - DatumStreamBlock_IntegrityCheckOrig( - buffer, - bufferSize, - minimalIntegrityChecks, - rowCount, - &dsr->typeInfo, - /* errdetailCallback */ errdetail_datumstreamblockread_callback, - /* errdetailArg */ (void *) dsr, - /* errcontextCallback */ errcontext_datumstreamblockread_callback, - /* errcontextArg */ (void *) dsr); - } - dsr->logical_row_count = blockOrig->ndatum; dsr->physical_datum_count = 0; @@ -340,6 +329,28 @@ DatumStreamBlockRead_GetReadyOrig( #endif dsr->datump = dsr->datum_beginp; + + if (FileEncryptionEnabled && blockOrig->encrypted) + { + DecryptAOBlock(dsr->datump, + dsr->physical_data_size, + node); + blockOrig->encrypted = 0; + } + + if (!minimalIntegrityChecks) + { + DatumStreamBlock_IntegrityCheckOrig( + buffer, + bufferSize, + minimalIntegrityChecks, + rowCount, + &dsr->typeInfo, + /* errdetailCallback */ errdetail_datumstreamblockread_callback, + /* errdetailArg */ (void *) dsr, + /* errcontextCallback */ errcontext_datumstreamblockread_callback, + /* errcontextArg */ (void *) dsr); + } } void @@ -620,7 +631,8 @@ DatumStreamBlockRead_GetReadyDense( int64 firstRowNum, int32 rowCount, bool *hadToAdjustRowCount, - int32 * adjustedRowCount) + int32 * adjustedRowCount, + RelFileNode *node) { uint8 *p; @@ -666,16 +678,6 @@ DatumStreamBlockRead_GetReadyDense( p = dsr->buffer_beginp; Assert(p == buffer); - DatumStreamBlock_IntegrityCheckDense( - buffer, - bufferSize, - minimalIntegrityChecks, - rowCount, - &dsr->typeInfo, - /* errdetailCallback */ errdetail_datumstreamblockread_callback, - /* errdetailArg */ (void *) dsr, - /* errcontextCallback */ errcontext_datumstreamblockread_callback, - /* errcontextArg */ (void *) dsr); blockDense = (DatumStreamBlock_Dense *) p; @@ -944,6 +946,26 @@ DatumStreamBlockRead_GetReadyDense( } } dsr->datump = dsr->datum_beginp; + if (FileEncryptionEnabled && (blockDense->orig_4_bytes.flags & DSB_HAS_ENCRYPTION) != 0) + { + DecryptAOBlock(dsr->datump, + dsr->physical_data_size, + node); + + /* reset the flag, mark the block has been decrypted */ + blockDense->orig_4_bytes.flags = blockDense->orig_4_bytes.flags & ~ DSB_HAS_ENCRYPTION; + } + + DatumStreamBlock_IntegrityCheckDense( + buffer, + bufferSize, + minimalIntegrityChecks, + rowCount, + &dsr->typeInfo, + /* errdetailCallback */ errdetail_datumstreamblockread_callback, + /* errdetailArg */ (void *) dsr, + /* errcontextCallback */ errcontext_datumstreamblockread_callback, + /* errcontextArg */ (void *) dsr); } static int @@ -3644,7 +3666,8 @@ DatumStreamBlockWrite_GetReady( static int64 DatumStreamBlockWrite_BlockOrig( DatumStreamBlockWrite * dsw, - uint8 * buffer) + uint8 * buffer, + RelFileNode *node) { uint8 *p; DatumStreamBlock_Orig block; @@ -3659,7 +3682,10 @@ DatumStreamBlockWrite_BlockOrig( block.version = DatumStreamVersion_Original; block.flags = dsw->has_null ? DSB_HAS_NULLBITMAP : 0; block.ndatum = dsw->nth; - block.unused = 0; + block.encrypted = 0; + + if (FileEncryptionEnabled) + block.encrypted = 1; /* NOTE:Unfortunately, this was not zeroed in the earlier releases of the code. */ /* compress null bitmaps */ @@ -3761,13 +3787,21 @@ DatumStreamBlockWrite_BlockOrig( /* errcontextCallback */ errcontext_datumstreamblockwrite_callback, /* errcontextArg */ (void *) dsw); + if (FileEncryptionEnabled) + { + EncryptAOBLock(p - block.sz, + block.sz, + node); + } + return writesz; } static int64 DatumStreamBlockWrite_BlockDense( DatumStreamBlockWrite * dsw, - uint8 * buffer) + uint8 * buffer, + RelFileNode *node) { int64 writesz = 0; uint8 *p = NULL; @@ -3815,6 +3849,11 @@ DatumStreamBlockWrite_BlockDense( dense.orig_4_bytes.flags |= DSB_HAS_DELTA_COMPRESSION; } + if (FileEncryptionEnabled) + { + dense.orig_4_bytes.flags |= DSB_HAS_ENCRYPTION; + } + dense.logical_row_count = dsw->nth; dense.physical_datum_count = dsw->physical_datum_count; dense.physical_data_size = dsw->datump - dsw->datum_buffer; @@ -4197,13 +4236,18 @@ DatumStreamBlockWrite_BlockDense( /* errcontextCallback */ errcontext_datumstreamblockwrite_callback, /* errcontextArg */ (void *) dsw); + if (FileEncryptionEnabled) + EncryptAOBLock(buffer + metadataMaxAlignSize, + dense.physical_data_size, + node); return writesz; } int64 DatumStreamBlockWrite_Block( DatumStreamBlockWrite * dsw, - uint8 * buffer) + uint8 * buffer, + RelFileNode *node) { if (strncmp(dsw->eyecatcher, DatumStreamBlockWrite_Eyecatcher, DatumStreamBlockWrite_EyecatcherLen) != 0) elog(FATAL, "DatumStreamBlockWrite data structure not valid (eyecatcher)"); @@ -4211,11 +4255,11 @@ DatumStreamBlockWrite_Block( switch (dsw->datumStreamVersion) { case DatumStreamVersion_Original: - return DatumStreamBlockWrite_BlockOrig(dsw, buffer); + return DatumStreamBlockWrite_BlockOrig(dsw, buffer, node); case DatumStreamVersion_Dense: case DatumStreamVersion_Dense_Enhanced: - return DatumStreamBlockWrite_BlockDense(dsw, buffer); + return DatumStreamBlockWrite_BlockDense(dsw, buffer, node); default: ereport(FATAL, @@ -4241,7 +4285,8 @@ DatumStreamBlockWrite_Init( int (*errdetailCallback) (void *errdetailArg), void *errdetailArg, int (*errcontextCallback) (void *errcontextArg), - void *errcontextArg) + void *errcontextArg, + RelFileNode *relFileNode) { memcpy(dsw->eyecatcher, DatumStreamBlockWrite_Eyecatcher, DatumStreamBlockWrite_EyecatcherLen); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 9868a1fef80..ec786175ab5 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -103,6 +103,7 @@ extern void InitGPOPT(); extern void TerminateGPOPT(); #endif +NoticeSessionDB_hook_type NoticeSessionDB_hook = NULL; /*** InitPostgres support ***/ @@ -682,7 +683,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); - InitGPOPT(); + if (!bootstrap && Gp_role == GP_ROLE_DISPATCH) + InitGPOPT(); #endif /* @@ -1100,6 +1102,13 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, */ MyProc->databaseId = MyDatabaseId; + /* + * Mark PGPROC entry with the database ID maybe not enough in a distributed + * environment. Use this hook to record it. + */ + if (NoticeSessionDB_hook) + (*NoticeSessionDB_hook)(MyDatabaseId); + /* * We established a catalog snapshot while reading pg_authid and/or * pg_database; but until we have set up MyDatabaseId, we won't react to diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5a09028b116..9b6c8e93752 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -53,6 +53,7 @@ #include "commands/vacuum.h" #include "commands/variable.h" #include "common/string.h" +#include "crypto/kmgr.h" #include "funcapi.h" #include "jit/jit.h" #include "libpq/auth.h" @@ -671,7 +672,7 @@ static char *recovery_target_string; static char *recovery_target_xid_string; static char *recovery_target_name_string; static char *recovery_target_lsn_string; - +static char *file_encryption_method_str; /* should be static, but commands/variable.c needs to get at this */ char *role_string; @@ -799,6 +800,8 @@ const char *const config_group_names[] = gettext_noop("Statistics / Monitoring"), /* STATS_COLLECTOR */ gettext_noop("Statistics / Query and Index Statistics Collector"), + /* ENCRYPTION */ + gettext_noop("Encryption"), /* AUTOVACUUM */ gettext_noop("Autovacuum"), /* CLIENT_CONN_STATEMENT */ @@ -835,6 +838,8 @@ const char *const config_group_names[] = gettext_noop("Customized Options"), /* DEVELOPER_OPTIONS */ gettext_noop("Developer Options"), + /* TASK_SCHEDULE_OPTIONS */ + gettext_noop("Task Schedule Options"), /* DEPRECATED_OPTIONS */ gettext_noop("Deprecated Options"), @@ -845,7 +850,7 @@ const char *const config_group_names[] = NULL }; -StaticAssertDecl(lengthof(config_group_names) == (DEVELOPER_OPTIONS + 4), +StaticAssertDecl(lengthof(config_group_names) == (TASK_SCHEDULE_OPTIONS + 4), "array length mismatch"); /* @@ -1268,6 +1273,7 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { {"fsync", PGC_SIGHUP, WAL_SETTINGS, gettext_noop("Forces synchronization of updates to disk."), @@ -2058,7 +2064,7 @@ static struct config_bool ConfigureNamesBool[] = GUC_EXPLAIN }, ¶llel_leader_participation, - true, + false, NULL, NULL, NULL }, @@ -2158,6 +2164,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"tde_force_switch", PGC_POSTMASTER, ENCRYPTION, + gettext_noop("Whether to enable tde featue."), + }, + &tde_force_switch, + true, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -4652,6 +4667,27 @@ static struct config_string ConfigureNamesString[] = check_backtrace_functions, assign_backtrace_functions, NULL }, + { + {"cluster_key_command", PGC_SIGHUP, ENCRYPTION, + gettext_noop("Command to obtain cluster key for cluster file encryption."), + NULL + }, + &cluster_key_command, + "", + NULL, NULL, NULL + }, + + { + {"file_encryption_method", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows the cluster file encryption method."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &file_encryption_method_str, + "", + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL @@ -10894,7 +10930,8 @@ can_skip_gucvar(struct config_generic *gconf) */ return gconf->context == PGC_POSTMASTER || gconf->context == PGC_INTERNAL || gconf->source == PGC_S_DEFAULT || - strcmp(gconf->name, "role") == 0; + strcmp(gconf->name, "role") == 0 || strcmp(gconf->name, "gp_role") == 0 || + strcmp(gconf->name, "gp_is_writer") == 0; } /* @@ -11375,6 +11412,22 @@ RestoreGUCState(void *gucstate) error_context_name_and_value[0] = varname; error_context_name_and_value[1] = varvalue; error_context_callback.arg = &error_context_name_and_value[0]; + /* + * GPDB + * Skip all GUCs that will change FirstSnapshotSet. + * Current is only gp_write_shared_snapshot. + * Restore those will call some functions like assign_gp_write_shared_snapshot + * which will set FirstSnapshotSet before some GUCs like transaction_deferrable. + * That is not allowed. + * + * We must EnterParallelMode() after RestoreGUCState(), and can't know + * if we are parallel workers now. + * But, in principle, we are parallel workers launched by PG style nodes. + * So, just skip the GUC is reasonable. + * + */ + if (strcmp(varname, "gp_write_shared_snapshot") == 0) + continue; result = set_config_option(varname, varvalue, varscontext, varsource, GUC_ACTION_SET, true, ERROR, true); if (result <= 0) diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index d973f967f4c..538d1b03af5 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -35,6 +35,7 @@ #include "cdb/memquota.h" #include "commands/defrem.h" #include "commands/vacuum.h" +#include "commands/variable.h" #include "miscadmin.h" #include "optimizer/cost.h" #include "optimizer/planmain.h" @@ -42,8 +43,10 @@ #include "parser/scansup.h" #include "postmaster/syslogger.h" #include "postmaster/fts.h" +#include "postmaster/postmaster.h" #include "replication/walsender.h" #include "storage/proc.h" +#include "task/pg_cron.h" #include "tcop/idle_resource_cleaner.h" #include "utils/builtins.h" #include "utils/gdd.h" @@ -96,6 +99,7 @@ static bool check_pljava_classpath_insecure(bool *newval, void **extra, GucSourc static void assign_pljava_classpath_insecure(bool newval, void *extra); static bool check_gp_resource_group_bypass(bool *newval, void **extra, GucSource source); static int guc_array_compare(const void *a, const void *b); +static bool check_max_running_tasks(int *newval, void **extra, GucSource source); extern int listenerBacklog; @@ -137,6 +141,9 @@ bool gp_appendonly_verify_block_checksums = true; bool gp_appendonly_verify_write_block = false; bool gp_appendonly_compaction = true; int gp_appendonly_compaction_threshold = 0; +bool enable_parallel = false; +int gp_appendonly_insert_files = 0; +int gp_appendonly_insert_files_tuples_range = 0; bool gp_heap_require_relhasoids_match = true; bool gp_local_distributed_cache_stats = false; bool debug_xlog_record_read = false; @@ -246,6 +253,7 @@ bool gp_enable_hashjoin_size_heuristic = false; bool gp_enable_predicate_propagation = false; bool gp_enable_minmax_optimization = true; bool gp_enable_multiphase_agg = true; +bool gp_enable_multiphase_limit = true; bool gp_enable_preunique = true; bool gp_enable_agg_distinct = true; bool gp_enable_dqa_pruning = true; @@ -256,6 +264,7 @@ bool gp_cte_sharing = false; bool gp_enable_relsize_collection = false; bool gp_recursive_cte = true; bool gp_eager_two_phase_agg = false; +bool gp_force_random_redistribution = false; /* Optimizer related gucs */ bool optimizer; @@ -293,6 +302,7 @@ bool optimizer_enable_indexjoin; bool optimizer_enable_motions_masteronly_queries; bool optimizer_enable_motions; bool optimizer_enable_motion_broadcast; +bool parallel_hash_enable_motion_broadcast; bool optimizer_enable_motion_gather; bool optimizer_enable_motion_redistribute; bool optimizer_enable_sort; @@ -680,6 +690,16 @@ struct config_bool ConfigureNamesBool_gp[] = NULL, NULL, NULL }, + { + {"gp_enable_multiphase_limit", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of two phase limit plans."), + gettext_noop("Allows partial limit on QEs.") + }, + &gp_enable_multiphase_limit, + true, + NULL, NULL, NULL + }, + { {"gp_enable_preunique", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enable 2-phase duplicate removal."), @@ -1789,6 +1809,16 @@ struct config_bool ConfigureNamesBool_gp[] = false, NULL, NULL }, + { + {"gp_force_random_redistribution", PGC_USERSET, CUSTOM_OPTIONS, + gettext_noop("Force redistribution of insert for randomly-distributed."), + NULL, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE + }, + &gp_force_random_redistribution, + false, NULL, NULL + }, + { {"optimizer", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enable GPORCA."), @@ -2059,6 +2089,16 @@ struct config_bool ConfigureNamesBool_gp[] = true, NULL, NULL, NULL }, + { + {"parallel_hash_enable_motion_broadcast", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Enable plans with Motion Broadcast operators in parallel hash join."), + NULL, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE + }, + ¶llel_hash_enable_motion_broadcast, + true, + NULL, NULL, NULL + }, { {"optimizer_enable_motion_gather", PGC_USERSET, DEVELOPER_OPTIONS, gettext_noop("Enable plans with Motion Gather operators in the optimizer."), @@ -2878,6 +2918,47 @@ struct config_bool ConfigureNamesBool_gp[] = false, NULL, NULL, NULL }, + /* for tasks schedule */ + { + {"task_use_background_worker", PGC_POSTMASTER, TASK_SCHEDULE_OPTIONS, + gettext_noop("Use background workers instead of client sessions."), + NULL, + GUC_SUPERUSER_ONLY + }, + &task_use_background_worker, + false, + NULL, NULL, NULL + }, + { + {"task_log_run", PGC_POSTMASTER, TASK_SCHEDULE_OPTIONS, + gettext_noop("Log all tasks runs into the pg_task_run_history table."), + NULL, + GUC_SUPERUSER_ONLY + }, + &task_log_run, + true, + NULL, NULL, NULL + }, + { + {"task_enable_superuser_jobs", PGC_POSTMASTER, TASK_SCHEDULE_OPTIONS, + gettext_noop("Allow tasks to be scheduled as superuser."), + NULL, + GUC_SUPERUSER_ONLY + }, + &task_enable_superuser_jobs, + true, + NULL, NULL, NULL + }, + { + {"task_log_statement", PGC_POSTMASTER, TASK_SCHEDULE_OPTIONS, + gettext_noop("Log all cron task statements prior to execution."), + NULL, + GUC_SUPERUSER_ONLY + }, + &task_log_statement, + true, + NULL, NULL, NULL + }, #ifndef USE_INTERNAL_FTS { {"gp_etcd_enable_cache", PGC_SUSET, CUSTOM_OPTIONS, @@ -2890,6 +2971,17 @@ struct config_bool ConfigureNamesBool_gp[] = NULL, NULL, NULL }, #endif + { + {"enable_parallel", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("allow to use of parallel query facilities or not."), + NULL, + GUC_EXPLAIN + }, + &enable_parallel, + false, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL @@ -3077,6 +3169,28 @@ struct config_int ConfigureNamesInt_gp[] = NULL, NULL, NULL }, + { + {"gp_appendonly_insert_files", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Number of segment files to insert for appendonly table within a transaction." + " will be useful for appendonly table parallel scan."), + NULL + }, + &gp_appendonly_insert_files, + 4, 0, 127, + NULL, NULL, NULL + }, + + { + {"gp_appendonly_insert_files_tuples_range", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Range number of tuples files to switch between segment files for appendonly" + "table insertion with multiple segment files within a transaction."), + NULL + }, + &gp_appendonly_insert_files_tuples_range, + 100000, 0, INT_MAX, + NULL, NULL, NULL + }, + { {"gp_workfile_max_entries", PGC_POSTMASTER, RESOURCES, gettext_noop("Sets the maximum number of entries that can be stored in the workfile directory"), @@ -4124,6 +4238,18 @@ struct config_int ConfigureNamesInt_gp[] = 0, 0, MAX_GP_DISPATCH_KEEPALIVES_COUNT, NULL, NULL, NULL }, + + { + {"max_running_tasks", PGC_POSTMASTER, TASK_SCHEDULE_OPTIONS, + gettext_noop("Maximum number of tasks that can be running at the same time."), + NULL, + GUC_SUPERUSER_ONLY, + }, + &max_running_tasks, + 5, 1, MAX_BACKENDS, + check_max_running_tasks, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL @@ -4412,6 +4538,28 @@ struct config_string ConfigureNamesString_gp[] = check_gp_default_storage_options, assign_gp_default_storage_options, NULL }, + { + {"task_timezone", PGC_POSTMASTER, TASK_SCHEDULE_OPTIONS, + gettext_noop("Specify timezone used for cron task schedule."), + NULL, + GUC_SUPERUSER_ONLY + }, + &task_timezone, + "GMT", + check_timezone, assign_timezone, show_timezone + }, + + { + {"task_host_addr", PGC_POSTMASTER, TASK_SCHEDULE_OPTIONS, + gettext_noop("Host address to connect to CloudBerry database."), + NULL, + GUC_SUPERUSER_ONLY + }, + &task_host_addr, + "127.0.0.1", + NULL, NULL, NULL + }, + { /* Can't be set in postgresql.conf */ {"gp_server_version", PGC_INTERNAL, PRESET_OPTIONS, @@ -4915,6 +5063,21 @@ check_gp_hashagg_default_nbatches(int *newval, void **extra, GucSource source) } } +bool +check_max_running_tasks(int *newval, void **extra, GucSource source) +{ + if (*newval < 0) + { + GUC_check_errmsg("max_running_tasks must be greater than or equal to 0"); + return false; + } + if (task_use_background_worker && *newval >= max_worker_processes) + return false; + if (*newval >= MaxConnections) + return false; + return true; +} + /* * Malloc a new string representing current storage_opts. */ diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c index 209a20a8827..908179cdf8b 100644 --- a/src/backend/utils/misc/pg_controldata.c +++ b/src/backend/utils/misc/pg_controldata.c @@ -28,6 +28,8 @@ #include "utils/pg_lsn.h" #include "utils/timestamp.h" +#define CONTROL_FILE_COLS 12 + Datum pg_control_system(PG_FUNCTION_ARGS) { @@ -263,8 +265,8 @@ pg_control_recovery(PG_FUNCTION_ARGS) Datum pg_control_init(PG_FUNCTION_ARGS) { - Datum values[11]; - bool nulls[11]; + Datum values[CONTROL_FILE_COLS]; + bool nulls[CONTROL_FILE_COLS]; TupleDesc tupdesc; HeapTuple htup; ControlFileData *ControlFile; @@ -274,7 +276,7 @@ pg_control_init(PG_FUNCTION_ARGS) * Construct a tuple descriptor for the result row. This must match this * function's pg_proc entry! */ - tupdesc = CreateTemplateTupleDesc(11); + tupdesc = CreateTemplateTupleDesc(12); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "max_data_alignment", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database_block_size", @@ -297,6 +299,8 @@ pg_control_init(PG_FUNCTION_ARGS) BOOLOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 11, "data_page_checksum_version", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 12, "file_encryption_method", + INT4OID, -1, 0); tupdesc = BlessTupleDesc(tupdesc); /* read the control file */ @@ -338,6 +342,9 @@ pg_control_init(PG_FUNCTION_ARGS) values[10] = Int32GetDatum(ControlFile->data_checksum_version); nulls[10] = false; + values[11] = Int32GetDatum(ControlFile->file_encryption_method); + nulls[11] = false; + htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 396bd410b65..2ffcacf00cd 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -675,6 +675,11 @@ optimizer_analyze_root_partition = on # stats collection on root partitions # autovacuum, -1 means use # vacuum_cost_limit +#------------------------------------------------------------------------------ +# ENCRYPTION +#------------------------------------------------------------------------------ + +#cluster_key_command = '' #------------------------------------------------------------------------------ # CLIENT CONNECTION DEFAULTS #------------------------------------------------------------------------------ diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c index 17555c83151..f191fc1e04d 100644 --- a/src/backend/utils/time/combocid.c +++ b/src/backend/utils/time/combocid.c @@ -95,6 +95,7 @@ static int sizeComboCids = 0; /* allocated size of array */ /* prototypes for internal functions */ static CommandId GetComboCommandId(CommandId cmin, CommandId cmax); +static CommandId PGGetComboCommandId(CommandId cmin, CommandId cmax); static CommandId GetRealCmin(CommandId combocid); static CommandId GetRealCmax(CommandId combocid); @@ -317,6 +318,82 @@ GetComboCommandId(CommandId cmin, CommandId cmax) return combocid; } +/* this is for PG parallel workers. */ +static CommandId +PGGetComboCommandId(CommandId cmin, CommandId cmax) +{ + CommandId combocid; + ComboCidKeyData key; + ComboCidEntry entry; + bool found; + + /* + * Create the hash table and array the first time we need to use combo + * cids in the transaction. + */ + if (comboHash == NULL) + { + HASHCTL hash_ctl; + + /* Make array first; existence of hash table asserts array exists */ + comboCids = (ComboCidKeyData *) + MemoryContextAlloc(TopTransactionContext, + sizeof(ComboCidKeyData) * CCID_ARRAY_SIZE); + sizeComboCids = CCID_ARRAY_SIZE; + usedComboCids = 0; + + hash_ctl.keysize = sizeof(ComboCidKeyData); + hash_ctl.entrysize = sizeof(ComboCidEntryData); + hash_ctl.hcxt = TopTransactionContext; + + comboHash = hash_create("Combo CIDs", + CCID_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + } + + /* + * Grow the array if there's not at least one free slot. We must do this + * before possibly entering a new hashtable entry, else failure to + * repalloc would leave a corrupt hashtable entry behind. + */ + if (usedComboCids >= sizeComboCids) + { + int newsize = sizeComboCids * 2; + + comboCids = (ComboCidKeyData *) + repalloc(comboCids, sizeof(ComboCidKeyData) * newsize); + sizeComboCids = newsize; + } + + /* Lookup or create a hash entry with the desired cmin/cmax */ + + /* We assume there is no struct padding in ComboCidKeyData! */ + key.cmin = cmin; + key.cmax = cmax; + entry = (ComboCidEntry) hash_search(comboHash, + (void *) &key, + HASH_ENTER, + &found); + + if (found) + { + /* Reuse an existing combo CID */ + return entry->combocid; + } + + /* We have to create a new combo CID; we already made room in the array */ + combocid = usedComboCids; + + comboCids[combocid].cmin = cmin; + comboCids[combocid].cmax = cmax; + usedComboCids++; + + entry->combocid = combocid; + + return combocid; +} + static CommandId GetRealCmin(CommandId combocid) { @@ -413,7 +490,7 @@ RestoreComboCIDState(char *comboCIDstate) /* Use GetComboCommandId to restore each combo CID. */ for (i = 0; i < num_elements; i++) { - cid = GetComboCommandId(keydata[i].cmin, keydata[i].cmax); + cid = PGGetComboCommandId(keydata[i].cmin, keydata[i].cmax); /* Verify that we got the expected answer. */ if (cid != i) diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index d41ca5a213b..4e054a498bd 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -195,6 +195,19 @@ typedef struct SerializedSnapshotData CommandId curcid; TimestampTz whenTaken; XLogRecPtr lsn; + + bool haveDistribSnapshot; + + /* for cdb distribute snapshot */ + TransactionId minCachedLocalXid; + TransactionId maxCachedLocalXid; + int32 currentLocalXidsCount; + + DistributedTransactionId ds_xminAllDistributedSnapshots; + DistributedSnapshotId ds_distribSnapshotId; + DistributedTransactionId ds_xmin; + DistributedTransactionId ds_xmax; + int32 ds_count; } SerializedSnapshotData; Size @@ -2190,7 +2203,7 @@ EstimateSnapshotSpace(Snapshot snap) Size size; Assert(snap != InvalidSnapshot); - Assert(snap->snapshot_type == SNAPSHOT_MVCC); + Assert(snap->snapshot_type == SNAPSHOT_MVCC || gp_select_invisible); /* We allocate any XID arrays needed in the same palloc block. */ size = add_size(sizeof(SerializedSnapshotData), @@ -2200,9 +2213,26 @@ EstimateSnapshotSpace(Snapshot snap) size = add_size(size, mul_size(snap->subxcnt, sizeof(TransactionId))); + if (snap->haveDistribSnapshot && snap->distribSnapshotWithLocalMapping.ds.count > 0) + { + size = add_size(size, + mul_size(snap->distribSnapshotWithLocalMapping.ds.count, sizeof(DistributedTransactionId))); + if (snap->distribSnapshotWithLocalMapping.currentLocalXidsCount > 0) + { + size = add_size(size, + mul_size(snap->distribSnapshotWithLocalMapping.currentLocalXidsCount, sizeof(TransactionId))); + } + } + return size; } +Size +EstimateSnapshotDataSpace(void) +{ + return sizeof(SerializedSnapshotData); +} + /* * SerializeSnapshot * Dumps the serialized snapshot (extracted from given snapshot) onto the @@ -2212,6 +2242,9 @@ void SerializeSnapshot(Snapshot snapshot, char *start_address) { SerializedSnapshotData serialized_snapshot; + Size subxipoff = sizeof(SerializedSnapshotData) + snapshot->xcnt * sizeof(TransactionId); + Size dsoff = subxipoff + snapshot->subxcnt * sizeof(TransactionId); + Size dslmoff = dsoff + snapshot->distribSnapshotWithLocalMapping.ds.count * sizeof(DistributedTransactionId); Assert(snapshot->subxcnt >= 0); @@ -2226,6 +2259,19 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; + serialized_snapshot.haveDistribSnapshot = snapshot->haveDistribSnapshot; + + /* Copy fields for cdb distribute snapshot */ + serialized_snapshot.minCachedLocalXid = snapshot->distribSnapshotWithLocalMapping.minCachedLocalXid; + serialized_snapshot.maxCachedLocalXid = snapshot->distribSnapshotWithLocalMapping.maxCachedLocalXid; + serialized_snapshot.currentLocalXidsCount = snapshot->distribSnapshotWithLocalMapping.currentLocalXidsCount; + + serialized_snapshot.ds_xminAllDistributedSnapshots = snapshot->distribSnapshotWithLocalMapping.ds.xminAllDistributedSnapshots; + serialized_snapshot.ds_distribSnapshotId = snapshot->distribSnapshotWithLocalMapping.ds.distribSnapshotId; + serialized_snapshot.ds_xmin = snapshot->distribSnapshotWithLocalMapping.ds.xmin; + serialized_snapshot.ds_xmax = snapshot->distribSnapshotWithLocalMapping.ds.xmax; + serialized_snapshot.ds_count = snapshot->distribSnapshotWithLocalMapping.ds.count; + /* * Ignore the SubXID array if it has overflowed, unless the snapshot was * taken during recovery - in that case, top-level XIDs are in subxip as @@ -2252,12 +2298,26 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) */ if (serialized_snapshot.subxcnt > 0) { - Size subxipoff = sizeof(SerializedSnapshotData) + - snapshot->xcnt * sizeof(TransactionId); - memcpy((TransactionId *) (start_address + subxipoff), snapshot->subxip, snapshot->subxcnt * sizeof(TransactionId)); } + + if (snapshot->haveDistribSnapshot && + snapshot->distribSnapshotWithLocalMapping.ds.count > 0) + { + memcpy((DistributedTransactionId*) (start_address + dsoff), + snapshot->distribSnapshotWithLocalMapping.ds.inProgressXidArray, + snapshot->distribSnapshotWithLocalMapping.ds.count * + sizeof(DistributedTransactionId)); + + if (snapshot->distribSnapshotWithLocalMapping.currentLocalXidsCount > 0) + { + memcpy((TransactionId*) (start_address + dslmoff), + snapshot->distribSnapshotWithLocalMapping.inProgressMappedLocalXids, + snapshot->distribSnapshotWithLocalMapping.currentLocalXidsCount * + sizeof(TransactionId)); + } + } } /* @@ -2271,6 +2331,8 @@ Snapshot RestoreSnapshot(char *start_address) { SerializedSnapshotData serialized_snapshot; + Size dsoff = 0; + Size dslmoff = 0; Size size; Snapshot snapshot; TransactionId *serialized_xids; @@ -2284,6 +2346,17 @@ RestoreSnapshot(char *start_address) size = sizeof(SnapshotData) + serialized_snapshot.xcnt * sizeof(TransactionId) + serialized_snapshot.subxcnt * sizeof(TransactionId); + dslmoff = dsoff = size; + + if (serialized_snapshot.haveDistribSnapshot && + serialized_snapshot.ds_count > 0) + { + size += serialized_snapshot.ds_count * + sizeof(DistributedTransactionId); + dslmoff = size; + size += serialized_snapshot.ds_count * + sizeof(TransactionId); + } /* Copy all required fields */ snapshot = (Snapshot) MemoryContextAlloc(TopTransactionContext, size); @@ -2300,6 +2373,18 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; + snapshot->haveDistribSnapshot = serialized_snapshot.haveDistribSnapshot; + + /* Copy all fields for cdb distributed snapshot */ + snapshot->distribSnapshotWithLocalMapping.minCachedLocalXid = serialized_snapshot.minCachedLocalXid; + snapshot->distribSnapshotWithLocalMapping.maxCachedLocalXid = serialized_snapshot.maxCachedLocalXid; + snapshot->distribSnapshotWithLocalMapping.currentLocalXidsCount = serialized_snapshot.currentLocalXidsCount; + + snapshot->distribSnapshotWithLocalMapping.ds.xminAllDistributedSnapshots = serialized_snapshot.ds_xminAllDistributedSnapshots; + snapshot->distribSnapshotWithLocalMapping.ds.distribSnapshotId = serialized_snapshot.ds_distribSnapshotId; + snapshot->distribSnapshotWithLocalMapping.ds.xmin = serialized_snapshot.ds_xmin; + snapshot->distribSnapshotWithLocalMapping.ds.xmax = serialized_snapshot.ds_xmax; + snapshot->distribSnapshotWithLocalMapping.ds.count = serialized_snapshot.ds_count; /* Copy XIDs, if present. */ if (serialized_snapshot.xcnt > 0) @@ -2323,6 +2408,34 @@ RestoreSnapshot(char *start_address) snapshot->active_count = 0; snapshot->copied = true; + snapshot->distribSnapshotWithLocalMapping.ds.inProgressXidArray = NULL; + snapshot->distribSnapshotWithLocalMapping.inProgressMappedLocalXids = NULL; + /* Copy distribSnapshotWithLocalMapping. */ + if (serialized_snapshot.haveDistribSnapshot && + serialized_snapshot.ds_count > 0) + { + snapshot->distribSnapshotWithLocalMapping.ds.inProgressXidArray = + (DistributedTransactionId*) ((char *) snapshot + dsoff); + snapshot->distribSnapshotWithLocalMapping.inProgressMappedLocalXids = + (TransactionId*) ((char *) snapshot + dslmoff); + + memcpy(snapshot->distribSnapshotWithLocalMapping.ds.inProgressXidArray, + (DistributedTransactionId*) (start_address + dsoff), + serialized_snapshot.ds_count * + sizeof(DistributedTransactionId)); + + if (serialized_snapshot.currentLocalXidsCount > 0) + { + memset(snapshot->distribSnapshotWithLocalMapping.inProgressMappedLocalXids, + 0, + serialized_snapshot.ds_count * sizeof(TransactionId)); + memcpy(snapshot->distribSnapshotWithLocalMapping.inProgressMappedLocalXids, + (TransactionId*) (start_address + dslmoff), + serialized_snapshot.currentLocalXidsCount * + sizeof(TransactionId)); + } + } + return snapshot; } diff --git a/src/bin/Makefile b/src/bin/Makefile index 85728aac4e1..8baea7d1404 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -22,6 +22,7 @@ SUBDIRS = \ gpfts \ pg_amcheck \ pg_archivecleanup \ + pg_alterckey \ pg_basebackup \ pg_checksums \ pg_config \ diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 9f2cfed2c81..56ab1561b6e 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -65,6 +65,7 @@ #include "catalog/pg_collation_d.h" #include "common/file_perm.h" #include "common/file_utils.h" +#include "common/kmgr_utils.h" #include "common/logging.h" #include "common/restricted_token.h" #include "common/string.h" @@ -144,11 +145,16 @@ static bool noclean = false; static bool noinstructions = false; static bool do_sync = true; static bool sync_only = false; +static bool pass_terminal_fd = false; +static char *term_fd_opt = NULL; +static int file_encryption_method = DISABLED_ENCRYPTION_METHOD; static bool show_setting = false; static bool data_checksums = false; static char *xlog_dir = NULL; static char *str_wal_segment_size_mb = NULL; static int wal_segment_size_mb; +static char *cluster_key_cmd = NULL; +static char *old_key_datadir = NULL; /* internal vars */ @@ -215,6 +221,7 @@ static const char *const subdirs[] = { "global", "pg_wal/archive_status", "pg_commit_ts", + "pg_cryptokeys", "pg_dynshmem", "pg_notify", "pg_serial", @@ -982,12 +989,13 @@ test_config_settings(void) test_buffs = n_buffers; snprintf(cmd, sizeof(cmd), - "\"%s\" --boot -x0 %s %s " + "\"%s\" --boot -x0 %s %s %s " "-c max_connections=%d " "-c shared_buffers=%d " "-c dynamic_shared_memory_type=%s " "< \"%s\" > \"%s\" 2>&1", backend_exec, boot_options, extra_options, + term_fd_opt ? term_fd_opt : "", test_conns, test_buffs, dynamic_shared_memory_type, DEVNULL, DEVNULL); @@ -1021,12 +1029,13 @@ test_config_settings(void) } snprintf(cmd, sizeof(cmd), - "\"%s\" --boot -x0 %s %s " + "\"%s\" --boot -x0 %s %s %s " "-c max_connections=%d " "-c shared_buffers=%d " "-c dynamic_shared_memory_type=%s " "< \"%s\" > \"%s\" 2>&1", backend_exec, boot_options, extra_options, + term_fd_opt ? term_fd_opt : "", n_connections, test_buffs, dynamic_shared_memory_type, DEVNULL, DEVNULL); @@ -1231,6 +1240,13 @@ setup_config(void) "password_encryption = md5"); } + if (cluster_key_cmd) + { + snprintf(repltok, sizeof(repltok), "cluster_key_command = '%s'", + escape_quotes(cluster_key_cmd)); + conflines = replace_token(conflines, "#cluster_key_command = ''", repltok); + } + /* * If group access has been enabled for the cluster then it makes sense to * ensure that the log files also allow group access. Otherwise a backup @@ -1453,11 +1469,17 @@ bootstrap_template1(void) unsetenv("PGCLIENTENCODING"); snprintf(cmd, sizeof(cmd), - "\"%s\" --boot -x1 -X %u %s %s %s %s", + "\"%s\" --boot -x1 -X %u %s %s %s %s %s %s %s %s %s", backend_exec, wal_segment_size_mb * (1024 * 1024), data_checksums ? "-k" : "", - boot_options, extra_options, + cluster_key_cmd ? "-K" : "", + cluster_key_cmd ? encryption_methods[file_encryption_method].name : "", + old_key_datadir ? "-u" : "", + old_key_datadir ? old_key_datadir : "", + boot_options, + extra_options, + term_fd_opt ? term_fd_opt : "", debug ? "-d 5" : ""); @@ -2509,7 +2531,7 @@ usage(const char *progname) printf(_(" -T, --text-search-config=CFG\n" " default text search configuration\n")); printf(_(" -U, --username=NAME database superuser name\n")); - printf(_(" -W, --pwprompt prompt for a password for the new superuser\n")); + printf(_(" -W, --pwprompt prompt for the new superuser password\n")); printf(_(" -X, --waldir=WALDIR location for the write-ahead log directory\n")); printf(_(" --wal-segsize=SIZE size of WAL segments, in megabytes\n")); printf(_("\nShared memory allocation:\n")); @@ -2517,14 +2539,22 @@ usage(const char *progname) printf(_(" --shared_buffers=NBUFFERS number of shared buffers; or, amount of memory for\n" " shared buffers if kB/MB/GB suffix is appended\n")); printf(_("\nLess commonly used options:\n")); + printf(_(" -c, --cluster-key-command=COMMAND\n" + " enable cluster file encryption and set command\n" + " to obtain the cluster key\n")); printf(_(" -d, --debug generate lots of debugging output\n")); printf(_(" --discard-caches set debug_discard_caches=1\n")); + printf(_(" -K, --file-encryption-method=METHOD\n" + " cluster file encryption method\n")); printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -n, --no-clean do not clean up after errors\n")); printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); + printf(_(" -R, --authprompt prompt for a passphrase or PIN\n")); printf(_(" --no-instructions do not print instructions for next steps\n")); printf(_(" -s, --show show internal settings\n")); printf(_(" -S, --sync-only only sync data directory\n")); + printf(_(" -u, --copy-encryption-keys=DATADIR\n" + " copy the file encryption key from another cluster\n")); printf(_("\nOther options:\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" --gp-version output Cloudberry version information, then exit\n")); @@ -3104,6 +3134,23 @@ initialize_data_directory(void) /* Top level PG_VERSION is checked by bootstrapper, so make it first */ write_version_file(NULL); + if (pass_terminal_fd) + { +#ifndef WIN32 + int terminal_fd = open("/dev/tty", O_RDWR, 0); +#else + int terminal_fd = open("CONOUT$", O_RDWR, 0); +#endif + + if (terminal_fd < 0) + { + pg_log_error(_("%s: could not open terminal: %s"), + progname, strerror(errno)); + exit(1); + } + term_fd_opt = psprintf("-R %d", terminal_fd); + } + /* Select suitable configuration settings */ set_null_conf("postgresql.conf"); set_null_conf(GP_INTERNAL_AUTO_CONF_FILE_NAME); @@ -3128,8 +3175,9 @@ initialize_data_directory(void) fflush(stdout); snprintf(cmd, sizeof(cmd), - "\"%s\" %s %s template1 >%s", + "\"%s\" %s %s %s template1 >%s", backend_exec, backend_options, extra_options, + term_fd_opt ? term_fd_opt : "", DEVNULL); PG_CMD_OPEN; @@ -3216,8 +3264,11 @@ main(int argc, char *argv[]) {"data-checksums", no_argument, NULL, 'k'}, {"max_connections", required_argument, NULL, 1001}, /*CDB*/ {"shared_buffers", required_argument, NULL, 1003}, /*CDB*/ + {"file-encryption-method", required_argument, NULL, 'K'}, {"allow-group-access", no_argument, NULL, 'g'}, {"discard-caches", no_argument, NULL, 14}, + {"cluster-key-command", required_argument, NULL, 'c'}, + {"copy-encryption-keys", required_argument, NULL, 'u'}, {NULL, 0, NULL, 0} }; @@ -3264,7 +3315,7 @@ main(int argc, char *argv[]) /* process command-line options */ - while ((c = getopt_long(argc, argv, "A:dD:E:gkL:nNsST:U:WX:", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "A:c:dD:E:gkK:L:nNRsST:u:U:WX:", long_options, &option_index)) != -1) { switch (c) { @@ -3310,6 +3361,28 @@ main(int argc, char *argv[]) case 'N': do_sync = false; break; + case 'R': + pass_terminal_fd = true; + break; + case 'K': + { + int i; + + /* method 0/disabled cannot be specified */ + for (i = DISABLED_ENCRYPTION_METHOD + 1; + i < NUM_ENCRYPTION_METHODS; i++) + if (pg_strcasecmp(optarg, encryption_methods[i].name) == 0) + { + file_encryption_method = i; + break; + } + if (i == NUM_ENCRYPTION_METHODS) + { + fprintf(stderr, _("invalid cluster encryption method, method_name:%s, index:%d \n"), optarg, i); + exit(1); + } + } + break; case 'S': sync_only = true; break; @@ -3346,6 +3419,12 @@ main(int argc, char *argv[]) case 9: pwfilename = pg_strdup(optarg); break; + case 'c': + cluster_key_cmd = pg_strdup(optarg); + break; + case 'u': + old_key_datadir = pg_strdup(optarg); + break; case 's': show_setting = true; break; @@ -3430,6 +3509,35 @@ main(int argc, char *argv[]) exit(1); } +#ifndef USE_OPENSSL + if (cluster_key_cmd) + { + pg_log_error("cluster file encryption is not supported because OpenSSL is not supported by this build"); + exit(1); + } +#endif + + if (old_key_datadir != NULL && cluster_key_cmd == NULL) + { + pg_log_error("copying encryption keys requires the cluster key command to be specified"); + exit(1); + } + + if (file_encryption_method != DISABLED_ENCRYPTION_METHOD && + cluster_key_cmd == NULL) + { + /* + * If we have set the file_encryption_method, but cluster_key_cmd is null, + * we use default cluster key command. + */ + cluster_key_cmd = DEFAULT_CLUSTER_KEY_COMMAND; + } + + /* set the default */ + if (file_encryption_method == DISABLED_ENCRYPTION_METHOD && + cluster_key_cmd != NULL) + file_encryption_method = DEFAULT_ENABLED_ENCRYPTION_METHOD; + check_authmethod_unspecified(&authmethodlocal); check_authmethod_unspecified(&authmethodhost); @@ -3497,6 +3605,11 @@ main(int argc, char *argv[]) else printf(_("Data page checksums are disabled.\n")); + if (cluster_key_cmd) + printf(_("Cluster file encryption is enabled.\n")); + else + printf(_("Cluster file encryption is disabled.\n")); + if (pwprompt || pwfilename) get_su_pwd(); diff --git a/src/bin/pg_alterckey/.gitignore b/src/bin/pg_alterckey/.gitignore new file mode 100644 index 00000000000..4c4f39f2ccc --- /dev/null +++ b/src/bin/pg_alterckey/.gitignore @@ -0,0 +1 @@ +/pg_alterckey diff --git a/src/bin/pg_alterckey/Makefile b/src/bin/pg_alterckey/Makefile new file mode 100644 index 00000000000..2133a4ba063 --- /dev/null +++ b/src/bin/pg_alterckey/Makefile @@ -0,0 +1,38 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/bin/pg_alterckey +# +# Copyright (c) 1998-2021, PostgreSQL Global Development Group +# +# src/bin/pg_alterckey/Makefile +# +#------------------------------------------------------------------------- + +PGFILEDESC = "pg_alterckey - alter the cluster key" +PGAPPICON=win32 + +subdir = src/bin/pg_alterckey +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + $(WIN32RES) \ + pg_alterckey.o + +all: pg_alterckey + +pg_alterckey: $(OBJS) | submake-libpgport + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + +install: all installdirs + $(INSTALL_PROGRAM) pg_alterckey$(X) '$(DESTDIR)$(bindir)/pg_alterckey$(X)' + +installdirs: + $(MKDIR_P) '$(DESTDIR)$(bindir)' + +uninstall: + rm -f '$(DESTDIR)$(bindir)/pg_alterckey$(X)' + +clean distclean maintainer-clean: + rm -f pg_alterckey$(X) $(OBJS) + rm -rf tmp_check diff --git a/src/bin/pg_alterckey/README b/src/bin/pg_alterckey/README new file mode 100644 index 00000000000..6db3739271c --- /dev/null +++ b/src/bin/pg_alterckey/README @@ -0,0 +1,24 @@ +pg_alterckey +============ + +This directory contains the code to generate the pg_alterckey binary. + +Architecture +------------ + +pg_alterckey allows altering of the cluster encryption key (key +encryption key or KEK) which is stored outside of the file system; see +src/backend/crypto/README for more details. This must be done in a +crash-safe manner since the keys are critical to reading an encrypted +cluster. The active data encryption keys (DEK) are encrypted/wrapped by +the KEK and stored in PGDATA/pg_cryptokeys/live as separate files, +currently files 0 and 1. + +This process can be interrupted at anytime; the new execution of +pg_alterckey will repair any previously interrupted execution of +pg_alterckey. + +pg_alterckey should never be run concurrently. A lock file prevents +almost all concurrent execution. pg_alterckey can be run if the +database server is running or stopped, so it can't use database locking +that is only available when the server is running. diff --git a/src/bin/pg_alterckey/pg_alterckey.c b/src/bin/pg_alterckey/pg_alterckey.c new file mode 100644 index 00000000000..6449e5b896d --- /dev/null +++ b/src/bin/pg_alterckey/pg_alterckey.c @@ -0,0 +1,788 @@ +/*------------------------------------------------------------------------- + * + * pg_alterckey.c + * A utility to change the cluster key (key encryption key, KEK) + * used for cluster file encryption. The KEK wrap data encryption + * keys (DEK). + * + * The theory of operation is fairly simple: + * 1. Create lock file + * 2. Retrieve current and new cluster key using the supplied + * commands. + * 3. Revert any failed alter operation. + * 4. Create a "new" directory + * 5. Unwrap each DEK in "live" using the old KEK + * 6. Wrap each DEK using the new KEK and write it to "new" + * 7. Rename "live" to "old" + * 8. Rename "new" to "live" + * 9. Remove "old" + * 10. Remove lock file + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_alterckey/pg_alterckey.c + * + *------------------------------------------------------------------------- + */ + + +#define FRONTEND 1 + +#include "postgres_fe.h" + +#include +#include +#include +#include + +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/restricted_token.h" +#include "common/logging.h" +#include "crypto/kmgr.h" +#include "getopt_long.h" +#include "pg_getopt.h" + +typedef enum +{ + SUCCESS_EXIT = 0, + ERROR_EXIT, + RMDIR_EXIT, + REPAIR_EXIT +} exit_action; + +#define MAX_WRAPPED_KEY_LENGTH 64 + +static int lock_fd = -1; +static bool pass_terminal_fd = false; +int terminal_fd = -1; +static bool repair_mode = false; +static char *old_cluster_key_cmd = NULL, + *new_cluster_key_cmd = NULL; +static char old_cluster_key[KMGR_CLUSTER_KEY_LEN], + new_cluster_key[KMGR_CLUSTER_KEY_LEN]; +static CryptoKey data_key; +unsigned char in_key[MAX_WRAPPED_KEY_LENGTH], + out_key[MAX_WRAPPED_KEY_LENGTH]; +int in_klen, + out_klen; +static char top_path[MAXPGPATH], + pid_path[MAXPGPATH], + live_path[MAXPGPATH], + new_path[MAXPGPATH], + old_path[MAXPGPATH]; + +static char *DataDir = NULL; +static const char *progname; + +static void create_lockfile(void); +static void recover_failure(void); +static void retrieve_cluster_keys(void); +static void bzero_keys_and_exit(exit_action action); +static void reencrypt_data_keys(void); +static void install_new_keys(void); + +static uint64 hex_decode(const char *src, size_t len, char *dst); + + +static void +usage(const char *progname) +{ + printf(_("%s changes the cluster key of a PostgreSQL database cluster.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION] old_cluster_key_command new_cluster_key_command [DATADIR]\n"), progname); + printf(_(" %s [repair_option] [DATADIR]\n"), progname); + printf(_("\nOptions:\n")); + printf(_(" -R, --authprompt prompt for a passphrase or PIN\n")); + printf(_(" [-D, --pgdata=]DATADIR data directory\n")); + printf(_(" -V, --version output version information, then exit\n")); + printf(_(" -?, --help show this help, then exit\n")); + printf(_("\nRepair options:\n")); + printf(_(" -r, --repair repair previous failure\n")); + printf(_("\nIf no data directory (DATADIR) is specified, " + "the environment variable PGDATA\nis used.\n\n")); + printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT); + printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); +} + + +int +main(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"authprompt", required_argument, NULL, 'R'}, + {"repair", required_argument, NULL, 'r'}, + {"pgdata", required_argument, NULL, 'D'}, + {NULL, 0, NULL, 0} + }; + + int c; + + pg_logging_init(argv[0]); + set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_alterckey")); + progname = get_progname(argv[0]); + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(progname); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pg_alterckey (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + /* check for -r/-R */ + while ((c = getopt_long(argc, argv, "D:rR", long_options, NULL)) != -1) + { + switch (c) + { + case 'D': + DataDir = optarg; + break; + case 'r': + repair_mode = true; + break; + case 'R': + pass_terminal_fd = true; + break; + default: + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + } + + if (!repair_mode) + { + /* get cluster key commands */ + if (optind < argc) + old_cluster_key_cmd = argv[optind++]; + else + { + pg_log_error("missing old_cluster_key_command"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + + if (optind < argc) + new_cluster_key_cmd = argv[optind++]; + else + { + pg_log_error("missing new_cluster_key_command"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + } + + if (DataDir == NULL) + { + if (optind < argc) + DataDir = argv[optind++]; + else + DataDir = getenv("PGDATA"); + + /* If no DataDir was specified, and none could be found, error out */ + if (DataDir == NULL) + { + pg_log_error("no data directory specified"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + } + + /* Complain if any arguments remain */ + if (optind < argc) + { + pg_log_error("too many command-line arguments (first is \"%s\")", + argv[optind]); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + + /* + * Disallow running as root because we create directories in PGDATA + */ +#ifndef WIN32 + if (geteuid() == 0) + { + pg_log_error("%s: cannot be run as root\n" + "Please log in (using, e.g., \"su\") as the " + "(unprivileged) user that will\n" + "own the server process.\n", + progname); + exit(1); + } +#endif + + get_restricted_token(); + + /* Set mask based on PGDATA permissions */ + if (!GetDataDirectoryCreatePerm(DataDir)) + { + pg_log_error("could not read permissions of directory \"%s\": %m", + DataDir); + exit(1); + } + + umask(pg_mode_mask); + + snprintf(top_path, sizeof(top_path), "%s/%s", DataDir, KMGR_DIR); + snprintf(pid_path, sizeof(pid_path), "%s/%s", DataDir, KMGR_DIR_PID); + snprintf(live_path, sizeof(live_path), "%s/%s", DataDir, LIVE_KMGR_DIR); + snprintf(new_path, sizeof(new_path), "%s/%s", DataDir, NEW_KMGR_DIR); + snprintf(old_path, sizeof(old_path), "%s/%s", DataDir, OLD_KMGR_DIR); + + /* Complain if any arguments remain */ + if (optind < argc) + { + pg_log_error("too many command-line arguments (first is \"%s\")", + argv[optind]); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + + if (DataDir == NULL) + { + pg_log_error("no data directory specified"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + + create_lockfile(); + + recover_failure(); + + if (!repair_mode) + { + retrieve_cluster_keys(); + reencrypt_data_keys(); + install_new_keys(); + } + +#ifndef WIN32 + /* remove file system reference to file */ + if (unlink(pid_path) < 0) + { + pg_log_error("could not delete lock file \"%s\": %m", KMGR_DIR_PID); + exit(1); + } +#endif + + close(lock_fd); + + bzero_keys_and_exit(SUCCESS_EXIT); +} + +/* Create a lock file; this prevents almost all cases of concurrent access */ +void +create_lockfile(void) +{ + struct stat buffer; + char lock_pid_str[20]; + + if (stat(top_path, &buffer) != 0 || !S_ISDIR(buffer.st_mode)) + { + pg_log_error("cluster file encryption directory \"%s\" is missing; is it enabled?", KMGR_DIR_PID); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } + + /* Does a lockfile exist? */ + if ((lock_fd = open(pid_path, O_RDONLY, 0)) != -1) + { + int lock_pid; + int len; + + /* read the PID */ + if ((len = read(lock_fd, lock_pid_str, sizeof(lock_pid_str) - 1)) == 0) + { + pg_log_error("cannot read pid from lock file \"%s\": %m", KMGR_DIR_PID); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } + lock_pid_str[len] = '\0'; + + if ((lock_pid = atoi(lock_pid_str)) == 0) + { + pg_log_error("invalid pid in lock file \"%s\": %m", KMGR_DIR_PID); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } + + /* Is the PID running? */ + if (kill(lock_pid, 0) == 0) + { + pg_log_error("active process %d currently holds a lock on this operation, recorded in \"%s\"", + lock_pid, KMGR_DIR_PID); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } + + close(lock_fd); + + if (repair_mode) + printf("old lock file removed\n"); + + /* ---------- + * pid is no longer running, so remove the lock file. + * This is not 100% safe from concurrent access, e.g.: + * + * process 1 exits and leaves stale lock file + * process 2 checks stale lock file of process 1 + * process 3 checks stale lock file of process 1 + * process 2 remove the lock file of process 1 + * process 4 creates a lock file + * process 3 remove the lock file of process 4 + * process 5 creates a lock file + * + * The sleep(2) helps with this since it reduces the likelihood + * a process that did an unlock will interfere with another unlock + * process. We could ask users to remove the lock, but that seems + * even more error-prone, especially since this might happen + * on server start. Many PG tools seem to have problems with + * concurrent access. + * ---------- + */ + unlink(pid_path); + + /* Sleep to reduce the likelihood of concurrent unlink */ + pg_usleep(2000000L); /* 2 seconds */ + } + + /* Create our own lockfile? */ +#ifndef WIN32 + lock_fd = open(pid_path, O_RDWR | O_CREAT | O_EXCL, pg_file_create_mode); +#else + /* delete on close */ + lock_fd = open(pid_path, O_RDWR | O_CREAT | O_EXCL | O_TEMPORARY, + pg_file_create_mode); +#endif + + if (lock_fd == -1) + { + if (errno == EEXIST) + pg_log_error("an active process currently holds a lock on this operation, recorded in \"%s\"", + KMGR_DIR_PID); + else + pg_log_error("unable to create lock file \"%s\": %m", KMGR_DIR_PID); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } + + snprintf(lock_pid_str, sizeof(lock_pid_str), "%d\n", getpid()); + if (write(lock_fd, lock_pid_str, strlen(lock_pid_str)) != strlen(lock_pid_str)) + { + pg_log_error("could not write pid to lock file \"%s\": %m", KMGR_DIR_PID); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } +} + +/* + * ---------- + * recover_failure + * + * A previous pg_alterckey might have failed, so it might need recovery. + * The normal operation is: + * 1. reencrypt LIVE_KMGR_DIR -> NEW_KMGR_DIR + * 2. rename KMGR_DIR -> OLD_KMGR_DIR + * 3. rename NEW_KMGR_DIR -> LIVE_KMGR_DIR + * remove OLD_KMGR_DIR + * + * There are eight possible directory configurations: + * + * LIVE_KMGR_DIR NEW_KMGR_DIR OLD_KMGR_DIR + * + * Normal: + * 0. normal X + * 1. remove new X X + * 2. install new X X + * 3. remove old X X + * + * Abnormal: + * fatal + * restore old X + * install new X + * remove old and new X X X + * + * We don't handle the abnormal cases, just report an error. + * ---------- + */ +static void +recover_failure(void) +{ + struct stat buffer; + bool is_live, + is_new, + is_old; + + is_live = !stat(live_path, &buffer); + is_new = !stat(new_path, &buffer); + is_old = !stat(old_path, &buffer); + + /* normal #0 */ + if (is_live && !is_new && !is_old) + { + if (repair_mode) + printf("repair unnecessary\n"); + return; + } + /* remove new #1 */ + else if (is_live && is_new && !is_old) + { + if (!rmtree(new_path, true)) + { + pg_log_error("unable to remove new directory \"%s\": %m", NEW_KMGR_DIR); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } + printf(_("removed files created during previously aborted alter operation\n")); + return; + } + /* install new #2 */ + else if (!is_live && is_new && is_old) + { + if (rename(new_path, live_path) != 0) + { + pg_log_error("unable to rename directory \"%s\" to \"%s\": %m", + NEW_KMGR_DIR, LIVE_KMGR_DIR); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } + printf(_("Installed new cluster password supplied in previous alter operation\n")); + return; + } + /* remove old #3 */ + else if (is_live && !is_new && is_old) + { + if (!rmtree(old_path, true)) + { + pg_log_error("unable to remove old directory \"%s\": %m", OLD_KMGR_DIR); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } + printf(_("Removed old files invalidated during previous alter operation\n")); + return; + } + else + { + pg_log_error("cluster file encryption directory \"%s\" is in an abnormal state and cannot be processed", + KMGR_DIR); + fprintf(stderr, _("Exiting with no changes made.\n")); + exit(1); + } +} + +/* Retrieve old and new cluster keys */ +void +retrieve_cluster_keys(void) +{ + int cluster_key_len; + char cluster_key_hex[ALLOC_KMGR_CLUSTER_KEY_LEN]; + + /* + * If we have been asked to pass an open file descriptor to the user + * terminal to the commands, set one up. + */ + if (pass_terminal_fd) + { +#ifndef WIN32 + terminal_fd = open("/dev/tty", O_RDWR, 0); +#else + terminal_fd = open("CONOUT$", O_RDWR, 0); +#endif + if (terminal_fd < 0) + { + pg_log_error(_("%s: could not open terminal: %s\n"), + progname, strerror(errno)); + exit(1); + } + } + + /* Get old key encryption key from the cluster key command */ + cluster_key_len = kmgr_run_cluster_key_command(old_cluster_key_cmd, + (char *) cluster_key_hex, + ALLOC_KMGR_CLUSTER_KEY_LEN, + live_path, terminal_fd); + if (hex_decode(cluster_key_hex, cluster_key_len, + (char *) old_cluster_key) != + KMGR_CLUSTER_KEY_LEN) + { + pg_log_error("cluster key must be at %d hex bytes", KMGR_CLUSTER_KEY_LEN); + bzero_keys_and_exit(ERROR_EXIT); + } + + /* + * Create new key directory here in case the new cluster key command needs + * it to exist. + */ + if (mkdir(new_path, pg_dir_create_mode) != 0) + { + pg_log_error("unable to create new cluster key directory \"%s\": %m", NEW_KMGR_DIR); + bzero_keys_and_exit(ERROR_EXIT); + } + + /* Get new key */ + cluster_key_len = kmgr_run_cluster_key_command(new_cluster_key_cmd, + (char *) cluster_key_hex, + ALLOC_KMGR_CLUSTER_KEY_LEN, + new_path, terminal_fd); + if (hex_decode(cluster_key_hex, cluster_key_len, + (char *) new_cluster_key) != + KMGR_CLUSTER_KEY_LEN) + { + pg_log_error("cluster key must be at %d hex bytes", KMGR_CLUSTER_KEY_LEN); + bzero_keys_and_exit(ERROR_EXIT); + } + + if (pass_terminal_fd) + close(terminal_fd); + + /* output newline */ + puts(""); + + if (memcmp(old_cluster_key, new_cluster_key, KMGR_CLUSTER_KEY_LEN) == 0) + { + pg_log_error("cluster keys are identical, exiting\n"); + bzero_keys_and_exit(RMDIR_EXIT); + } +} + +/* Decrypt old keys encrypted with old pass phrase and reencrypt with new one */ +void +reencrypt_data_keys(void) +{ + PgCipherCtx *old_ctx, + *new_ctx; + + old_ctx = pg_cipher_ctx_create(PG_CIPHER_AES_KWP, + (unsigned char *) old_cluster_key, + KMGR_CLUSTER_KEY_LEN, false); + if (!old_ctx) + pg_log_error("could not initialize encryption context"); + + new_ctx = pg_cipher_ctx_create(PG_CIPHER_AES_KWP, + (unsigned char *) new_cluster_key, + KMGR_CLUSTER_KEY_LEN, true); + if (!new_ctx) + pg_log_error("could not initialize encryption context"); + + for (int id = 0; id < KMGR_NUM_DATA_KEYS; id++) + { + char src_path[MAXPGPATH], + dst_path[MAXPGPATH]; + int src_fd, + dst_fd; + int len; + struct stat st; + + CryptoKeyFilePath(src_path, live_path, id); + CryptoKeyFilePath(dst_path, new_path, id); + + if ((src_fd = open(src_path, O_RDONLY | PG_BINARY, 0)) < 0) + { + pg_log_error("could not open file \"%s\": %m", src_path); + bzero_keys_and_exit(RMDIR_EXIT); + } + + if ((dst_fd = open(dst_path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY, + pg_file_create_mode)) < 0) + { + pg_log_error("could not open file \"%s\": %m", dst_path); + bzero_keys_and_exit(RMDIR_EXIT); + } + + if (fstat(src_fd, &st)) + { + pg_log_error("could not stat file \"%s\": %m", src_path); + bzero_keys_and_exit(RMDIR_EXIT); + } + + in_klen = st.st_size; + + if (in_klen > MAX_WRAPPED_KEY_LENGTH) + { + pg_log_error("invalid wrapped key length (%d) for file \"%s\"", in_klen, src_path); + bzero_keys_and_exit(RMDIR_EXIT); + } + + /* Read the source key */ + len = read(src_fd, in_key, in_klen); + if (len != in_klen) + { + if (len < 0) + pg_log_error("could read file \"%s\": %m", src_path); + else + pg_log_error("could read file \"%s\": read %d of %u", + src_path, len, in_klen); + bzero_keys_and_exit(RMDIR_EXIT); + } + + /* decrypt with old key */ + if (!kmgr_unwrap_data_key(old_ctx, in_key, in_klen, &data_key)) + { + pg_log_error("incorrect old key specified"); + bzero_keys_and_exit(RMDIR_EXIT); + } + + if (KMGR_MAX_KEY_LEN_BYTES + pg_cipher_blocksize(new_ctx) > MAX_WRAPPED_KEY_LENGTH) + { + pg_log_error("invalid max wrapped key length"); + bzero_keys_and_exit(RMDIR_EXIT); + } + + /* encrypt with new key */ + if (!kmgr_wrap_data_key(new_ctx, &data_key, out_key, &out_klen)) + { + pg_log_error("could not encrypt new key"); + bzero_keys_and_exit(RMDIR_EXIT); + } + + /* Write to the dest key */ + len = write(dst_fd, out_key, out_klen); + if (len != out_klen) + { + pg_log_error("could not write fie \"%s\"", dst_path); + bzero_keys_and_exit(RMDIR_EXIT); + } + + close(src_fd); + close(dst_fd); + } + + /* The cluster key is correct, free the cipher context */ + pg_cipher_ctx_free(old_ctx); + pg_cipher_ctx_free(new_ctx); +} + +/* Install new keys */ +void +install_new_keys(void) +{ + /* + * Issue fsync's so key rotation is less likely to be left in an + * inconsistent state in case of a crash during this operation. + */ + + if (rename(live_path, old_path) != 0) + { + pg_log_error("unable to rename directory \"%s\" to \"%s\": %m", + LIVE_KMGR_DIR, OLD_KMGR_DIR); + bzero_keys_and_exit(RMDIR_EXIT); + } + fsync_dir_recurse(top_path); + + if (rename(new_path, live_path) != 0) + { + pg_log_error("unable to rename directory \"%s\" to \"%s\": %m", + NEW_KMGR_DIR, LIVE_KMGR_DIR); + bzero_keys_and_exit(REPAIR_EXIT); + } + fsync_dir_recurse(top_path); + + if (!rmtree(old_path, true)) + { + pg_log_error("unable to remove old directory \"%s\": %m", OLD_KMGR_DIR); + bzero_keys_and_exit(REPAIR_EXIT); + } + fsync_dir_recurse(top_path); +} + +/* Erase memory and exit */ +void +bzero_keys_and_exit(exit_action action) +{ + explicit_bzero(old_cluster_key, sizeof(old_cluster_key)); + explicit_bzero(new_cluster_key, sizeof(new_cluster_key)); + + explicit_bzero(in_key, sizeof(in_key)); + explicit_bzero(&data_key, sizeof(data_key)); + explicit_bzero(out_key, sizeof(out_key)); + + if (action == RMDIR_EXIT) + { + if (!rmtree(new_path, true)) + pg_log_error("unable to remove new directory \"%s\": %m", NEW_KMGR_DIR); + printf("Re-running pg_alterckey to repair might be needed before the next server start\n"); + exit(1); + } + else if (action == REPAIR_EXIT) + { + unlink(pid_path); + printf("Re-running pg_alterckey to repair might be needed before the next server start\n"); + } + + /* return 0 or 1 */ + exit(action != SUCCESS_EXIT); +} + +/* + * HEX + */ + +static const int8 hexlookup[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + +static inline char +get_hex(const char *cp) +{ + unsigned char c = (unsigned char) *cp; + int res = -1; + + if (c < 127) + res = hexlookup[c]; + + if (res < 0) + pg_log_fatal("invalid hexadecimal digit: \"%s\"",cp); + + return (char) res; +} + +static uint64 +hex_decode(const char *src, size_t len, char *dst) +{ + const char *s, + *srcend; + char v1, + v2, + *p; + + srcend = src + len; + s = src; + p = dst; + while (s < srcend) + { + if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r') + { + s++; + continue; + } + v1 = get_hex(s) << 4; + s++; + if (s >= srcend) + pg_log_fatal("invalid hexadecimal data: odd number of digits"); + + v2 = get_hex(s); + s++; + *p++ = v1 | v2; + } + + return p - dst; +} diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 1bc015d5dc5..27c5b48e762 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -25,6 +25,7 @@ #include "access/xlog_internal.h" #include "catalog/pg_control.h" #include "common/controldata_utils.h" +#include "common/kmgr_utils.h" #include "common/logging.h" #include "getopt_long.h" #include "pg_getopt.h" @@ -339,5 +340,7 @@ main(int argc, char *argv[]) ControlFile->data_checksum_version); printf(_("Mock authentication nonce: %s\n"), mock_auth_nonce_str); + printf(_("File encryption method: %s\n"), + encryption_methods[ControlFile->file_encryption_method].name); return 0; } diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 7d48510ac4f..ef3b5bfee20 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -83,6 +83,7 @@ typedef enum static bool do_wait = true; static int wait_seconds = DEFAULT_WAIT; static bool wait_seconds_arg = false; +static bool pass_terminal_fd = false; static bool silent_mode = false; static ShutdownMode shutdown_mode = FAST_MODE; static int sig = SIGINT; /* default */ @@ -456,7 +457,7 @@ static pgpid_t start_postmaster(void) { char launcher[MAXPGPATH] = ""; - char *cmd; + char *cmd, *term_fd_opt = NULL; #ifndef WIN32 pgpid_t pm_pid; @@ -481,6 +482,19 @@ start_postmaster(void) /* fork succeeded, in child */ + if (pass_terminal_fd) + { + int terminal_fd = open("/dev/tty", O_RDWR, 0); + + if (terminal_fd < 0) + { + write_stderr(_("%s: could not open terminal: %s\n"), + progname, strerror(errno)); + exit(1); + } + term_fd_opt = psprintf(" -R %d", terminal_fd); + } + /* * If possible, detach the postmaster process from the launching process * group and make it a group leader, so that it doesn't get signaled along @@ -509,12 +523,14 @@ start_postmaster(void) * has the same PID as the current child process. */ if (log_file != NULL) - cmd = psprintf("exec %s \"%s\" %s%s < \"%s\" >> \"%s\" 2>&1", + cmd = psprintf("exec %s \"%s\" %s%s%s < \"%s\" >> \"%s\" 2>&1", launcher, exec_path, pgdata_opt, post_opts, + term_fd_opt ? term_fd_opt : "", DEVNULL, log_file); else - cmd = psprintf("exec %s \"%s\" %s%s < \"%s\" 2>&1", - launcher, exec_path, pgdata_opt, post_opts, DEVNULL); + cmd = psprintf("exec %s \"%s\" %s%s%s < \"%s\" 2>&1", + launcher, exec_path, pgdata_opt, post_opts, + term_fd_opt ? term_fd_opt : "", DEVNULL); (void) execl("/bin/sh", "/bin/sh", "-c", cmd, (char *) NULL); @@ -535,6 +551,21 @@ start_postmaster(void) PROCESS_INFORMATION pi; const char *comspec; + if (pass_terminal_fd) + { + /* Hopefully we can read and write CONOUT, see simple_prompt() XXX */ + /* Do CreateRestrictedProcess() children even inherit open file descriptors? XXX */ + int terminal_fd = open("CONOUT$", O_RDWR, 0); + + if (terminal_fd < 0) + { + write_stderr(_("%s: could not open terminal: %s\n"), + progname, strerror(errno)); + exit(1); + } + term_fd_opt = psprintf(" -R %d", terminal_fd); + } + /* Find CMD.EXE location using COMSPEC, if it's set */ comspec = getenv("COMSPEC"); if (comspec == NULL) @@ -575,12 +606,14 @@ start_postmaster(void) else close(fd); - cmd = psprintf("\"%s\" /C \"\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1\"", - comspec, exec_path, pgdata_opt, post_opts, DEVNULL, log_file); + cmd = psprintf("\"%s\" /C \"\"%s\" %s%s%s < \"%s\" >> \"%s\" 2>&1\"", + comspec, exec_path, pgdata_opt, post_opts, + term_fd_opt ? term_fd_opt : "", DEVNULL, log_file); } else - cmd = psprintf("\"%s\" /C \"\"%s\" %s%s < \"%s\" 2>&1\"", - comspec, exec_path, pgdata_opt, post_opts, DEVNULL); + cmd = psprintf("\"%s\" /C \"\"%s\" %s%s%s < \"%s\" 2>&1\"", + comspec, exec_path, pgdata_opt, post_opts, + term_fd_opt ? term_fd_opt : "", DEVNULL); if (!CreateRestrictedProcess(cmd, &pi, false)) { @@ -734,7 +767,8 @@ wait_for_postmaster_start(pgpid_t pm_pid, bool do_checkpoint) } else #endif - print_msg("."); + if (!pass_terminal_fd) + print_msg("."); } pg_usleep(USEC_PER_SEC / WAITS_PER_SEC); @@ -2297,6 +2331,7 @@ do_help(void) printf(_(" -o, --options=OPTIONS command line options to pass to postgres\n" " (PostgreSQL server executable) or initdb\n")); printf(_(" -p PATH-TO-POSTGRES normally not necessary\n")); + printf(_(" -R, --authprompt prompt for a passphrase or PIN\n")); printf(_("\nOptions for stop or restart:\n")); printf(_(" -m, --mode=MODE MODE can be \"smart\", \"fast\", or \"immediate\"\n")); @@ -2491,6 +2526,7 @@ main(int argc, char **argv) {"mode", required_argument, NULL, 'm'}, {"pgdata", required_argument, NULL, 'D'}, {"options", required_argument, NULL, 'o'}, + {"authprompt", no_argument, NULL, 'R'}, {"silent", no_argument, NULL, 's'}, {"timeout", required_argument, NULL, 't'}, {"core-files", no_argument, NULL, 'c'}, @@ -2569,7 +2605,7 @@ main(int argc, char **argv) /* process command-line options */ while (optind < argc) { - while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:sS:t:U:wW", + while ((c = getopt_long(argc, argv, "cD:e:l:m:N:o:p:P:RsS:t:U:wW", long_options, NULL)) != -1) { switch (c) @@ -2621,6 +2657,9 @@ main(int argc, char **argv) case 'P': register_password = pg_strdup(optarg); break; + case 'R': + pass_terminal_fd = true; + break; case 's': silent_mode = true; break; diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 4b11eb9fed1..3c964e09892 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -53,6 +53,7 @@ #include "common/controldata_utils.h" #include "common/fe_memutils.h" #include "common/file_perm.h" +#include "common/kmgr_utils.h" #include "common/logging.h" #include "common/restricted_token.h" #include "common/string.h" @@ -982,6 +983,8 @@ PrintControlValues(bool guessed) (ControlFile.float8ByVal ? _("by value") : _("by reference"))); printf(_("Data page checksum version: %u\n"), ControlFile.data_checksum_version); + printf(_("File encryption method: %s\n"), + encryption_methods[ControlFile.file_encryption_method].name); } diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 99897a841d4..8162b45cf50 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -29,6 +29,7 @@ #include "catalog/catalog.h" #include "catalog/pg_tablespace_d.h" #include "common/hashfn.h" +#include "common/kmgr_utils.h" #include "common/string.h" #include "datapagemap.h" #include "filemap.h" @@ -109,6 +110,13 @@ static const char *excludeDirContents[] = /* Contents removed on startup, see AsyncShmemInit(). */ "pg_notify", + /* + * Skip cryptographic keys. It's generally not a good idea to copy the + * cryptographic keys from source database because these might use + * different cluster key. + */ + //KMGR_DIR, + /* * Old contents are loaded for possible debugging but are not required for * normal operation, see SerialInit(). diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index f223b49a576..dc78788ca71 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -10,6 +10,8 @@ #include "postgres_fe.h" #include "catalog/pg_authid_d.h" +#include "catalog/pg_collation.h" +#include "common/kmgr_utils.h" #include "fe_utils/string_utils.h" #include "mb/pg_wchar.h" #include "pg_upgrade.h" @@ -29,6 +31,7 @@ static void check_for_composite_data_type_usage(ClusterInfo *cluster); static void check_for_reg_data_type_usage(ClusterInfo *cluster); static void check_for_jsonb_9_4_usage(ClusterInfo *cluster); static void check_for_pg_role_prefix(ClusterInfo *cluster); +static void check_for_cluster_key_failure(ClusterInfo *cluster); static void check_for_new_tablespace_dir(ClusterInfo *new_cluster); static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster); static char *get_canonical_locale_name(int category, const char *locale); @@ -162,6 +165,9 @@ check_and_dump_old_cluster(bool live_check, char **sequence_script_file_name) if (GET_MAJOR_VERSION(old_cluster.major_version) <= 905) check_for_pg_role_prefix(&old_cluster); + if (GET_MAJOR_VERSION(old_cluster.major_version) >= 1400) + check_for_cluster_key_failure(&old_cluster); + if (GET_MAJOR_VERSION(old_cluster.major_version) == 904 && old_cluster.controldata.cat_ver < JSONB_FORMAT_CHANGE_CAT_VER) check_for_jsonb_9_4_usage(&old_cluster); @@ -246,6 +252,9 @@ check_new_cluster(void) check_loadable_libraries(); + if (GET_MAJOR_VERSION(old_cluster.major_version) >= 1400) + check_for_cluster_key_failure(&new_cluster); + switch (user_opts.transfer_mode) { case TRANSFER_MODE_CLONE: @@ -1523,6 +1532,32 @@ check_for_user_defined_encoding_conversions(ClusterInfo *cluster) } +/* + * check_for_cluster_key_failure() + * + * Make sure there was no unrepaired pg_alterckey failure + */ +static void +check_for_cluster_key_failure(ClusterInfo *cluster) +{ + struct stat buffer; + + if (stat (KMGR_DIR_PID, &buffer) == 0) + { + if (cluster == &old_cluster) + pg_fatal("The source cluster had a pg_alterckey failure that needs repair or\n" + "pg_alterckey is running. Run pg_alterckey --repair or wait for it\n" + "to complete.\n"); + else + pg_fatal("The target cluster had a pg_alterckey failure that needs repair or\n" + "pg_alterckey is running. Run pg_alterckey --repair or wait for it\n" + "to complete.\n"); + } + + check_ok(); +} + + /* * get_canonical_locale_name * diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 34cafd943d4..e80d0dd23fc 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -9,11 +9,17 @@ #include "postgres_fe.h" +#include #include #include "pg_upgrade.h" #include "greenplum/pg_upgrade_greenplum.h" +#include "access/xlog_internal.h" +#include "common/controldata_utils.h" +#include "common/file_utils.h" +#include "common/kmgr_utils.h" + /* * get_control_data() * @@ -62,6 +68,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) bool got_date_is_int = false; bool got_data_checksum_version = false; bool got_cluster_state = false; + int got_file_encryption_method = false; char *lc_collate = NULL; char *lc_ctype = NULL; char *lc_monetary = NULL; @@ -217,6 +224,13 @@ get_control_data(ClusterInfo *cluster, bool live_check) got_data_checksum_version = true; } + /* Only in <= 14 */ + if (GET_MAJOR_VERSION(cluster->major_version) <= 1400) + { + cluster->controldata.file_encryption_method = DISABLED_ENCRYPTION_METHOD; + got_file_encryption_method = true; + } + /* we have the result of cmd in "output". so parse it line by line now */ while (fgets(bufin, sizeof(bufin), output)) { @@ -574,6 +588,18 @@ get_control_data(ClusterInfo *cluster, bool live_check) cluster->controldata.data_checksum_version = str2uint(p); got_data_checksum_version = true; } + else if ((p = strstr(bufin, "Cluster file encryption method:")) != NULL) + { + p = strchr(p, ':'); + + if (p == NULL || strlen(p) <= 1) + pg_fatal("%d: controldata retrieval problem\n", __LINE__); + + p++; /* remove ':' char */ + /* used later for contrib check */ + cluster->controldata.file_encryption_method = atoi(p); + got_file_encryption_method = true; + } } pclose(output); @@ -652,7 +678,8 @@ get_control_data(ClusterInfo *cluster, bool live_check) !got_index || /* !got_toast || */ (!got_large_object && cluster->controldata.ctrl_ver >= LARGE_OBJECT_SIZE_PG_CONTROL_VER) || - !got_date_is_int || !got_data_checksum_version) + !got_date_is_int || !got_data_checksum_version || + !got_file_encryption_method) { if (cluster == &old_cluster) pg_log(PG_REPORT, @@ -730,7 +757,11 @@ get_control_data(ClusterInfo *cluster, bool live_check) if (!got_data_checksum_version) pg_log(PG_REPORT, " data checksum version\n"); - pg_fatal("Cannot continue without required control information, terminating\n"); + /* value added in Postgres 14 */ + if (!got_file_encryption_method) + pg_log(PG_REPORT, " file encryption method\n"); + + pg_fatal("Cannot continue without required control information, terminating"); } } @@ -828,6 +859,13 @@ check_control_data(ControlData *oldctrl, else if (oldctrl->data_checksum_version != newctrl->data_checksum_version && is_checksum_mode(CHECKSUM_NONE)) pg_fatal("old and new cluster pg_controldata checksum versions do not match\n"); + /* + * We cannot upgrade if the old cluster file encryption method + * doesn't match the new one. + */ + if (oldctrl->file_encryption_method != newctrl->file_encryption_method) + pg_fatal("old and new clusters use different file encryption methods or\n" + "one cluster uses encryption and the other does not"); } diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index cac4a01e453..4b223bf681b 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -11,6 +11,7 @@ #include #include +#include #ifdef HAVE_COPYFILE_H #include #endif @@ -21,6 +22,7 @@ #include "access/visibilitymapdefs.h" #include "common/file_perm.h" +#include "common/file_utils.h" #include "pg_upgrade.h" #include "storage/bufpage.h" #include "storage/checksum.h" diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 9d7b11c27a0..e91e2ef1715 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -53,6 +53,7 @@ parseCommandLine(int argc, char *argv[]) {"check", no_argument, NULL, 'c'}, {"link", no_argument, NULL, 'k'}, {"retain", no_argument, NULL, 'r'}, + {"authprompt", no_argument, NULL, 'R'}, {"jobs", required_argument, NULL, 'j'}, {"socketdir", required_argument, NULL, 's'}, {"verbose", no_argument, NULL, 'v'}, @@ -108,7 +109,7 @@ parseCommandLine(int argc, char *argv[]) if (os_user_effective_id == 0) pg_fatal("%s: cannot be run as root\n", os_info.progname); - while ((option = getopt_long(argc, argv, "d:D:b:B:cj:ko:O:p:P:rs:U:v", + while ((option = getopt_long(argc, argv, "d:D:b:B:cj:ko:O:p:P:rRs:U:v", long_options, &optindex)) != -1) { switch (option) @@ -186,6 +187,10 @@ parseCommandLine(int argc, char *argv[]) log_opts.retain = true; break; + case 'R': + user_opts.pass_terminal_fd = true; + break; + case 's': user_opts.socketdir = pg_strdup(optarg); break; diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index e9b078deb3c..a5fd2c81025 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -16,6 +16,10 @@ #include "postgres.h" #include "libpq-fe.h" #include "pqexpbuffer.h" +#include "common/kmgr_utils.h" + +/* For now, pg_upgrade does not use common/logging.c; use our own pg_fatal */ +#undef pg_fatal /* Use port in the private/dynamic port number range */ #define DEF_PGUPORT 50432 @@ -328,6 +332,7 @@ typedef struct bool date_is_int; bool float8_pass_by_value; bool data_checksum_version; + int file_encryption_method; } ControlData; /* @@ -400,6 +405,8 @@ typedef struct transferMode transfer_mode; /* copy files or link them? */ int jobs; /* number of processes/threads to use */ char *socketdir; /* directory to use for Unix sockets */ + bool ind_coll_unknown; /* mark unknown index collation versions */ + bool pass_terminal_fd; /* pass -R to pg_ctl? */ } UserOpts; typedef struct diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index ee9010e3857..5cae4f6a677 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -883,6 +883,10 @@ static const SchemaQuery Query_for_list_of_collations = { " (SELECT tgrelid FROM pg_catalog.pg_trigger "\ " WHERE pg_catalog.quote_ident(tgname)='%s')" +#define Query_for_list_of_tasks \ +"SELECT pg_catalog.quote_ident(jobname) FROM pg_catalog.pg_task "\ +" WHERE username = current_user" + #define Query_for_list_of_ts_configurations \ "SELECT pg_catalog.quote_ident(cfgname) FROM pg_catalog.pg_ts_config "\ " WHERE substring(pg_catalog.quote_ident(cfgname),1,%d)='%s'" @@ -1121,6 +1125,7 @@ static const pgsql_thing_t words_after_create[] = { {"SYSTEM", NULL, NULL, NULL, THING_NO_CREATE | THING_NO_DROP}, {"TABLE", NULL, NULL, &Query_for_list_of_tables}, {"TABLESPACE", Query_for_list_of_tablespaces}, + {"TASK", Query_for_list_of_tasks}, {"TEMP", NULL, NULL, NULL, THING_NO_DROP | THING_NO_ALTER}, /* for CREATE TEMP TABLE * ... */ {"TEMPLATE", Query_for_list_of_ts_templates, NULL, NULL, THING_NO_SHOW}, @@ -2290,6 +2295,11 @@ psql_completion(const char *text, int start, int end) /* complete ALTER GROUP ADD|DROP USER with a user name */ else if (Matches("ALTER", "GROUP", MatchAny, "ADD|DROP", "USER")) COMPLETE_WITH_QUERY(Query_for_list_of_roles); + /* ALTER TASK */ + else if (Matches("ALTER", "TASK")) + COMPLETE_WITH_QUERY(Query_for_list_of_tasks); + else if (Matches("ALTER", "TASK", MatchAny)) + COMPLETE_WITH("SCHEDULE", "DATABASE", "USER", "ACTIVE", "NOT ACTIVE", "AS"); /* * If we have ALTER TYPE RENAME VALUE, provide list of enum values @@ -2716,6 +2726,15 @@ psql_completion(const char *text, int start, int end) TailMatches("CREATE", "TEMP|TEMPORARY", "SEQUENCE", MatchAny, "NO")) COMPLETE_WITH("MINVALUE", "MAXVALUE", "CYCLE"); +/* CREATE TASK */ + else if (Matches("CREATE", "TASK", MatchAny)) + COMPLETE_WITH("SCHEDULE"); + else if (Matches("CREATE", "TASK", MatchAny, "SCHEDULE", MatchAny)) + COMPLETE_WITH("DATABASE", "USER", "AS"); + else if (Matches("CREATE", "TASK", MatchAny, "SCHEDULE", MatchAny, "DATABASE", MatchAny) || + Matches("CREATE", "TASK", MatchAny, "SCHEDULE", MatchAny, "USER", MatchAny)) + COMPLETE_WITH("AS"); + /* CREATE SERVER */ else if (Matches("CREATE", "SERVER", MatchAny)) COMPLETE_WITH("TYPE", "VERSION", "FOREIGN DATA WRAPPER"); @@ -3219,6 +3238,10 @@ psql_completion(const char *text, int start, int end) else if (Matches("DROP", "TEXT", "SEARCH")) COMPLETE_WITH("CONFIGURATION", "DICTIONARY", "PARSER", "TEMPLATE"); + /* DROP TASK */ + else if (Matches("DROP", "TASK")) + COMPLETE_WITH_QUERY(Query_for_list_of_tasks); + /* DROP TRIGGER */ else if (Matches("DROP", "TRIGGER", MatchAny)) COMPLETE_WITH("ON"); diff --git a/src/common/Makefile b/src/common/Makefile index 68280aca8df..4549e6a24fb 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -61,6 +61,7 @@ OBJS_COMMON = \ ip.o \ jsonapi.o \ keywords.o \ + kmgr_utils.o \ kwlookup.o \ link-canary.o \ md5.o \ @@ -83,11 +84,13 @@ OBJS_COMMON = \ ifeq ($(with_ssl),openssl) OBJS_COMMON += \ + cipher_openssl.o \ protocol_openssl.o \ cryptohash_openssl.o \ hmac_openssl.o else OBJS_COMMON += \ + cipher.o \ cryptohash.o \ hmac.o \ md5.o \ diff --git a/src/common/cipher.c b/src/common/cipher.c new file mode 100644 index 00000000000..7aa3ea6138c --- /dev/null +++ b/src/common/cipher.c @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * + * cipher.c + * Shared frontend/backend for cryptographic functions + * + * This is the set of in-core functions used when there are no other + * alternative options like OpenSSL. + * + * Copyright (c) 2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/cipher.c + * + *------------------------------------------------------------------------- + */ + +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include "common/cipher.h" + +static void cipher_failure(void) pg_attribute_noreturn(); + + +PgCipherCtx * +pg_cipher_ctx_create(int cipher, unsigned char *key, int klen, bool enc) +{ + cipher_failure(); + return NULL; /* keep compiler quiet */ +} + +void +pg_cipher_ctx_free(PgCipherCtx *ctx) +{ + cipher_failure(); +} + +int +pg_cipher_blocksize(PgCipherCtx *ctx) +{ + cipher_failure(); + return -1; /* keep compiler quiet */ +} + +bool +pg_cipher_encrypt(PgCipherCtx *ctx, const int cipher, + const unsigned char *plaintext, + const int inlen, unsigned char *ciphertext, int *outlen, + const unsigned char *iv, const int ivlen, + unsigned char *outtag, const int taglen) +{ + cipher_failure(); + return false; /* keep compiler quiet */ +} + +bool +pg_cipher_decrypt(PgCipherCtx *ctx, const int cipher, + const unsigned char *ciphertext, + const int inlen, unsigned char *plaintext, int *outlen, + const unsigned char *iv, const int ivlen, + unsigned char *intag, const int taglen) +{ + cipher_failure(); + return false; /* keep compiler quiet */ +} + +bool +pg_cipher_keywrap(PgCipherCtx *ctx, const unsigned char *plaintext, + const int inlen, unsigned char *ciphertext, int *outlen) +{ + cipher_failure(); + return false; /* keep compiler quiet */ +} + +bool +pg_cipher_keyunwrap(PgCipherCtx *ctx, const unsigned char *ciphertext, + const int inlen, unsigned char *plaintext, int *outlen) +{ + cipher_failure(); + return false; /* keep compiler quiet */ +} + +static void +cipher_failure(void) +{ +#ifndef FRONTEND + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + (errmsg("cluster file encryption is not supported because OpenSSL is not supported by this build"), + errhint("Compile with --with-openssl to use this feature.")))); +#else + fprintf(stderr, _("cluster file encryption is not supported because OpenSSL is not supported by this build")); + exit(1); +#endif +} diff --git a/src/common/cipher_openssl.c b/src/common/cipher_openssl.c new file mode 100644 index 00000000000..a03fbe435f2 --- /dev/null +++ b/src/common/cipher_openssl.c @@ -0,0 +1,419 @@ +/*------------------------------------------------------------------------- + * cipher_openssl.c + * Cryptographic function using OpenSSL + * + * This contains the common low-level functions needed in both frontend and + * backend, for implement the database encryption. + * + * Portions Copyright (c) 2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/cipher_openssl.c + * + *------------------------------------------------------------------------- + */ +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include "common/cipher.h" +#include +#include +#include +#include + +/* + * prototype for the EVP functions that return an algorithm, e.g. + * EVP_aes_128_gcm(). + */ +typedef const EVP_CIPHER *(*ossl_EVP_cipher_func) (void); + +static ossl_EVP_cipher_func get_evp_aes_gcm(int klen); +static EVP_CIPHER_CTX *ossl_cipher_ctx_create(int cipher, unsigned char *key, + int klen, bool enc); + +/* + * Return a newly created cipher context. 'cipher' specifies cipher algorithm + * by identifier like PG_CIPHER_XXX. + */ +PgCipherCtx * +pg_cipher_ctx_create(int cipher, unsigned char *key, int klen, bool enc) +{ + PgCipherCtx *ctx = NULL; + + if (cipher > PG_MAX_CIPHER_ID) + return NULL; + + ctx = ossl_cipher_ctx_create(cipher, key, klen, enc); + + return ctx; +} + +void +pg_cipher_ctx_free(PgCipherCtx *ctx) +{ + EVP_CIPHER_CTX_free(ctx); +} + +int +pg_cipher_blocksize(PgCipherCtx *ctx) +{ + Assert(ctx); + + return EVP_CIPHER_CTX_block_size(ctx); +} + +/* + * Encryption routine to encrypt data provided. + * + * ctx is the encryption context which must have been created previously. + * + * plaintext is the data we are going to encrypt + * inlen is the length of the data to encrypt + * + * ciphertext is the encrypted result + * outlen is the encrypted length + * + * iv is the IV to use. + * ivlen is the IV length to use. + * + * outtag is the resulting tag. + * taglen is the length of the tag. + */ +bool +pg_cipher_encrypt(PgCipherCtx *ctx, int cipher, + const unsigned char *plaintext, const int inlen, + unsigned char *ciphertext, int *outlen, + const unsigned char *iv, const int ivlen, + unsigned char *outtag, const int taglen) +{ + int len; + int enclen; + + Assert(ctx != NULL); + + /* + * Here we are setting the IV for the context which was passed in. Note + * that we signal to OpenSSL that we are configuring a new value for the + * context by passing in 'NULL' for the 2nd ('type') parameter. + */ + + /* + * We don't use GCM mode, but it has a MAC, so we support it and test it + * in case we need it later. XXX is this correct for GCM and CTR? + */ + /* Set the GCM IV length first */ + if (cipher == PG_CIPHER_AES_GCM && + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, ivlen, NULL)) + return false; + + /* Set the IV for this encryption. */ + if (!EVP_EncryptInit_ex(ctx, NULL, NULL, NULL, iv)) + return false; + + /* + * This is the function which is actually performing the encryption for + * us. + */ + if (!EVP_EncryptUpdate(ctx, ciphertext, &len, plaintext, inlen)) + return false; + + enclen = len; + + /* Finalize the encryption, which could add more to output. */ + if (!EVP_EncryptFinal_ex(ctx, ciphertext + enclen, &len)) + return false; + + *outlen = enclen + len; + + /* + * Once all of the encryption has been completed we grab the tag. + */ + if (cipher == PG_CIPHER_AES_GCM && + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, taglen, outtag)) + return false; + + return true; +} + +/* + * Decryption routine + * + * ctx is the encryption context which must have been created previously. + * + * ciphertext is the data we are going to decrypt + * inlen is the length of the data to decrypt + * + * plaintext is the decrypted result + * outlen is the decrypted length + * + * iv is the IV to use. + * ivlen is the length of the IV. + * + * intag is the tag to use to verify. + * taglen is the length of the tag. + */ +bool +pg_cipher_decrypt(PgCipherCtx *ctx, const int cipher, + const unsigned char *ciphertext, const int inlen, + unsigned char *plaintext, int *outlen, + const unsigned char *iv, const int ivlen, + unsigned char *intag, const int taglen) +{ + int declen; + int len; + + /* + * Here we are setting the IV for the context which was passed in. Note + * that we signal to OpenSSL that we are configuring a new value for the + * context by passing in 'NULL' for the 2nd ('type') parameter. + */ + + /* XXX is this correct for GCM and CTR? */ + /* Set the GCM IV length first */ + if (cipher == PG_CIPHER_AES_GCM && + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, ivlen, NULL)) + return false; + + /* Set the IV for this decryption. */ + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, iv)) + return false; + + /* + * This is the function which is actually performing the decryption for + * us. + */ + if (!EVP_DecryptUpdate(ctx, plaintext, &len, ciphertext, inlen)) + return false; + + declen = len; + + /* Set the expected tag value. */ + if (cipher == PG_CIPHER_AES_GCM && + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, taglen, intag)) + return false; + + /* + * Finalize the decryption, which could add more to output, this is also + * the step which checks the tag and we MUST fail if this indicates an + * invalid tag! + */ + if (!EVP_DecryptFinal_ex(ctx, plaintext + declen, &len)) + return false; + + *outlen = declen + len; + + return true; +} + +/* + * Routine to perform key wrapping for data provided. + * + * ctx is the encryption context which must have been created previously. + * + * plaintext is the data/key we are going to encrypt/wrap + * inlen is the length of the data + * + * ciphertext is the wrapped result + * outlen is the encrypted length (will be larger than input!) + */ +bool +pg_cipher_keywrap(PgCipherCtx *ctx, + const unsigned char *plaintext, const int inlen, + unsigned char *ciphertext, int *outlen) +{ + int len; + int enclen; + + Assert(ctx != NULL); + + /* + * This is the function which is actually performing the encryption for + * us. + */ + if (!EVP_EncryptUpdate(ctx, ciphertext, &len, plaintext, inlen)) + return false; + + enclen = len; + + /* Finalize the encryption, which could add more to output. */ + if (!EVP_EncryptFinal_ex(ctx, ciphertext + enclen, &len)) + return false; + + *outlen = enclen + len; + + return true; +} + +/* + * Routine to perform key unwrapping of the data provided. + * + * ctx is the encryption context which must have been created previously. + * + * ciphertext is the wrapped key we are going to unwrap + * inlen is the length of the data to decrypt/unwrap + * + * plaintext is the decrypted result + * outlen is the decrypted length (will be smaller than input!) + */ +bool +pg_cipher_keyunwrap(PgCipherCtx *ctx, + const unsigned char *ciphertext, const int inlen, + unsigned char *plaintext, int *outlen) +{ + int declen; + int len; + + /* + * This is the function which is actually performing the decryption for + * us. + */ + if (!EVP_DecryptUpdate(ctx, plaintext, &len, ciphertext, inlen)) + return false; + + declen = len; + + /* + * Finalize the decryption, which could add more to output, this is also + * the step which checks the tag and we MUST fail if this indicates an + * invalid result! + */ + if (!EVP_DecryptFinal_ex(ctx, plaintext + declen, &len)) + return false; + + *outlen = declen + len; + + return true; +} + +/* + * Returns the correct GCM cipher functions for OpenSSL based + * on the key length requested. + */ +static ossl_EVP_cipher_func +get_evp_aes_gcm(int klen) +{ + switch (klen) + { + case PG_AES128_KEY_LEN: + return EVP_aes_128_gcm; + case PG_AES192_KEY_LEN: + return EVP_aes_192_gcm; + case PG_AES256_KEY_LEN: + return EVP_aes_256_gcm; + default: + return NULL; + } +} + +/* + * Returns the correct KWP cipher functions for OpenSSL based + * on the key length requested. + */ +static ossl_EVP_cipher_func +get_evp_aes_kwp(int klen) +{ + switch (klen) + { + case PG_AES128_KEY_LEN: + return EVP_aes_128_wrap_pad; + case PG_AES192_KEY_LEN: + return EVP_aes_192_wrap_pad; + case PG_AES256_KEY_LEN: + return EVP_aes_256_wrap_pad; + default: + return NULL; + } +} + +/* + * Returns the correct CTR cipher functions for OpenSSL based + * on the key length requested. + */ +static ossl_EVP_cipher_func +get_evp_aes_ctr(int klen) +{ + switch (klen) + { + case PG_AES128_KEY_LEN: + return EVP_aes_128_ctr; + case PG_AES192_KEY_LEN: + return EVP_aes_192_ctr; + case PG_AES256_KEY_LEN: + return EVP_aes_256_ctr; + default: + return NULL; + } +} + +/* + * Initialize and return an EVP_CIPHER_CTX. Returns NULL if the given + * cipher algorithm is not supported or on failure. + */ +static EVP_CIPHER_CTX * +ossl_cipher_ctx_create(int cipher, unsigned char *key, int klen, bool enc) +{ + EVP_CIPHER_CTX *ctx; + ossl_EVP_cipher_func func; + int ret; + + ctx = EVP_CIPHER_CTX_new(); + + /* + * We currently only support AES GCM but others could be added in the + * future. + */ + switch (cipher) + { + case PG_CIPHER_AES_GCM: + func = get_evp_aes_gcm(klen); + if (!func) + goto failed; + break; + case PG_CIPHER_AES_KWP: + func = get_evp_aes_kwp(klen); + + /* + * Since wrapping will produce more output then input, and we have + * to be ready for that, OpenSSL requires that we explicitly + * enable wrapping for the context. + */ + EVP_CIPHER_CTX_set_flags(ctx, EVP_CIPHER_CTX_FLAG_WRAP_ALLOW); + if (!func) + goto failed; + break; + case PG_CIPHER_AES_CTR: + func = get_evp_aes_ctr(klen); + if (!func) + goto failed; + break; + default: + goto failed; + } + + /* + * We create the context here based on the cipher requested and the + * provided key. Note that the IV will be provided in the actual + * encryption call through another EVP_EncryptInit_ex call- this is fine + * as long as 'type' is passed in as NULL! + */ + if (enc) + ret = EVP_EncryptInit_ex(ctx, (const EVP_CIPHER *) func(), NULL, key, NULL); + else + ret = EVP_DecryptInit_ex(ctx, (const EVP_CIPHER *) func(), NULL, key, NULL); + + if (!ret) + goto failed; + + /* Set the key length based on the key length requested. */ + if (!EVP_CIPHER_CTX_set_key_length(ctx, klen)) + goto failed; + + return ctx; + +failed: + EVP_CIPHER_CTX_free(ctx); + return NULL; +} diff --git a/src/common/kmgr_utils.c b/src/common/kmgr_utils.c new file mode 100644 index 00000000000..1a6f281deb6 --- /dev/null +++ b/src/common/kmgr_utils.c @@ -0,0 +1,469 @@ +/*------------------------------------------------------------------------- + * + * kmgr_utils.c + * Shared frontend/backend for cluster file encryption + * + * These functions handle reading the wrapped DEK files from the + * file system, and wrapping and unwrapping them. It also handles + * running the cluster_key_command. + * + * Copyright (c) 2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/kmgr_utils.c + * + *------------------------------------------------------------------------- + */ + +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include +#include +#include + +#ifdef FRONTEND +#include "common/logging.h" +#endif +#include "common/cryptohash.h" +#include "common/file_perm.h" +#include "common/string.h" +#include "crypto/kmgr.h" +#include "lib/stringinfo.h" +#include "storage/fd.h" +#include "postgres.h" +#include "utils/builtins.h" + +#ifndef FRONTEND +#include "pgstat.h" +#include "storage/fd.h" +#endif + +#define KMGR_PROMPT_MSG "Enter authentication needed to generate the cluster key: " + +#ifdef FRONTEND +static FILE *open_pipe_stream(const char *command); +static int close_pipe_stream(FILE *file); +#endif + +static void read_wrapped_data_key(const char *cryptoKeyDir, uint32 id, unsigned char **key_p, int *key_len); + +/* + * We need this array in frontend and backend code, so define it here. + * You can only add to the end of this array since the array index is + * stored in pg_control. + */ +encryption_method encryption_methods[NUM_ENCRYPTION_METHODS] = { + {"", 0}, + {"AES128", 128}, + {"AES192", 192}, + {"AES256", 256}, + {"SM4", 128} +}; + +/* This maps wrapped key filesnames to their array slots */ +char *wkey_filenames[KMGR_NUM_DATA_KEYS] = { + "relation", + "wal" +}; + + +/* + * Wrap the given CryptoKey. + * + * Returns true and writes encrypted/wrapped/padded data to 'out', and the length + * of the result to outlen, if successful. + * + * Otherwise returns false. The caller must allocate sufficient space + * for cipher data calculated by using KmgrSizeOfCipherText(). Please note that + * this function modifies 'out' data even on failure. + */ +bool +kmgr_wrap_data_key(PgCipherCtx *ctx, CryptoKey *in, unsigned char *out, int *outlen) +{ + Assert(ctx && in && out); + + if (!pg_cipher_keywrap(ctx, (unsigned char *) in, sizeof(CryptoKey), out, outlen)) + return false; + + return true; +} + +/* + * Decrypt the given data. Return true and set plain text data to `out` if + * successful. Otherwise return false. The caller must allocate sufficient + * space for cipher data calculated by using KmgrSizeOfPlainText(). Please + * note that this function modifies 'out' data even on failure. + */ +bool +kmgr_unwrap_data_key(PgCipherCtx *ctx, unsigned char *in, int inlen, CryptoKey *out) +{ + int outlen; + + Assert(ctx && in && out); + + if (!pg_cipher_keyunwrap(ctx, in, inlen, (unsigned char *) out, &outlen)) + return false; + + Assert(outlen == sizeof(CryptoKey)); + + return true; +} + +/* + * Verify the correctness of the given cluster key by unwrapping the given keys. + * If the given cluster key is correct we set unwrapped keys to out_keys and return + * true. Otherwise return false. Please note that this function changes the + * contents of out_keys even on failure. Both in_keys and out_keys must be the + * same length. + */ +bool +kmgr_verify_cluster_key(unsigned char *cluster_key, + unsigned char **in_keys, int *key_lens, CryptoKey *out_keys) +{ + PgCipherCtx *ctx; + + /* Create decryption context with the KEK. */ + ctx = pg_cipher_ctx_create(PG_CIPHER_AES_KWP, cluster_key, + KMGR_CLUSTER_KEY_LEN, false); + + /* unwrap each DEK */ + for (int i = 0; i < KMGR_NUM_DATA_KEYS; i++) + { + if (!kmgr_unwrap_data_key(ctx, in_keys[i], key_lens[i], &(out_keys[i]))) + { + /* The cluster key is not correct */ + pg_cipher_ctx_free(ctx); + return false; + } + explicit_bzero(in_keys[i], key_lens[i]); + } + + /* The cluster key is correct, free the cipher context */ + pg_cipher_ctx_free(ctx); + + return true; +} + +/* + * Run cluster key command. + * + * Substitute %d for directory, %p for prompt, %R for file descriptor. + * + * The result will be put in buffer buf, which is of size "size". + * The return value is the length of the actual result. + */ +int +kmgr_run_cluster_key_command(char *cluster_key_command, char *buf, + int size, char *dir, int terminal_fd) +{ + StringInfoData command; + const char *sp; + FILE *fh; + int pclose_rc; + size_t len = 0; + + buf[0] = '\0'; + + Assert(size > 0); + + /* + * Build the command to be executed. + */ + initStringInfo(&command); + + for (sp = cluster_key_command; *sp; sp++) + { + if (*sp == '%') + { + switch (sp[1]) + { + /* directory */ + case 'd': + { + char *nativePath; + + sp++; + + /* + * This needs to use a placeholder to not modify the + * input with the conversion done via + * make_native_path(). + */ + nativePath = pstrdup(dir); + make_native_path(nativePath); + appendStringInfoString(&command, nativePath); + pfree(nativePath); + break; + } + /* prompt string */ + case 'p': + appendStringInfoString(&command, " "); + sp++; + appendStringInfoString(&command, KMGR_PROMPT_MSG); + break; + /* file descriptor number */ + case 'R': + { + char fd_str[20]; + + if (terminal_fd == -1) + { +#ifdef FRONTEND + pg_log_fatal("cluster key command referenced %%R, but --authprompt not specified"); +#else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("cluster key command referenced %%R, but --authprompt not specified"))); +#endif + } + + appendStringInfoString(&command, " "); + sp++; + snprintf(fd_str, sizeof(fd_str), "%d", terminal_fd); + appendStringInfoString(&command, fd_str); + break; + } + /* literal "%" */ + case '%': + /* convert %% to a single % */ + sp++; + appendStringInfoChar(&command, *sp); + break; + default: + /* otherwise treat the % as not special */ + appendStringInfoChar(&command, *sp); + break; + } + } + else + { + appendStringInfoChar(&command, *sp); + } + } + +#ifdef FRONTEND + fh = open_pipe_stream(command.data); + if (fh == NULL) + { + pg_log_fatal("could not execute command \"%s\": %m", + command.data); + } +#else + fh = OpenPipeStream(command.data, "r"); + if (fh == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not execute command \"%s\": %m", + command.data))); +#endif + + if (!fgets(buf, size, fh)) + { + if (ferror(fh)) + { +#ifdef FRONTEND + pg_log_fatal("could not read from command \"%s\": %m", + command.data); +#else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from command \"%s\": %m", + command.data))); +#endif + } + } + +#ifdef FRONTEND + pclose_rc = close_pipe_stream(fh); +#else + pclose_rc = ClosePipeStream(fh); +#endif + + if (pclose_rc == -1) + { +#ifdef FRONTEND + pg_log_fatal("could not close pipe to external command: %m"); +#else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close pipe to external command: %m"))); +#endif + } + else if (pclose_rc != 0) + { +#ifdef FRONTEND + pg_log_fatal("command \"%s\" failed", command.data); +#else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("command \"%s\" failed", + command.data), + errdetail_internal("%s", wait_result_to_str(pclose_rc)))); +#endif + } + + /* strip trailing newline and carriage returns */ + len = pg_strip_crlf(buf); + + pfree(command.data); + + return len; +} + +#ifdef FRONTEND +static FILE * +open_pipe_stream(const char *command) +{ + FILE *res; + +#ifdef WIN32 + size_t cmdlen = strlen(command); + char *buf; + int save_errno; + + buf = malloc(cmdlen + 2 + 1); + if (buf == NULL) + { + errno = ENOMEM; + return NULL; + } + buf[0] = '"'; + memcpy(&buf[1], command, cmdlen); + buf[cmdlen + 1] = '"'; + buf[cmdlen + 2] = '\0'; + + res = _popen(buf, "r"); + + save_errno = errno; + free(buf); + errno = save_errno; +#else + res = popen(command, "r"); +#endif /* WIN32 */ + return res; +} + +static int +close_pipe_stream(FILE *file) +{ +#ifdef WIN32 + return _pclose(file); +#else + return pclose(file); +#endif /* WIN32 */ +} +#endif /* FRONTEND */ + +/* + * Reads the keys at path. + * + * This routine simply reads in the raw encrypted/wrapped keys; + * it does not handle any decryption, see kmgr_key_unwrap(). + * + * For each key returned, the key and key length are returned + * in the keys and key_lens arrays respectfully. + * + * Note that keys and key_lens must be allocated before calling + * this function as arrays of at least KMGR_NUM_DATA_KEYS length. + */ +void +kmgr_read_wrapped_data_keys(const char *path, unsigned char **keys, int *key_lens) +{ +/* StaticAssertStmt(lengthof(wkey_filenames) == KMGR_NUM_DATA_KEYS, + "wkey_filenames[] must match KMGR_NUM_DATA_KEYS"); +*/ + StaticAssertStmt(1 == 1, + "wkey_filenames[] must match KMGR_NUM_DATA_KEYS"); + + for (int id = 0; id < KMGR_NUM_DATA_KEYS; id++) + read_wrapped_data_key(path, id, &(keys[id]), &(key_lens[id])); + + return; +} + +/* Read a wrapped DEK file */ +static void +read_wrapped_data_key(const char *cryptoKeyDir, uint32 id, unsigned char **key_p, int *key_len) +{ + char path[MAXPGPATH]; + int fd; + int r; + struct stat st; + + CryptoKeyFilePath(path, cryptoKeyDir, id); + +#ifndef FRONTEND + if ((fd = OpenTransientFile(path, O_RDONLY | PG_BINARY)) == -1) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" for reading: %m", + path))); + else if (fstat(fd, &st)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + path))); +#else + if ((fd = open(path, O_RDONLY | PG_BINARY, 0)) == -1) + pg_log_fatal("could not open file \"%s\" for reading: %m", + path); + else if (fstat(fd, &st)) + pg_log_fatal("could not stat file \"%s\": %m", + path); +#endif + + *key_len = st.st_size; + +#ifndef FRONTEND + pgstat_report_wait_start(WAIT_EVENT_KEY_FILE_READ); +#endif + + *key_p = (unsigned char *) palloc0(*key_len); + + /* Get key bytes */ + r = read(fd, *key_p, *key_len); + if (r != *key_len) + { + if (r < 0) + { +#ifndef FRONTEND + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); +#else + pg_log_fatal("could not read file \"%s\": %m", path); +#endif + } + else + { +#ifndef FRONTEND + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %u", + path, r, *key_len))); +#else + pg_log_fatal("could not read file \"%s\": read %d of %u", + path, r, *key_len); +#endif + } + } + +#ifndef FRONTEND + pgstat_report_wait_end(); +#endif + +#ifndef FRONTEND + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + path))); +#else + if (close(fd) != 0) + pg_log_fatal("could not close file \"%s\": %m", path); +#endif +} diff --git a/src/include/Makefile b/src/include/Makefile index f8690340925..48573c7a76b 100644 --- a/src/include/Makefile +++ b/src/include/Makefile @@ -24,7 +24,7 @@ SUBDIRS = access bootstrap catalog commands common datatype \ statistics storage tcop snowball snowball/libstemmer tsearch \ tsearch/dicts utils port port/atomics port/win32 port/win32_msvc \ port/win32_msvc/sys port/win32/arpa port/win32/netinet \ - port/win32/sys portability + port/win32/sys portability task SUBDIRS += cdb diff --git a/src/include/access/appendonlywriter.h b/src/include/access/appendonlywriter.h index a8516611597..0388e3bcaae 100644 --- a/src/include/access/appendonlywriter.h +++ b/src/include/access/appendonlywriter.h @@ -39,13 +39,37 @@ */ #define RESERVED_SEGNO 0 +/* + * Modes of operation for the choose_segno_internal() function. + */ +typedef enum +{ + /* + * Normal mode; select a segment to insert to, for INSERT or COPY. + */ + CHOOSE_MODE_WRITE, + + /* + * Select a segment to insert surviving rows to, when compacting + * another segfile in VACUUM. + */ + CHOOSE_MODE_COMPACTION_WRITE, + + /* + * Select next segment to compact. + */ + CHOOSE_MODE_COMPACTION_TARGET +} choose_segno_mode; + /* * functions in appendonlywriter.c */ extern void LockSegnoForWrite(Relation rel, int segno); extern int ChooseSegnoForWrite(Relation rel); +extern int ChooseSegnoForWriteMultiFile(Relation rel, List *avoid_segnos); extern int ChooseSegnoForCompactionWrite(Relation rel, List *avoid_segnos); extern int ChooseSegnoForCompaction(Relation rel, List *avoidsegnos); extern void AORelIncrementModCount(Relation parentrel); +extern bool ShouldUseReservedSegno(Relation rel, choose_segno_mode mode); #endif /* APPENDONLYWRITER_H */ diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 4b06575d987..04f11fb3693 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -41,7 +41,7 @@ #define GISTNProcs 11 /* - * Page opaque data in a GiST index page. + * Page opaque data flags in a GiST index page. */ #define F_LEAF (1 << 0) /* leaf page */ #define F_DELETED (1 << 1) /* the page has been deleted */ @@ -51,6 +51,9 @@ #define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead, * but not deleted yet */ +/* Specifies the bits that can be set in the GiST flags field */ +#define GIST_FLAG_BITMASK 0x1F + /* * NSN (node sequence number) is a special-purpose LSN which is stored on each * index page in GISTPageOpaqueData and updated only during page splits. By diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index 93d88ac60e1..170952958af 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -16,7 +16,9 @@ #include "access/xlogdefs.h" #include "lib/ilist.h" +#include "nodes/execnodes.h" #include "postmaster/bgworker.h" +#include "storage/barrier.h" #include "storage/shm_mq.h" #include "storage/shm_toc.h" @@ -52,12 +54,41 @@ typedef struct ParallelWorkerContext { dsm_segment *seg; shm_toc *toc; + int nworkers; + int worker_id; } ParallelWorkerContext; extern volatile bool ParallelMessagePending; extern PGDLLIMPORT int ParallelWorkerNumber; extern PGDLLIMPORT bool InitializingParallelWorker; +typedef struct ParallelEntryTag +{ + int cid; + int sliceId; + int sessionId; +} ParallelEntryTag; + +#define INIT_PARALLELENTRYTAG(a,xx_cid,xx_sliceId,xx_sessionId) \ +do { \ + (a).cid = (xx_cid); \ + (a).sliceId = (xx_sliceId); \ + (a).sessionId = (xx_sessionId); \ +} while(0) + +typedef struct GpParallelDSMEntry +{ + ParallelEntryTag tag; + int pid; + dsm_handle handle; + shm_toc *toc; + int reference; + int tolaunch; + int parallel_workers; + int temp_worker_id; /* temproary usage */ + Barrier build_barrier; /* synchronization for the build dsm phases */ +} GpParallelDSMEntry; + #define IsParallelWorker() (ParallelWorkerNumber >= 0) extern ParallelContext *CreateParallelContext(const char *library_name, @@ -79,4 +110,22 @@ extern void ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end); extern void ParallelWorkerMain(Datum main_arg); +extern void InitGpParallelDSMHash(void); + +extern GpParallelDSMEntry* GpInsertParallelDSMHash(PlanState *planstate); + +extern Size GpParallelDSMHashSize(void); + +extern bool EstimateGpParallelDSMEntrySize(PlanState *planstate, ParallelContext *pctx); + +extern bool InitializeGpParallelDSMEntry(PlanState *node, ParallelContext *pctx); +extern bool InitializeGpParallelWorkers(PlanState *planstate, ParallelWorkerContext *pwcxt); +extern void* GpFetchParallelDSMEntry(ParallelEntryTag tag, int plan_node_id); + +extern void GpDestroyParallelDSMEntry(void); + +extern void AtEOXact_GP_Parallel(void); + +extern void AtProcExit_GP_Parallel(int code, Datum arg); + #endif /* PARALLEL_H */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 74a07ef152d..c4727581431 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -66,6 +66,8 @@ typedef struct ParallelTableScanDescData bool phs_syncscan; /* report location to syncscan logic? */ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ Size phs_snapshot_off; /* data for snapshot */ + CommandId command_id; + int plan_node_id; } ParallelTableScanDescData; typedef struct ParallelTableScanDescData *ParallelTableScanDesc; @@ -81,6 +83,12 @@ typedef struct ParallelBlockTableScanDescData BlockNumber phs_startblock; /* starting block number */ pg_atomic_uint64 phs_nallocated; /* number of blocks allocated to * workers so far. */ + /* + * The Set of fields has different meaning for AO/AOCO tables: + * phs_nblocks: total # of segment files + * phs_nallocated: # of processed segment file + * phs_startblock: not used + */ } ParallelBlockTableScanDescData; typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc; @@ -162,6 +170,7 @@ typedef struct IndexScanDescData bool *xs_orderbynulls; bool xs_recheckorderby; + struct dsa_area *dsa; /* dsa area for parallel bitmap scan */ /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; } IndexScanDescData; @@ -188,4 +197,4 @@ typedef struct SysScanDescData struct TupleTableSlot *slot; } SysScanDescData; -#endif /* RELSCAN_H */ +#endif /* RELSCAN_H */ \ No newline at end of file diff --git a/src/include/access/session.h b/src/include/access/session.h index 82cee5aff57..0c256d189d4 100644 --- a/src/include/access/session.h +++ b/src/include/access/session.h @@ -40,5 +40,5 @@ extern void DetachSession(void); /* The current session, or NULL for none. */ extern Session *CurrentSession; - +extern Session *ParallelSession; #endif /* SESSION_H */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 3e12e382b8b..35970d2e15b 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -23,6 +23,7 @@ #include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/snapmgr.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -313,6 +314,7 @@ typedef struct TableAmRoutine */ TableScanDesc (*scan_begin_extractcolumns) (Relation rel, Snapshot snapshot, + ParallelTableScanDesc parallel_scan, List *targetlist, List *qual, uint32 flags); @@ -914,22 +916,42 @@ table_beginscan(Relation rel, Snapshot snapshot, * scan key array from the targetList and the quals if the corresponding method * is implemented. This is an optimization needed for AOCO relations. * Otherwise, it is equivalent as passing the last two arguments as, 0, NULL. + * Like table_beginscan_parallel, it will be parallel mode if parallel_scan is not NULL. */ static inline TableScanDesc -table_beginscan_es(Relation rel, Snapshot snapshot, +table_beginscan_es(Relation relation, Snapshot snapshot, ParallelTableScanDesc parallel_scan, List *targetList, List *qual) { + bool isParallel = parallel_scan != NULL; uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; - if (rel->rd_tableam->scan_begin_extractcolumns) - return rel->rd_tableam->scan_begin_extractcolumns(rel, snapshot, + /* reset snapshot if in parallel mode */ + if (isParallel) + { + Assert(RelationGetRelid(relation) == parallel_scan->phs_relid); + if (!parallel_scan->phs_snapshot_any) { + /* Snapshot was serialized -- restore it */ + snapshot = RestoreSnapshot((char *) parallel_scan + + parallel_scan->phs_snapshot_off); + RegisterSnapshot(snapshot); + flags |= SO_TEMP_SNAPSHOT; + } + else + { + /* SnapshotAny passed by caller (not serialized) */ + snapshot = SnapshotAny; + } + } + + if (relation->rd_tableam->scan_begin_extractcolumns) + return relation->rd_tableam->scan_begin_extractcolumns(relation, snapshot, parallel_scan, targetList, qual, flags); - return rel->rd_tableam->scan_begin(rel, snapshot, + return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL, - NULL, flags); + parallel_scan, flags); } /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 21b512280f7..4702785d046 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -15,6 +15,7 @@ #include "access/xlogdefs.h" #include "access/xloginsert.h" #include "access/xlogreader.h" +#include "common/kmgr_utils.h" #include "datatype/timestamp.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" @@ -157,6 +158,8 @@ extern int CheckPointSegments; extern bool StandbyModeRequested; extern bool StandbyMode; +/* tde feature enable or not */ +extern int FileEncryptionEnabled; /* Archive modes */ typedef enum ArchiveMode { @@ -209,13 +212,15 @@ extern PGDLLIMPORT int wal_level; /* * Is a full-page image needed for hint bit updates? * - * Normally, we don't WAL-log hint bit updates, but if checksums are enabled, - * we have to protect them against torn page writes. When you only set - * individual bits on a page, it's still consistent no matter what combination - * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log - * them if forced by wal_log_hints=on. + * Normally, we don't WAL-log hint bit updates, but if checksums or encryption + * is enabled, we have to protect them against torn page writes. When you + * only set individual bits on a page, it's still consistent no matter what + * combination of the bits make it to disk, but the checksum wouldn't match. + * Cluster file encryption requires a new LSN for hint bit changes, and can't + * tolerate torn pages. Also WAL-log them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() \ + (DataChecksumsEnabled() || FileEncryptionEnabled || wal_log_hints) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -336,6 +341,9 @@ extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); extern bool DataChecksumsEnabled(void); + +extern int GetFileEncryptionMethod(void); + extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); extern void XLOGShmemInit(void); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 0880fe31d28..2b951a409b5 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -62,6 +62,8 @@ extern void log_newpage_range(Relation rel, ForkNumber forkNum, BlockNumber startblk, BlockNumber endblk, bool page_std); extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); +extern XLogRecPtr LSNForEncryption(bool use_wal_lsn); + extern void InitXLogInsert(void); #endif /* XLOGINSERT_H */ diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h index 0b6eabd9516..11656cf1ebf 100644 --- a/src/include/catalog/catalog.h +++ b/src/include/catalog/catalog.h @@ -33,6 +33,7 @@ extern bool IsCatalogRelation(Relation relation); extern bool IsSystemClass(Oid relid, Form_pg_class reltuple); extern bool IsToastClass(Form_pg_class reltuple); +extern bool IsSystemClassByRelid(Oid relid); extern bool IsCatalogRelationOid(Oid relid); diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 07da04b7ddf..0df4b38a5c2 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -135,10 +135,11 @@ typedef enum ObjectClass OCLASS_TRANSFORM, /* pg_transform */ /* GPDB additions */ - OCLASS_EXTPROTOCOL /* pg_extprotocol */ + OCLASS_EXTPROTOCOL, /* pg_extprotocol */ + OCLASS_TASK /* pg_task */ } ObjectClass; -#define LAST_OCLASS OCLASS_EXTPROTOCOL +#define LAST_OCLASS OCLASS_TASK /* flag bits for performDeletion/performMultipleDeletions: */ #define PERFORM_DELETION_INTERNAL 0x0001 /* internal action */ diff --git a/src/include/catalog/pg_appendonly.h b/src/include/catalog/pg_appendonly.h index 7aa3f22a333..1800c33e278 100644 --- a/src/include/catalog/pg_appendonly.h +++ b/src/include/catalog/pg_appendonly.h @@ -33,6 +33,7 @@ CATALOG(pg_appendonly,6105,AppendOnlyRelationId) NameData compresstype; /* the compressor used (e.g. zlib) */ bool columnstore; /* true if orientation is column */ Oid segrelid; /* OID of aoseg table; 0 if none */ + int16 segfilecount; /* the (per seg) average total number of segment file */ Oid blkdirrelid; /* OID of aoblkdir table; 0 if none */ Oid blkdiridxid; /* if aoblkdir table, OID of aoblkdir index */ Oid visimaprelid; /* OID of the aovisimap table */ @@ -167,4 +168,7 @@ RemoveAppendonlyEntry(Oid relid); extern void SwapAppendonlyEntries(Oid entryRelId1, Oid entryRelId2); +extern int16 +GetAppendOnlySegmentFilesCount(Relation rel); + #endif /* PG_APPENDONLY_H */ diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 938872bef70..98772c27358 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -90,6 +90,7 @@ typedef struct CheckPoint /* GPDB_14_MERGE_FIXME: Compatible, Figure out whether 0xC0 already used? */ /* 0xC0 is used in Postgres 9.5-11 */ #define XLOG_OVERWRITE_CONTRECORD 0xE0 +#define XLOG_ENCRYPTION_LSN 0xF0 /* @@ -239,6 +240,9 @@ typedef struct ControlFileData */ char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN]; + /* File encryption method; index into encryption_methods[]. */ + int file_encryption_method; + /* CRC of all above ... MUST BE LAST! */ pg_crc32c crc; } ControlFileData; diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index f64317bfeb4..d0833282b22 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -7229,17 +7229,21 @@ proargtypes => 'name', prosrc => 'pg_database_size_name' }, { oid => '2325', descr => 'disk space usage for the main fork of the specified table or index', - proname => 'pg_relation_size', prolang => 'sql', provolatile => 'v', + proname => 'pg_relation_size', prolang => 'sql', provolatile => 'v', proparallel => 'u', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'see system_functions.sql' }, { oid => '2332', descr => 'disk space usage for the specified fork of a table or index', - proname => 'pg_relation_size', provolatile => 'v', prorettype => 'int8', + proname => 'pg_relation_size', provolatile => 'v', proparallel => 'u', prorettype => 'int8', proargtypes => 'regclass text', prosrc => 'pg_relation_size' }, { oid => '2286', descr => 'total disk space usage for the specified table and associated indexes', - proname => 'pg_total_relation_size', provolatile => 'v', prorettype => 'int8', + proname => 'pg_total_relation_size', provolatile => 'v', proparallel => 'u', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_total_relation_size' }, +{ oid => '2137', + descr => 'ao tables segment file count', + proname => 'gp_ao_segment_file_count', provolatile => 'v', prorettype => 'int2', + proargtypes => 'regclass', prosrc => 'gp_ao_segment_file_count' }, { oid => '2288', descr => 'convert a long int to a human readable text using size units', proname => 'pg_size_pretty', prorettype => 'text', proargtypes => 'int8', @@ -7254,7 +7258,7 @@ prosrc => 'pg_size_bytes' }, { oid => '2997', descr => 'disk space usage for the specified table, including TOAST, free space and visibility map', - proname => 'pg_table_size', provolatile => 'v', prorettype => 'int8', + proname => 'pg_table_size', provolatile => 'v', proparallel => 'u', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_table_size' }, { oid => '2998', descr => 'disk space usage for all indexes attached to the specified table', @@ -11568,9 +11572,9 @@ descr => 'pg_controldata init state information as a function', proname => 'pg_control_init', provolatile => 'v', prorettype => 'record', proargtypes => '', - proallargtypes => '{int4,int4,int4,int4,int4,int4,int4,int4,int4,bool,int4}', - proargmodes => '{o,o,o,o,o,o,o,o,o,o,o}', - proargnames => '{max_data_alignment,database_block_size,blocks_per_segment,wal_block_size,bytes_per_wal_segment,max_identifier_length,max_index_columns,max_toast_chunk_size,large_object_chunk_size,float8_pass_by_value,data_page_checksum_version}', + proallargtypes => '{int4,int4,int4,int4,int4,int4,int4,int4,int4,bool,int4,int4}', + proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{max_data_alignment,database_block_size,blocks_per_segment,wal_block_size,bytes_per_wal_segment,max_identifier_length,max_index_columns,max_toast_chunk_size,large_object_chunk_size,float8_pass_by_value,data_page_checksum_version,file_encryption_method}', prosrc => 'pg_control_init' }, # subscripting support for built-in types diff --git a/src/include/catalog/pg_task.h b/src/include/catalog/pg_task.h new file mode 100644 index 00000000000..c42f2883ff8 --- /dev/null +++ b/src/include/catalog/pg_task.h @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * pg_task.h + * save all tasks of cron task scheduler. + * + * Portions Copyright (c) 2023-Present Hashdata Inc. + * + * + * IDENTIFICATION + * src/include/catalog/pg_task.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_TASK_H +#define PG_TASK_H + +#include "catalog/genbki.h" +#include "catalog/pg_task_d.h" + +/* ---------------- + * pg_task definition. cpp turns this into + * typedef struct FormData_pg_task + * ---------------- + */ +CATALOG(pg_task,9637,TaskRelationId) BKI_SHARED_RELATION +{ + Oid jobid; + text schedule; + text command; + text nodename; + int32 nodeport; + text database; + text username; + bool active BKI_DEFAULT(t); + text jobname; +} FormData_pg_task; + +typedef FormData_pg_task *Form_pg_task; + +DECLARE_UNIQUE_INDEX(pg_task_jobname_username_index, 8915, on pg_task using btree(jobname text_ops, username text_ops)); +#define TaskJobNameUserNameIndexId 8915 +DECLARE_UNIQUE_INDEX_PKEY(pg_task_jobid_index, 8916, on pg_task using btree(jobid oid_ops)); +#define TaskJobIdIndexId 8916 + +extern Oid TaskCreate(const char *schedule, const char *command, + const char *nodename, int32 nodeport, + const char *database, const char *username, + bool active, const char *jobname); + +extern void TaskUpdate(Oid jobid, const char *schedule, + const char *command, const char *database, + const char *username, bool *active); + +extern Oid GetTaskJobId(const char *jobname, const char *username); + +extern char * GetTaskNameById(Oid jobid); + +#endif /* PG_TASK_H */ diff --git a/src/include/catalog/pg_task_run_history.h b/src/include/catalog/pg_task_run_history.h new file mode 100644 index 00000000000..e4617169b4b --- /dev/null +++ b/src/include/catalog/pg_task_run_history.h @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * pg_task_run_history.h + * save all tasks run history of cron task scheduler. + * + * Portions Copyright (c) 2023-Present Hashdata Inc. + * + * + * IDENTIFICATION + * src/include/catalog/pg_task_run_history.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_TASK_RUN_HISTORY_H +#define PG_TASK_RUN_HISTORY_H + +#include "catalog/genbki.h" +#include "catalog/pg_task_run_history_d.h" +#include "datatype/timestamp.h" + +#define timestamptz Datum + +/* ---------------- + * pg_task_run_history definition. cpp turns this into + * typedef struct FormData_pg_task_run_history + * ---------------- + */ +CATALOG(pg_task_run_history,9993,TaskRunHistoryRelationId) BKI_SHARED_RELATION +{ + Oid runid; + Oid jobid; + int32 job_pid BKI_DEFAULT(0); + text database; + text username; + text command; + text status; + text return_message; + timestamptz start_time; + timestamptz end_time; +} FormData_pg_task_run_history; + +typedef FormData_pg_task_run_history *Form_pg_task_run_history; + +DECLARE_INDEX(pg_task_run_history_jobid_index, 8633, on pg_task_run_history using btree(jobid oid_ops)); +#define TaskRunHistoryJobIdIndexId 8633 +DECLARE_UNIQUE_INDEX_PKEY(pg_task_run_history_runid_index, 8110, on pg_task_run_history using btree(runid oid_ops)); +#define TaskRunHistoryRunIdIndexId 8110 + +extern void TaskRunHistoryCreate(Oid runid, int64 *jobid, const char *database, const char *username, + const char *command, const char *status); + +extern void TaskRunHistoryUpdate(Oid runid, int32 *job_pid, const char *status, + const char *return_message, + TimestampTz *start_time, TimestampTz *end_time); + +extern void MarkRunningTaskAsFailed(void); + +extern void RemoveTaskRunHistoryByJobId(Oid jobid); + +#undef timestamptz + +#endif /* PG_TASK_RUN_HISTORY_H */ diff --git a/src/include/cdb/cdbaocsam.h b/src/include/cdb/cdbaocsam.h index e0ab7d5f15e..087d0f1819d 100644 --- a/src/include/cdb/cdbaocsam.h +++ b/src/include/cdb/cdbaocsam.h @@ -69,6 +69,13 @@ typedef struct AOCSInsertDescData struct DatumStreamWrite **ds; AppendOnlyBlockDirectory blockDirectory; + + /* + * For multiple segment files insertion. + */ + bool insertMultiFiles; /* insert into multi files */ + dlist_node node; /* node of segfiles list */ + int range; /* inserted tuples of each range */ } AOCSInsertDescData; typedef AOCSInsertDescData *AOCSInsertDesc; @@ -307,7 +314,7 @@ typedef AOCSAddColumnDescData *AOCSAddColumnDesc; * ---------------- */ -extern AOCSScanDesc aocs_beginscan(Relation relation, Snapshot snapshot, +extern AOCSScanDesc aocs_beginscan(Relation relation, Snapshot snapshot, ParallelTableScanDesc parallel_scan, bool *proj, uint32 flags); extern AOCSScanDesc aocs_beginrangescan(Relation relation, Snapshot snapshot, Snapshot appendOnlyMetaDataSnapshot, @@ -324,7 +331,7 @@ static inline void aocs_insert(AOCSInsertDesc idesc, TupleTableSlot *slot) slot_getallattrs(slot); aocs_insert_values(idesc, slot->tts_values, slot->tts_isnull, (AOTupleId *) &slot->tts_tid); } -extern void aocs_insert_finish(AOCSInsertDesc idesc); +extern void aocs_insert_finish(AOCSInsertDesc idesc, dlist_head *head); extern AOCSFetchDesc aocs_fetch_init(Relation relation, Snapshot snapshot, Snapshot appendOnlyMetaDataSnapshot, diff --git a/src/include/cdb/cdbappendonlyam.h b/src/include/cdb/cdbappendonlyam.h index 5643ae234c9..4295741fb1d 100644 --- a/src/include/cdb/cdbappendonlyam.h +++ b/src/include/cdb/cdbappendonlyam.h @@ -57,7 +57,7 @@ /* * AppendOnlyInsertDescData is used for inserting data into append-only * relations. It serves an equivalent purpose as AppendOnlyScanDescData - * (relscan.h) only that the later is used for scanning append-only + * (relscan.h) only that the later is used for scanning append-only * relations. */ typedef struct AppendOnlyInsertDescData @@ -108,6 +108,13 @@ typedef struct AppendOnlyInsertDescData /* The block directory for the appendonly relation. */ AppendOnlyBlockDirectory blockDirectory; + + /* + * For multiple segment files insertion. + */ + bool insertMultiFiles; /* insert into multi files */ + dlist_node node; /* node of segfiles list */ + int range; /* inserted tuples of each range */ } AppendOnlyInsertDescData; typedef AppendOnlyInsertDescData *AppendOnlyInsertDesc; @@ -408,7 +415,7 @@ extern void appendonly_insert( AppendOnlyInsertDesc aoInsertDesc, MemTuple instup, AOTupleId *aoTupleId); -extern void appendonly_insert_finish(AppendOnlyInsertDesc aoInsertDesc); +extern void appendonly_insert_finish(AppendOnlyInsertDesc aoInsertDesc, dlist_head *head); extern void appendonly_dml_finish(Relation relation, CmdType operation); extern AppendOnlyDeleteDesc appendonly_delete_init(Relation rel); diff --git a/src/include/cdb/cdbappendonlystorageread.h b/src/include/cdb/cdbappendonlystorageread.h index 230c6a5a013..9f3857ae814 100755 --- a/src/include/cdb/cdbappendonlystorageread.h +++ b/src/include/cdb/cdbappendonlystorageread.h @@ -172,6 +172,8 @@ typedef struct AppendOnlyStorageRead */ char *segmentFileName; + RelFileNode relFileNode; + /* * The number of blocks read since the beginning of the segment file. */ @@ -195,7 +197,8 @@ extern void AppendOnlyStorageRead_Init(AppendOnlyStorageRead *storageRead, MemoryContext memoryContext, int32 maxBufferLen, char *relationName, char *title, - AppendOnlyStorageAttributes *storageAttributes); + AppendOnlyStorageAttributes *storageAttributes, + RelFileNode *relFileNode); extern char *AppendOnlyStorageRead_RelationName(AppendOnlyStorageRead *storageRead); extern char *AppendOnlyStorageRead_SegmentFileName(AppendOnlyStorageRead *storageRead); diff --git a/src/include/cdb/cdbbufferedread.h b/src/include/cdb/cdbbufferedread.h index bc5c8d703d7..7e176698380 100644 --- a/src/include/cdb/cdbbufferedread.h +++ b/src/include/cdb/cdbbufferedread.h @@ -19,7 +19,7 @@ #define CDBBUFFEREDREAD_H #include "storage/fd.h" - +#include "storage/relfilenode.h" typedef struct BufferedRead { /* @@ -73,6 +73,7 @@ typedef struct BufferedRead /* current read position */ off_t fileOff; + RelFileNode relFileNode; /* * Temporary limit support for random reading. */ @@ -102,7 +103,8 @@ extern void BufferedReadInit( int32 memoryLen, int32 maxBufferLen, int32 maxLargeReadLen, - char *relationName); + char *relationName, + RelFileNode *file_node); /* * Takes an open file handle for the next file. diff --git a/src/include/cdb/cdbgroupingpaths.h b/src/include/cdb/cdbgroupingpaths.h index 4f740447e7a..6e26d11ccb8 100644 --- a/src/include/cdb/cdbgroupingpaths.h +++ b/src/include/cdb/cdbgroupingpaths.h @@ -28,7 +28,8 @@ extern void cdb_create_multistage_grouping_paths(PlannerInfo *root, const AggClauseCosts *agg_final_costs, List *rollups, List *new_rollups, - AggStrategy strat); + AggStrategy strat, + List *partial_pathlist); extern void cdb_create_twostage_distinct_paths(PlannerInfo *root, diff --git a/src/include/cdb/cdbllize.h b/src/include/cdb/cdbllize.h index 9522decfd69..3de67683157 100644 --- a/src/include/cdb/cdbllize.h +++ b/src/include/cdb/cdbllize.h @@ -28,7 +28,6 @@ extern Plan *cdbllize_decorate_subplans_with_motions(PlannerInfo *root, Plan *pl extern void cdbllize_build_slice_table(PlannerInfo *root, Plan *top_plan, PlanSlice *top_slice); extern void motion_sanity_check(PlannerInfo *root, Plan *plan); - extern bool is_plan_node(Node *node); extern Flow *makeFlow(FlowType flotype, int numsegments); diff --git a/src/include/cdb/cdbmutate.h b/src/include/cdb/cdbmutate.h index 288fbbd9ecc..6338b50c821 100644 --- a/src/include/cdb/cdbmutate.h +++ b/src/include/cdb/cdbmutate.h @@ -29,6 +29,7 @@ extern Motion *make_hashed_motion(Plan *lefttree, int numHashSegments); extern Motion *make_broadcast_motion(Plan *lefttree); +extern Motion *make_parallel_broadcast_motion(Plan *lefttree); extern Plan *make_explicit_motion(PlannerInfo *root, Plan *lefttree, diff --git a/src/include/cdb/cdbpath.h b/src/include/cdb/cdbpath.h index df8b4ac8679..0c9469dfedd 100644 --- a/src/include/cdb/cdbpath.h +++ b/src/include/cdb/cdbpath.h @@ -46,19 +46,34 @@ extern Path *create_motion_path_for_upddel(PlannerInfo *root, Index rti, GpPolic extern Path *create_split_update_path(PlannerInfo *root, Index rti, GpPolicy *targetPolicy, Path *subpath); extern CdbPathLocus -cdbpath_motion_for_join(PlannerInfo *root, - JoinType jointype, /* JOIN_INNER/FULL/LEFT/RIGHT/IN */ - Path **p_outer_path, /* INOUT */ - Path **p_inner_path, /* INOUT */ - int *p_rowidexpr_id, - List *redistribution_clauses, /* equijoin RestrictInfo list */ - List *restrict_clauses, /* all RestrictInfos */ - List *outer_pathkeys, - List *inner_pathkeys, - bool outer_require_existing_order, - bool inner_require_existing_order); +cdbpath_motion_for_join(PlannerInfo *root, + JoinType jointype, /* JOIN_INNER/FULL/LEFT/RIGHT/IN */ + Path **p_outer_path, /* INOUT */ + Path **p_inner_path, /* INOUT */ + int *p_rowidexpr_id, + List *redistribution_clauses, /* equijoin RestrictInfo list */ + List *restrict_clauses, /* all RestrictInfos */ + List *outer_pathkeys, + List *inner_pathkeys, + bool outer_require_existing_order, + bool inner_require_existing_order); extern bool cdbpath_contains_wts(Path *path); extern Path * turn_volatile_seggen_to_singleqe(PlannerInfo *root, Path *path, Node *node); +extern CdbPathLocus +cdbpath_motion_for_parallel_join(PlannerInfo *root, + JoinType jointype, /* JOIN_INNER/FULL/LEFT/RIGHT/IN */ + Path **p_outer_path, /* INOUT */ + Path **p_inner_path, /* INOUT */ + int *p_rowidexpr_id, + List *redistribution_clauses, /* equijoin RestrictInfo list */ + List *restrict_clauses, /* all RestrictInfos */ + List *outer_pathkeys, + List *inner_pathkeys, + bool outer_require_existing_order, + bool inner_require_existing_order, + bool parallel_aware, + bool uninterested_broadcast); /* for parallel hash join, do not use Broadcast if possible */ + #endif /* CDBPATH_H */ diff --git a/src/include/cdb/cdbpathlocus.h b/src/include/cdb/cdbpathlocus.h index fc54d73980c..9f1753b6b45 100644 --- a/src/include/cdb/cdbpathlocus.h +++ b/src/include/cdb/cdbpathlocus.h @@ -52,13 +52,19 @@ typedef enum CdbLocusType * generally available in any qExec or qDisp) */ CdbLocusType_SegmentGeneral,/* generally available in any qExec, but not * available in qDisp */ + CdbLocusType_SegmentGeneralWorkers, /* generally available in any qExec, but not + * available in qDisp, strewn on workers of the same segments */ CdbLocusType_OuterQuery, /* generally available in any qExec or qDisp, but * contains correlated vars from outer query, so must * not be redistributed */ CdbLocusType_Replicated, /* replicated over all qExecs of an N-gang */ + CdbLocusType_ReplicatedWorkers, /* replicated over all qExecs of an N-gang in parallel execution mode */ CdbLocusType_Hashed, /* hash partitioned over all qExecs of N-gang */ CdbLocusType_HashedOJ, /* result of hash partitioned outer join, NULLs can be anywhere */ CdbLocusType_Strewn, /* partitioned on no known function */ + CdbLocusType_HashedWorkers, /* strewn on all qExecs but partitioned on M-parallel qExecs of total N-gang. + * parallel mode(M workers) for partitioned locus. + */ CdbLocusType_End /* = last valid CdbLocusType + 1 */ } CdbLocusType; @@ -150,11 +156,21 @@ typedef struct CdbPathLocus CdbLocusType locustype; List *distkey; /* List of DistributionKeys */ int numsegments; + int parallel_workers; } CdbPathLocus; #define CdbPathLocus_NumSegments(locus) \ ((locus).numsegments) +#define CdbPathLocus_NumParallelWorkers(locus) \ + ((locus).parallel_workers) + +/* + * GPDB: total num of processes in consideration of parallel mode. + * This is used to broadcast to parallel workers estimation. + */ +#define CdbPathLocus_NumSegmentsPlusParallelWorkers(locus) \ + (((locus).parallel_workers) ? ((locus).numsegments) * ((locus).parallel_workers) : ((locus).numsegments)) /* * CdbPathLocus_IsEqual * @@ -164,7 +180,8 @@ typedef struct CdbPathLocus #define CdbPathLocus_IsEqual(a, b) \ ((a).locustype == (b).locustype && \ (a).numsegments == (b).numsegments && \ - (a).distkey == (b).distkey) + (a).distkey == (b).distkey && \ + (a).parallel_workers == (b).parallel_workers) #define CdbPathLocus_CommonSegments(a, b) \ Min((a).numsegments, (b).numsegments) @@ -189,6 +206,7 @@ typedef struct CdbPathLocus */ #define CdbPathLocus_IsPartitioned(locus) \ (CdbPathLocus_IsHashed(locus) || \ + CdbPathLocus_IsHashedWorkers(locus) || \ CdbPathLocus_IsHashedOJ(locus) || \ CdbPathLocus_IsStrewn(locus)) @@ -202,14 +220,20 @@ typedef struct CdbPathLocus ((locus).locustype == CdbLocusType_General) #define CdbPathLocus_IsReplicated(locus) \ ((locus).locustype == CdbLocusType_Replicated) +#define CdbPathLocus_IsReplicatedWorkers(locus) \ + ((locus).locustype == CdbLocusType_ReplicatedWorkers) #define CdbPathLocus_IsHashed(locus) \ ((locus).locustype == CdbLocusType_Hashed) +#define CdbPathLocus_IsHashedWorkers(locus) \ + ((locus).locustype == CdbLocusType_HashedWorkers) #define CdbPathLocus_IsHashedOJ(locus) \ ((locus).locustype == CdbLocusType_HashedOJ) #define CdbPathLocus_IsStrewn(locus) \ ((locus).locustype == CdbLocusType_Strewn) #define CdbPathLocus_IsSegmentGeneral(locus) \ ((locus).locustype == CdbLocusType_SegmentGeneral) +#define CdbPathLocus_IsSegmentGeneralWorkers(locus) \ + ((locus).locustype == CdbLocusType_SegmentGeneralWorkers) #define CdbPathLocus_IsOuterQuery(locus) \ ((locus).locustype == CdbLocusType_OuterQuery) @@ -217,8 +241,9 @@ typedef struct CdbPathLocus do { \ CdbPathLocus *_locus = (plocus); \ _locus->locustype = (_locustype); \ - _locus->numsegments = (numsegments_); \ - _locus->distkey = NIL; \ + _locus->numsegments = (numsegments_); \ + _locus->parallel_workers = 0; \ + _locus->distkey = NIL; \ } while (0) #define CdbPathLocus_MakeNull(plocus) \ @@ -229,16 +254,42 @@ typedef struct CdbPathLocus CdbPathLocus_MakeSimple((plocus), CdbLocusType_SingleQE, (numsegments_)) #define CdbPathLocus_MakeGeneral(plocus) \ CdbPathLocus_MakeSimple((plocus), CdbLocusType_General, -1) -#define CdbPathLocus_MakeSegmentGeneral(plocus, numsegments_) \ +#define CdbPathLocus_MakeSegmentGeneral(plocus, numsegments_) \ CdbPathLocus_MakeSimple((plocus), CdbLocusType_SegmentGeneral, (numsegments_)) -#define CdbPathLocus_MakeReplicated(plocus, numsegments_) \ - CdbPathLocus_MakeSimple((plocus), CdbLocusType_Replicated, (numsegments_)) -#define CdbPathLocus_MakeHashed(plocus, distkey_, numsegments_) \ +#define CdbPathLocus_MakeSegmentGeneralWorkers(plocus, numsegments_, parallel_workers_) \ + do { \ + CdbPathLocus *_locus = (plocus); \ + _locus->locustype = CdbLocusType_SegmentGeneralWorkers; \ + _locus->numsegments = (numsegments_); \ + _locus->parallel_workers = (parallel_workers_); \ + _locus->distkey = NIL; \ + Assert(cdbpathlocus_is_valid(*_locus)); \ + } while (0) +#define CdbPathLocus_MakeReplicated(plocus, numsegments_, parallel_workers_) \ + do { \ + CdbPathLocus *_locus = (plocus); \ + _locus->locustype = CdbLocusType_Replicated; \ + _locus->numsegments = (numsegments_); \ + _locus->distkey = NIL; \ + _locus->parallel_workers = (parallel_workers_); \ + Assert(cdbpathlocus_is_valid(*_locus)); \ + } while (0) +#define CdbPathLocus_MakeReplicatedWorkers(plocus, numsegments_, parallel_workers_) \ + do { \ + CdbPathLocus *_locus = (plocus); \ + _locus->locustype = CdbLocusType_ReplicatedWorkers; \ + _locus->numsegments = (numsegments_); \ + _locus->distkey = NIL; \ + _locus->parallel_workers = (parallel_workers_); \ + Assert(cdbpathlocus_is_valid(*_locus)); \ + } while (0) +#define CdbPathLocus_MakeHashed(plocus, distkey_, numsegments_, parallel_workers_) \ do { \ CdbPathLocus *_locus = (plocus); \ _locus->locustype = CdbLocusType_Hashed; \ _locus->numsegments = (numsegments_); \ _locus->distkey = (distkey_); \ + _locus->parallel_workers = (parallel_workers_); \ Assert(cdbpathlocus_is_valid(*_locus)); \ } while (0) #define CdbPathLocus_MakeHashedOJ(plocus, distkey_, numsegments_) \ @@ -249,14 +300,34 @@ typedef struct CdbPathLocus _locus->distkey = (distkey_); \ Assert(cdbpathlocus_is_valid(*_locus)); \ } while (0) -#define CdbPathLocus_MakeStrewn(plocus, numsegments_) \ - CdbPathLocus_MakeSimple((plocus), CdbLocusType_Strewn, (numsegments_)) +#define CdbPathLocus_MakeHashedWorkers(plocus, distkey_, numsegments_, parallel_workers_) \ + do { \ + CdbPathLocus *_locus = (plocus); \ + _locus->locustype = CdbLocusType_HashedWorkers; \ + _locus->numsegments = (numsegments_); \ + _locus->distkey = (distkey_); \ + _locus->parallel_workers = (parallel_workers_); \ + Assert(cdbpathlocus_is_valid(*_locus)); \ + } while (0) +#define CdbPathLocus_MakeStrewn(plocus, numsegments_, parallel_workers_) \ + do { \ + CdbPathLocus *_locus = (plocus); \ + _locus->locustype = CdbLocusType_Strewn; \ + _locus->numsegments = (numsegments_); \ + _locus->parallel_workers = (parallel_workers_); \ + _locus->distkey = NIL; \ + Assert(cdbpathlocus_is_valid(*_locus)); \ + } while (0) #define CdbPathLocus_MakeOuterQuery(plocus) \ CdbPathLocus_MakeSimple((plocus), CdbLocusType_OuterQuery, -1) +#define CdbPathLocus_HasMultipleWorkers(plocus) \ + (((plocus).parallel_workers > 1) ? true : false) + /************************************************************************/ +extern bool cdbpath_distkey_equal(List *a_distkey, List *b_distkey); extern bool cdbpathlocus_equal(CdbPathLocus a, CdbPathLocus b); /************************************************************************/ @@ -266,17 +337,19 @@ extern CdbPathLocus cdbpathlocus_for_insert(struct PlannerInfo *root, struct PathTarget *pathtarget); CdbPathLocus -cdbpathlocus_from_policy(struct PlannerInfo *root, Index rti, struct GpPolicy *policy); +cdbpathlocus_from_policy(struct PlannerInfo *root, Index rti, struct GpPolicy *policy, int parallel_workers); CdbPathLocus cdbpathlocus_from_baserel(struct PlannerInfo *root, - struct RelOptInfo *rel); + struct RelOptInfo *rel, + int parallel_workers); CdbPathLocus cdbpathlocus_from_exprs(struct PlannerInfo *root, struct RelOptInfo *rel, List *hash_on_exprs, List *hash_opclasses, List *hash_sortrefs, - int numsegments); + int numsegments, + int parallel_workers); CdbPathLocus cdbpathlocus_from_subquery(struct PlannerInfo *root, struct RelOptInfo *rel, @@ -285,6 +358,9 @@ cdbpathlocus_from_subquery(struct PlannerInfo *root, CdbPathLocus cdbpathlocus_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b); +CdbPathLocus +cdbpathlocus_parallel_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b, bool parallel_aware); + /************************************************************************/ /* @@ -321,11 +397,12 @@ extern void cdbpathlocus_get_distkey_exprs(CdbPathLocus locus, */ CdbPathLocus cdbpathlocus_pull_above_projection(struct PlannerInfo *root, - CdbPathLocus locus, - Bitmapset *relids, - List *targetlist, - List *newvarlist, - Index newrelid); + CdbPathLocus locus, + Bitmapset *relids, + List *targetlist, + List *newvarlist, + Index newrelid, + bool parallel_aware); /************************************************************************/ diff --git a/src/include/cdb/cdbvarblock.h b/src/include/cdb/cdbvarblock.h index 7e57f99d49e..1dd2224a2e1 100644 --- a/src/include/cdb/cdbvarblock.h +++ b/src/include/cdb/cdbvarblock.h @@ -89,6 +89,7 @@ */ #ifndef CDBVARBLOCK_H #define CDBVARBLOCK_H +#include "cdb/cdbappendonlystoragewrite.h" typedef int32 VarBlockByteLen; typedef int32 VarBlockByteOffset; @@ -282,7 +283,8 @@ extern void VarBlockMakerInit( uint8 *buffer, VarBlockByteLen maxBufferLen, uint8 *tempScratchSpace, - int tempScratchSpaceLen); + int tempScratchSpaceLen, + AppendOnlyStorageWrite *storageWrite); /* * Get a pointer to the next variable-length item so it can @@ -306,7 +308,8 @@ extern int VarBlockMakerItemCount( * The item-offsets array will be added to the end. */ extern VarBlockByteLen VarBlockMakerFinish( - VarBlockMaker *varBlockMaker); + VarBlockMaker *varBlockMaker, + AppendOnlyStorageWrite *storageWrite); // ----------------------------------------------------------------------------- @@ -341,7 +344,9 @@ char *VarBlockGetCheckErrorStr(void); extern void VarBlockReaderInit( VarBlockReader *varBlockReader, uint8 *buffer, - VarBlockByteLen bufferLen); + VarBlockByteLen bufferLen, + bool needDecrypt, + RelFileNode *file_nod); /* * Get a pointer to the next variable-length item. @@ -367,6 +372,7 @@ extern uint8* VarBlockReaderGetItemPtr( VarBlockByteLen *itemLen); extern VarBlockByteLen VarBlockCollapseToSingleItem( + AppendOnlyStorageWrite *storageWrite, uint8 *target, uint8 *source, int32 sourceLen); diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h index 5fca25a008d..d619b381110 100644 --- a/src/include/cdb/cdbvars.h +++ b/src/include/cdb/cdbvars.h @@ -652,6 +652,9 @@ extern bool gp_statistics_use_fkeys; /* Allow user to force tow stage agg */ extern bool gp_eager_two_phase_agg; +/* Force redistribution of insert into randomly-distributed table */ +extern bool gp_force_random_redistribution; + /* Analyze tools */ extern int gp_motion_slice_noop; diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index e413e5cb6fb..77cb96f0cab 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -41,6 +41,7 @@ typedef struct ExplainState bool verbose; /* be verbose */ bool analyze; /* print actual times */ bool costs; /* print estimated costs */ + bool locus; /* print path locus */ bool buffers; /* print buffer usage */ bool dxl; /* CDB: print DXL */ bool slicetable; /* CDB: print slice table */ diff --git a/src/include/commands/taskcmds.h b/src/include/commands/taskcmds.h new file mode 100644 index 00000000000..e3c6a79fa83 --- /dev/null +++ b/src/include/commands/taskcmds.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * taskcmds.h + * prototypes for taskcmds.c. + * + * Portions Copyright (c) 2023-Present Hashdata Inc. + * + * IDENTIFICATION + * src/include/commands/taskcmds.h + * + *------------------------------------------------------------------------- + */ + +#ifndef TASKCMDS_H +#define TASKCMDS_H + +#include "catalog/objectaddress.h" +#include "nodes/parsenodes.h" + +extern ObjectAddress DefineTask(ParseState *pstate, CreateTaskStmt *stmt); + +extern ObjectAddress AlterTask(ParseState *pstate, AlterTaskStmt *stmt); + +extern ObjectAddress DropTask(ParseState *pstate, DropTaskStmt *stmt); + +extern void RemoveTaskById(Oid jobid); + +#endif /* TASKCMDS_H */ diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 67a136d479a..19640e449f9 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -238,6 +238,16 @@ typedef enum VacOptValue * VPgClassStats is used to hold the stats information that are stored in * pg_class. It is sent from QE to QD in a special libpq message , when a * QE runs VACUUM on a table. + * + * relallvisible + * AO/AOCO does not currently have an equivalent to Heap's 'all visible pages', + * relallvisible is always set to 0 in pg_class for AO/AOCO tabes. But QE use + * this field in libpq to represent AO/AOCO's total file segment number when + * vacuum ao tables. + * Remember to reset relallvisible to 0 after qd get file segment number and + * before updating pg_class. + * See vac_update_relstats_from_list in vacuum.c and + * vac_update_relstats in vacuum_ao.c. */ typedef struct VPgClassStats { diff --git a/src/include/common/cipher.h b/src/include/common/cipher.h new file mode 100644 index 00000000000..da97ca37766 --- /dev/null +++ b/src/include/common/cipher.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * cipher.h + * Declarations for cryptographic functions + * + * Portions Copyright (c) 2021, PostgreSQL Global Development Group + * + * src/include/common/cipher.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_CIPHER_H +#define PG_CIPHER_H + +#ifdef USE_OPENSSL +#include +#include +#include +#endif + +/* + * Supported symmetric encryption algorithm. These identifiers are passed + * to pg_cipher_ctx_create() function, and then actual encryption + * implementations need to initialize their context of the given encryption + * algorithm. + */ +#define PG_CIPHER_AES_GCM 0 +#define PG_CIPHER_AES_CTR 1 +#define PG_CIPHER_AES_KWP 2 +#define PG_MAX_CIPHER_ID 2 + +/* AES128/192/256 various length definitions */ +#define PG_AES128_KEY_LEN (128 / 8) +#define PG_AES192_KEY_LEN (192 / 8) +#define PG_AES256_KEY_LEN (256 / 8) + +/* + * The encrypted data is a series of blocks of size. Initialization + * vector(IV) is the same size of cipher block. + */ +#define PG_AES_BLOCK_SIZE 16 +#define PG_AES_IV_SIZE (PG_AES_BLOCK_SIZE) + +#ifdef USE_OPENSSL +typedef EVP_CIPHER_CTX PgCipherCtx; +#else +typedef void PgCipherCtx; +#endif + +extern PgCipherCtx *pg_cipher_ctx_create(int cipher, unsigned char *key, int klen, + bool enc); + +extern void pg_cipher_ctx_free(PgCipherCtx *ctx); +extern bool pg_cipher_encrypt(PgCipherCtx *ctx, int cipher, + const unsigned char *plaintext, const int inlen, + unsigned char *ciphertext, int *outlen, + const unsigned char *iv, const int ivlen, + unsigned char *tag, const int taglen); +extern bool pg_cipher_decrypt(PgCipherCtx *ctx, const int cipher, + const unsigned char *ciphertext, const int inlen, + unsigned char *plaintext, int *outlen, + const unsigned char *iv, const int ivlen, + unsigned char *intag, const int taglen); + +extern bool pg_cipher_keywrap(PgCipherCtx *ctx, + const unsigned char *plaintext, const int inlen, + unsigned char *ciphertext, int *outlen); +extern bool pg_cipher_keyunwrap(PgCipherCtx *ctx, + const unsigned char *ciphertext, const int inlen, + unsigned char *plaintext, int *outlen); + +extern int pg_cipher_blocksize(PgCipherCtx *ctx); + +#endif /* PG_CIPHER_H */ diff --git a/src/include/common/kmgr_utils.h b/src/include/common/kmgr_utils.h new file mode 100644 index 00000000000..a07b70c253a --- /dev/null +++ b/src/include/common/kmgr_utils.h @@ -0,0 +1,96 @@ +/*------------------------------------------------------------------------- + * + * kmgr_utils.h + * Declarations for utility function for file encryption key + * + * Portions Copyright (c) 2021, PostgreSQL Global Development Group + * + * src/include/common/kmgr_utils.h + * + *------------------------------------------------------------------------- + */ +#ifndef KMGR_UTILS_H +#define KMGR_UTILS_H + +#include "common/cipher.h" + +/* Current version number */ +#define KMGR_VERSION 1 + +/* + * Directories where cluster file encryption keys reside within PGDATA. + */ +#define KMGR_DIR "pg_cryptokeys" +#define KMGR_DIR_PID KMGR_DIR"/pg_alterckey.pid" +#define LIVE_KMGR_DIR KMGR_DIR"/live" +/* used during cluster key rotation */ +#define NEW_KMGR_DIR KMGR_DIR"/new" +#define OLD_KMGR_DIR KMGR_DIR"/old" + +/* CryptoKey file name is keys id */ +#define CryptoKeyFilePath(path, dir, id) \ + snprintf((path), MAXPGPATH, "%s/%s.wkey", (dir), (wkey_filenames[id])) + +/* + * Identifiers of internal keys. + */ +#define KMGR_KEY_ID_REL 0 +#define KMGR_KEY_ID_WAL 1 +#define KMGR_NUM_DATA_KEYS 2 + +/* We always, today, use a 256-bit AES key. */ +#define KMGR_CLUSTER_KEY_LEN PG_AES256_KEY_LEN + +/* double for hex format, plus some for spaces, \r,\n, and null byte */ +#define ALLOC_KMGR_CLUSTER_KEY_LEN (KMGR_CLUSTER_KEY_LEN * 2 + 10 + 2 + 1) + +/* Maximum length of key the key manager can store */ +#define KMGR_MAX_KEY_LEN 256 +#define KMGR_MAX_KEY_LEN_BYTES (KMGR_MAX_KEY_LEN / 8) + + +/* + * Cryptographic key data structure. + * + * This is the structure we use to write out the encrypted keys and + * which we use to store the keys in shared memory. + * + * Note that wrapping this structure results in an encrypted byte + * string which is what we actually write and then read back in. + * + * klen is the key length in bytes + * key is the encryption key of klen length + */ +typedef struct CryptoKey +{ + int klen; /* key length in bytes */ + unsigned char key[KMGR_MAX_KEY_LEN_BYTES]; +} CryptoKey; + +/* Encryption method array */ +typedef struct encryption_method +{ + const char *name; + const int bit_length; +} encryption_method; + +#define NUM_ENCRYPTION_METHODS 5 +#define DISABLED_ENCRYPTION_METHOD 0 +#define DEFAULT_ENABLED_ENCRYPTION_METHOD 1 +/*generate by cmd "echo cloudberry | sha256sum | cut -d' ' -f1 " */ +#define DEFAULT_CLUSTER_KEY_COMMAND "echo 6f1c78b6722ae3f3b65e038a30f087d22e2a47f84578d5913e06ef5c871ae4c6" + +extern encryption_method encryption_methods[NUM_ENCRYPTION_METHODS]; +extern char *wkey_filenames[KMGR_NUM_DATA_KEYS]; + +extern bool kmgr_wrap_data_key(PgCipherCtx *ctx, CryptoKey *in, unsigned char *out, int *outlen); +extern bool kmgr_unwrap_data_key(PgCipherCtx *ctx, unsigned char *in, int inlen, CryptoKey *out); +extern bool kmgr_verify_cluster_key(unsigned char *cluster_key, + unsigned char **in_keys, int *klens, CryptoKey *out_keys); +extern int kmgr_run_cluster_key_command(char *cluster_key_command, + char *buf, int size, char *dir, + int terminal_fd); +extern void kmgr_read_wrapped_data_keys(const char *path, unsigned char **keys, + int *key_lens); + +#endif /* KMGR_UTILS_H */ diff --git a/src/include/crypto/bufenc.h b/src/include/crypto/bufenc.h new file mode 100644 index 00000000000..da371b7334d --- /dev/null +++ b/src/include/crypto/bufenc.h @@ -0,0 +1,34 @@ +/*------------------------------------------------------------------------- + * + * bufenc.h + * + * Portions Copyright (c) 2021, PostgreSQL Global Development Group + * + * src/include/crypto/bufenc.h + * + *------------------------------------------------------------------------- + */ +#ifndef BUFENC_H +#define BUFENC_H + +#include "storage/bufmgr.h" +#include "crypto/kmgr.h" + +/* Cluster encryption encrypts only main forks */ +#define PageNeedsToBeEncrypted(forknum) \ + (FileEncryptionEnabled && (forknum) == MAIN_FORKNUM) + + +extern void InitializeBufferEncryption(void); +extern void EncryptPage(Page page, + BlockNumber blkno); +extern void DecryptPage(Page page, + BlockNumber blkno); +extern void +EncryptAOBLock(unsigned char *data_buf, const int buf_len, + RelFileNode *file_node); +extern void +DecryptAOBlock(unsigned char *data_buf, const int buf_len, + RelFileNode *file_node); + +#endif /* BUFENC_H */ diff --git a/src/include/crypto/kmgr.h b/src/include/crypto/kmgr.h new file mode 100644 index 00000000000..45103666093 --- /dev/null +++ b/src/include/crypto/kmgr.h @@ -0,0 +1,27 @@ +/*------------------------------------------------------------------------- + * + * kmgr.h + * + * Portions Copyright (c) 2021, PostgreSQL Global Development Group + * + * src/include/crypto/kmgr.h + * + *------------------------------------------------------------------------- + */ +#ifndef KMGR_H +#define KMGR_H + +#include "common/kmgr_utils.h" + +/* GUC parameters */ +extern char *cluster_key_command; +extern bool tde_force_switch; + +extern Size KmgrShmemSize(void); +extern void KmgrShmemInit(void); +extern void BootStrapKmgr(void); +extern void InitializeKmgr(void); +extern const CryptoKey *KmgrGetKey(int id); +extern bool CheckIsSM4Method(void); + +#endif /* KMGR_H */ diff --git a/src/include/crypto/sm4.h b/src/include/crypto/sm4.h new file mode 100644 index 00000000000..6bd0ed5f421 --- /dev/null +++ b/src/include/crypto/sm4.h @@ -0,0 +1,62 @@ +#ifndef _SM4_H_ +#define _SM4_H_ +#include "c.h" + +typedef signed char int8_t; +typedef unsigned char uint8_t; +typedef short int16_t; +typedef unsigned short uint16_t; +typedef int int32_t; +typedef unsigned int uint32_t; +typedef long int64_t; +typedef unsigned long int uint64_t; + +# define SM4_ENCRYPT 1 +# define SM4_DECRYPT 0 + +# define SM4_BLOCK_SIZE 16 +# define SM4_KEY_SCHEDULE 32 + +/* ossl_inline: portable inline definition usable in public headers */ +# if !defined(inline) && !defined(__cplusplus) +# if defined(__STDC_VERSION__) && __STDC_VERSION__>=199901L + /* just use inline */ +# define ossl_inline inline +# elif defined(__GNUC__) && __GNUC__>=2 +# define ossl_inline __inline__ +# elif defined(_MSC_VER) + /* + * Visual Studio: inline is available in C++ only, however + * __inline is available for C, see + * http://msdn.microsoft.com/en-us/library/z8y1yy88.aspx + */ +# define ossl_inline __inline +# else +# define ossl_inline +# endif +# else +# define ossl_inline inline +# endif + +typedef struct SM4_KEY_st { + uint32_t rk[SM4_KEY_SCHEDULE]; +} SM4_KEY; + +typedef struct _sm4_ctx +{ + uint32_t k_len; + int encrypt; + SM4_KEY rkey; +} sm4_ctx; + +int ossl_sm4_set_key(const uint8_t *key, SM4_KEY *ks); + +void ossl_sm4_encrypt(const uint8_t *in, uint8_t *out, const SM4_KEY *ks); + +void ossl_sm4_decrypt(const uint8_t *in, uint8_t *out, const SM4_KEY *ks); +void sm4_setkey_enc(sm4_ctx *ctx, uint8_t* key); +void sm4_setkey_dec(sm4_ctx *ctx, uint8_t* key); +int sm4_ofb_cipher(sm4_ctx *ctx, unsigned char *out, + const unsigned char *in, size_t input_len, + unsigned char ivec[16]); +#endif /* _SM4_H_ */ \ No newline at end of file diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h index cbf3e95556c..36bd4bbde40 100644 --- a/src/include/executor/execdesc.h +++ b/src/include/executor/execdesc.h @@ -136,6 +136,12 @@ typedef struct ExecSlice List *primaryProcesses; /* A bitmap to identify which QE should execute this slice */ Bitmapset *processesMap; + + /* + * Flag for whether to execute local slice plan in mpp parallel mode. + */ + bool useMppParallelMode; + int parallel_workers; } ExecSlice; /* diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 1e41be9e664..503cba7f11f 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -284,6 +284,8 @@ typedef struct ParallelHashJoinState Barrier build_barrier; /* synchronization for the build phases */ Barrier grow_batches_barrier; Barrier grow_buckets_barrier; + Barrier sync_barrier; + Barrier batch0_barrier; pg_atomic_uint32 distributor; /* counter for load balancing */ SharedFileSet fileset; /* space for shared temporary files */ diff --git a/src/include/executor/nodeAppend.h b/src/include/executor/nodeAppend.h index db6709b3176..6dbcebb1c3e 100644 --- a/src/include/executor/nodeAppend.h +++ b/src/include/executor/nodeAppend.h @@ -22,7 +22,9 @@ extern void ExecEndAppend(AppendState *node); extern void ExecReScanAppend(AppendState *node); extern void ExecSquelchAppend(AppendState *node); extern void ExecAppendEstimate(AppendState *node, ParallelContext *pcxt); +extern void GpAppendEstimate(AppendState *node, shm_toc_estimator *estimator); extern void ExecAppendInitializeDSM(AppendState *node, ParallelContext *pcxt); +extern void GpAppendInitializeLWLock(ParallelAppendState *pstate); extern void ExecAppendReInitializeDSM(AppendState *node, ParallelContext *pcxt); extern void ExecAppendInitializeWorker(AppendState *node, ParallelWorkerContext *pwcxt); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 74295e07092..e582e6e1647 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -27,6 +27,7 @@ #include "nodes/plannodes.h" #include "nodes/tidbitmap.h" #include "partitioning/partdefs.h" +#include "storage/barrier.h" #include "storage/condition_variable.h" #include "utils/hsearch.h" #include "utils/queryenvironment.h" @@ -702,6 +703,11 @@ typedef struct EState */ int currentSliceId; + /* + * Flag for whether to execute local slice plan in mpp parallel mode. + */ + bool useMppParallelMode; + /* Should the executor skip past the alien plan nodes */ bool eliminateAliens; @@ -2228,6 +2234,7 @@ typedef struct HashJoinState bool reuse_hashtable; /* Do we need to preserve hash table to support rescan */ bool delayEagerFree; /* is safe to free memory used by this node, * when this node has outputted its last row? */ + int worker_id; /* worker id for this process */ } HashJoinState; @@ -2887,6 +2894,8 @@ typedef struct HashState /* Parallel hash state. */ struct ParallelHashJoinState *parallel_state; + + Barrier *sync_barrier; } HashState; /* ---------------- @@ -3035,6 +3044,7 @@ typedef struct MotionState bool sentEndOfStream; /* set when end-of-stream has successfully been sent */ List *hashExprs; /* state struct used for evaluating the hash expressions */ struct CdbHash *cdbhash; /* hash api object */ + struct CdbHash *cdbhashworkers; /* hash api object for parallel workers */ int numHashSegments; /* number of segments to use when calculating hash */ /* For Motion recv */ @@ -3069,6 +3079,8 @@ typedef struct MotionState Oid *outputFunArray; /* output functions for each column (debug only) */ int numInputSegs; /* the number of segments on the sending slice */ + + int parallel_workers; /* parallel workers of motion */ } MotionState; /* ---------------- diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 4154cbfd40a..cf73922db89 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -515,6 +515,9 @@ typedef enum NodeTag T_AlterCollationStmt, T_CallStmt, T_AlterStatsStmt, + T_CreateTaskStmt, + T_AlterTaskStmt, + T_DropTaskStmt, /* GPDB additions */ T_PartitionBy, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 2068656ac8d..d9c3324e343 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2736,6 +2736,36 @@ typedef struct AlterTableMoveAllStmt bool nowait; } AlterTableMoveAllStmt; +/* ---------------------- + * Create/Alter/Drop Task Statements + * ---------------------- + */ + +typedef struct CreateTaskStmt +{ + NodeTag type; + char *taskname; + char *schedule; + char *sql; /* the sql command */ + List *options; + bool if_not_exists; /* just do nothing if it already exists? */ +} CreateTaskStmt; + +typedef struct AlterTaskStmt +{ + NodeTag type; + char *taskname; + List *options; + bool missing_ok; /* skip error if missing? */ +} AlterTaskStmt; + +typedef struct DropTaskStmt +{ + NodeTag type; + char *taskname; + bool missing_ok; /* skip error if missing? */ +} DropTaskStmt; + /* ---------------------- * Create/Alter Extension Statements * ---------------------- @@ -3287,6 +3317,8 @@ typedef struct SecLabelStmt * of the query are always postponed until execution. * ---------------------- */ +#define CURSOR_OPT_PARALLEL_NOT_OK 0x0000 /* parallel mode is not OK */ + #define CURSOR_OPT_BINARY 0x0001 /* BINARY */ #define CURSOR_OPT_SCROLL 0x0002 /* SCROLL explicitly given */ #define CURSOR_OPT_NO_SCROLL 0x0004 /* NO SCROLL explicitly given */ diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index a615518441c..27153a3b48d 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1498,6 +1498,8 @@ typedef struct Path bool motionHazard; /* true => path contains a CdbMotion operator without a slackening operator above it */ + bool barrierHazard; /* true => path contains sync barrier in Parallel Hash + which should be executed in non or all workers. */ bool rescannable; /* CDB: true => path can accept ExecRescan call */ List *pathkeys; /* sort ordering of path's output */ @@ -2092,6 +2094,7 @@ typedef struct HashPath List *path_hashclauses; /* join clauses used for hashing */ int num_batches; /* number of batches expected */ double inner_rows_total; /* total inner rows expected */ + bool batch0_barrier; } HashPath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 3a118982a3f..b3e48ff0c7d 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -240,6 +240,7 @@ typedef struct PlanSlice /* # of segments in the gang, for PRIMARY_READER/WRITER slices */ int numsegments; + int parallel_workers; /* segment to execute on, for SINGLETON_READER slices */ int segindex; @@ -319,6 +320,16 @@ typedef struct Plan * Set during parallelization. */ + /* + * GPDB parallel + * Additional fields here are used to show locus type and + * parallel workers of plan node. + * Field flow has the locus info only in the top Plan nodes, + * other nodes couldn't be set that. + */ + CdbLocusType locustype; + int parallel; /* parallel workers of this plan if there was */ + /** * How much memory (in KB) should be used to execute this plan node? */ @@ -1048,6 +1059,7 @@ typedef struct HashJoin */ List *hashkeys; List *hashqualclauses; + bool batch0_barrier; } HashJoin; #define SHARE_ID_NOT_SHARED (-1) @@ -1363,6 +1375,7 @@ typedef struct Hash /* all other info is in the parent HashJoin node */ double rows_total; /* estimate total rows if parallel_aware */ bool rescannable; /* CDB: true => save rows for rescan */ + bool sync_barrier; } Hash; /* ---------------- @@ -1438,6 +1451,7 @@ typedef enum MotionType MOTIONTYPE_GATHER_SINGLE, /* Execute subplan on N nodes, but only send the tuples from one */ MOTIONTYPE_HASH, /* Use hashing to select a segindex destination */ MOTIONTYPE_BROADCAST, /* Send tuples from one sender to a fixed set of segindexes */ + MOTIONTYPE_PARALLEL_BROADCAST, /* */ MOTIONTYPE_EXPLICIT, /* Send tuples to the segment explicitly specified in their segid column */ MOTIONTYPE_OUTER_QUERY /* Gather or Broadcast to outer query's slice, don't know which one yet */ } MotionType; diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index 49a8bf97938..57fb269564e 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -142,6 +142,7 @@ typedef struct RefreshClause bool concurrent; /* allow concurrent access? */ bool skipData; RangeVar *relation; /* relation to insert into */ + bool intoAO; /* is relation to insert into AO/AOCS */ } RefreshClause; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 99a85f23dec..c62d362bb3f 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -215,7 +215,8 @@ extern Path *create_hashjoin_path(PlannerInfo *root, List *restrict_clauses, Relids required_outer, List *redistribution_clauses, /*CDB*/ - List *hashclauses); + List *hashclauses, + bool uninterested_broadcast); /* GPDB parallel */ extern ProjectionPath *create_projection_path(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index a92ef140391..417bcfb200e 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -60,13 +60,14 @@ extern void generate_grouping_paths(PlannerInfo *root, RelAggInfo *agg_info); extern void generate_useful_gather_paths(PlannerInfo *root, RelOptInfo *rel, bool override_rows); -extern int compute_parallel_worker(RelOptInfo *rel, double heap_pages, +extern int compute_parallel_worker(PlannerInfo *root, RelOptInfo *rel, double heap_pages, double index_pages, int max_workers); extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, Path *bitmapqual); extern void generate_partitionwise_join_paths(PlannerInfo *root, RelOptInfo *rel); - +extern void partial_bring_to_outer_query(PlannerInfo *root, RelOptInfo *rel, + PathTarget *target, List *outer_quals); #ifdef OPTIMIZER_DEBUG extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); #endif diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index ec090419fdc..fc275952c71 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -274,6 +274,7 @@ PG_KEYWORD("localtimestamp", LOCALTIMESTAMP, RESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("location", LOCATION, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("lock", LOCK_P, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("locked", LOCKED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("locus", LOCUS, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("log", LOG_P, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) PG_KEYWORD("logged", LOGGED, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("mapping", MAPPING, UNRESERVED_KEYWORD, BARE_LABEL) @@ -417,6 +418,7 @@ PG_KEYWORD("rows", ROWS, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("rule", RULE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("savepoint", SAVEPOINT, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("scatter", SCATTER, RESERVED_KEYWORD, AS_LABEL) /* GPDB */ +PG_KEYWORD("schedule", SCHEDULE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("schema", SCHEMA, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("schemas", SCHEMAS, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("scroll", SCROLL, UNRESERVED_KEYWORD, BARE_LABEL) @@ -467,6 +469,7 @@ PG_KEYWORD("table", TABLE, RESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("tables", TABLES, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("tablesample", TABLESAMPLE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) PG_KEYWORD("tablespace", TABLESPACE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("task", TASK, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("temp", TEMP, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("template", TEMPLATE, UNRESERVED_KEYWORD, BARE_LABEL) PG_KEYWORD("temporary", TEMPORARY, UNRESERVED_KEYWORD, BARE_LABEL) diff --git a/src/include/postmaster/bgworker.h b/src/include/postmaster/bgworker.h index cc589921f1b..14995e73f5e 100644 --- a/src/include/postmaster/bgworker.h +++ b/src/include/postmaster/bgworker.h @@ -127,6 +127,7 @@ extern bool RegisterDynamicBackgroundWorker(BackgroundWorker *worker, /* Query the status of a bgworker */ extern BgwHandleStatus GetBackgroundWorkerPid(BackgroundWorkerHandle *handle, pid_t *pidp); + extern BgwHandleStatus WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *handle, pid_t *pid); extern BgwHandleStatus WaitForBackgroundWorkerShutdown(BackgroundWorkerHandle *); diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 3b7ecb61804..533093b4e88 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -32,6 +32,8 @@ extern char *bonjour_name; extern bool restart_after_crash; extern bool remove_temp_files_after_crash; +extern int terminal_fd; + #ifdef WIN32 extern HANDLE PostmasterHandle; #else @@ -99,6 +101,6 @@ extern bool amAuxiliaryBgWorker(void); */ #define MAX_BACKENDS 0x3FFFF -#define MaxPMAuxProc (3 + IC_PROXY_NUM_BGWORKER + FTS_NUM_BGWORKER) +#define MaxPMAuxProc (4 + IC_PROXY_NUM_BGWORKER + FTS_NUM_BGWORKER) #endif /* _POSTMASTER_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index d17eae33124..a8d26b2482b 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -15,6 +15,7 @@ #define BUFPAGE_H #include "access/xlogdefs.h" +#include "common/relpath.h" #include "storage/block.h" #include "storage/item.h" #include "storage/off.h" @@ -165,6 +166,8 @@ typedef struct PageHeaderData } PageHeaderData; typedef PageHeaderData *PageHeader; +#define PageEncryptOffset offsetof(PageHeaderData, pd_special) +#define SizeOfPageEncryption (BLCKSZ - PageEncryptOffset) /* * pd_flags contains the following flag bits. Undefined bits are initialized @@ -458,7 +461,7 @@ do { \ ((is_heap) ? PAI_IS_HEAP : 0)) #define PageIsVerified(page, blkno) \ - PageIsVerifiedExtended(page, blkno, \ + PageIsVerifiedExtended(page, MAIN_FORKNUM, blkno, \ PIV_LOG_WARNING | PIV_REPORT_STAT) /* @@ -472,7 +475,9 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)), "BLCKSZ has to be a multiple of sizeof(size_t)"); extern void PageInit(Page page, Size pageSize, Size specialSize); -extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags); +extern bool PageIsVerifiedExtended(Page page, ForkNumber forknum, + BlockNumber blkno, + int flags); extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size, OffsetNumber offsetNumber, int flags); extern Page PageGetTempPage(Page page); @@ -492,5 +497,11 @@ extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize); extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); extern void PageSetChecksumInplace(Page page, BlockNumber blkno); +extern char *PageEncryptCopy(Page page, ForkNumber forknum, + BlockNumber blkno); +extern void PageEncryptInplace(Page page, ForkNumber forknum, + BlockNumber blkno); +extern void PageDecryptInplace(Page page, ForkNumber forknum, + BlockNumber blkno); #endif /* BUFPAGE_H */ diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h index 2c3936b0dad..669c04d5812 100644 --- a/src/include/storage/copydir.h +++ b/src/include/storage/copydir.h @@ -14,6 +14,6 @@ #define COPYDIR_H extern void copydir(char *fromdir, char *todir, bool recurse); -extern void copy_file(char *fromfile, char *tofile); +extern void copy_file(char *fromfile, char *tofile, bool encrypt_init_file); #endif /* COPYDIR_H */ diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index c9ba3c278f7..ec2fc93a67b 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -560,6 +560,13 @@ typedef enum * worker */ } DeadLockState; + +typedef bool (*ActivateLock_hook_type) (const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock, bool dontWait); +extern PGDLLIMPORT ActivateLock_hook_type ActivateLock_hook; + +typedef void (*DeactivateLock_hook_type) (const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock, bool releaseAll); +extern PGDLLIMPORT DeactivateLock_hook_type DeactivateLock_hook; + /* * The lockmgr's shared hash tables are partitioned to reduce contention. * To determine which partition a given locktag belongs to, compute the tag's diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index b7eaed07351..06f6e9f81ad 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -82,6 +82,9 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests; /* Number of partitions of the shared buffer mapping hashtable */ #define NUM_BUFFER_PARTITIONS 128 +/* Number of partitions of the shared parallel dsm entry hashtable */ +#define NUM_PARALLEL_DSM_PARTITIONS 128 + /* Number of partitions the shared lock tables are divided into */ #define LOG2_NUM_LOCK_PARTITIONS 4 #define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS) diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 4ec5191370c..b0ceda24d37 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -502,4 +502,20 @@ extern void ResLockWaitCancel(void); extern bool ProcCanSetMppSessionId(void); extern void ProcNewMppSessionId(int *newSessionId); +/* + * session related hook types + */ +typedef int (*AllocSessionId_hook_type) (bool reset); +extern PGDLLIMPORT AllocSessionId_hook_type AllocSessionId_hook; + +typedef void (*NoticeSessionDB_hook_type) (Oid databaseid); +extern PGDLLIMPORT NoticeSessionDB_hook_type NoticeSessionDB_hook; + +typedef bool (*CountDBSession_hook_type) (Oid databaseid); +extern PGDLLIMPORT CountDBSession_hook_type CountDBSession_hook; + + +typedef void (*AuxProcCallbackFunction) (volatile PGPROC *proc, void *args); +extern void LoopAuxProc(AuxProcCallbackFunction func, void *args); + #endif /* _PROC_H_ */ diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index a95434b3e83..48991d1dd00 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -124,4 +124,7 @@ extern DistributedTransactionId LocalXidGetDistributedXid(TransactionId xid); extern int GetSessionIdByPid(int pid); extern void ResGroupSignalMoveQuery(int sessionId, void *slot, Oid groupId); +typedef void (*BackendProcCallbackFunction) (volatile PGPROC *proc, void *args); +extern void LoopBackendProc(BackendProcCallbackFunction func, void *args); + #endif /* PROCARRAY_H */ diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index 31024e5a50f..10eb25b881c 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -52,7 +52,7 @@ extern void RequestAddinShmemSpace(Size size); /* max size of data structure string name */ #define SHMEM_INDEX_KEYSIZE (48) /* estimated size of the shmem index table (not a hard limit) */ -#define SHMEM_INDEX_SIZE (64) +#define SHMEM_INDEX_SIZE (128) /* this is a hash bucket in the shmem index table */ typedef struct diff --git a/src/include/task/bitstring.h b/src/include/task/bitstring.h new file mode 100644 index 00000000000..8a700662baa --- /dev/null +++ b/src/include/task/bitstring.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1989 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Paul Vixie. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * @(#)bitstring.h 5.2 (Berkeley) 4/4/90 + */ + +typedef unsigned char bitstr_t; + +/* internal macros */ +/* byte of the bitstring bit is in */ +#define _bit_byte(bit) \ + ((bit) >> 3) + +/* mask for the bit within its byte */ +#define _bit_mask(bit) \ + (1 << ((bit)&0x7)) + +/* external macros */ +/* bytes in a bitstring of nbits bits */ +#define bitstr_size(nbits) \ + ((((nbits) - 1) >> 3) + 1) + +/* allocate a bitstring */ +#define bit_alloc(nbits) \ + (bitstr_t *)malloc(1, \ + (unsigned int)bitstr_size(nbits) * sizeof(bitstr_t)) + +/* allocate a bitstring on the stack */ +#define bit_decl(name, nbits) \ + (name)[bitstr_size(nbits)] + +/* is bit N of bitstring name set? */ +#define bit_test(name, bit) \ + ((name)[_bit_byte(bit)] & _bit_mask(bit)) + +/* set bit N of bitstring name */ +#define bit_set(name, bit) \ + (name)[_bit_byte(bit)] |= _bit_mask(bit) + +/* clear bit N of bitstring name */ +#define bit_clear(name, bit) \ + (name)[_bit_byte(bit)] &= ~_bit_mask(bit) + +/* clear bits start ... stop in bitstring */ +#define bit_nclear(name, start, stop) { \ + register bitstr_t *_name = name; \ + register int _start = start, _stop = stop; \ + register int _startbyte = _bit_byte(_start); \ + register int _stopbyte = _bit_byte(_stop); \ + if (_startbyte == _stopbyte) { \ + _name[_startbyte] &= ((0xff >> (8 - (_start&0x7))) | \ + (0xff << ((_stop&0x7) + 1))); \ + } else { \ + _name[_startbyte] &= 0xff >> (8 - (_start&0x7)); \ + while (++_startbyte < _stopbyte) \ + _name[_startbyte] = 0; \ + _name[_stopbyte] &= 0xff << ((_stop&0x7) + 1); \ + } \ +} + +/* set bits start ... stop in bitstring */ +#define bit_nset(name, start, stop) { \ + register bitstr_t *_name = name; \ + register int _start = start, _stop = stop; \ + register int _startbyte = _bit_byte(_start); \ + register int _stopbyte = _bit_byte(_stop); \ + if (_startbyte == _stopbyte) { \ + _name[_startbyte] |= ((0xff << (_start&0x7)) & \ + (0xff >> (7 - (_stop&0x7)))); \ + } else { \ + _name[_startbyte] |= 0xff << ((_start)&0x7); \ + while (++_startbyte < _stopbyte) \ + _name[_startbyte] = 0xff; \ + _name[_stopbyte] |= 0xff >> (7 - (_stop&0x7)); \ + } \ +} + +/* find first bit clear in name */ +#define bit_ffc(name, nbits, value) { \ + register bitstr_t *_name = name; \ + register int _byte, _nbits = nbits; \ + register int _stopbyte = _bit_byte(_nbits), _value = -1; \ + for (_byte = 0; _byte <= _stopbyte; ++_byte) \ + if (_name[_byte] != 0xff) { \ + _value = _byte << 3; \ + for (_stopbyte = _name[_byte]; (_stopbyte&0x1); \ + ++_value, _stopbyte >>= 1); \ + break; \ + } \ + *(value) = _value; \ +} + +/* find first bit set in name */ +#define bit_ffs(name, nbits, value) { \ + register bitstr_t *_name = name; \ + register int _byte, _nbits = nbits; \ + register int _stopbyte = _bit_byte(_nbits), _value = -1; \ + for (_byte = 0; _byte <= _stopbyte; ++_byte) \ + if (_name[_byte]) { \ + _value = _byte << 3; \ + for (_stopbyte = _name[_byte]; !(_stopbyte&0x1); \ + ++_value, _stopbyte >>= 1); \ + break; \ + } \ + *(value) = _value; \ +} diff --git a/src/include/task/cron.h b/src/include/task/cron.h new file mode 100644 index 00000000000..2ed1a1f7f64 --- /dev/null +++ b/src/include/task/cron.h @@ -0,0 +1,296 @@ +/* + * Copyright 1988,1990,1993,1994 by Paul Vixie + * All rights reserved + * + * Distribute freely, except: don't remove my name from the source or + * documentation (don't take credit for my work), mark your changes (don't + * get me blamed for your possible bugs), don't alter or remove this + * notice. May be sold if buildable source is provided to buyer. No + * warrantee of any kind, express or implied, is included with this + * software; use at your own risk, responsibility for damages (if any) to + * anyone resulting from the use of this software rests entirely with the + * user. + * + * Send bug reports, bug fixes, enhancements, requests, flames, etc., and + * I'll try to keep a version up to date. I can be reached as follows: + * Paul Vixie uunet!decwrl!vixie!paul + */ + +/* + * cron.h - header for vixie's cron + * + * $Id: cron.h,v 2.10 1994/01/15 20:43:43 vixie Exp $ + * + * marco 07nov16 [remove code not needed by pg_cron] + * marco 04sep16 [integrate into pg_cron] + * vix 14nov88 [rest of log is in RCS] + * vix 14jan87 [0 or 7 can be sunday; thanks, mwm@berkeley] + * vix 30dec86 [written] + */ + +#include +#include + +#include +#include +#if SYS_TIME_H +# include +#else +# include +#endif + +/* these are really immutable, and are + * defined for symbolic convenience only + * TRUE, FALSE, and ERR must be distinct + * ERR must be < OK. + */ +#define TRUE 1 +#define FALSE 0 +/* system calls return this on success */ +#define OK 0 +/* or this on error */ +#define ERR (-1) + +/* turn this on to get '-x' code */ +#ifndef DEBUGGING +#define DEBUGGING FALSE +#endif + +#define READ_PIPE 0 /* which end of a pipe pair do you read? */ +#define WRITE_PIPE 1 /* or write to? */ +#define STDIN 0 /* what is stdin's file descriptor? */ +#define STDOUT 1 /* stdout's? */ +#define STDERR 2 /* stderr's? */ +#define ERROR_EXIT 1 /* exit() with this will scare the shell */ +#define OK_EXIT 0 /* exit() with this is considered 'normal' */ +#define MAX_FNAME 100 /* max length of internally generated fn */ +#define MAX_COMMAND 1000 /* max length of internally generated cmd */ +#define MAX_TEMPSTR 1000 /* max length of envvar=value\0 strings */ +#define MAX_ENVSTR MAX_TEMPSTR /* DO NOT change - buffer overruns otherwise */ +#define MAX_UNAME 20 /* max length of username, should be overkill */ +#define ROOT_UID 0 /* don't change this, it really must be root */ +#define ROOT_USER "root" /* ditto */ + +/* + * NOTE: these correspond to DebugFlagNames, defined below. + */ +#define DEXT 0x0001 /* extend flag for other debug masks */ +#define DSCH 0x0002 /* scheduling debug mask */ +#define DPROC 0x0004 /* process control debug mask */ +#define DPARS 0x0008 /* parsing debug mask */ +#define DLOAD 0x0010 /* database loading debug mask */ +#define DMISC 0x0020 /* misc debug mask */ +#define DTEST 0x0040 /* test mode: don't execute any commands */ +#define DBIT 0x0080 /* bit twiddling shown (long) */ + +#define CRON_TAB(u) "%s/%s", SPOOL_DIR, u +#define REG register +#define PPC_NULL ((char **)NULL) + +#ifndef MAXHOSTNAMELEN +#define MAXHOSTNAMELEN 64 +#endif + +#define Skip_Blanks(c, f) \ + while (c == '\t' || c == ' ') \ + c = get_char(f); + +#define Skip_Nonblanks(c, f) \ + while (c!='\t' && c!=' ' && c!='\n' && c != EOF && c != '\0') \ + c = get_char(f); + +#define Skip_Line(c, f) \ + do {c = get_char(f);} while (c != '\n' && c != EOF); + +#if DEBUGGING +# define Debug(mask, message) \ + if ( (DebugFlags & (mask) ) ) \ + printf message; +#else /* !DEBUGGING */ +# define Debug(mask, message) \ + ; +#endif /* DEBUGGING */ + +#define MkLower(ch) (isupper(ch) ? tolower(ch) : ch) +#define MkUpper(ch) (islower(ch) ? toupper(ch) : ch) +#define Set_LineNum(ln) {Debug(DPARS|DEXT,("linenum=%d\n",ln)); \ + LineNumber = ln; \ + } + +typedef int time_min; + +/* Log levels */ +#define CRON_LOG_JOBSTART 0x01 +#define CRON_LOG_JOBEND 0x02 +#define CRON_LOG_JOBFAILED 0x04 +#define CRON_LOG_JOBPID 0x08 + +#define SECONDS_PER_MINUTE 60 + +#define FIRST_MINUTE 0 +#define LAST_MINUTE 59 +#define MINUTE_COUNT (LAST_MINUTE - FIRST_MINUTE + 1) + +#define FIRST_HOUR 0 +#define LAST_HOUR 23 +#define HOUR_COUNT (LAST_HOUR - FIRST_HOUR + 1) + +#define FIRST_DOM 1 +#define LAST_DOM 31 +#define DOM_COUNT (LAST_DOM - FIRST_DOM + 1) + +#define FIRST_MONTH 1 +#define LAST_MONTH 12 +#define MONTH_COUNT (LAST_MONTH - FIRST_MONTH + 1) + +/* note on DOW: 0 and 7 are both Sunday, for compatibility reasons. */ +#define FIRST_DOW 0 +#define LAST_DOW 7 +#define DOW_COUNT (LAST_DOW - FIRST_DOW + 1) + +/* + * each user's crontab will be held as a list of + * the following structure. + * + * These are the cron commands. + */ +typedef struct _entry { + struct _entry *next; + uid_t uid; + gid_t gid; + char **envp; + int secondsInterval; + bitstr_t bit_decl(minute, MINUTE_COUNT); + bitstr_t bit_decl(hour, HOUR_COUNT); + bitstr_t bit_decl(dom, DOM_COUNT); + bitstr_t bit_decl(month, MONTH_COUNT); + bitstr_t bit_decl(dow, DOW_COUNT); + int flags; +#define DOM_STAR 0x01 +#define DOW_STAR 0x02 +#define WHEN_REBOOT 0x04 +#define MIN_STAR 0x08 +#define HR_STAR 0x10 +} entry; + +/* + * the crontab database will be a list of the + * following structure, one element per user + * plus one for the system. + * + * These are the crontabs. + */ +typedef struct _user { + struct _user *next, *prev; /* links */ + char *name; + time_t mtime; /* last modtime of crontab */ + entry *crontab; /* this person's crontab */ +#ifdef WITH_SELINUX + security_context_t scontext; /* SELinux security context */ +#endif +} user; + +typedef struct _cron_db { + user *head, *tail; /* links */ + time_t user_mtime; /* last modtime on spooldir */ + time_t sys_mtime; /* last modtime on system crontab */ +#ifdef DEBIAN + time_t sysd_mtime; /* last modtime on system crondir */ +#endif +} cron_db; + +typedef struct _orphan { + struct _orphan *next; /* link */ + char *uname; + char *fname; + char *tabname; +} orphan; + +/* + * Buffer used to mimick getc(FILE*) and ungetc(FILE*) + */ +#define MAX_FILE_BUFFER_LENGTH 1000 + +typedef struct _file_buffer { + char data[MAX_FILE_BUFFER_LENGTH]; + int length; + int pointer; + char unget_data[MAX_FILE_BUFFER_LENGTH]; + int unget_count; +} file_buffer; + +void unget_char(int, FILE *), + free_entry(entry *), + skip_comments(FILE *); + +int get_char(FILE *), + get_string(char *, int, FILE *, char *); + +entry * parse_cron_entry(char *); + +/* + * in the C tradition, we only create + * variables for the main program, just + * extern them elsewhere. + */ +#ifdef MAIN_PROGRAM +# if !defined(LINT) && !defined(lint) +char *copyright[] = { + "@(#) Copyright 1988,1989,1990,1993,1994 by Paul Vixie", + "@(#) All rights reserved" + }; +# endif + +char *MonthNames[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", + NULL + }; + +char *DowNames[] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun", + NULL + }; + +char *ecodes[] = { + "no error", + "bad minute", + "bad hour", + "bad day-of-month", + "bad month", + "bad day-of-week", + "bad command", + "bad time specifier", + "bad username", + "command too long", + NULL + }; + + +char *ProgramName; +int LineNumber; +time_t StartTime; +time_min virtualTime; +time_min clockTime; + +# if DEBUGGING +int DebugFlags; +char *DebugFlagNames[] = { /* sync with #defines */ + "ext", "sch", "proc", "pars", "load", "misc", "test", "bit", + NULL /* NULL must be last element */ + }; +# endif /* DEBUGGING */ +#else /*MAIN_PROGRAM*/ +extern char *copyright[], + *MonthNames[], + *DowNames[], + *ProgramName; +extern int LineNumber; +extern time_t StartTime; +extern time_min virtualTime; +extern time_min clockTime; +# if DEBUGGING +extern int DebugFlags; +extern char *DebugFlagNames[]; +# endif /* DEBUGGING */ +#endif /*MAIN_PROGRAM*/ diff --git a/src/include/task/job_metadata.h b/src/include/task/job_metadata.h new file mode 100644 index 00000000000..a96a116bee5 --- /dev/null +++ b/src/include/task/job_metadata.h @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------- + * + * job_metadata.h + * definition of job metadata functions + * + * Copyright (c) 2010-2015, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#ifndef JOB_METADATA_H +#define JOB_METADATA_H + +#include "nodes/pg_list.h" +#include "task/cron.h" + +typedef enum +{ + CRON_STATUS_STARTING, + CRON_STATUS_RUNNING, + CRON_STATUS_SENDING, + CRON_STATUS_CONNECTING, + CRON_STATUS_SUCCEEDED, + CRON_STATUS_FAILED +} CronStatus; + +/* job metadata data structure */ +typedef struct CronJob +{ + int64 jobId; + char *scheduleText; + entry schedule; + char *command; + char *nodeName; + int nodePort; + char *database; + char *userName; + bool active; + char *jobName; +} CronJob; + +extern bool CronJobCacheValid; + +/* functions for retrieving job metadata */ +extern void InitializeJobMetadataCache(void); +extern void ResetJobMetadataCache(void); +extern List * LoadCronJobList(void); +extern CronJob * GetCronJob(int64 jobId); + +extern void InsertJobRunDetail(int64 runId, int64 *jobId, char *database, char *username, char *command, char *status); +extern void UpdateJobRunDetail(int64 runId, int32 *job_pid, char *status, char *return_message, TimestampTz *start_time, + TimestampTz *end_time); +extern int64 NextRunId(void); +extern void MarkPendingRunsAsFailed(void); +extern char *GetCronStatus(CronStatus cronstatus); + +extern int64 ScheduleCronJob(text *scheduleText, text *commandText, + text *databaseText, text *usernameText, + bool active, text *jobnameText); + +extern Oid UnscheduleCronJob(const char *jobname, const char *username, Oid jobid, bool missing_ok); + +extern void AlterCronJob(int64 jobId, char *schedule, char *command, + char *database_name, char *username, bool *active); + +#endif /* JOB_METADATA_H */ diff --git a/src/include/task/pg_cron.h b/src/include/task/pg_cron.h new file mode 100644 index 00000000000..24fd88842e9 --- /dev/null +++ b/src/include/task/pg_cron.h @@ -0,0 +1,36 @@ +/*------------------------------------------------------------------------- + * + * pg_cron.h + * definition of pg_cron data types + * + * Copyright (c) 2010-2015, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#ifndef PG_CRON_H +#define PG_CRON_H + +/* GUC settings */ +extern char *task_host_addr; +extern int max_running_tasks; +extern bool task_enable_superuser_jobs; +extern bool task_log_run; +extern bool task_log_statement; +extern bool task_use_background_worker; +extern char *task_timezone; + +/* Shared memory area for pg cron process */ +typedef struct PgCronData +{ + pid_t cron_pid; /* pid of pg cron process */ +} PgCronData; + +extern void PgCronLauncherMain(Datum arg); +extern bool PgCronStartRule(Datum main_arg); +extern void CronBackgroundWorker(Datum arg); +extern pid_t PgCronLauncherPID(void); +extern Size PgCronLauncherShmemSize(void); +extern void PgCronLauncherShmemInit(void); + +#endif /* PG_CRON_H */ diff --git a/src/include/task/task_states.h b/src/include/task/task_states.h new file mode 100644 index 00000000000..29f8dfcfa19 --- /dev/null +++ b/src/include/task/task_states.h @@ -0,0 +1,67 @@ +/*------------------------------------------------------------------------- + * + * task_states.h + * definition of task state functions + * + * Copyright (c) 2010-2015, Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#ifndef TASK_STATES_H +#define TASK_STATES_H + +#include "libpq-fe.h" +#include "postmaster/bgworker.h" +#include "storage/dsm.h" +#include "storage/shm_mq.h" +#include "task/job_metadata.h" +#include "utils/timestamp.h" + +typedef enum +{ + CRON_TASK_WAITING = 0, + CRON_TASK_START = 1, + CRON_TASK_CONNECTING = 2, + CRON_TASK_SENDING = 3, + CRON_TASK_RUNNING = 4, + CRON_TASK_RECEIVING = 5, + CRON_TASK_DONE = 6, + CRON_TASK_ERROR = 7, + CRON_TASK_BGW_START = 8, + CRON_TASK_BGW_RUNNING = 9 +} CronTaskState; + +struct BackgroundWorkerHandle +{ + int slot; + uint64 generation; +}; + +typedef struct CronTask +{ + int64 jobId; + int64 runId; + CronTaskState state; + uint32 pendingRunCount; + PGconn *connection; + PostgresPollingStatusType pollingStatus; + TimestampTz startDeadline; + TimestampTz lastStartTime; + uint32 secondsInterval; + bool isSocketReady; + bool isActive; + char *errorMessage; + bool freeErrorMessage; + shm_mq_handle *sharedMemoryQueue; + dsm_segment *seg; + BackgroundWorkerHandle handle; +} CronTask; + +extern void InitializeTaskStateHash(void); +extern void RefreshTaskHash(void); +extern List * CurrentTaskList(void); +extern void InitializeCronTask(CronTask *task, int64 jobId); +extern void RemoveTask(int64 jobId); + +#endif /* TASK_STATES_H */ diff --git a/src/include/tcop/cmdtaglist.h b/src/include/tcop/cmdtaglist.h index 3fae75a1cc4..3cb929e3f2e 100644 --- a/src/include/tcop/cmdtaglist.h +++ b/src/include/tcop/cmdtaglist.h @@ -63,6 +63,7 @@ PG_CMDTAG(CMDTAG_ALTER_SUBSCRIPTION, "ALTER SUBSCRIPTION", true, false, false) PG_CMDTAG(CMDTAG_ALTER_SYSTEM, "ALTER SYSTEM", false, false, false) PG_CMDTAG(CMDTAG_ALTER_TABLE, "ALTER TABLE", true, true, false) PG_CMDTAG(CMDTAG_ALTER_TABLESPACE, "ALTER TABLESPACE", false, false, false) +PG_CMDTAG(CMDTAG_ALTER_TASK, "ALTER TASK", true, false, false) PG_CMDTAG(CMDTAG_ALTER_TEXT_SEARCH_CONFIGURATION, "ALTER TEXT SEARCH CONFIGURATION", true, false, false) PG_CMDTAG(CMDTAG_ALTER_TEXT_SEARCH_DICTIONARY, "ALTER TEXT SEARCH DICTIONARY", true, false, false) PG_CMDTAG(CMDTAG_ALTER_TEXT_SEARCH_PARSER, "ALTER TEXT SEARCH PARSER", true, false, false) @@ -130,6 +131,7 @@ PG_CMDTAG(CMDTAG_CREATE_TEXT_SEARCH_TEMPLATE, "CREATE TEXT SEARCH TEMPLATE", tru PG_CMDTAG(CMDTAG_CREATE_TRANSFORM, "CREATE TRANSFORM", true, false, false) PG_CMDTAG(CMDTAG_CREATE_TRIGGER, "CREATE TRIGGER", true, false, false) PG_CMDTAG(CMDTAG_CREATE_TYPE, "CREATE TYPE", true, false, false) +PG_CMDTAG(CMDTAG_CREATE_TASK, "CREATE TASK", true, false, false) PG_CMDTAG(CMDTAG_CREATE_USER_MAPPING, "CREATE USER MAPPING", true, false, false) PG_CMDTAG(CMDTAG_CREATE_VIEW, "CREATE VIEW", true, false, false) PG_CMDTAG(CMDTAG_DEALLOCATE, "DEALLOCATE", false, false, false) @@ -188,6 +190,7 @@ PG_CMDTAG(CMDTAG_DROP_STATISTICS, "DROP STATISTICS", true, false, false) PG_CMDTAG(CMDTAG_DROP_SUBSCRIPTION, "DROP SUBSCRIPTION", true, false, false) PG_CMDTAG(CMDTAG_DROP_TABLE, "DROP TABLE", true, false, false) PG_CMDTAG(CMDTAG_DROP_TABLESPACE, "DROP TABLESPACE", false, false, false) +PG_CMDTAG(CMDTAG_DROP_TASK, "DROP TASK", true, false, false) PG_CMDTAG(CMDTAG_DROP_TEXT_SEARCH_CONFIGURATION, "DROP TEXT SEARCH CONFIGURATION", true, false, false) PG_CMDTAG(CMDTAG_DROP_TEXT_SEARCH_DICTIONARY, "DROP TEXT SEARCH DICTIONARY", true, false, false) PG_CMDTAG(CMDTAG_DROP_TEXT_SEARCH_PARSER, "DROP TEXT SEARCH PARSER", true, false, false) diff --git a/src/include/utils/datumstream.h b/src/include/utils/datumstream.h index 14ba145c8d4..541e2fff79e 100644 --- a/src/include/utils/datumstream.h +++ b/src/include/utils/datumstream.h @@ -261,7 +261,8 @@ extern DatumStreamWrite *create_datumstreamwrite( Form_pg_attribute attr, char *relname, char *title, - bool needsWAL); + bool needsWAL, + RelFileNodeBackend *rnode); extern DatumStreamRead *create_datumstreamread( char *compName, @@ -271,7 +272,8 @@ extern DatumStreamRead *create_datumstreamread( int32 maxsz, Form_pg_attribute attr, char *relname, - char *title); + char *title, + RelFileNode *relFileNode); extern void datumstreamwrite_open_file( DatumStreamWrite * ds, diff --git a/src/include/utils/datumstreamblock.h b/src/include/utils/datumstreamblock.h index 6fe5d811a0b..68bb043cc99 100755 --- a/src/include/utils/datumstreamblock.h +++ b/src/include/utils/datumstreamblock.h @@ -16,6 +16,7 @@ #define DATUMSTREAMBLOCK_H #include "catalog/pg_attribute.h" +#include "storage/relfilenode.h" #include "utils/guc.h" typedef enum DatumStreamVersion @@ -75,7 +76,7 @@ typedef struct DatumStreamBlock_Orig int16 version; /* version number */ int16 flags; /* some flags */ int16 ndatum; /* number of datum, including null */ - int16 unused; /* unused */ + int16 encrypted; /* has been encrypted */ int32 nullsz; /* size nullbitmaps */ int32 sz; /* logical data size, not including header, * nullbitmap, and padding */ @@ -203,6 +204,7 @@ enum DSB_HAS_NULLBITMAP = 0x1, DSB_HAS_RLE_COMPRESSION = 0x2, DSB_HAS_DELTA_COMPRESSION = 0x4, + DSB_HAS_ENCRYPTION = 0x8, }; typedef struct DatumStreamBitMapWrite @@ -2069,7 +2071,8 @@ extern void DatumStreamBlockRead_GetReadyOrig( int64 firstRowNum, int32 rowCount, bool *hadToAdjustRowCount, - int32 * adjustedRowCount); + int32 * adjustedRowCount, + RelFileNode *node); extern void DatumStreamBlockRead_GetReadyDense( DatumStreamBlockRead * dsr, uint8 * buffer, @@ -2077,7 +2080,8 @@ extern void DatumStreamBlockRead_GetReadyDense( int64 firstRowNum, int32 rowCount, bool *hadToAdjustRowCount, - int32 * adjustedRowCount); + int32 * adjustedRowCount, + RelFileNode *node); inline static void DatumStreamBlockRead_GetReady( @@ -2087,7 +2091,8 @@ DatumStreamBlockRead_GetReady( int64 firstRowNum, int32 rowCount, bool *hadToAdjustRowCount, - int32 * adjustedRowCount) + int32 * adjustedRowCount, + RelFileNode *node) { if (dsr->datumStreamVersion == DatumStreamVersion_Original) { @@ -2098,7 +2103,8 @@ DatumStreamBlockRead_GetReady( firstRowNum, rowCount, hadToAdjustRowCount, - adjustedRowCount); + adjustedRowCount, + node); } else { @@ -2111,7 +2117,8 @@ DatumStreamBlockRead_GetReady( firstRowNum, rowCount, hadToAdjustRowCount, - adjustedRowCount); + adjustedRowCount, + node); } } @@ -2158,7 +2165,8 @@ extern void DatumStreamBlockWrite_Init( int (*errdetailCallback) (void *errdetailArg), void *errdetailArg, int (*errcontextCallback) (void *errcontextArg), - void *errcontextArg); + void *errcontextArg, + RelFileNode *relFileNode); extern void DatumStreamBlockWrite_Finish( DatumStreamBlockWrite * dsw); @@ -2172,6 +2180,7 @@ extern void DatumStreamBlockWrite_GetReady( DatumStreamBlockWrite * dsw); extern int64 DatumStreamBlockWrite_Block( DatumStreamBlockWrite * dsw, - uint8 * buffer); + uint8 * buffer, + RelFileNode *node); #endif /* DATUMSTREAMBLOCK_H */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 8df618adfd7..080bdbc1184 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -295,6 +295,16 @@ extern bool gp_local_distributed_cache_stats; extern bool gp_appendonly_verify_block_checksums; extern bool gp_appendonly_verify_write_block; extern bool gp_appendonly_compaction; +extern bool enable_parallel; +extern int gp_appendonly_insert_files; +extern int gp_appendonly_insert_files_tuples_range; +/* + * gp_enable_multiphase_limit is not cost based. + * When set to false, the planner will not use multi-phase limit. + * Used to debug and create test cases. + */ +extern bool gp_enable_multiphase_limit; + /* * Threshold of the ratio of dirty data in a segment file @@ -485,6 +495,7 @@ extern bool optimizer_enable_indexjoin; extern bool optimizer_enable_motions_masteronly_queries; extern bool optimizer_enable_motions; extern bool optimizer_enable_motion_broadcast; +extern bool parallel_hash_enable_motion_broadcast; extern bool optimizer_enable_motion_gather; extern bool optimizer_enable_motion_redistribute; extern bool optimizer_enable_sort; diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 7df12d1f587..ca0085657da 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -96,6 +96,7 @@ enum config_group STATS_ANALYZE, /*CDB*/ STATS_MONITORING, STATS_COLLECTOR, + ENCRYPTION, AUTOVACUUM, CLIENT_CONN_STATEMENT, @@ -119,6 +120,8 @@ enum config_group CUSTOM_OPTIONS, DEVELOPER_OPTIONS, + TASK_SCHEDULE_OPTIONS, + /* * GPDB: deprecated GUCs. In this group, the GUCs are still functioning, * but we don't recommend customers to use them. They may be defunct in diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 195f11e6494..6b959fa04fa 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -187,4 +187,5 @@ extern void SerializeSnapshot(Snapshot snapshot, char *start_address); extern Snapshot RestoreSnapshot(char *start_address); extern void RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc); +extern Size EstimateSnapshotDataSpace(void); #endif /* SNAPMGR_H */ diff --git a/src/include/utils/sync_guc_name.h b/src/include/utils/sync_guc_name.h index 26a4203570f..2f93f8d9579 100644 --- a/src/include/utils/sync_guc_name.h +++ b/src/include/utils/sync_guc_name.h @@ -10,6 +10,7 @@ "default_tablespace", "default_toast_compression", "dml_ignore_target_partition_check", + "enable_parallel", "execute_pruned_plan", "explain_memory_verbosity", "force_parallel_mode", @@ -138,3 +139,5 @@ "wal_debug", "work_mem", "gp_resgroup_debug_wait_queue", + "gp_appendonly_insert_files", + "gp_appendonly_insert_files_tuples_range", diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index 6abab3f9e2e..c2ba8a84b2f 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -66,6 +66,7 @@ enum SysCacheIdentifier FOREIGNSERVEROID, FOREIGNTABLEREL, GPPOLICYID, + AORELID, INDEXRELID, LANGNAME, LANGOID, diff --git a/src/include/utils/typcache.h b/src/include/utils/typcache.h index 5436ce464d6..4b23a69168a 100644 --- a/src/include/utils/typcache.h +++ b/src/include/utils/typcache.h @@ -203,6 +203,8 @@ extern int compare_values_of_enum(TypeCacheEntry *tcache, Oid arg1, Oid arg2); extern size_t SharedRecordTypmodRegistryEstimate(void); +extern uint32 GetSharedNextRecordTypmod(SharedRecordTypmodRegistry* registry); + extern void SharedRecordTypmodRegistryInit(SharedRecordTypmodRegistry *, dsm_segment *segment, dsa_area *area); diff --git a/src/include/utils/unsync_guc_name.h b/src/include/utils/unsync_guc_name.h index fdabb683fd4..e3c9cab4397 100644 --- a/src/include/utils/unsync_guc_name.h +++ b/src/include/utils/unsync_guc_name.h @@ -196,6 +196,7 @@ "gp_enable_minmax_optimization", "gp_enable_motion_deadlock_sanity", "gp_enable_multiphase_agg", + "gp_enable_multiphase_limit", "gp_enable_predicate_propagation", "gp_enable_predicate_pushdown", "gp_enable_preunique", @@ -221,6 +222,7 @@ "gp_etcd_endpoints", "gp_cbdb_deploy", #endif + "gp_force_random_redistribution", "gp_gang_creation_retry_count", "gp_gang_creation_retry_timer", "gp_global_deadlock_detector_period", @@ -358,6 +360,7 @@ "max_replication_slots", "max_resource_portals_per_transaction", "max_resource_queues", + "max_running_tasks", "max_slot_wal_keep_size", "max_stack_depth", "max_standby_archive_delay", @@ -411,6 +414,7 @@ "optimizer_enable_materialize", "optimizer_enable_mergejoin", "optimizer_enable_motion_broadcast", + "parallel_hash_enable_motion_broadcast", "optimizer_enable_motion_gather", "optimizer_enable_motion_redistribute", "optimizer_enable_motions", @@ -548,6 +552,12 @@ "syslog_ident", "syslog_sequence_numbers", "syslog_split_messages", + "task_enable_superuser_jobs", + "task_host_addr", + "task_log_run", + "task_log_statement", + "task_use_background_worker", + "task_timezone", "tcp_keepalives_count", "tcp_keepalives_idle", "tcp_keepalives_interval", @@ -616,3 +626,6 @@ "xmlbinary", "xmloption", "zero_damaged_pages", + "cluster_key_command", + "file_encryption_method", + "tde_force_switch", diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 661a2037144..49592bbbc41 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -192,6 +192,9 @@ typedef enum WAIT_EVENT_DATA_FILE_TRUNCATE, WAIT_EVENT_DATA_FILE_WRITE, WAIT_EVENT_DSM_FILL_ZERO_WRITE, + WAIT_EVENT_KEY_FILE_READ, + WAIT_EVENT_KEY_FILE_WRITE, + WAIT_EVENT_KEY_FILE_SYNC, WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ, WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC, WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE, diff --git a/src/test/Makefile b/src/test/Makefile index 114c9b7d85d..256c5fe23d9 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -21,6 +21,9 @@ SUBDIRS += fsync walrep heap_checksum isolation2 fdw # Test suites that are not safe by default but can be run if selected # by the user via the whitespace-separated list in variable # PG_TEST_EXTRA: +ifeq ($(with_openssl),yes) +SUBDIRS += crypto +endif ifeq ($(with_gssapi),yes) ifneq (,$(filter kerberos,$(PG_TEST_EXTRA))) SUBDIRS += kerberos @@ -53,4 +56,10 @@ $(call recurse,$(recurse_alldirs_targets)) $(call recurse,installcheck, $(installable_dirs)) $(call recurse,install, $(installable_dirs)) +# New target installcheck-cbdb-parallel +check_dirs := regress isolation2 + +installcheck-cbdb-parallel: install + $(call recurse,installcheck-cbdb-parallel,$(check_dirs)) + $(recurse_always) diff --git a/src/test/README b/src/test/README index afdc7676519..0955a764e3b 100644 --- a/src/test/README +++ b/src/test/README @@ -11,6 +11,9 @@ which tests get run automatically. authentication/ Tests for authentication (but see also below) +crypto/ + Tests for cluster file encryption + examples/ Demonstration programs for libpq that double as regression tests via "make check" diff --git a/src/test/crypto/.gitignore b/src/test/crypto/.gitignore new file mode 100644 index 00000000000..7b92218ad20 --- /dev/null +++ b/src/test/crypto/.gitignore @@ -0,0 +1,4 @@ +# Generated by test suite +/tmp_check/ +/log/ +/testcrypto diff --git a/src/test/crypto/KWP_AD_128.txt b/src/test/crypto/KWP_AD_128.txt new file mode 100644 index 00000000000..d77aeab719f --- /dev/null +++ b/src/test/crypto/KWP_AD_128.txt @@ -0,0 +1,35 @@ +# CAVS 21.4 +# 'NIST SP 800-38F KWP-AD with AES-128 cipher function' information for test-files +# Seed = 0f0c6b6602d959d22ff2305478ea477a85d196d9695abf5b445010b45ce046c501bc04e6ca4faf46137adf6eac452d97314cd91137e33fc30cc6df316117fed3 +# Generated on Fri Apr 6 14:46:45 2018 +[PLAINTEXT LENGTH = 4096] + +COUNT = 0 +K = 1dd51f0d3a0a784174ba81b2c9f89005 +C = e1bde6d2df3b8e48ca127f97b56b5dc2672b3736cc3157c7b80a0316ef1efbdbbce19fea23da831836ccd2e002b2c1dfad206b5cec358446b8434d7f4c39e65b0e0b50897642ffc34bfb3cb3e233aa9c1058ff0d4fd48e98bc8cc3d214c06d514dd97db2278093a308f91f4ae92626d85771fb1447b36a3467fff02ac7e81ddbd0fdbcd02d1acd4f053c989ef3dcc2c01e23bc2f6090f3e8c0ba5f0082341200b1c37b99daa9cb6fec78bce3429aec5badb9fd28fdbdbdc5d53570675a9e39535b4594095658ef950ecd79a162223b60d2eb91765e022dc6e1bbdd86f1bcc280ed9df350da08a801fa16a1bf2701947acfb08f19fdfcaa1d76f466a5de2458a78fb82f6af3e1be68f405a4289f25896f4c9830005c9e895c86e67eceab0ad544856071b8d9585835b5e85a07ab01515f7ab54f98dffb4ca49a15068eefc6a01f7f52fd1adbe3631c59f6f43f79d2b4f2a691e2b30bb1d43a848dc3ee39c7f2e50f0c9deb7ab51e33bf40903ac255bb1510fd61676a6c13c3c776b8aacc6cefb95e24973ebb11192e2692dd0c6a085b58f86e11cc28ee2194988c123e3666da7339c0a4ac6afbacc83f1f100fbb39efff7cc605c9213828224a17c476395aeb9bb0a3150fb8889a8c2a494c8c526203f261642bfa69a94b86de9e6d3d932fe20fffe4bd76d502c0d437a3e1d0d8727b7a8dc0e361967109e93566326b6c517663731c4c9bdd0295d8 +P = 1a4eed4bf5b8d2e2a58f1f1277f164cc32cdadaed848f76fe634034082ff9aa1711870bf3936d01a2aa48de30de5143b9148cf56f4490f9d480dda0b672e8e17a012cd26cec3c68837bd5b2f9beb13e0110f21c6c36343e09e027f39557d1596d4ca406e3e7aa113e9bb8623106bae25f0ea23d46bc29970ba2596f83fe4f73a6f978a4d949fa7c271570a2ae5d2b50792d5ab5c43d455f359fb83c35ca3da37cd73cd66b6adce94d78ecdeabf667daa47ea70799af299e1d898ccf3fca6c42c6fff8cf2ec992f596fed4a0cdb502a00f9b5689302931d15cba691e2f8079a0411332438b714ace5234b91e4aebee8f8dda0e1968c2016fed350430a65d8d206c9436f40b79ce03083b8dc207d6960be1ce97007ed22a388ebb7b3d8f7d2b7d9f8f49731fbcb21e21db0cdd15674c795d5af2b2cd727f83e634e8c47157ed0c6873a5c9419e683f16f4a7827b444967812f9d1adb9201b89a0e66bbcf0591465f5d7036a21cdda0e10099feb819dfc37fdd3105120044dab716882d3971f312e3f4459006fd5a1eab08ff63edf6718f47ddaa37f7f40c9c372995f3aec97bc45e287b64fc8cf5559ab04a4d4d3ed482f5d61d3abd99cc87ee406da3ab9c9cd22ba3b8d191b26754aa94a2412f39e332d77fe72210adb0cbb5c96adebdbde036f1f1aaafad74a7ac2594f81efa734054e2e16dc931d49b970b81756862705fcd4 + +COUNT = 1 +K = b3fa008b5947ce58dfbd354dd01f2d43 +C = 55cd8e45138f477ce0a84f07bd28a93d7d628bb4860207a2f6dc4256bd79843e32c856a4fa831d1603699d49e6c36291b60aa80635900cc6c78cf0a2ddc457beb41782de0de03f08a064df90b41f2e98ce61185d735380403fe56b68f8343a801a14afb8a7ba79684dc2a585110da83e9a836cae1fd9e1a220dd6dc922b4f02b15ca88d43ab61e1da24a9b3cb99c4e5024ce5667f4841ca2a305b1f4c1ae9fb63d1d4dcb83870755a1a646b16c088e612d82ba2bf0e7e2fa0e8035c3baeb595f1ac9bb49b01f6f71392e217c049c0e9bd794b9aa2383cf59ee0a90f965610c65ecd629a17cba2bdf2458e3a8e1a9d219cb66eb9ec8e5226b34f95003064952523920a0b4e94ec8ecd1bdca8a65fe46ed25fd4d076e46fa62a8cde6eabc593045d17cef996ebbeca4b537f65c4f683a10baeb4c42b9867bbb49ca7ea1c5437bc114948c542cffced9bb1ebe3c946eb24ff55be89be004596ba648b264167217d267b881020b905f508e4f0e1a58eca051d56ff30d91891838c574c3de54e3feafcdf514740ddc94ba92cb85fe86033e67f14d90be7a0222e4bd1624cea8894df66a36a8e848dfe9168d8024b7ba5636afbcf6b945a53e6b2778f229af7dc2e59bebbf8bdbdfde1e21465f6b6344b13afa0e5ceac212b3b88932f21b1ae04268476597c92e64ff7c14b9ef678f10a35b56cd70ba03063f94aed97b0a6cf883d1f07facfa37b6e5b070 +P = a067ab39cede4ac6c6cb7630cba48c52a794ac8ebec037125bcd97d1a3c52a8ed64764899f9035a6944d0605a5d977172a55bbf86cd81aef5d6bafb1ac86bfa65da2b3c39bf5da94a98f7b6dbc5df16a7b38061e0665ad16b20fb6aedc9ce7f6d3497c3c55cea92e6343f21251092ef2ea307b35f999683298098bedaea847d1ccbf8bda18dc477e8d49fee4e357273396ad2245703485b97b5a7d97057bad875a3e76b67ad5adbc6ef3b8ba9a1786aa93149f0f8dd166535acbf93f1b9839754d537da3fae1ab02973427c3f353fe9aa6c5a100bf0e6ccb08dc1fdb0fc363a95c77c5758d440db0a70f0340a4c488de51e1ecb932ce2fcb2c95ea28c9f55695d97ba1765c8f11e523ae3e4e1efceb69000a192c047ab197f4840c664c035064ecc12926fd3bca0527a160b5b5a2bbaf5db11437f2c38a1c7535e87f552b9f04f2fdd309a826e4ec7708217022fb075cdfc6cc23e9301e33068caa69ef746f357b09ccc098443a3a2979a225e70be1e722e8d6fbb57d0dded2456c1d47eeb0af2241f769836026fec8fc51d97c4abbe9710a4aa5b95aaac83bee57e1333fa244ccc971b6260a9be16e31cc2fd283fec1b247a7340d149fe5309acb47c9cdb955b7bcc4df277eaf611e8af281ff0bcd64b4534309282d1b5cb14efa93141869d67ce7e418f06bb4c2feebcb7a1151aea2eb8bc2fc4dcee53de9b2fb1803490caf + +COUNT = 2 +K = 4b4c43c9de4fb4a2a7a7adafeabe2dbd +C = 6e4d08b8124f7d3e23303fac1a842014f95e3d71c438f8f1990307842796dc5e404ad81802e35c183fe000390a12c81ee684c5cf26c1d90e414cfffe6931b0f352936fcf0b31429eb5c7612cc359a15371390e518cf5c6a6bff1bb0348d14e2c39b98c9f30672ed2af1d96296df8b5567db25b9510a2083461810e119735490058ed1b46b7fdfa885041d8749f90a072b43ba49f2f51fbcda0dbf3cf99fca1d8f46330e5f6fe079d6679cfa26214c8831b782aaa023a2e0ea91050d277dab876aa6865f2bb3fc1a4a77db52f6179d5e5325993280948b6b7002b572829641d35ed3d735d8423e5b24673c4570ca25064fc2c2ad4840632536bcfaf2a7a814f3eaed92b4d501bc51c1719a0d8d8f420b66db845682bb41c88038cfedf13417143a3a701b521a9bf0bb639875a728c3b5ce6ca7e7a45bc75285c193902e6b5e7a4c6e720493d3937bf485e587bff894f70fd6165a1d0129cc673a992e0a4f5489d228a066b1df60002ec0521924f8d672cd1452fec927e58e75807b2a390256f920743fa4d0fc8f59f2469a595ef65095ca0c80adfc843e9e69b6d4a3f824af47b2bfbf2a7a6c1b650378f096f6f0bfabc752c8f279d4f45d56d09dce97962c119de3a64d83b93ea55066f24d4238a229ae86e6a7857af1d8aba823370a72fe358046049a84a70213ef31d9e77a722def8e21480e79b71299438070946bd459a7251707446c911e381 +FAIL + +COUNT = 3 +K = 96ab719a3d08df2393ebc330e151dab1 +C = d50ae797f6c3418f388a7513d693c6dd665e858767531fbccd3eb1aabe796690ec8fbb757d88b169adf5c136de50ff0f2cfdd8389f812382578aee0b0b61e13c6a2bc500640fe1585f068eee0d1fa3420220e23090e24e3248fe16f4e0c7c0e996a21b4947ddd08fd3ccc1f036651be4f48ee1ffb486cdc05911244480548221d8da1f2bc37dece080e51b2cdd1ddebf37213a4dfa1b252e567243d9cec8c89eb8db544e7c389a2e13f1b91d860df3cbcec3e85c93276c2a9a5fa080efc85e9bad3bfe2d9bb06498dd8b3720456bfabd3c69b345f6954872baa1d43b9f7ceb92ae9ad77b270d0b94c79275a48874dafb136105f5553529687b6aeeaa521790b9376c9f88ace94049235cd52c4387ad210442513dd5e07171519d58b1294fb8ac1f60ac68b8f07b418e1bb0598601ec38b9a9b137dd87d0c8a41089d17ca1c720fd0e7e3b81b85a373753bed0f5e29586f84cb29e1d88c379c965c50f6a803ddfac2e1555beb9c208a3821f53bead8f120f4ef4a1490b730a0b8a2f1869c6b985520d709bdc0e5fce44316b8aa2448a2743761bf77bdbbfdab6a721a8ec79f38f7e7321a80a2cd3a35a912eaac5eace85c4cad3c6685b88be4517cd1c20971b85bd9e8eb6e52869e014831dff7585a163f5a4dbf1d59160104da90a9cfcc8d6a0324942b40fde4319a32442d83ebbf5d7a36e9495be2ffd0e7faec1b66c96f71843750b8a051b7170 +P = 3a3b9e6de537458875e59204ef7565b6dde796e5ab11c83f7a361b8143f0f7a7eadb5b53c6efa6d199f759cad5c029004024eabbaff717bafb95646dc31a8f6063b9f8faaea650dfa8803bfa0c79091f299a55f78611c2e0d015021d6c6d3abf3d85cac306740acc144201516b787421a77c78a566c6eadc88ecdeea4ff861b6db73f7b00f0a8f62faedefa58866fb368424d7267afdf5ff1279916d2f177408d780697e1c45e58a524bb0365858d2b5a42ee2bd9e8904134d04cf071e84db8a31804aa8bebc0b28dd621360385117764178fe74b29da3ac390ac4812fdc7eedf91fce6eaae3d03163435001ce42f55982daeda5cec5deb960b35df231463cbc26267746be628c53b55f4f21ef003816eb7bfc6c710efa03d0994a1b3c8595fc9293a2c101483798034d4ee7e3d5e07bbd897c9de4b8315e53cbd1f81bdecbd59d093c844a0ed1e3e9d238707a7b893ca453745223c67756d9062152b239ceec44c436e0896a59ea9ea8cf79a93b8b759389bb5e73c5f5330e26580d9777817400166d826008be5e8c7184ae2ecf8fb9dba92af3c747c74e1534c05395f9204b5e8481fdcf4dab5ea6224a8e0ee52576d467d930c0899d31a4e288e3eecb8cb7a3be3a66c79ae93033de5d0d422a6d54ab002d1a82f3f60db97834d9fa3782dd64cbec8ddcac2216a393dc263cea2705fd072ec82dfa1ddef9c588c49f17c275 + +COUNT = 4 +K = d8c221e426109cb5911d7d6f0836f4cd +C = d853d57eaacac8096346564eccd33281ee864fb290ce91eb717fa153ca00064e033635178c59860a567215b7320fab4a72ccf716501dbc9a44d5b3d501729674987d2413cad79dece055a9b0d47ec980331f4a236b31984f5d62f9d7f58c0f3afb81fe60f266652da65d06874334be065f56096e98536bd1f2120313b0905ffe2f2c3b6de265ab7800c42be810bed18548c08f9193b02a3981a922b32b618fd9a978439ea382bf2890ad1f30d115b2319276289cf4f7a9917b0c064180e79c8644f9ac880a793b4a8ee424dff32cf2b6ca46f52ff8bd8359ed18ea8aac23e63ae337f5baea9e2ff845a5fdc0b79d5767d47c2a1a536d889f553c52696cbe91ccd2ec671a0644689bdb0f4db7e5d58c854eb539b6b4cd9214e361a216d315b1b124b43c76c703c01d3bc3142f760a399ba4887a6e326a58ecaf56fd49ae128a86cda485eedc3da80b75b171e77cade00c903c1f216eefa845dfaef660fc5ecb6791ed53765683f44da6c4ed8a9ad9e995f7d920cdee8463e79b18e8874b0a2f573b1825f8a480b1ed1245c81f4ec097bf0a0504aab9bdaef27b67d98805a7ec687c8cbcbc92ad3ce58651162a1f57f8af427ea0a111dbd6e3c7f240eb6b2360650a72b9c1c4417b1d541dfcc2a8d6ce3e8c160d8d417e4efbdce809bfe30802696bfd52a0f40be4db9be247dfd867179d82390b55180ebc6ceba0a990e3f6d32eef9dfdb946706371 +P = 92814e18dbe6e83714c4a82ba3ac3baf682a8054eb36666c9546db040d40b8613fc560d97b05265fb19ade180efeb55dfce2cc5981ca222e66b547b78a42401710535b1181674fdd426cf2b0b55e5b7f0505f11307120d495cfb197a3de00569b3d39f93c27270df4725243d314a026549692b0e2b4079c60a8053f0f36e83aaa3494307b175fd40643c1bc264eed1c00f8b565f2a3aeeb78bd94970bd9267d21f5a9a1b07df09ed44a3bd4255a139a328235b921833f92904a74ec202b0eac65df1caed05dc84e52b06c1ecf0f7914324ac4d828b7de7189705308959be42401948e3bf4bdd50ce24101c67ae745a73a67d7e366b6b432ce67b05cefd149a17247010f42dae4de1b2ca42a8e71824cd32c5cb2e2055443ec3ff24339c774dc9207744b84e9203fda1f85595f961987d847ed33867f1ddfce0795e3f2d78c5d749a488a4997392b8c9022c810197c93186894faa55cb0b6775b57a7ba2729c617c9430a44098d5081e3c5c4908ddd1a475cf9211408a8ddbe19ee527ddb2596456e1b1481a09b04e091b1c14b7b2e41bb4434a906736e115cb25ada0950ac5d2845b4a9f1e95f4d80f64440e983324c3aa9f3ec8964f9da0d26aa47e86355aa80ad99d0e573fa9932da70bd65cb1a06d8cb77e455fe7cada4561e027ca1608132c2605b6d0489bba6b29f293951883c451f37bd545f6605364ddc75918df097e + +COUNT = 5 +K = 704eb91dd5ba3d85279cf47c01eca2a5 +C = 51c71fd7778aa3648f3e31e1db0c73cb1479372f2e35f65f00188f08f794993a2ee2bb7e91cd1a2b86e92b8ccca7277207fb525ab17600173fa28844ae27f093e0e5ae00585cc714dac90cbe9b6332cbe4cb689b2cd141c102c6881f5b71ec477c5f4a91f7bdcb5871aadd478f1a9ccc6e069b7283f4d70b26e8748eda6d443ab13804c543a44fe2fb366f90de35d83fbf6354a9a9ab4a93ff7d61cbc0bfe05d6102c9c393273e7d3a04d61eba771f05cee29e5dacb7abf34ec9159e121841e2c39848f604c8f743313cbdca828bfa4635a81136e7a37f230c0d3c814d35c2eaabdd94183312909ab3a09b87cce0c719408f837bf24bfb2dad87630aabc9eab35bdb9cc536198389aceec68e8779f9e1eee84392189823a68195b75bbb6d33addf580564e696a362928e2ac506b79480600bc2f9eaa3e96f323390d1d92cf3c6d4bd4147ada5634cff2bf2d97b259904a335eaf11ec3fc84dcd8e27f7538e0fac1dbe7cb4533f4fa58913535d957b90678fac58aa96694a8047ac774afe488ab429c6807e709351f8159dcfbf83b865aeeb26722ef64a537ce932b2cfa6d53ed6cc1ca8ab58748c06a753515fffc56e294f51ab257585b610d261c6fe12def38a1b5dceaa4681569124c679b20984ed2967740419b342e9010eabd291de026f6e829e4dba5300cb668191358ab58e178c29a0194a639233f9c28c50a609bc42f8fa6cd17bc58eccd +FAIL diff --git a/src/test/crypto/KWP_AD_256.txt b/src/test/crypto/KWP_AD_256.txt new file mode 100644 index 00000000000..6d28f397059 --- /dev/null +++ b/src/test/crypto/KWP_AD_256.txt @@ -0,0 +1,35 @@ +# CAVS 21.4 +# 'NIST SP 800-38F KWP-AD with AES-256 cipher function' information for test-files +# Seed = bf1ba8f321ce0abadb40026686c9d9e7f0c4a55388f2ea7cefc81aeb0e054d40c94f48093f2739580010d6ad6e6ce734f21e7338100b750ec9c7bb06bf46f7f4 +# Generated on Fri Apr 6 14:47:06 2018 +[PLAINTEXT LENGTH = 4096] + +COUNT = 0 +K = 08f5c088acec18e6cf1f03a8f85d772e327e7fb07f8c2939eb554e84c42ab93d +C = dff30fd43647d4be54cf2dfd6187e2ddffb55267313f980fb09c833a9c2bfa558a95861711f0acb2a5c7e731ba22f24a9c4dfdd9e9b0216e9088f817a175b9835b0e17615687a20f68c067205626494cd04fbabc0b3eea7c0a4cd6236bc8b3e52e721dfc357fb8a3722bfcc4c690d8f63dbb864bb6e3a15805aea7270f8eb748deebaa2d066fcda11c2e67221f9a91d2c29a6c79ffae76aa80a2590b4f9e35f623fbf2f8ceb2a205493077556a186e25e5bd52dcff7bcc6909b37a66c1d1431be1b363bb40da25386eaaf5fcabc7be6422a04434a21d1d3105328e7c56770b9f59b03395e4138f5f06fc7e6b80dab87b08caa7bfffc45a095c15263efd3f06c651ded6f58074efc20620d704997fc84721a0a8e9e5b9f5cd330bbb156b31d9d1b1c260e4a24535f30404dc5b2dd6b35d916a1391b25a7d8790be09d85483ed1522074a2785812005bda10dd55acb245b3bd3d9bb777dd23f9b02538ba1a114ba53386d7ca4d9524b2f8a18e0ffb21580b560540bb2146f08f04974b90eb324547d56222df95f44bc6e5f183bef283e4816fb1b2933f9c7c6726a245a495e304d8318d0008c51b0be8090f8f668fbc3f31e073be4b9e97468f4dd8c798e9d682868df493db8a85738b58cfd005190f365849072577772672c6f82555c65046eb34e86fe61103327a063bacbbe33cea7eaa3d1de45471b7269e1b6b38608626e323447a3d5fe0599a6 +P = 8b68f66a3d2f59d419851b94d9a6f2f0e667f8125e11d463a6bc2cea46b12dcc40ce8018b204972c735fdd6d2d05b628f4905c6690f5ac5b1b51e12f3af2dc3ae9b9dab616f0a2a66a1ac197592fd5b15900547f32f54110b58d51a0340aa80e9eeb7b2e0eb97e80aa22ba918f2fe1c678c730ed5c3d8d24774f17d8ab6e01a06243d36e764df1dbb8af1faadbc55281f0242abd7a162c984fd0b05ab8b0bcaedffb2962024f009a8d7c9e71281c09f52ec0707ee3bbeb1ecb918be6ae3e9c1fabbcd3512af928db3ba6c109ff9e9839a616b2a53f092160a48222b84d53cd52490515ef93e1ebb33897263492ab8ec6fad2e633276ae367f76d7f926309478c0205d4f22506a451795dc98f5410d8f5d3e049cbedf381620861e7b4ae08f2d8a71abc1f230248cb636a2d7b4e7717ab2b7b5f2dc6e5b5a18e8043254208b50fd6f8929eaf974c48551233661ad67321b64d69245d536d9a8ca2a6a10966dddb9d2ce36641c9281c460ae524b077867258f638e6ac872cb5f5c6fb216b1ae60a9d0c5ea0dbcd060f255da26111175af4e9935df59ddade6a2a70cddff8cae6a98e4f3843c2dd59d09053b07b648a46f5de0eb21ebb192828279a386ea3eedf2cdc355d73d51111e8c1d522e059752bc56226a4225bcab713bfaaaec78167d7cfd33e913b26fda93ca7524aa8a8b17977c88ff9bc23ea810b4de59eac18d1523b + +COUNT = 1 +K = 94c4d5d70f881e58e10e7246cf812d40e2be258adb2b6c13c6603fc7daf7e85a +C = 6c07b5ffd1b9be182413ef8eae4a6eac657108a46008a0d898727f2711e6fa0ca60fd1d51fad683b57d4202fa2b0eb88b856e08b07155439bdb03890cbb7e0f228172bf297a4e0917dadaa5e89a287bb9ba6441c852c5b0cff5084e6c425aaf866815b3fc45f5f7fb5d14b270343e6a30f402e11d62e433a0d84f65684b2df78d4e7758bc0bf81783316905cdc3c1150ec47f225c966f7f339b2538970eb3b8a2c13f95df1310d6e3b2a1f8aed19105846557d8f0018fc0f17146bf836b654dec98e9ad639c7e4b2f922b4396e82c690cdecb65f5e0ea282dd6262f34346ff9adbc8b2f361ddd4356f0feadf7c750fc0580c4e12c00ee049d06eed2242b14727ef4d58386dc8df279a7bf8131c3befaea2f059ab757826e5e381d49a2f11b8cbc2b0021af4da7a779e5df0083edeb54348cc36ce96a19a3d7ff5bd2f19d05fef6b200e76399a02a991111832173353bff4ce1859ff534ae13290dd176ba8e1384ed24d9702dbff127e15e5c66618f94680271732d19f64552ed03df76dc9d46c3cfdd53a1b253992fbcbea6db006f16e8dd92406f0090ad9100856c6b71f7767fcb895136416b374285efe1c6506941911a380e2bf74ffd0f67e853f9ac7b5df6666b177a2908fda9add0eb798f8ccc52801535b2bdf9507f3fb3b46915aa889d62ac5909040a1a28856105dfe2e10d5cfbb569c380551fc8bbe7d83dc87ef7a92faa3fff4b1e2c2 +P = 85693a16ae69d751cfa6799b95a6396de2eabe7e4da74d734691d992cba353a39f3b9615c1325db5b0563ce1a846bb0f0534a86130ce6657736b9a9b35b0f8d89dd1b3a295131d2f3f57f94deef9606dad76a377d0b24e632b3680e4d3338f3e4484609e8063e9ec621297f55802d7c347e8085ba6e514884b8fc1ae109409c5c3a5bddf4daf034d300e31eccba07a9380f5325666c4a3aa12d60b30ca272fe03534aabe78ba0452a7e4648ebfd4645675629676be6f122a54b6b810cf9cc0c68b7c61470a537a5a664ec24dbb3eb4f9fa8355cc7ae8fef27a0146df5ccc585d8c106a1eeb64ad4c701fd5a54ef18295b07e9e47f7f7dd2f67d38ed776a5f0b28843cc4bb5d7fdbea9cb0088dee849ae232e4e016d8cf3681971e8a45d6b25451538212b91f30e17580a8107a7a95587a06d22d615f5475a5f616fdf2fab79152f2643054d96ba88f50888eb0f2f1f154c6fff53dd44c3613269751dca4fa86f45d6b1af9ad0159685223889529609e7003c8f3cab491fd6c1a020305da8f94ec833d721d9fac7e575c2a1bc26eb4fb5010c35ffbd39b98d857f12584f4ab7de92aa6d7e7148a0120cc6b3f7ae47a291ba1cf55a28d38d3a30dfc3917d663458cf840385ca81cf70acce45a5cd509f8387d450bbdd6fa51830cf9a7387887c620b86809c55a3eb322ca784a51693f1054759804314ae86048f0d9c99650a5a12 + +COUNT = 2 +K = d65338fd3771fd58c07b6b689577378939d439628529b92cd5625edd18afac76 +C = 1c429bf25c144a2cc649fbd60de5c26c31a0c352de99b34b86101c551994f082feffe1db8853de59b3e8593785eca100a71c5392f0c71eca9f411cbd87fc77ea1a96376dc13f6ad460a11e9cd5a829875a7b7dcd2ba4eaac08c5bb48ab5d4c338a6f8bc5e760739edcff2db116b5b1802e35f936d473db168edd12532a992bcc418a759cc9fd3f97f561623078af29d7ab489b7ec564ba981c188f11240dd9354c324f8d0cdf1c74252f0fc75e390e837b8be90a670f5803ee53eb75c3ce95b2853b2342e54f86dd9aeb308eb82ceb2bae7b3e0b364d17105eb61b3843f7206bdb6abb818efe0f0d3b1004e370191e8218cca14947aa8070f7c66fd0422b02ab4a1d94fa46197acd24e272c765667353e819588402feb85d7f00243521d0e7a9d9e70753d8b51d374ea9c8355536594bf05a6960ca7176a4b66086b055b099e315a23e042a7e0807316d7a11a657a6dc9043806e248a9af06570f710af65267d436a5fcb001104fe8a7c564afe075d85bc0a2ce3d33d8d93d5ab1e923f51d4ef26cbb6fd4a935a97cb115aed678e75d5d67fbfcd2362cb3d74ed6b9b9fb0cf82569a474a25e5aa39d22fe5cd301045203d9f93cf5c9e9e9451f1bf3566eec75fbd995cf8c640aa68fb04f5419344057fd1c0e655d750a68c523b0fab24cab03d7393ee3a5735039daed52895dfe7937f55d7ae9a8c0256e9d638a8598452f5329353a20c4bd9958c +P = 58b20979cba48a9dc95a8857f5bce433087ff93470fc62546e86e72dfaaf7b233ffe428802390c1db7cba00b1f23678aace4a16a237b41d26bcd83d471030929a34e8467f85eaef070b9b74a57f13e91b4e95a3c0b8dfea87d026196a10168c152c4ac42718989003b7e688ca43207034b674d3cbab6f57db6513f8883d27f2280c742896a62e7d0f3f20377e98a0688652d270887fdacb86daa086ffda17937e6d20e4a82667f80ac7749a889b0d748e906d653f569b86de2b42b5819ade9c92970d4caeeed8cd5759d56fd38205215bd8401b2a5a000990afe6c9bea8d091171e85ed83f45bb5b9a8d74cae897cc36f1eaf0122693990b1fb57d0025ad6d92c90885accb649368fe237c4cf017787609fb93c9ea5b413847a9fcf2d2ccb6283345a278619abf8dc351682928187bf92551a820939ec73928eb9930c48f7088ed0a367882f4a8b20d754c5f06bc82990da02227923eb8d1cb73c23793ea0d19bed4a9986f0d48d7835733d1ed3396ec3cf15e1854473b05535261251f4f0af8a0743b3298888bec2f7656493d05eb2d9b848e6802845fb9f7835b50d6a0f0e6cfdaf9b1afc6caa6573b3350256e6f23cc4681316705e33eb0a5f664b79be556cb1bbdd0208430cdc95a35f61facbe7ca2a9bd329e4a1fa42aab9bb02f6519a5672346a4cfac1b96a969317480dd995e339af888fc0e43692332d583fec6215d + +COUNT = 3 +K = 1726706350c11e6883955f24ea11ab247ce3b2ab54d05e67ad9770b5564483dd +C = b006f26a67d0e1e2cbeb5c23b6b300adc1526d1f17bbe964fe8237ae244878158e6b04cb488786b5258ac973c3a2eafd7fcf3a7ca6c825155659fbc53d112bc78b3a770cf059fdd5e68f2b4bfa36de3721231102e5041c947fba3d906bff39592ec3901a398da23035f1190e99b58659330cc2e856ee87ad4197dcc7d16e1f062275bced1ed5cd82163ae3e58da7368dc2aadac855385bd4fa0b8baadef608d0a5c27172d12b88c70b136eeccf37f36364361a990dc50815743cab1636e661bff04ca8345520c30b935a060b450526b1d6ac09170e5b0a327b88f42327b85c9a621d2ca745963c2815a2bfcf509d50b6058ed6e67f369b5608d2aa885238b67d1b8e0d83f9464aa473bf109350fcc02e360c2619236cbfbf895b607895530d8d3d2e41450750dad05b1c37ef15db7fb4707597ac252e8e58d4c1ab2713b427643d198164c908b5d8ff36e9700157284009c7b283633d8b27b378bb65eff8aa59b5fe5e6437a1d53a99c106c2c4d033d3d23950e313a10eb31d68524ae9f8e4f56437acf66db3e8f77407a15bbff4b393e5559908993146d93c673d2aeb7d4cb8fc8d0169de7ed6e2bbe6ce9958a0f5d201419e7acb17e47da827ba380d6b3ad3b5a8c2101c5fb501110c727169065f23297947f538ab3ec165d61edc1f6a9e1735e9b7fc06d4d3406cf8f9c6a68b196cf262324a986705fbc802cdd2e6b4ebcf68e6bb9e793ae644 +FAIL + +COUNT = 4 +K = 32e57ccfe7563dc0a20c14ee450837a33606c086ce1467fd7ec58467154338ab +C = 977d9c5f6861a69e13cd854299434e348cd0690b4d04e08e0598b47eea621bcd8a22838dc9c35a72c35fb1a6434718d02fd24cb4b3dd90b0430334a938a218467eeb4c373d446a539810bc3ce1e923b7c20d9f58ea931d4f964c79613bce67b268efc44bdb9bb00a68d60037949aec7a399493defb2a466e33d4831efd63ad1cb89e00b530626d2f0165975ddfc4cc5e0f968d3875de0f674b3a517df26480b02b6236ebb377118268cebb30ff1ddf0e280fe1bff61902a017e8decf60753c642f35faf0565303bfe651ec8f0193cf34d4af010c9925b8871f0f8c934a149d874a3b659f78ad148428aacaeab80b1b25dec8b0f7ce54406287bc802ac2c0cca3db4adcaa8400a8636ea339b62f5e94d5e32fd3d1183b374507a2af620ca1346dccc9f83a4fe855b1c0e91db9e7c532828d0944d9a81b553ebdf35e24119ed8164bd0260627ea011e93bc103f208c76498ddb8bca15fd05324da5473157feedd54592aacaee68852968eb54c69eb1ddf607917c57493ea380de0cc6ae304dc49cab80a31b8b456986dc367c70f144e52dd604c8d5edbce5de5efb30d9470bc883445b34fa4414f44bb94a64362a12b546665721fa6db82f0c947f015978412b2ce136c471c98b1f908315a16c83e9318e64508c7e179a4429195a9b1ccc211a1c1d4e4df15c5ebc7ab90926fcf7da03657159e440e93adea31ee35f72f2399f5fe2f8c560c8826e23 +P = 80de48ff805a88f3b359451bb6df61def9cb3551e64fdd3a3a70a3b6d238a69311a85bc5924e395ce92ef394b1e5dc301233e9a212f7fb86272c42ddf5f4857c38d0dd259dc1d663c0d729e033d9b0f7f01ab1f8f1b7192d40921ee0d4696a3e35663c5ffcff5ed167660bf6b4c00e619512a2e827be33c90eecc539e18acc8c76eb332b28b1cc502af571242342f63d155271da3211352128aa0af70c9ce78dfdf084a13049b7bb6f2bd10dd385b412d60bf1ccc9fae1208f39dc53db471a04d0dcc3703b4f7b95e72ea815b64a1499865a7ccc5b740999e76338e1b251c740d75274150a96def8760a08c5a8a6f58273b079c06ee09f79a976eaacc8a04c365bc61a786b496811121c386d274c413a2fbfae9464db6ea775233193395740fc9a5eca1a3820d33f6f7b38a83ccbacaac16479225e108acdf46ca35e573151963721b73b3e1c9a12effff0c3a622eb9f07bef7ae712c96ee3ba245597fb8d511698d6e819a967e0d1868c0c6055333b7c13a98cf63d6a5d87779a95345ca8b7e9e597ec588e96f8fc2a7f0a0b8f1543d9e362a911dfb1f03132a4e6af71b503c41814d6b684a26b8df00cdc657ae129a1f2a18cf4b78a3981de68296b1268609fe3ecb9928b90df4553be37319fc508096fa54b35e4822328569da60a6c660f30c61f02f4c5ab2527cf36cb7da8d7dade4c714ea3fc2da8f65b4199090e114dd + +COUNT = 5 +K = e3982db2032f2b4ce658fc44b76f5964c45cd31bf803708982ae599186fc3765 +C = 975e49a4b9a770957a1bb2be920a4f39b9cfd69ba46983d2473d631c08132b9bf61c44510b8aa8bd48c70a86276aa1149d8fdefad511d15d2e2037d9e920e640cb71a97663d19eb90d0b74d9764d03e17cda87ebec6e35ac2003cb75bf9192920d910188d78e2e664255fdf6c9190319d34adb858162ff0830f37fe1dd44003d3d5a1f9451949e368f46ad1977ce622daadf8483a1f60359992b9b366e8a81ffbe96cee45d3aef2fd0ad8c17cc34927af77a0d6d0c5deef3b4a25c82ec388667a493bb0599ac492b351246cbad6d283bf1820883afa48bd909eb7304b9fc5b7d960344309133aab7a85c49f7de396926f50bc83c95900cd049eac1b387aae7fcba5345496425f9216e1fd15c20da75fbb26da176149b40a701e15a7bacfe899e3ecc534ab8bc5b7bd081fb825b5f40fa57e363d7bce40020e73f638acfa097b89c50cb9edb0bd6d71d429b8003aa5dcb7d61792eb3bcac795954c625a104209b373c28cf02038c3318916edd2b818e6719ec154cfa56afb2f337d333069f915d0d35edd6c278fae23c4440c40be462a1dcab23758e4a7fbe8436493f58e890092ea71cb8bcf1336e9ee16b852ccc488f21682dc9f02bdf6c56fe8ad04d84a3c69d8d06dee3d126c0a75f142d0c90c256139acd4b719573e588b80b4540024a05a35044cf58d89673923a534c3816492e62379797cd6e6a7464da5eaaf11ee7b9c27b9b03d7b53c03 +P = 43d38ca132545b154995cff07082611cc47a6467a980654d2d1f1ccfb3bcd387e9d7ffa281b0e0b00dc8669207e0d8033e9e36613c98978f8644bb7e505fbf491dcefbe19589254c8abf859dd65cb94dfc99e7b9d3d1f0a31f21285963e1f7b45c7490a522ff887786f7940fb6192f5081ce7181944bdca5c5bfcf2589f9173a682b78fcdf971dd9f4e8529033e15cde560984dcf796914206973dcc13f8c9a24b25dd00c11166ec6ecf33c6ad9b487847abd7bd29b4f3b9c8dc93a6a5a31723dc03245884bfadc12b2fddcc82409d7b14660af808d4e8216157bb6ba03a319193ad4dacbd37ac884550962a4de26ae923f8d74f2f694fcd0aa74f2e809da4689aad9f2820684b3b423ec4a7da0ce4a1b599fc21bd2779653283b0ee81d7b0d9fd3f6d1e75bd71af9620630aa87b73f7b12e68ddbdfa02ae86ae06b0b1aee4a997d34f61b466348b92e36f83652763084a215c47dcf689df17e36b64bae3ca1a2cc22c837b5907236833c2c1e5f3ddb74165fb6f0633990122cbe4af8b5920b1bb6961cdb144ea8d7b245d0128ab76f4fc0189ba97385717e89e0f99c962ee8c2b6e55546a18be0ba3dbebf7e4140eed6aa3558c43115b65b6f6e8e8fb4b9cfbe0b6eac006603667b28cefb4dec037f33568a3c94d9e36539e91b3199d728521a9a6b82b96ff1c29dd1d10366d0510f1b9a9494cd104db2390530be3fb6abdb7 diff --git a/src/test/crypto/KWP_AE_128.txt b/src/test/crypto/KWP_AE_128.txt new file mode 100644 index 00000000000..a682d4bb325 --- /dev/null +++ b/src/test/crypto/KWP_AE_128.txt @@ -0,0 +1,35 @@ +# CAVS 21.4 +# 'NIST SP 800-38F KWP-AE with AES-128 cipher function' information for test-files +# Seed = 7c0f161159cc9e338308ddc59a655450a030832b3d4576c568d63725d57b020073aee9c77d9fec34eab358b83bf5aae4fb52803343bc03d472283edbebbcf75c +# Generated on Fri Apr 6 14:46:12 2018 +[PLAINTEXT LENGTH = 4096] + +COUNT = 0 +K = 0e54956a24c7d4a343f90269fb18a17f +P = 817ddabdc5d215eee233adff97e92193c6beec52a71340477f70243a794ce954af51e356c9940e4ab198f0e68c543355f65ad179cb2d60dd369eaeb9ed141fb18c9e4054ac7fdc83506896990a4d20833d2d6e9a34938796ee67c9d7d23058544a4a35f2954103ce443a95a7e785602075ca0a73da37899e4568106bb2dbf1f901377d4d3380c70fa5175ebc550481ac6f15986a4407fde5c23ff317e37544c0a25f87117506597db5bb79850c86247b73a5d0090417d63e4c257ea0220c2c04db07a34f0ab7954e1dfa2007a1466795c4d0c2aa09ca3986c028185b43a466526594afc9c891c263a7c608304bc1957c9873f544dc71e6f847c48d32026ed03b2333825452ee7e12a50e1cd7d678319264c65f78001996d37fae7f9861fbd21cb506c2f8a3b0ee53c7debe17111b6e3f78a5c5677857b082c2c4943dfd1edf6337fea98a44fc25928361156ef38d865948b979cf6f4b46bd2119f12f0891cef7fc9d0638fd105fc05f9968d16948d1cb820751e82e44cb68e99d4f072ffd1577da6c0631b5827bec7e1b9ec72d18b74cf5f233e85013c1668ceb5d7a1f5e0f016b0ff726a0a9d41e2cea8e14a2f56492b14606d3fafd8ac141335f39f90d56863735628e8f17be90e100ef0785f3cd57db8b9d89a6b2189dc2ea00c285d2657983f8bd7883c215477e67a55556401f1d8b27d4e0d541c7fb7ace370c2e428884 +C = 876f3e53ba9cf4f6a521ac198bc813d0ede0f862ab6082e3e0a06ad82b4f279582f7c43bb63574608446bc2a05f401a68f74086cf2776b4b3df6b3679c2edfb91c024db54c6831e0752ae6f86c7596462de905ee0be908c1b9d043ecafe2ad1cbddb904e18ebc9b7a107031be3a87059516a3d1257812d9c801b0b9f21539e70c47150c128d87c5e58fa6e4371aedde69c7b5cd16b73ac422676328131f3ac48c602bb6e0741805aad9d23b33b3523b86cf0588cdf9dc6c4d5f9fa43d88ca17976eaf48fb37a41a598266da04144373df5631cc5126341c200a0c8499b29ae96e6e6e6c2bdf8d8903da62bf8ddae970569b695240e77f8ac5b191da5034008b6ef21936858e69bac372bbafd8794f6b03711503c1875528a9348681844edb199a0664d740f0f0b1f866c4248c80fe8b5700a3c4134cdddb17676e0cd37d6d81831a0f4adfba071bb0935502480eccd48b28be5954ea6c7d873b51b8bd2b709c5b6132ed31296510915073c18f7012f0eff6a9aad5340a19fd5e372d35260b718d9e4807b1954c24e6a4fd48e4dbb8f395474e99ab577367d2ab5ccaa18c947331047dc3986e213a878b41089aa221019dad4191a4feefd095f8606c2700a46d71cbb13efb6957df925ec26071c04d04d5a94e138e5fc5d1f059236aad76208077dcc607b1dd2086f9c04e33f955822b457eecd68bd5f24836ecedbac675e6ed93d8a787cb57ad68e + +COUNT = 1 +K = 6b8ba9cc9b31068ba175abfcc60c1338 +P = 8af887c58dfbc38ee0423eefcc0e032dcc79dd116638ca65ad75dca2a2459f13934dbe61a62cb26d8bbddbabf9bf52bbe137ef1d3e30eacf0fe456ec808d6798dc29fe54fa1f784aa3c11cf39405009581d3f1d596843813a6685e503fac8535e0c06ecca8561b6a1f22c578eefb691912be2e1667946101ae8c3501e6c66eb17e14f2608c9ce6fbab4a1597ed49ccb3930b1060f98c97d8dc4ce81e35279c4d30d1bf86c9b919a3ce4f0109e77929e58c4c3aeb5de1ec5e0afa38ae896df9121c72c255141f2f5c9a51be5072547cf8a3b067404e62f9615a02479cf8c202e7feb2e258314e0ebe62878a5c4ecd4e9df7dab2e1fa9a7b532c2169acedb7998d5cd8a7118848ce7ee9fb2f68e28c2b279ddc064db70ad73c6dbe10c5e1c56a709c1407f93a727cce1075103a4009ae2f7731b7d71756eee119b828ef4ed61eff164935532a94fa8fe62dc2e22cf20f168ae65f4b6785286c253f365f29453a479dc2824b8bdabd962da3b76ae9c8a720155e158fe389c8cc7fa6ad522c951b5c236bf964b5b1bfb098a39835759b95404b72b17f7dbcda936177ae059269f41ecdac81a49f5bbfd2e801392a043ef06873550a67fcbc039f0b5d30ce490baa979dbbaf9e53d45d7e2dff26b2f7e6628ded694217a39f454b288e7906b79faf4a407a7d207646f93096a157f0d1dca05a7f92e318fc1ff62ce2de7f129b187053 +C = aea19443d7f8ad7d4501c1ecadc6b5e3f1c23c29eca608905f9cabdd46e34a55e1f7ac8308e75c903675982bda99173a2ba57d2ccf2e01a02589f89dfd4b3c7fd229ec91c9d0c46ea5dee3c048cd4611bfeadc9bf26daa1e02cb72e222cf3dab120dd1e8c2dd9bd58bbefa5d14526abd1e8d2170a6ba8283c243ec2fd5ef07030b1ef5f69f9620e4b17a3639341005887b9ffc793533594703e5dcae67bd0ce7a3c98ca65815a4d067f27e6e66d6636cebb789732566a52ac3970e14c37310dc2fcee0e739a16291029fd2b4d534e30445474b26711a8b3e1ee3cc88b09e8b1745b6cc0f067624ecb232db750b01fe5457fdea77b251b10fe95d3eeedb083bdf109c41dba26cc9654f787bf95735ff07070b175cea8b62302e6087b91a0415474605691099f1a9e2b626c4b3bb7aeb8ead9922bc3617cb427c669b88be5f98aea7edb8b0063bec80af4c081f89778d7c7242ddae88e8d3aff1f80e575e1aab4a5d115bc27636fd14d19bc59433f697635ecd870d17e7f5b004dee4001cddc34ab6e377eeb3fb08e9476970765105d93e4558fe3d4fc6fe053aab9c6cf032f1116e70c2d65f7c8cdeb6ad63ac4291f93d467ebbb29ead265c05ac684d20a6bef09b71830f717e08bcb4f9d3773bec928f66eeb64dc451e958e357ebbfef5a342df28707ac4b8e3e8c854e8d691cb92e87c0d57558e44cd754424865c229c9e1abb28e003b6819400b + +COUNT = 2 +K = 5f0f2428ad83ea04642d8f90e31f00ad +P = 05f65580a602fc839c5372778bc495219ab159935acf3caed10958ca0e135cbc44a896aa84b1f6b05d65ac45f5930029a223977ca46c7717809e80d8eb91e7ccbc525e69ca0a85d3e45b2f9bcf0c86980b7f9f9eb20e9b558479aa8c814e6113635a9b8b0e264055da68198303ae91683eb8ed3351dea3b8280a504d163e1cf284834e3641a26868e23fcf460d279d3577115d8c4cf0f54374df26ddfbf0c1df5bf71fe509231177ac531514ed878d3ef44a03685dee906ab394bfb69cb56ff973f19fb466ce550ae7b03008feca7046bae160ad47f28c9004da68e594bcd7f9ca8674a645c50768b5ee0ec2088f66602a429f95492e09dc567c11083eb18da97d85968a86f6ad05054f4a9c4bcd57ece2511bdb38afd78084b0a088112b81b993a4d37674d97e8081795ee055d9c37d2907d859be201e73580ebd439620918e7c4b42243f43a59b475095205c40a1d37c0f87a5df19450ccd5f274f186b0f45f7bc153d30ce3c873d7bba5dea5a8bd348b2f0b4f25461927642a2991c3c4bc3ad34d09c104322bf27e8eaa0ca99c0dd47a3c4c618af02599192e362e75de53db336a68b223567bcddcd7feecde0aba91a1eac0ce6327d3c9c5be36698494b2b67ec0efd36b744d794f521bee251b31dd29852a94e6b95f143a53f93f85ec8c2e86e1026ec949aaf97fc9b8d728c13e1deab628e9624ca966992efae61dcd6e5 +C = efa9d549d0e975789a6c84b99991c98cf20db4de6789be6f9f7ae109e322a6f04a1a1c1fe582e4242a5e2f2b390827d7e52f1bd97a4e745920dda69a25fe93fa5de82c09dce8a2593106667820dfa1f55c8a1152ee02ed0c607ab986e5209859cbb36baf90ca63285c3128fdceb18e5145b9d3efe6bc7b8bac7837637ce5553d634b13fa3888201112248bc31e0785f2dbae1a8eaa7f4ebc0b00d21242f03acd4e4b0f4ad5e9bb1e25301a331a4b5535c1253b9ccc7800cb2e80602ec9ef858865a39e0ba28006da93f9c26c1fb9ee8258dfcd7e79f02ab0d9c2df9ade309fa6c604fcbeb38641b2968ff96349bcc34927cc479c30153125f7ad07068e5a4ffaba903d27488c94bfb694d8277251061a7dcc5b5fbf47ea88065ea1df007cbd4c1bdbc2e3609d53bf99bf2c7de8ef457fd7b85882210fb52c5093415dd2b37084b75f83d2a05a669867143fd95705f241fa794526bc65c5f6422410da4783340497bc54852125e3da579bfabc9ba98ec398487444b3da454f40d8b59d384cdd3454ad77f38d4ce657d10b4708dd4ecb0fcc3238aca57258e082e8cb86d802b9722388284f232811ec35b49ebba151d5b0771feeb6120d8bee8d55fa9c4de45d8f950752b3b9c6526fc4fa8fdd02c645fb053cef8293437111c85b07e98b3b38ccd3de93cd6fd08fdfb8a3a94b619f9acd226be3bd3ff906f14b1dda93cccae783c04ed545db685bf0 + +COUNT = 3 +K = 9b5c2e4a9e53d3e5f5670af2440439db +P = c4c726199cc27b81c90ee59f1f44420abad2a4fe0f24647952116ec46f364a3dad1b48ba5bc6c48c056e34764ed4b4a2da04e7e45ff477df8364ee9f116c82dcff7e9ae2db6a7722f2261b6cee1ef7cc66820184988e002886a6b636264e7704383543817e119ba3582d42aa00aa78dcf84d0ac92ff69f09046f89e7b608143699f9fda37b07d3e2949e935bba230e0a42c2d41d285042feaa480bcc1ef8236d615f449107fad97b5be514ae39d586b98ed45cc39784c30a3a256d62336002b1f7f6cedc3e5418d6f444bacd74d7bc89b3ec65c52191b4a2f3b3c7b682f254e6e909c5dcbc11d5def56792f5d2183e5e7119f268b0395e7f4f25b8574bc921da8419af30113bd35b11c21debb979fb698752264595777be7665419052cf3265117bce96b61f27da4072f828bd9315ef25a9ab9eb4f5319907cdbce6104054c8563a997cf477523478e6d23527f91997dd43ebaf629003530de4b8406bdbfae49c0d97920b745c78a665c438ff064d24803382f79fcc2e24265adaf707394b832685def125b69b1787719aaf92e518645ac22dd94d0dcb03f052c87954854d177e2caa6ab5d9cf3dc4ed4ceca90d2fa0b235efd1b7d17ba9758987d127b11204047f22e75158720feec549f29b67a8158e577325e178d6b757770d56a30b85d12ab5fb164a07a5f613b53d075c44cd3c1e3a311515aeed2427f70b3bc018dd796 +C = 09540dbbe97e983a3420ec1ef7d1195f9cd32acc93e9f31d07e68dd9d963f32bb23565a37d97e857e5e40eae27946ef83a4da10ec29ea001fbe892133a4814b04aa3ebabfa295a19429c7e6b263293d07a48f5b2beea2930aabe6268aec00a8fbcf9f4865d4658905413a0f17bb08bbc6466f8d3e1744d9ffe6132f94ce79bec4ffea05785525eb5e071ecf30081281df58f1b5295842967a486ee262a7c366978140bcb8562abf1a417c7b1ffe07b1748978408cfae4beec8290b312f15d3223fa7447f5a89e52ddfdc4f79cf1d9fe4ecad3e674e98136c457e36748da001970cceed0c61d2ea5f9ee5f9660ad561b9fb808be3d4455fe76ae9778a49526c8461f99dc2c271ff9aa389d3e6a546a82d15967a82e8ec9b64cdf25fb73afccd7cc2fde8d07ef4a1345976970997facd5be16566c7761d23f4a31909a5a31bce35e33596a861e426fb56d9ea13e5f6219aceda2450434e04c454ec8569ef9076856cb48f36618e690a710dfe8717526001c3be7eccb671dbb4430289924ae975a30b6fc7079f8c9cbc8a268f5defc03d94d5bae0474987b7ebd4c9e29eec9080e8d85776c570089fcd235372518dc38e481f8a5edc2105bb4191cd81a19e51c60252a18b3058ac1ff9368af59a2caf82a06357764fb3721287bd28204d77dc2eadb0913cabff7cd63c73488e92c818341563476e69a50fbb781f151e02f2df4a9fe05c8419a3393d67 + +COUNT = 4 +K = f2713273ad5b7add10d2b8d8b61d2f68 +P = 1c189d6d34b57c699dd8c79e30457c6735f3697aa6825e14257906e18413580727d676b6e667cd58e0a47b738abfd91144952940de60b161b0dd261d68af12573820f4f172b32d7c032ed7db3715fc6ceb92ec2d614a2005ea1aed14cc5015de143aaf6dc2db406e0fd31976562250e80eb79140bfc8d14fc79bea4159d66e1013704b474b0b1377220741dc55cecde0eed4e0931e58025e7fb672117a8288587cc91b1a41fb4d1e9e4a7ff59fdb814dbd9521dc148abfe16ab62fa45a3dc361cffb422482f4fd93466320a11d9a5438959947ba51d7761b2a3bb9e0b843ea6a71cba54a96bd14de8360ac8f16ece773d7bfb25df63c1aa67a7bcb587f618f2425bf9aa018eba5aebc2911b9fb6276e48d72dc74836a290fad0f5c545934f1231ed57c6944a464905fb08685e815ecddc8dbe4eeec1355f414fd2ce975c0dbe29bf4369e05c1582cfd9a7ddb7659247a0f6db38ba165f1f46a472760bcd2609ffc8bff49c6f54698063fa81305696f7ce25c95b237c6a4fecb3e48167060f556359609b73e738c5c3021d0e5bc62ed31a2a27f43dc417c0041f8a2402fa7bf76c72e2b290c060efd93ef65d0a893314b03c098a6674d0137f12eba3ebb63b3e65fd0aac89847bf9573d9eff0b1283843bd4edd6d459de6d559e48633d964dccd8b9fb5142596430b2373f9448606f9286157e810dd0f6a72843b0cb28ab452d7 +C = dbeedfe9aede047a07402d23b1b349a3e75490d3c404240c872470aa4f6b5a1723b6a0e455a8961fd25f88a9329b4fdc7d8e6bb2b3d7403bd65418f94b933467d882c1aaca702a47e7bf44916ce5eed0647250d572686c8a8c9f9437f2ceb13b361aef929d428c35cd4cf1d8970d98e75c46b4825c871003eceac2412adb6681f0b8a1d6d6bd7eb41c6e58ebe6909ab061d5e832f7580dce89eccb4ea2546204e895ccb54429fe388b0d9173922e08625d43f0a10743ad76230fa458b0971028b18e0f04f3c1049012a8bbc2f93934e0b51b943efd2fa4a14fdb795a1f21302529653583edcfa3c613ada2bf69a93e5a29a20a6f9d9df7891e3b438be4f9f714806cd57afcabc0d5f5753af804d885ca1ae23bb5fd2593fa361e92e7d12eea0805453056022256410369dab0e2e8d2d4c9ffda1d7df8ba5ee19c5bf62133d1ca9a713fa45157d327bc4e1d8e7a105012036939a5d5ae15a5fe8ef989c8323129b8b4e43402bb4f60f35483685893e9ea0b0dc4a8238148d0220cd9bdd7dc42590cc303f706cd81cc5f6ce761ffdc570c62cf734cfffe0ec7be17c7ae3ab34c65f1a53267b03de4da3a5f921cf01b068c4098c7594a3051bb03940cb6c5bac2aa04f95bd30bd15ccc9edb8e29091047de011ee9591cc4c83244750dceb8c2eedda797965c1243521c19228eb2d799775dd83ceab5aae082cfa743d58544b61b7104f3d10f632994a4 + +COUNT = 5 +K = 8967c8058742169ab39d2d528130df80 +P = cbe7ac6c232b327f7d9154acf334891398fa0a1ea64a4f15243f488ecc784749fec41fab7a447e2a2cc75a44fb548a46a46e51f4c245f64901f3acbf18f468a28367c51060c34762c8a88780044761579846557f285c97bcb9c966aa08d4cf352c26d8a2f308803393155437b91776ee74941da9db39ef64d9ac9261f2c7fb6349e52111a7c146d5a4ef6d5d00717ac93338fea5ff0c90cdefe0d66b039045330877d8596de4c35987ce70f660aa8777276eed1b128002dffd90ce03224d505950eb2c73f5e958df92ee1cd8dee667bbb8cc71f87e4ce880585f004160c798ff2cc195b09921c73951332bd437840bbadcd10730272d46b3f23963a7c0396275407bb8041a6e5ceed4d1dc2a65a70d4843660cc039ce94a4ce96573811d2b8be00e91dec7486484ccb050dfd6e319001ba8b8fef48b03d17902cb91792dc420c79e823616cc79af4e738dc077519015c734c1be0150921657f3139e59ba4ad8e75e0f7b51806a96b7249709faac3cdcd53bfed8f0326d19000ebc1ae05a43e96c1fa02f2cfadefaf871250aff6eade8ce8d4b8d5a686d553e8a66d96412f2f189fb724e67c8acdb229501414e45c430a9533d211a217eae0030ff10db12a399567e94c900ce839157ebee1f4f876675794cb3642ace55fbdb261f1369807f284f4e6f8a2c8330eed88e18b8ff564fb754b96f8c85d28dc7a0b18805ecc3ddffc +C = d29eb0bc54c43081030a3e1169f243eac73f1d0fb2b2f983fb25755684df4331a1932b02e237502c1b73a8d744619f521457b72fcd7401973ec48ee915054e5e12577ef1212d0e642c9dc7341cdaffd344dc480eaf9f29019355b07de2f05897cb91b00b4c53714609dea69f18c7c93429b73373311e0c4a45e2f2b3a6bd393b69f90a4125eb635a794563a68e092cc66730d236909cce6585d34f47a10c2968f161ed79c16c18a6c093c1fc455eb502770ee436dd48c091a4cefeaf449c5b27f2fd2f9150a601bccf8a37cda65b25917728ce3f1f12f55a51d9110c16132bde100b6479eb8a1ae670a9ca396f18a2ff19fb5543dbd3b189e51ba5f12516129071c997044df747f4ed70616e784bfb948a63211b224dfdfc5b9116b869fc5313c761d55c759a6e275e9ac53e83a5386e2cfa50f6c1b541cf752da800ff991ff896b79b9e691d67e96fffd166a06f0c300b7d9bb552f5ec7df6eea384751632ed9357df5a5a9fda931568930fedb0536f5491d4067a3951dc72903d1ca2f5f1b713e24cce5347b056e2f8829d34143032f53edb81a765d2ab3e6cbe12536cd85ac3230fea546ee50d991e13c562bea3d694d3990b1783a373a59fe12cfdb74fa0275e367d76bdef698dd33c30afadbbf0fe0dd7ddf288aec7a397e263a5d537df9f606f5c126565c06ed5e0bd6753d3eaf5002b472a1d7f4d7df7d9a8fe8aa2c595ecef8cfb3da3e7 diff --git a/src/test/crypto/KWP_AE_256.txt b/src/test/crypto/KWP_AE_256.txt new file mode 100644 index 00000000000..66d8523987c --- /dev/null +++ b/src/test/crypto/KWP_AE_256.txt @@ -0,0 +1,35 @@ +# CAVS 21.4 +# 'NIST SP 800-38F KWP-AE with AES-256 cipher function' information for test-files +# Seed = 6eb04695fd5dc2b0038aff221b225c302fc9cde3d6dbf4195d9d4d2756c8303906335e07679fee9a80e5d0d1580f7cdab8f0fdd78c0fa58f34afccdcba4820a5 +# Generated on Fri Apr 6 14:46:23 2018 +[PLAINTEXT LENGTH = 4096] + +COUNT = 0 +K = 20f31cded60b8ed8d9d3fd1e1fa6244e76c7cb7628bfd28a5d63ce8aa2c9494d +P = f07225202842c8dede42215301e44b9bb7e625d3812f74f9b6ddbcd024ebd1f33e2cbf280b9004941f3cbf86c880a2357f88f92a6dcf8dad9da7dddcd00f3635efdff0af4382024e93c2af66b991e565eacca6b886f07178c9b4adad6f0d6ada5ff6aa7cd0712519a947a8089cea5e1e3e40ffe1806010b0149f9ffc7c4dd3c31b3d08d5ae1997c52369393d58611dff9bec501c1ab35e6ed3e7f9445a34e211010a8236686f154e0a5ae3433d6a844eb3884961aa6592216d93952b46bb58a4195aa80966ad0ccd4a7e23823912556a90d5ee9c3bb952ecbb9d895dabd3b11ab4f2e3a6c2582de50403289230ef4dc46e7c0d870a3f0cba9d643a0349503c1b162ddb6350e699589eb47bd563999f55a1adb6b78b52f006901b0427ea7d3394bb0adae4637b4f1ad5d5425e2c8ff3083506d7ad7ba4c7405a778b0a3a11760c96900a5256956cc9710091d073a19f46a985d004651fe2b6448ed761bf9bc81619cf273a6783d868d090753bf01318be21afd88d9f3a961a69f93e9d9fb822c80acc7b48cf14a08b5b7ef15c66975721b7cde9761a145b679155472a44dea8fedc0f86ae7ebf6283ecfde5f2444b51569e6723a7a19e28cdf8dec6791ccc14af95abad018f741575b343cb1a20a2a9adf4248f99728069a1e2e78ad8966c41c9918fb7019ef56c153a183a6247d22d9956564bb03075cbfd1b43d96818b28484 +C = a5b63618fc0c4512960f00a1f226d9837a90480baea75265453b9553b12a58c72153080842d7f8710f317f88fbbbf97caf879ab4bf416ba767ee9aeb34357f4a2d0e8b9571054d98e28804a70bc4d74807f2bfd95ee955bfdbb6f4d6969a0c3c3b541a514647d5cd8c9740ac3496095c3f145c50c97ec98b935158fbdf89705d5330015e48ece89188b8c1bcb2ad6825d865b375a9b9056b743dac720feeac033c9f757f6fe73dd7c4a747661b64cf490a0dd43b547cd791a5d78dac97efcd355f7ebac248fa2a33e4fad640dc34e0d40b0d36588aa32f0864c9446739a6b44ff84666d723bd7d646c5172cda932fec34ddaaba342b02a9604087ef042a2be4774194b5d32cb3fb112438fbf2801050b5424635fa2d3d3fb10332965c73e6669e65195310a3a30602640e9809179cdfc50de585aa1c0072423c626815d281a06eac3b6ffa137716318e288e3f9970e415ef0451bdc557968febf9eb6772c1f77cb8e95701246d9c567048142bb25e340351b87d7391822d9ee7fe51378bc0d08135f9f39cf44b348b87937939dc61f430dfe308cada632722e23aed5a0699e039cf0563ab8025163744b136a13ce3c62c748c89f5e17540f105e7c6ec9ba13515b504342f9e6dc7d65b9a633d8c0b5c9fa858dbb9b3a594406d478a81bb9abfa289730408c1e303c663a61d5caca00f615065312580042862397b9aa8c80ca812887664c439c8c68 + +COUNT = 1 +K = 85c8aa6d9c83cbcb11550c95cb9fa991d49dc89fbe2531e10694269f38e9a309 +P = 58ffd9e73b5f64cb951b7df799a505d842ef0cca6d4b21027680793565a67d47db0bc872a7640691216c044fbfc0be1cd33ffe9fc73c5ad8cf38bcfe6d4b7b76639ef4200ced0eff2d361e9c948075f09398fbe5ef24d370a5a16b515c102d415813cbce56f6a8f7b5227db7aab33c4d8162ac3c178e2344f3c7e123f0e60142112b960137c0176c450248167e9714db8566e2d4af731a7ef3f88730a4be4cb99bf46eba39527ff279be67aee1f4af25b3396aef5883fe7086fed7285af5216c7945d9bd61d907a84095d6fad383a7a90ad4061f7241a84358cf9fbc7048c010c7652bf47f9ff58f1c0cfd62a5fef92b02074ecbf4762af71eeda9aeb8497d7b33beeb4fdbd80b72f2c6dc9a12409cf7ee3716fc07515970411ed7e113b7ad9865d822a502976bba1d2ef254dda4acdf6f55a04c4ac5f6a5e011375af4cfd405d1220a10465a4cd01d160d179fc7ce614ef4023d55dad363e3639f1eb8fec716f21d267ca21b0587186d162b4a3cd23f80e33ca330a8c3bd756e70c2d457306cd54be9e0433bd9416efd861ed42fcde10ccc98389652b5c14cff6c73341320c93ee457f9e3f320c5c5564fe50eeecf2d7a62a85639ddac4abdad7495e8926111b18f7866e6b3f95c788d9124ab9788210b1e25524427e86049726200052404d9b3585477bfc11f12c4fa761ed86b7449a5471ce18f964acee913946c0997980f +C = d4112af26f37a7e2e9cdc0fd460cf4cf4f3985fc820348e63a095751ce30b2fa5d6e89a8d37b8ddd3dc2af8fa64e0db729f526bab89562ad7f2bbb9ba02668033ebf6eb742904f37fbf1d87b4d23b1dfc815529a6e3f24bb31ba1d4354820a47e0d7b0acb4e495a50912824715d24cdeab0be3597dc535326ee073eb39b925c7699115dfdffd4162b84afc61048c7802a76ddafec243b91791a26138f202bf365a0b0863555d48c953c50cce5e09fede4a97571d3ebf0582c57c39680714750fd2227a632274adece8a4bd13413825b51969b3e965ad417c96415f4041d8f0d0a3df04cbb24fcfbeb95b401be7ea840da43606cd5bd56bfde26034fe94aadc78bedfaa4eed77e64cdc3d059e3765ff6346e9344f0e0a183085d78711b13d4ddc356a669693d2ad090bb67e8013403c51c81951f8a6241269d3423a739b44215bfa403f8927fc1d100587d602a7efc505f28edfe51ae8df407361fa06c09d06c7e960cb7b7041f3603faa6b4767a8a3c3d1c8ab05f75143a3061396c7289503ec29237eea7a364eedb77f66da308fe8e194c9b1784364072dbcc8aa1145db1a03f3862c38cc1a08b569f8576623915a644700ab43f7eff7e59684eea5595c31b13778824704b4aaa9c1b5d2acc4a6cd2ae21c5b0b32bc1433d536e8bd7a4e6c5e6d1ceb70dcd2ac215bd4619aa1ea8c184033b3279cc9fd094336c5e6f5112bd9f36fbebf304ab04f + +COUNT = 2 +K = 5309c5dff8fb03b007c28bda0a219d329080120b430f4b5af9df60ebbe20890c +P = 22b75fa402aa5b665aa71db0e24e76a6796329fea9128dffbbe03764e23d32443cadfbb6d557e833f841ecf5cc7c9200de81b2c58fc72a1a9451ce51c75abde6504b7868d1864a5824d6ef5dbd57c80e488a011102f7a321639fa57c40a5eb8479cbbc7428bb5497dd6295e270b53b00eaff17c2aac199a637e2baabce9c3f5af8d8c4698eea5390d7bdaa92ddf8190361a2452df2f44220b92519449605069661e0360cfef6841b734c8bdac87feeac243693a4e114995e1985d5c93e512d0f2fa27f12db8803e0aafd974d968cfb82cd9f0d4b09f34a440cff85e09a63462fe9820d5f1ba6bc6a489f30ed570e44c06668b6a4d3b977a6b7fdff3e781f0b2c7a1361108eb95f569c3ac5dfb1352cfe0965d9bea70f923d9f65cccfa83a9272947b3563f81c92f7f5caff203137b034af82847de0653bfee01aea476c9112267f97507b6949dd1d0d7510fa280432159a0b3bd67a8e628ef11599be4459238161914520b556fd5d91e92349169bd2ec175a9e728ca78b56eb2bacd56fd2af2696d1b116f5776e9df459c1023180a18f5d6e3718ac402bb0fdb806cd7b1cda6fc107e635f6a756005df79b720f3f274deffb31552cf238ec93182370d0ebe9f0b819e2e7225bc2b77e702d5df2825fe5d42005f1e39d67414ea539810db552049641ca22bd6539f1efefcb90dd479bf93080cb9df86a9aa99048f950297b8343 +C = c6a64b49464f5db516dd39f3d1dbc444528b5da57016b5f2a5649965a4781962264dee68d4c78e987f9446cdc815aabeff2590d3e2679a2583dbfd3aa6b8bd3bcd86d66d598daa6e766a6ab8fc6fe98a1bc89b5a137a342a59dfb52c7e7b2feb2a33d5f936503121476a9eee8d5c60b6c4c9aa7f8c56ddee5b4a9d7908890d3aa98f45cce20a074896aacefb6d7520e84b6675d02b819f37929746298d2c81407c20328a1ff4ee6da8298fd639fa9bba4ba27da5e19520434e27ae4bef59b73c3660aba0353701573d602d8135442a14cea4dbca3732a3fc8e5ac831929ad48ca30d4ee0f068f54faafb50bc7c6a784e81cf86b29225200f985af914ea6ed166e08c04e6324a372550603c98bb6c010ea7219699e76e4cdd9c4b8e6c9200d10c51e93a57f25b0019ae436dde220868596e96ebea3086089ece51b93a812c8120b8ba60435720ed690a2d1ff7c024cd514cf73bc7cb121a83222e4fba2dca65aaafd8cb51ac096511f83327458c53119f2462caf6cde5ae679fafd4f96756aa496ae6657ce417b2963c09463ab810ef51b4942dd1ecd120d69ae76f8928f3bcf20465f68e526beb7cef276533270f08136c4088182dcd626ffb2c3c4a357d09affb6b935e671bc827779ec6f260acf2c01f1faad57b162fb50579ed45909be1cb2a4a4dd07effcae6a2342022e0bad504261bb8d5d3b73eea932b0135fbb76fd6b8f8d3b6435db6d3 + +COUNT = 3 +K = 8027a28b92435ac111a67ba0f4af5cdd50e6bbf965cb9077189143440db0327d +P = 0d8f70b229ad687ab5f55f0b0ade81a4bcff76dba57aef40abb8074b9745ad74efcc8d8f2202b92a9788f8049c864e5973403279dd02e165b772ed7444795ebfdafd8e13a9a22ac1103d0c15d8d279c617dafff98dbdb9bcbea923a00967287ea9accdfcc90b2b4047021115d5454d057cf3a27a44988a8fa71578ab7751a5d3bbabd1a335248564f5e334c3925119a29c219aff6e71b3963f603b0f6e8213219fa56a36e33066f49409c643dfa1ab7478630d623c7b77efe07b0ad4811a522ed81a1c905831689747efb4edab65062cea3724c3f9decad27290c4ca0ee9c68f1ad9ce3981a0b712acbe2de17eefb7f9bfb3efd66aae691e63e5d64233715eae8957bce7924fe2f259309ddfe050cfca3cf76c52f96f8b7608cc833bb3876f35e37f6a83ac0a21f0c0fe859133c17c5f7422c72741137b8c3b5c3ed3e7737d5d8a29c063f1490558982496b7c852ec95bc452113c48bba93394dc1df0eee1f8dadd38715e93a1d7065c528f08525cd84583e250327320405bcf18d8011e3d864978f948b2839bb6d1d7bcef4b5b6871243a5c93a2afc1f7fc7d06c8307a025ebb522cf0fa1e1e9f8163f20854514504f4f873d757faa7938bb1ef91000df4ea64bb985d903136fcb9c4e9467561cd564f61146ac519c08dfa41f22b773cc821cd1252a3ba237c1e8c1c128ba40e0b75e67adf12cc448e47fb4037850b014f9c3 +C = c1e82d374a1d7c07b1f061656fd58b5df0a04c099f3390e98d2cd460b2e2713eb7274b9f646fc453a5d26c3fc0b9203521de72dc78292cf4d61095febeb0748658dc6764fdee4fa498b93a6bc053b3d81e10564cc9681aff2296ae0864fe3db495298ac4174f6ce06ae6b5bf86985c4603700757ef44c82c0e03beb7f2841df2dc4dd8022d5dc7ea2f711daacc1c8d1805798f646f1041146891e88edc4db9904f122c74656a5ea1694f8e6073920481e6cd30b20fac4cad7e03f58183f6b0beff8dde3ab4f781a1dc36ce0905e41e77be65c83dc9b6ab69d5d4c15a1de3cae6b081ea9568578207d678df77e20196483c2e00d2dba604784495949285a623f33e07e20687aadb86d57adb9935865e6467801a10add2ac7dfb6f8ba98e7635d6766eed38ca8fa8101f93018261b872a2a899065918b672423288cdaa417560701600424da6db15e0ebb7a5d1d0a7e1cc8aca714ac6bde51f779033563b49dcaeb8e0330112902b5ff0ebbdd00aa371c902092566244bd9a1e5e978af39adbe13a7067d056d04b718133575e420058529a42ec032cc5fa62035e872cec7acb3c32eb8b02e0cdd81c59afdba65b673c828b980756e8ff6b0160a1e796e221ed40c342863848b76e4390ef730abd3edc73ba975c037e6ba5e58162d7a60cd55b61643e9a8d4f6e5efa2485da7c45991ec4e8d4c6d9432e630e703ad1fc2f644d7577248ef7c7c974ce7 + +COUNT = 4 +K = 1a5b5f43929215da78d1d8c8fd4ba4508a66273be95c85a892da606eed04d6da +P = bd762d13e2385d333e435ecd792b8b3f13936d3b8372466fa7f65aa537bbd0bca184ec570acdb99ce59633bace041fb1374961be99276ff1d4b82cd49d78b89585aaa6cb1bf18498f4d01f2fc5e41e2b4fa56cedf5fe69e39eb1bed303d0795a715bb7eece6b389b3ae81b23b4f053137d54c03c2bdc68c79184fcca1758652c61a5e3c19d49b3af1a8946403b254da0f9ef227d2be6d0c528e1a7b78a55a2e79d79d6e420e9a3e8b40bb3ee74935be7c8e935414209be01532c991a871b90de0b35e963989475fbcfc37d81cbac674b1b29da1d0f9e78a6c3ef55575a8d3d7637b93895bae528d85130d059415d7c456a5c8d94897e567d6d6748e7c2689c98331ee5a7e1e25909901ace55828737e7464cb670399af3f806b996f416550d256f354dcdd1c332b4ca9eaef70114a7bc171efa08b88f1afd0429b2cf15366105cb812596a05e22b06debe1c69b06d904fe6cce284531ebefbae44b9c9eb156dedd01a4a73a1e631fcabc349e07636741580a916b35560d02e1cbbbea0a335b395487df8f972cf8600b48237e7b63bad5f08c85b85043eac0ca93d34df56cc486857d6eb3de0c332ee1ded06927e8acdaa3fce78636eb06c0f805788189c56023c8a1f5a2427c6f0f8b993f517a9b4a3bc3fa9c3e6318256dc49941c1e92973e3ae4814b87135c9ee767348045ef7877027c8ec441c50b1cab98bf556352333aa +C = bed34ebd1c70d52665103651110f847d83e19a551f5156c9720e2d1b038ed5d5ac0be8cd97ad3d725a8fa3c127939cbe1a6946558015bda4c7fffff7e00662c4b0ae3d5477bf61e175f8e4ca14cf53ccc558b8fb5f371748329b6cfbfb277cd0828a2394ec99f943775b4a2c1c285127a2d4677ba3b918154908badde2a74b58f73ac8f92b440214a3cc990f1517372b2e53272db9bb191cc7711ea31ffba2de2b646e8e312f4eedbe9e51e182603cf075ff69bc4d845410bf66d20215c160df05ab007b49a5396bf69d702d193aa0e2a2f9d62b7f9fb46cdf11864314101c7b7072f5ba8262c32b12b14225edba48b440501163d027e7fcf2292c9b06ce19dd0eabc6b6639769544d71bf068b8e03c4834c4a785a6b2b927c077f839f29da61c6c52c39ef4354839eff993068b67438266e10e9a421a90a103835829320fb1ad9e7e0cf73bdd22aa268436f4df0d973c401f8bb20f55585954079b6de5a2707ab5ce3b9f42af09b7c1996a45a15af12d041f5fe0c055b969a9055f7656cc2806745e84836903804799ffcc15580f8a50362f4dab9694e8ebf6f32cacbc3ba3501959ffbf33db0d231722715705e685df9202f0466c3ac0f1361a71905e23b0aa9fcac76b01d6402598861084477338b5d0463a641e2576f76aab2de95016cb31d37d8f4617e3908a179b43e3d50c516bff1e388bfc0ca519430f85a0de5b0e274cd0bd021f20c34 + +COUNT = 5 +K = 2a919617d24baf991779de6c06e68bbc4894a4e9a702214ba3eabfbd0015a7c0 +P = 8d01e11206e3b9285cfc73c2ad6b238536fefbe824f19588685601bf94e1ee2515c4ac281aa04feef877694840d15f2a6722661159a7216548d768b4754d65f1f1c522daf2baebbf88f638fb324b182e61d315f5f12b04c486022749f6a222d87c7d635c52605c7efc1853cbba466d138d3cda22979e7742e4fc8168f5139a72818109d2124c20ddcecc74fb388f34151e96fd33e8b269a7d7e51b6fc6ebb427219163c3d9f7de4a6831a7f8b596b4b490789f1104fa160115b6e8d191d1db885e42812c01724170e765634fdb8e05555cc7a85c3d8cc7a8e299b4f3708a4a5189871037c0d04216dca8e6127b449ec6004e7f076430d1f870f0d2c0c6812c810b4665f7fc07fbb04b56bd956b81267d2408c31be8dabade28a3c40c609eee3458b7340715272ee26df8d875d1139511649b629df067a8994e1fbcf7ec60f565e8da257db1b7ef3b2b765c77124c8120bf26db8c47f493d2afbbf8b89efe2e33c21c2a2a1141c0aa5b9dc1eeae50714304ff35a7cd34da963f5249bdce8e2c2a3026672bcd2ebf311f446cbfb7171fd35b446f0270320b30b554f96573eac72ea0594306d43c5ed48a6c2d01a363e88646a8759117075596fe17f18c0b36d00b7976098fddeb7abc4e34bf6d6b35238f81b59d169db4578a96c2bcfe78bd583fb31fc2e7fc97c6f3460147a5730bdf31ca197482acc341b7a719c4e1716cde56 +C = eb62ee28d9929093e3c6dbc0592c35dd0ffbe8faa37173195da391b84b1ffb342b9bd9f0df0947c33c7c74a071e7d71312f91c8e1f11d680865563f2d9604dea06892f2adaf2028246b74ecc82686bd7c369426a09f23012b69a89e7e66e7cf008b41dbf04f72cf950fd8f6ed63b3392de53b6feb34e45df3153e4aa6fdd685fcd7e3239f0509f474d8f6a58abfbd36bf14f93fecf913b2f9a0f0763c39cb6828651434ef0520cee88f46daef7932e5ce1549774ec106c2aba54cff64067e0202167f5a3ba1cd6396bda08023d2cfa9100043b44b8d90e2862d7d2a7f6e3f5eb6976dac2d5dbd2adeb69858ca391fdd4a582d3cb79a2d9fb57ad5b2cbd157b1e36ed49fb848e9960b0dd1c715b701f1d379027999fe8e35769e27a9ef60a45aeee56c7a3ebb39f6c50796d1236e721cd0e5d931e6a85f28fd87d652c3e5b706e2e97c12c4b33fc7df4585d60d0326267d8d252ab8970a2528be086d7ad18ee4b1531cc13b9c88cb3a9188a47b8a72cd276eb4d7d726a290398a98be0083e9917f349d2c8137a6fcd2232baedb6edf075d8e938319a12bfc31726b13d0e1a6c18ae33e11ad1f451fb9693f1774aec30fa7b473c98cbaf4dad1b3743d118ce69620503a753b87e4ae25f36564c2538d6650291087ae04d181e4fc44b6804b5bb0c6c8a0aa04a49895ea64d13f7d253e62216736a716258bc98c63349b5b19f5ae9aec3a176f98cb298 diff --git a/src/test/crypto/Makefile b/src/test/crypto/Makefile new file mode 100644 index 00000000000..9b5fce5b306 --- /dev/null +++ b/src/test/crypto/Makefile @@ -0,0 +1,39 @@ +#------------------------------------------------------------------------ +# +# Makefile for test crypto programs +# +# Copyright (c) 1998-2021, PostgreSQL Global Development Group +# +# src/test/crypto/Makefile +# +#------------------------------------------------------------------------ + +PGFILEDESC = "testcrypto - test PG crypto routines" +PGAPPICON=win32 + +subdir = src/test/crypto +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +export with_ssl + +OBJS = \ + $(WIN32RES) \ + testcrypto.o + +PROGS = testcrypto + +all: $(PROGS) + +testcrypto: $(OBJS) | submake-libpgport + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + +check: temp-install $(PROGS) + $(prove_check) + +installcheck: $(PROGS) + $(prove_installcheck) + +clean distclean maintainer-clean: + rm -f $(PROGS) $(OBJS) + rm -rf tmp_check log diff --git a/src/test/crypto/README b/src/test/crypto/README new file mode 100644 index 00000000000..3750346a830 --- /dev/null +++ b/src/test/crypto/README @@ -0,0 +1,33 @@ +src/test/crypto/README + +Regression tests for cluster file encryption +============================================ + +This directory contains scripts for testing cluster file encryption. +The first two tests test the encryption/decryption using AES128 and +AES256 in GCM mode and the Key Wrap with Padding (KWP) method. + +The third test tests that the data encryption keys can be encrypted and +decrypted, and tests cluster key rotation via pg_alterckey. The fourth +test checks that the database files are encrypted. + +Running the tests +================= + +Run + make check +or + make installcheck +You can use "make installcheck" if you previously did "make install". +In that case, the code in the installation tree is tested. With +"make check", a temporary installation tree is built from the current +sources and then tested. + +Either way, this test initializes, starts, and stops a test Postgres +cluster. + +Requirements +============ + +OpenSSL must be compiled into the server for these test to run; if not +the tests are skipped. diff --git a/src/test/crypto/gcmDecrypt128.rsp b/src/test/crypto/gcmDecrypt128.rsp new file mode 100644 index 00000000000..53f694d37a2 --- /dev/null +++ b/src/test/crypto/gcmDecrypt128.rsp @@ -0,0 +1,129 @@ +# CAVS 14.0 +# GCM Decrypt with keysize 128 test information +# Generated on Fri Aug 31 11:28:04 2012 + +[Keylen = 128] +[IVlen = 1024] +[PTlen = 408] +[AADlen = 0] +[Taglen = 32] + +Count = 0 +Key = 137f40bc82b01b34047a407f40d5c434 +IV = 1f3de3a272227db8a88ba0cf7b23f7e57f84ce370b289e2f774a0c6184f44f9df9b3e645ae506e3d18cdc819fdc3b31f77d2dae7ee295ba2c776e6f1fefecec740021c4aa1118a6ed3813720411f27ace5c11437a78b3d626b5f421e2f3ffcd0677930e7878110a0cad535c057261c63f0531b538d8bf04f491c16e871f071ac +CT = f6336f2001a5edc519aa64071fabec2fbe4654ff334871e9cb8b12c24ffb7ec9f0d316e43bbdb568b65608f937f24f64eab19b +AAD = +Tag = ea3d8cad +PT = 60881890126afdf881dba4c1a7bd0f9d8c240073517b21c622bfa70c1f05d59544b9dccdd908bd924ca60d23697fe64aab927c + +Count = 1 +Key = 5e49fac6608ee4157a9f0ffde503893c +IV = 53d854bba28eb9d9db71e68b5916f9e73964a0f95c6524411fb18f64977d012d611a15fe8d46ec49bcabf060cfc7989c4a8272567c660c0881f944066b88311992b5d52cd644bf07e9f01d2249e5f9b00ae1af9f2739da627b3a8838e28116b02e7d171d1938edc6dd5d7da3dac63ed148b367961b809bf423510f0a3f44a8d5 +CT = 23bff066410784e0999a3d0fe742ee28ce8095fe822fae0f50b0b17d70c8cdcd705c83e108fcef2d64c5b4db110e9a72d3c4e2 +AAD = +Tag = 01aaf4e6 +FAIL + +Count = 2 +Key = 174b47de82c64196af18214d9d6dce2f +IV = db9653e9ca4ff27f8180f45eaf4861541d30b3b147d2b9b64709983dd08312b5fa709a7bc6afc591fbdd08fa0f21f06a25758ee199a6c6415cffdd9777280e620d7bfaa6d551875c069f5672a730aa68137cf33f86d46cbd548248e68f589c2e699c554ae4680417e9440e5514c1cee03205324f789d953d63518b023f65abe6 +CT = f8b0c1317c5f3550363762dc7fcc847376a7917e1505cddafeb5451ecf4464ef52afd797697f1b0b1e8d3a2376600d6acc1c88 +AAD = +Tag = f6fa70c6 +PT = 523986d433fbcca153af7765d177e2bad72ba52743440250db451e2da643913cb25b33fa364011a8f8d5cb563b461d46952923 + +Count = 3 +Key = 5194e88020a501ddf21a7ba5525bff70 +IV = 444b0037fba6cb9d4f4e33df32289d44f262d8f535564a1921e6a4315611b98a4e1074b37114758b421ca44efda91a48ba8674c695c259649c0bd3c11bcdc8363d3d113a534bd790988a44e72559fa70a5e2deeb9da187e2fa89c7b636a000a38d7bfec2edf2831ddfc9326f0c54956c149e5ffba3f556c8f2a749dc6ed5407a +CT = 31398e58d6e27c50e85c87a4dbfd73134f456ff042c9f869b47a41e3c10db7b24cffc103a2185a696fdc1f2d389cbf41379f04 +AAD = +Tag = 2d28343a +FAIL + +Count = 4 +Key = de250ebe0ea1212480f7193934d1ae84 +IV = 255f658334f1c5258d613ef92da7242079795e9c04c5c4e800aa1c03b6abe77f4dc28cf614278900b52f1de23550fc9c0ace8e8dd91a25d0240281dcd684dcb786839f376496692f3e35a5946bbf7b39cabf7d782f4e77eec53a875d2a5e3106238dae0d048e8b9ec7b05961d9d68489bcbe2805ab0f9cd42b5f8f62b4b28b00 +CT = 6eace8c7c10c713870c676e557c6b49134af26011af157de1e2c6ce0769fab666b9a7844756d6b948ccdf1df77fa98119c83a0 +AAD = +Tag = 87b81d39 +FAIL + +Count = 5 +Key = d0d9c4aa70e5bcad8dd9d72d14c21c54 +IV = da20388ed0709e5af5a52a5293bc4060f02ed6d05d96f9ac3a05c14118a3613ff790b721ff2c888d63c04b9b28a7138ca00d0ebbfcb11b70868f5b90c0dce79c7220fb90d5f3b6e22c6ae3640e7dfaca72ec12813820debdb69f90c8393645d7e96cc36bdf62ecc4c0ec181c64593c65c8f707d6db61b16c6a5c2f05f8851692 +CT = 2f08e5a8027a88f0e223429e6749dbdf91ff9742c7c6130962ae837a434a7247600df8e2004fcf3e81f8ef64e6a23bda002f3e +AAD = +Tag = ef6c0d39 +FAIL + +Count = 6 +Key = 464609f485abe0bf4930102f9637fdee +IV = d0a788fa1494d47bd1cddd55f88515229de5c1a2ca7dca054e08bf35d3c0aba677671d5290c3375fabe40de14e15465138d298edea5efe1ac62452dfee86920fec8e46577bae1c0b43d77cb6b8d1de6ba4557d7410796bede796b8b87b5771e30e05a8bfa55ed92194d506892c33f8ad6038ee8896b98772d1a88a7a40ceb650 +CT = 5e81bd883fbedb3e074ba273751081752c3aeb7a62b08091b768f8f8e6c0913cf633358fc26885150adf65bd1d708d9e45eff2 +AAD = +Tag = 7e715546 +FAIL + +Count = 7 +Key = 018319ad18e5d877dbdd871316fa6610 +IV = 941f8715bbba7b1fdfa98c8c05546a50f86cd02b654862af6c1a70cf4d79618b37abd1f06b7a29bbed74e728a8c78a0ec8ef8312f2552fc4571f48024ec5c6d44c21d69311cb1ba390cda615f246cb175a9e8a522a20b671e6df973b8924fd003c5dab7e59af63f20edf56e9dbc9f98fcac25cc692cd87a3d8fbf8d4b5f150cf +CT = c5a681b2b2e5f68e0cb698150aab9d13e93e900e0e6fb20170a4104839a7e0a5775bb3f20ae814edd109530552ee2b41cf5986 +AAD = +Tag = 75a966e1 +FAIL + +Count = 8 +Key = 3556566a1cdec721469cc86412ade697 +IV = 731a29cc083422da193724a57c8126b587c3817fbc6b76bee106abb3ac48722bfd0cc15deb573ca73f2bc10ad7b60024f297bc52c31a7021bda91c5246c781a95d5d18978c685f096e9310d6f5ed151122262f7d3f5a2b265321683cd72e0693ff608e378199264eea876df44659817997b323e43457e70809c74737d015cf26 +CT = 753174df0aff575fcea79e54e5b8347e69175070a00d58720529b173e9dc64677fdf459ad72385b03244d90810670304129f2e +AAD = +Tag = 41147153 +PT = 7588b757f08b75fc799fc825034ac60585e24c56db94920b6982bbae86030e7a021ddc89a5131e5371e9295e414d3503ec11e3 + +Count = 9 +Key = 2ec96059d1313923fe522bed1e942491 +IV = aafb096a63195cfc959bf93785a48cb7a78b98c60f861c9646129222643d249e50d46f706d670662f60c12a7f0a6ad65b908ede9a56b27c46f07d427754c11549154632e9b5ffa25c9306add4c0b13c7191abd59430a65ccc42c5420ec98093ad6ec6ad1c4d3bb895ca3546236c2c78cd075b4b1f44a6a03d115b91df02b96b9 +CT = 61c6b8bd44c9c6eb075fa9bba7ac0156bc64a6326d32afedb51f35daf232dfd068dbfd0505933519a753bce6de4560b79b31f1 +AAD = +Tag = 26f53f5c +PT = e4b86f8b7684d6a300cdca848ee8185c41fcfe061e61af6f3a563ceccab58751f1884b322019983d99150db217a1e92a8f15bb + +Count = 10 +Key = 48cbb46539dca9faa97b69b32015886f +IV = 1d3152ccf3ff05a81837ac7c55f791c78aba8c95149c80de8724b1126dda847b983a1bfe54ac5206d5915dd0796944e4daf432b14a72c5fcf81822ce1e3e7ade345528641b94a2fdc90afac098b9e4b0b5ad159ba275dcbcc11fec04bba0f12db9b3d1473ea0ed7825d293fd0b5d2d07795b3de922bba5660778adbdb2cf8539 +CT = b2658e897b85d9acf437f9e895189954134c543aaa91fa82c586cb5bb1d96e04b189df64df5bfcb63a567e444f4653aee3fdbf +AAD = +Tag = 48d21620 +PT = ef58f7ad664e36dce523408e497acb871e9a579fe4688be6425adf41c4c41db85e905e9279b93a037648802dfc273b387e7216 + +Count = 11 +Key = c5c2e83c4e5b336e30257c95c8f7f75d +IV = 269c29118eb9cb4ae8fd8ff5b3a5d61f4a63a719416bf2cc5225544cabdae24741239ba7910d1a32364c341169a215264ae62964322b6ac64002d09455f11d9658a9454cc6bafa82dba5b5365b2243e854470dbe74a3043b5f5f82ebefadaca4d468317c83076691c188af6e800dc296cdd941fc1aaed8840874f8cbbf96827e +CT = 891da6e799429c772edfe93d9fc3ad20c25fd5a172d272d0ee6b7063b59b611e4a2805923f182abf99373fbe4a1759339bb13d +AAD = +Tag = 6759d819 +PT = 474a2e7aa4a5422bf17f257b584059fc7abe2d25e7243b434a6f936f5a0eaed692616a43a2aef8b427d83ddf1bece29d96c581 + +Count = 12 +Key = f4d7a0e84b7f024988f0507098415616 +IV = 9d58c03fe45a97d4c556bafa4292766c55bd84d0e467c6c26b0e42162015f3f506c9fa06caac63a3ef6a3f0ea62ca4e122bf453a767b2b00d6277a990329b76073c64ef260421d887eaf89b6deb488a0528608ee8c38faa5f5976b2b71d7e29a08ccbeb2342ec15c46232f3c8867ada7cb8cc8aed0728dc59d706913ba62124a +CT = 11358bd3ea694574c0bde99aafc2621732fcd978626d3a3ef124e98a2c3dcb8261580306b51ffa9557d379952ee31bffd9ade0 +AAD = +Tag = cc46943b +PT = 4a289261f4ced6fbe544fa27d430f04c72ccfdbbe0f93e881fcc1b29a19e706a579a98304cb2b03cfb3aacbe187dc69f923ee5 + +Count = 13 +Key = 31237ffe81db56b9541133156fd0e7d8 +IV = feea389ed83bddb360f42351cfd56321bdd0a8fbbdcc8787725a236365eafe5dbc9c6e9ec8e9ea0f74e623c33a574ea4223a43b1550add0c75d8348315add72ee9561e1750b69defd46214bcc507d4db4c66c6da23612aabc2ebc9b0e6d5aeee270c013a553ddbc7d7d7ac62ecb33a5a6e3db859471ca3d3686203c0c96eed2e +CT = b965d7bb29a8811872c894d535e001d534a7d6e99eea5bd14ebf39c5e79c623a67d549bafbd0dcee1494245c04854f34de677e +AAD = +Tag = e6fac1d4 +FAIL + +Count = 14 +Key = b0e576fd7cc85f7c92d44e4b8c149b3c +IV = 3855af97c5b740baebda1532c196d187b8af0da761489ee9e267fcbeb720bf6f73cd743b69d942f3f44893d68a5c70174a1863dfb831ced0ffbee668a03e1066e8b3dc03dc1471c3a848b3787c0645b20add2fc0cc37e5e0aa57ef08fc69c53897030fcb5579a831ee53c76df2f75d2de5bea93e9ddd8e4e1383bfa1a7c7aa77 +CT = 624aad7fd047a61c09638b8dd5065d7c00960035440303a03ff1b19caac02baa3b6835581f2b66c2381006798c9d5d63d65684 +AAD = +Tag = e6142617 +PT = 0fb24ea93a14e3d76c4d7a991cb55e6bde4083504ac9f3a00d4b0ef53d3929953b53f5ce820c41b9aa75a985e09986d3f4baee diff --git a/src/test/crypto/gcmDecrypt256.rsp b/src/test/crypto/gcmDecrypt256.rsp new file mode 100644 index 00000000000..af72933a605 --- /dev/null +++ b/src/test/crypto/gcmDecrypt256.rsp @@ -0,0 +1,129 @@ +# CAVS 14.0 +# GCM Decrypt with keysize 256 test information +# Generated on Fri Aug 31 11:31:25 2012 + +[Keylen = 256] +[IVlen = 1024] +[PTlen = 408] +[AADlen = 0] +[Taglen = 32] + +Count = 0 +Key = 60c9f83fb0ddbdc727e70bf9eb1acc13b1b63e3056e64db7c2ac55c4f2068273 +IV = c33d34a3673b93bb78dd1e00f877c4e6e4cf628438b9effa61cfe81e159155cc9ca7c1418917527ed3f0a51daf2bedbdaca20fad687a7dd086ae086c8ff5094e9b31fd71bd6f8f1f1adbf96bb2690663386c37d7bce891137897aeef70be10a453cef7e31c1b8c0a24ac1baeaf08a46aac445ad5a8103804825fde86dd4720b4 +CT = df6586921250aacd9d25f432977e92b09ddf89a9403c83a80890ff15ce9c4559145ecd85d86f1573bbc1b48992859d22fc13b6 +AAD = +Tag = b0dc70f0 +FAIL + +Count = 1 +Key = 8d7db520bf88c96d46778991a4f0b6de9aa7fd5d35cb6188a6f355499072af5c +IV = 488a50706bfd8ec7fb4c508511bf4c897c8566ef289b5e58a4c59bcbf16b5ae85fbccaee4a1cc0d1ec74156ae911d36d497f5ee71f1fa51649819c9cb88cf65d62d2abb65d621c202bcb33d8d68018a858d04e79deb62b3486658730735a1c87829acb49e73301902c116c9b6ce110f23a6b1a4dd657e47a328e017c19f0ee52 +CT = 497e1a39ba1b38d263bcbf19cc2900ca4070ad37ec12bfdd30139a7068a889825eaac5012cb5c2dcc710a220cc658dcf069f60 +AAD = +Tag = f20b9885 +FAIL + +Count = 2 +Key = 1ae1e42ee656973f5628e3cd11f0494dc8a563cbf5fbc5880cc2dc6d787bc9b9 +IV = 23e3c948cac6eba2ed11d667783557917f066ff6b93ab9409df9c7c84b27d26817dbdebb9fa9d0a64bbc572bfb2c7ef7f0c836528c9bd692505c8c5e522f57aecf6b479723449398e5b1f45cadd81264c5aca8059562d69deba26395034b4b01325d072dce92e540c159dba92d3e41e2d0947d873ad48f9f0b00f4807d420aac +CT = 3839fae7008b88250b602cbdf295e932e3c4e3710d397a2b9a37289104efde75f73302b2820f14664c064e8dec45ae49a74036 +AAD = +Tag = bce334cd +FAIL + +Count = 3 +Key = 8af302bc8684cb91b4d7a6088cf8c94f9f6e027ba046c2b508956ba3c88f2d65 +IV = 3218736e931392e6510b91210a6a6a27680740ba8924062ea176048d6b42f44ed04a46ce31843b735ef4f63dd1d85643f28fb335d21fc2e3c673e97e6b845e363362d32844c9054a165f40658267bb177b74797a8828b1eea723d51b571d93748c758ea5c328103612b109e008f743f9505034ed3c42ab3dc310c20938f8627c +CT = 1f1b133a1a7b58625fc77021f8ad1751bfa2b8addc0a9837dd5c44632cffe5ecc2e9e54b90cafb6cf8b652a8d2da116ecda3f5 +AAD = +Tag = a673129d +FAIL + +Count = 4 +Key = 13a7942b5a5cecb2bdc0e8b0348d4db5a98572544ee31918ea625b0691c10779 +IV = d92b4d05a549b296e18c90a8da55ec5bff3547a679697c489a1d49dc02bfe2dd85c8f050b32c389c4f857eb4b663f53354bcfe9c3a7e30019f2e3994421bcf3a3d1cc093768eed71bad5139f3f3078514d80a4a41d1284b5dc43ce07efac9c475d6ba2acb66dee50cdc62c463a05ca396e72d189f50d44ffb70d2c6112c6ef0e +CT = afe058cf694d64706302b405243db77c7ae2fe4f33c6427416f8992ba92754c69d4e7c1a89e9b6987f2bc0a7b568dca9c9d273 +AAD = +Tag = e0bfbdb0 +PT = d544d114e3d9ab8aa2b9ec588a112f780a6df74d637be3cd34fdefe14506f26281cacd2b98c26fc4adff837a7bd72173b962a2 + +Count = 5 +Key = 7272e6ca6d6d76c483df9a55c6d07bd54fd8fad50b529ed52154959acf01b64a +IV = 5c55bb8f4fe797ce34c0e281c3b04ba0bce8689493451ea569ba8cbacc74ea36ccf319776f77cb4d7f901fd0ff23cd28ff0ca77ad9d4adb0329fb68a60ff004a1c5b12111d2dd705ab1f7734178f14dbb356cfc0c5c208b91c277235f35afe8c2d46ebf43bd5e0a653e67e0c086ebcfca32a56d56dd5f810f562f769cce2794c +CT = 7f378bc30cf2774f21078f42b5d6b66aa355c8c073d3a70f06775f3c7e5948539ec08a2cc50cae6f2ad9680ba47bac190c3068 +AAD = +Tag = e612f4b2 +FAIL + +Count = 6 +Key = 82b1d2ffb53fc79f5ef88742a28eabcf404074836fe28b5b202ce7d5c68f6ebe +IV = 33f834fe23b9639d30de763faf7c1a71568c5dea9d5d253f28723ccb3306a3cac3cac3beca638067a3485ff743b5133577633ec88dae0aec4fec08e894ab5d61c411f0939772df2fa66d5775f74b3ff36ee61695d7cd2726b9be4df80750011477705948b1276db0cafede5d7ac73ccdf01b73a5492a02c43b89632a501f6694 +CT = d9d92b33a10f4252fff828b57ca5f5f118885df0825be80ea5725a874b7e8721af40bd221e7f5c2c8b005d77af6266cd36ddd2 +AAD = +Tag = 86e48fa3 +FAIL + +Count = 7 +Key = 5264831eaebdde1eadf741dbfd585cb0ef6437d1365bd5848d9cb3a22f57d420 +IV = ede74a8f53eac5dac276bc72518255831b616c9fb50a617eacdcdfa50e197d2941004f785f00f8c600e239cda77c8c06088793a674efb8759c98604dc0143e06665dc7e21d5031fd4751a7cd1b947304645e0987ec7e765db80a743122fbcaef9ec83849e8eee8d011dab67fb54317caddcfc472f585e93df91b1edce9695908 +CT = 9ad126b39dc2066542dd30c8fe81cd750b72123d74aa162113c6b0cf10a9cdb217d921e8f03b400f1ff719fc704f44e26ad463 +AAD = +Tag = 5d7bed4e +FAIL + +Count = 8 +Key = 6c2a43ce5610eab9dc40f43f035f7eed6651789dfdd166d4f106c95cef2a67ec +IV = 60e3a8ddb899108c11550a461720bdbf9adef26c300f098c73c3767621b06eac4f5619b9855d96e4d972ddd38f4538f8e25b7524b46c6341e8780e22c3b42ccf43f41fddfc5680432b64fb4025b378204045bb2d7ea56f4340a4018a4c99eb8b91012b28024d1b2bdb603fa10a28130e84bce38384fbb7c43548c0072c5c657d +CT = e073e948ddfc414948b12b4540d43dfeb9cbfa525b3cacccd21da89ecfb254c840722b9179057cb3ee69358f05e4ad0e41a543 +AAD = +Tag = ac0497a5 +FAIL + +Count = 9 +Key = 7c36a9bb3033bc6f7395155eadf0e07c8e5b3441d0ad66b21625d4950760386c +IV = c8393fb1d80ce92801a4fd906a568f7f404a82b02096e859e70e46d1ca5e231a073c5acbaa4cb4c33581e6887c402753bd55f95c76e68bfcbb1cb21bd37ab7a226e03d03e9dca6589c3020f5f916c50676e8c387f9b1710579a728ba7e7b60955ee5e383bb75d2b9d0f2abc72c02edd925bb32dc5a994f032e9a856931eb1ca3 +CT = 559b2ff3f5fd147b9889146f9fdea6758e5e0c716395cf1caf577dc2707764833099bda0910626c62bbb1ca010b66c54114982 +AAD = +Tag = cd2ed4fc +PT = 178745b297a23a897ec5cfe3a9e373befbdb840d9eb657885ad0423628c4a18f934e6fb57974a52436c517f4463cc5f9370c54 + +Count = 10 +Key = 981afbf7e7b74f08d186616d1f71b682bccc3cff6c5560696d267ad455d111f6 +IV = a2d07ac3ef29978c44ebbd83e1ad330a8fcfad8213fe2e924390015bc966a944a0a76831189a011094ec4ef98535efeb56b871e7e1aa36748e639dd5f9d1bf3286a1b2965bfc029faa0f855622c30cad67331bd11dbcea51e397185cbc4f0f0341fd8e744d2f09b2e3c2bd03af15850dbe2a701855ed4247f97acf9754f5e4f9 +CT = cbb1f9a5bd84c4b1b8df2714f87db878f7d2658cc7c37f75d784e2157687398a391ecdfd1119e087bd12f6af79db50ae7711bc +AAD = +Tag = ad794c97 +PT = 06ea96ad8e6044978ea676056df8c647b7bdfce3923750983cdca875089841612737e6fe078496d77906b9606532b309851cdd + +Count = 11 +Key = 03183678896e28b84e16ac41ebb14f4f436efe386ee6df4e8ad2a7aaf11f17d6 +IV = 718ec99fa1b9b1d29a06ca3973d9c0323b14a2cc34cfa2816481aa2da97b435b0a075a2ed6412bb482bea23df9deddd16944492b1756c65138c3d189b8d2d695150667f46edce88755e868a2d90bf13f170d9b6bb29d9210f3c9f507663756866ede0b362aa5c859e15cd96da4f8c7f7852b3924bdf35ff3a515ba5150e1b017 +CT = 6ecd41a492ae5d6295e9c18290c9a36999c79c87f8b69ff20cb42ccb7c6678baaf159c75ecfb15cb87db99a3236734001545d2 +AAD = +Tag = cad99689 +FAIL + +Count = 12 +Key = 2ca8b01d1cbb8d392bae40bd8a51205a9020be27a23533da51dfe1ad0c4c1d41 +IV = 5b5edcc2f17942afb9577c3d2ed7d5ecaf009ac3ebac985fcf1e0fac0dfdfe747fdfe3d05795337baf41cea3b26e4f35caee1c13fc52d1192da145f376b4ea810ce7dc94845a9ca9184203c3b8e803e7a9bfcbf4a310c85b28b04a007e8e9bd14ff0ae28a1966918a6e22ae8415334e7df0d530b0507a24f755f70f117581820 +CT = c420656fb66e89d5b10fbe3ec0929286683ddc4a34cbaca638493f5c09673609814127709b6b1bb765902f6857761a8d57d98f +AAD = +Tag = 23f1dc74 +FAIL + +Count = 13 +Key = 25cb38a4b7ff73bb632ecae5f75d46e45a108ffaf3ec2d6ad39d3af4b3c64ca5 +IV = 372510e18f877e0f74c1cc54b19d265b27a452cbe91339bb720aa1bdfaf9bbe5365c571ce8f01d2e96aef8bf089c3f4402f186213be72b46b200337c9ebf943bf3d2db1f68c8e655534d9198825737e623745c26f6b0a82585660a7cc3985a271dea9b20f93653701a8d383bbf3155864809decc03ffefc9ce018379d12d8bfc +CT = ea01ae67abee8f8552ad260ca9d08ea5b35b53667a3455718545e007e5ac0c62c1ff0c5b06f8c031079fce5f2367889a6a068e +AAD = +Tag = c6a365f1 +PT = 3bc70116886ed9b4ef795e45c6ec8ea65f6285b3449174f89ceb1294ea73dae9b2f037107f57355be7242abb7da818c98d2755 + +Count = 14 +Key = 19bb98022f5d140cdbb5b1c02aae8eeec1e96dc6eb489d70967588b6f414330a +IV = 6d7b41c7f949f8ff3e9e18ff7af3d67eff5ddaa62eefdbc0b0a49dfb6fb07582998250d1c8e609d57510c859333a268f7e89bca06adf1646cdeb2e592bc86769aba402410cbd71f572dbe065beb37d8766ac61c12e7ac322d213407e073d4bb3c28848c42959cab21f9e39d7f4ff8debd50f40bfff96cbf81af07fbffb6bb2b0 +CT = 60a77e3d27fcea5e505221382d82e9ee39c2bfaa01d7d6ce0d293e7fc7bd0d7f900afa9a7f080c33c04cedd76573a914409e39 +AAD = +Tag = b561ee30 +FAIL diff --git a/src/test/crypto/gcmEncryptExtIV128.rsp b/src/test/crypto/gcmEncryptExtIV128.rsp new file mode 100644 index 00000000000..69c74ca6c5c --- /dev/null +++ b/src/test/crypto/gcmEncryptExtIV128.rsp @@ -0,0 +1,129 @@ +# CAVS 14.0 +# GCM Encrypt with keysize 128 test information +# Generated on Fri Aug 31 11:23:06 2012 + +[Keylen = 128] +[IVlen = 1024] +[PTlen = 408] +[AADlen = 0] +[Taglen = 32] + +Count = 0 +Key = f8fe56171fa546a34b1b28e0b1d31cfb +IV = 960d57f1336271e069c12f11044dd5a5bea996fc0290d37b5b2f47c8df3ae3ee37214a6871d963b830aec266026364984cfe31eb88c2a6229f5594ca9d3b6d26c7fadb91a0282cdd0a321714b745dd5e161e7cd192420cf2eacd552c4df5cee8fb5f0e06b7c353017b4b9523ce56899db770c344da720327817ba823a8f71382 +PT = fd229158f18f5b8c2a96c86fa3d8084014660eb2314bbab4ca09fa72c3a98b6faa2ebb83a1809de9ccbc8973d23af34014fb27 +AAD = +CT = 4dc0fd07c86ec84c264f0544456bfee14f688af2109455d73aa1e58a3354727e05387c94568edf352f8342a6156c64d87c44d7 +Tag = ac350afd + +Count = 1 +Key = 803007b69146363999afad4433c0f3b5 +IV = 2fc7688faf0b3783094294526ac07c38a73ff961ab41f20deb66edcabfe68e094aa0f1fc52318706a0e2f9b3901a768346494e846cf17b662d05a3788d77c1468408a49ffe5c0cf68b3b8b26193dfd84c63c4631eebf0c7974283e05e39494d9aaad038018a6e999912b1f92681375214e634f5937cb32ccc4face42d013980c +PT = bb311134866deba57fe506445c5a312ea1ba1ba16469731c1647c6a482bc84fa8349d82bbd01d3edb6cfe8f25b37ff8ec9d621 +AAD = +CT = aa1d6ae4151aaec1030a6f0297c48e67b84d1c397e9eaf0c5c8c3d252bf8638bf8591342c5c20c7f88f41140b0334f55cb7b04 +Tag = 30076c5b + +Count = 2 +Key = bfb96f58535443205ce281e03bdc5c38 +IV = 3961f47e9e800f1c72526d73fe372cd6e69a7eb1d2692c58bf4297bb05503edf95e074a7cf2644981f72421f229d93866a6e1fcf5c13953b39bf36b56fcbd4c09b505e550b5ba0e9bc26efbe0b9621a47b81842caa8c945fa5bb606f0dab824a4bb5a2625668a916e47f0a0d8a995bcae6940e120724f6d53629545da5456008 +PT = 2c07d76ceac2a77809906bebd3452e2f898ea5467d47be1f17573b3f7fc7d11c9b868d1d1a24010b63dabe9c6c6b4e123df559 +AAD = +CT = ca443acf08d121e1ae4221013ec40dca2237d035e89ae67040602132972417e07a7c770d75d96fbb5b8a38e048abb15bb978d2 +Tag = f61f75ee + +Count = 3 +Key = f3a44f15d104f81b4bb263eecf806737 +IV = bbf1b0646991c2b9735066dad5860fcc08a6b92944a5e90dfb120acf2a75403d3175f5e61a1d84b89a0c1bdd3b3414450faf6ffb8820ab1ea01a2b3cc05f1cd1de9bd48ee1308ddc7d87e6db33d3a171e7f63fce6b8e0417359afa833f6b5f293195bedf444ac56103ee0c8706a69c08fe59a95c8474f28a4a12661905f8781c +PT = 62d05ea0b0ddff1c0418b01a21267230ebbf23b63a6c14caa769c9148150c2454c055cbe4a72a08e7cd8dcb456e1ec17bc3a63 +AAD = +CT = df08c4d223a168d4dea9445f5b88d40ef0a796de3e77a6a116bfed840ccbe7b988345d070640f62f5757878420b5c50dd9f567 +Tag = 1518c2d1 + +Count = 4 +Key = ccd58a6017ac344ce5f8ebebdbe03593 +IV = b2c53f013021f494f6637876f62ce5b5dec6d548ffe58d0952aa8fd8fd5c8d2b835165e1b0ca01e72c19f962e38cd4458229a3415d7b4f9afddf5bb63215999b750c07a080677ba4e40f6c5e42038882503c9923a6eb2cf0d3b82f9f94e624f9938830fc22430f16f6c93c362cb3c11cb05d63becdb4c572f03431e7108369c2 +PT = 0b89a455a2470b3b2b7e04afac15c45a0742061494b78f88c57f2505e1f5804f35c0a829f8b6443e427fb6ecd374642217b6f7 +AAD = +CT = a8b6203b914391c61dbc123efc6902b892107ea9724341a22ecdd0deb16c48d885d606f68724cb43a956c07ef9ef654c042906 +Tag = 02aeaa63 + +Count = 5 +Key = 50bcf114df40a431c0e1e88033154a25 +IV = 0ffcd9b14c6bf5f630c86b41cb3cf96004f3fa4fd48ee87b7235d34be0be1fdddeaa79abb3c0c9198a1eea0ccf04e8cfc8d24e4badbf438c59a70b435fbf07d44f55b75e5e48fa0f3d7714aaf9e34b430640614646d0008014ad432a464a252c66584c922b29a90cb2c2e4237c8545f913cd2ed3910a4f075062a55b71228411 +PT = 9f5206b2afe824ac4303d58a97255dea6f026b8531651105db9695f09acb16afd9488928060219307fd41edc16e49f01b7d646 +AAD = +CT = 0143c0a035ce29d1418acb2561dfe88c74f24a9808a8672427797cc0ba01693c2d66c0e365961cfd58fd039bf08fb4c2b1be29 +Tag = 552e27c7 + +Count = 6 +Key = 8dbf11f923374b8e8be93788de939806 +IV = 2fcb130962ba3e3eb8e31a7a26e2082a643f39d67cce11d8b2ba8a782f63d4df375b21b1fedaf67bd8d73b2208937ae941afc99420ebbe328214fe6a456bf00979d5ebfb22b79fd3cdea81056747bf4e4ded33f2f26f2d228965128a3d0a32696db44e4aff6ca5467d3c749830a5d2e9a41b4ffa3a422e5bb870cee84f4b64af +PT = 1c57186740c4901e022e63b2e7b085f9cb60c83763e357a591c1968277920ade1987334e88ed9c3c96665e37fe492a975153d0 +AAD = +CT = ce4c7e5fb87ed02424a13cd2cfcfaacf67f1072198dedc594db26d6453991821861ba6cc843c6d2750e846e162415123b7e01e +Tag = ab10b040 + +Count = 7 +Key = 6e88bed99ebe380c1f8297f46019d8a0 +IV = 80fb82c0016d054491f396a7722217b0a07bfd0fc954a7141bc1e2e7958cb24541a21492ec85d3c744489f93ae3abb9af101a78f2366226080389d29eef564d5205f377ab0902043bbf7ba64c30c9d2c945cd6f29654738106dd282194fa02344ec177b5547531061b31cebfac4a2b0f46b68e44c8c89f6942f9c9c13e50c58e +PT = 6c90ea3dad3e172ae6784cf1b7c08fc04d0f9b4372f1103d11393a8f02e2f495b53c57cf46f1df6f55b2fdb184fd2b7d590402 +AAD = +CT = f55d67b524b8e633019e9b1736f3db1a254e53cd71fecb48f2dceffa62256a0bfc3775f6506db52db6c6eb91971f5bb5688325 +Tag = b4585815 + +Count = 8 +Key = 99de57ce03d1db62a751c3b2c7d38f3c +IV = 2e74c283e216ab5ac9a1b214dd9280431d7ff942a17715fadc3a22978fa1ccc0743f969358baea3d79abe93388a7db82d82ad6a917bf67795fb4360543d7f22f7ee49f41029ebb87573ac03ddb7e279f1846f4a88b85bea63e2c9b9ecf6f91777434b3def0d4a42d3e025eb43a666a28d5f6c834c7e7991897bf051915e646c6 +PT = 72dc7e5533e862efe0d23c62095506b11c9b256d8d18d11511aa1ed4eae67b0017ec74e322f3a7a18e7d199e7093cadf26680a +AAD = +CT = af31d880d1820a35c9248ee0b1aa0da31339f90182e60451493bec8d4dec3baf922268741c2717831b8365bebf072aa7931ab4 +Tag = b1700396 + +Count = 9 +Key = acdc98a8baa0f003c130ce196135334d +IV = b7c9885d302842c41a2880b9382584806e8cb55b49183b80bc50403c82cfaf0d28a00fef813ba5b7e35b80dc1f0b7c3b0669a3bc739f499d77ed9cc47a467ca62fc34c5bf4c374ff396c01a472c3dfdab394e6926545a1c20363960c72dfad3eebf9a970e6579e3eff7a38f6bcf0373a8494d450d12445f9ff62c233dc1d2379 +PT = 30f1c7fb5fc152dad6911623ca4af1eb495e108ed94b6e6cc19eabeaa7b85262ea3cc4dc5297aa6f7cf504ac6e07db5db550e6 +AAD = +CT = b2cbae4df8898ca223213824a5c08e16eda81f063916e813bf2d0d8c7e8a75b2d0a9f6de91e08d5422970534331cf1dd53fa8e +Tag = 8ef08260 + +Count = 10 +Key = 75d651a377ccdc0e743d73b8205fb38c +IV = c7296796ef031d372284f7b1481a13862aec243792ca73f40e11bfe39da28984f11d591d3294c833babf05a1f19b603f4f4a9ca1102f201c1405b6cb45facc8ef408541963abcafa907b2eb8e5c1c404b0e4884a48bbb2d43add4dc1c44c526295bcfbd8f2b7041ed49189e835cdf4ed00dcbb450eef4070482f5e8b52360966 +PT = 13ab743d9db511857f0f79a84742c225b3692a9aee8a63697d42fe50d74fe028ccf95e18a2dd2d9b778392ee7a5d2f23b399eb +AAD = +CT = 09c3f153ef712fcb3b5fca2b9bf7f25740fb748bec64bc35576ed01682030ae2728d4282140264819e8c4dedd48e29199a6236 +Tag = ae374dd4 + +Count = 11 +Key = 15414fda594aa87a3e7af69df769adbf +IV = 7f021a78293e7cb4dd0af221efda3149b0cab87241b597865267cbab5aad530ea4aa4b10815ba9318a45fcb22fd0e6d692d7beecc2042fa2791f6cca5f9916b0bbae79e9d91133aa54d15a1397f8b063695a3d36b8e573a866fc94964f39016e9490c37189cbb0638db09548a91688d73e2c0d4542f5bd08e03ac0d75e36f519 +PT = bc8263ccf50d0224d088546bc16e2577925567ca52d98ec45cb43b190159bbd5f0326d4498a8a88c0ea0b0a79420b906cd5115 +AAD = +CT = 34b07942793b066b74b7fb8a4ce71a04f9b29e40dca351f5b6e0939bc2819cd95e69bd58163a4df9729e3d6220a6df60a4ba9e +Tag = bcf4aed9 + +Count = 12 +Key = 4d73cae96ded98e1688104a63f462c76 +IV = 331b51fd88cd4731e0eca051717b642f86ea6d6941f9a7331ff361e0edc3f9ea4c013d585f3eff70004a696c4b51d7c589ca97e5fd30f4b2f99c0f3ac83769c2397e10669b7b83aef714a2388638b8941efde8e631098dd78742772f484edd568fefe26b9d981b437e4e4f3ad25445e1aa8c8608c655a5090d2cac158ee67e98 +PT = bf55d48ae5015c39bb167782cab391510d7b7d698e9d3faacd7d409fe4d86fea0a6f61128b6aa305ac0fe4cee4d582dd30e717 +AAD = +CT = 3cf29f4d50feac5cda4c7cef91d563b2573096b7f32c723355d8d59b8e1a0e229dc2f6114f1db6bdc26212043d153709597c1d +Tag = 76fcb529 + +Count = 13 +Key = 7045a9d7caae2c0a39f58998720974ec +IV = a9028008f708ae19dd28e75a02c6e84a05096779781ce0a908047152208468cc4b2a57d25608767f936cf70fbe82dd8a61497a180e1fc967caf4e6310ed850082e6919c922e021ec070dc64b040ed9edbbf5883676630d69e48953068b2bd006bd6d5417038604ee5aed04980c9ed2316c531ce6a3a73dfad90c04c58596d5e9 +PT = 635b946047c38533bb2cb4c9799b44f6eae0e63626901b0741f6dcf3c2bb02270343c7708a72dfe303b20f7805cb732386b341 +AAD = +CT = aebb8c1e914ea9bc1ada9b01f84cc8dbbc611f2cd386d5fc89497d37e5a469b28fe2fdf0ab0f1c882dcce50620b1b18a2d8343 +Tag = 27bcaf06 + +Count = 14 +Key = 79d2a35efbf03f57b66e875c232e10d5 +IV = 7cc96b48afff401adf4bab2ebedd021377b18a819f3c3af39fda42e24c5d62e67ec30f8bcab00263dc5a9bb06cbfe1750c98555901d34d775fdcc86841bd08fbeb44ba68ee794dc351a29a1a9de576d83c17a730d50db79cab88d538a441bb9ff6aa073a2a976de820ab5cc61f834753220d4e472a275dcd13db3e51a23a84a5 +PT = d91e1b2811b3e3894b46c563e6ea0b4a33990ba4fce8a354c941e1effc5691671de5d97c4c1a35e3730b43584944695f00544a +AAD = +CT = 6a45beb05c0dba6c38c997f8c37ef07c7cf78eacff6ff4dd6fa000e745e8053d2d270a746994f29c8628f41fc7fecdac158655 +Tag = 36caee75 diff --git a/src/test/crypto/gcmEncryptExtIV256.rsp b/src/test/crypto/gcmEncryptExtIV256.rsp new file mode 100644 index 00000000000..3faa8956bbb --- /dev/null +++ b/src/test/crypto/gcmEncryptExtIV256.rsp @@ -0,0 +1,129 @@ +# CAVS 14.0 +# GCM Encrypt with keysize 256 test information +# Generated on Fri Aug 31 11:26:26 2012 + +[Keylen = 256] +[IVlen = 1024] +[PTlen = 408] +[AADlen = 0] +[Taglen = 32] + +Count = 0 +Key = 0df25c2bc9444b4a01e26d357a3ac0635fb6ff2e65ce1e759aae491a17772243 +IV = 54652573ba189cffed3bcfa60efbfb417eb4b0e8de80c7e53765d018cbddfae74617269eb35f29faf628d28c40737f9d9e1eb0b8757c984d94340ecc5ddd108f0a5a0e96335ea805950d378fea7569b98693e88bc1cfd9f6eb8d25de177122fc774a5b1957bcb80e92230c12fe401a8e00d0c04897e234644bda36ae761ea619 +PT = b29af460c6a5dbe56f1d67751346d7182c93413a6c328c6d85176cd8dea8ecce1cad3063c8708c0be9ae73d42bbb10421e73f1 +AAD = +CT = 3c786a3d0c8945bf320a21ca63f3b8bf5c6bf56a8412f836d7894e42c9e0695a8e41bf59b23fc52d17f8b341183f1cdce02e22 +Tag = 0b31bffe + +Count = 1 +Key = 1b6b7b8d00e543f0a17bd0bf595319a4a1f8a55ff41ce2380381d4e83c83243b +IV = 791d9546f180b838b50bb7b66b68f7f1aa09a2cef411a1dbf2e64c5ed026613200ac8f0e5b961925853621a1d4339322ea4b7bbc4adcbc008efeaaee0dc948a916b22ff8693a7d441620d0ff67680b23567f9582e22eec529408c6d0a00de1bd1ee5a11ed7fa29b7567f990e412ce90ee12d5b1d8ba8c2b528d0d9963fdc49a5 +PT = 0a76090de676d71d9f5ee8511864f0c9440a0fa12b5155a5bffc36127bc957b293d4fb624af3b956385783124f28f6f3c0f0e8 +AAD = +CT = fdaa9ef2c65d7666b61367f843863a3b273249192551b5633cfaf84a5ab9ecba42916395177f9a16c1ad385e77393cd93d71b3 +Tag = fd854d63 + +Count = 2 +Key = 261a0382c739093634502303d60dd0d2a568b5155147d661e7789f7bd70de82c +IV = e22330a2981b9b60354a740c49dcd17c9016cff50423977f7fc8500fc36a81610c979eb37a1f9c4d54d11b790906492205263178dca6d269a230a595ee95edeb60e45d92a2d6169877dfe5514b23db143dbcf3ca44e13cf5b7402e4f95a9e6760451be2d57d1d5fefb015ffe4b69456d87338865ee8775d1cce238f75c345dad +PT = 42c8c79988c52bcf38865a6f341a5165294af0947bc4fa597c6f648a1172070851c4ee154604d9a21c8d53f7eba77fbb72cf2b +AAD = +CT = 0fb368563c6ad0c097207239a791bc685274d434c4a54cb0ea97a98b4cee9eb100c4cd626ba82c338a3f51cb29391b1da3a44f +Tag = a9607e42 + +Count = 3 +Key = 5f3d6b5c76cb7ad0be3d771f1f7d1bafd88c8c7c7c84922f89f45f3b7453eb44 +IV = 7072d302e4acb642090c5d48e4549a8823725be1389316df0152e68c41f77937eadc4ee1d164101717cdcdd3bc9be3c2669c0b1394953090ac787fb117500b6275122f608e70a3f5063b21fae42ca4b00724b21c50e27c37a77d4befb118018ada7999888442e1271410f6b804c36a27a41e95a438792de4f06fd7122b177cc3 +PT = 00d82cc00c07d3cf0fd9b31ddc091d351aab9b58af07d2c59f3e28c7202ef6e3ec35726cbdd14f8f1985a770d092470f115be7 +AAD = +CT = 0aab0dbb31c113b33b116659294e0af5497c987af870ae8ce55288d48e8a4f1b3e8377cb34d06460daf3cc99fbefb90724a39f +Tag = 26e56b13 + +Count = 4 +Key = 1d4b6b6ba43269d46ca5c0fab38b61f1d2bf3d24dae46181fac73420a8e6194f +IV = 01c283a90c588dafc585f437dc111e94dd8ea98a622a5d2e554a87086ec10e8dac9b205fae70be3c024b3f9fbdc26208a6e44d082ad92a51a7fa1f861d0e93e7e74d4e41426e70b2e7ef7fbcbc5a302aed4bc8a42963ca53334259f1924e74225c3bf9aa2ffdf97a2be6474c72f6dedba7c454e36fbc3537596a63b61a4cf3ce +PT = 4eb036d945c14de26c4e0c83d446dacb57e91b5bdec2ef612bbed8135e57ce62b28843fbac555580ef23c74959eb869a017872 +AAD = +CT = 24d67f2e94294255b9fd90c16b67c7a8b3a47a42782954aa15e80ffe732d64342b1ec65aa66fc4d6b8fe9dfcdbf4d4c1a979e7 +Tag = 37358e86 + +Count = 5 +Key = cbfbb15a4fae4af3de55ffcaa4b8cdbd515ecc72fd50060b22acc8fad57a5f4b +IV = 86e47ba98d8ed360262f94afc364df373d1e44c788b4ee5ed7542822858aaeca2b3c07b5464d6f7edefca9759107bbb64c086d526cc4c9a6b9a7cd6dbb50acec2297597612fd436d8c8ddbc83664305b214cf2c3b6fab3e545c499c7f1cbba1897041475f94952d8a4fc10110986b5ccc7a10c1d9e6427dabba8d942ada9290a +PT = 5470a60f2818ffef374be3cc591084096a4a8caa0dac0024c12d304382c301264526b02efb674dfbcc3be5c818a1d88a7c19f1 +AAD = +CT = a6a8ebadfe5c5ac754e4a2a9cdc25fdf3588c69a287b0f9aba2347bfcdc110a1134426a45c62d577e52a8d61fae39ea38bad51 +Tag = 97406b4b + +Count = 6 +Key = 3ced5c63078a36f6b02d3bec0debacd19021c8dbc501a9d86f556c4ad2cdef8a +IV = b9629801cba22f7493b4f6394620d49a76ec686e524f5ebcb3a76b5a189473484060cbb01bcbf10048427ef21527626085c8a75aa5264b6338abbc26171c2c3a44f6b3b5c3fd05a892c8290a8f99be962deea48d7ae4e626c45a45ffda5efaad6e54e98ba876b039a5dde3d6061f217e57da4774acfaf1f5da9495083aad4dad +PT = c29788107799077ac6dbccad29a346727f263676a8510fbcadb9b5bf53df978b3382fdfc1b5c3312eaa0f7621b6efdaeebe1c0 +AAD = +CT = 5811d4a96169b39cd8f8ff3f931efdda68550f17558b48de8bb5adf455af5179c8c5ff4a73f363f8819daf846427132daf6a17 +Tag = 06e8e7fa + +Count = 7 +Key = 39ae4fdfe56b74a71325a1a685a6593b44936892890cd05d2719a420c97a7c64 +IV = 8f110d0fd20ece7f35c35b2eed32316fd742ae9346fc6907bd749361a4436427f80185b376b16a36fb95429cfaf2e22e46c210442fe5efc14985a9d9d847c3ceb5db02e0d999acbc3ba0afdfecbfaf65024258cc7f6fc8e3f568cbedec1c7eabe3ff3ab3c7331722b6400429d46b54820bd1f88ac03cfba5cbd0812d91342c4b +PT = 8b086d11be7ff55312addad86b49585ea38ea1ee7c4200964cd269a4bb5cfe0f518e6f9b733efb4ba3ce35ce2e803b0ad47d24 +AAD = +CT = be46d6e61dc11b2fc8ac4c9c5f49ceca0fb6fe1fc7221c7cc5d8ba254a92282500b1b31528314035cd125578de960b3bafb69d +Tag = 7c00aa14 + +Count = 8 +Key = c244e9afd90ec810acb7e586cf7a06386e6892e01c7d111d5c1455fc95250d1b +IV = 6ff9103f9751ff4743d856c5cd54491e1a537384260fbfe076f772ad0d66ff6ddd0aaff57023885b0a4d60d2b25c80b1cdc54802607770a61a2503c23cf26f1fe529573c8d4745b19dedaff5769a6a796c01540776d4fa99be9057ad87cdc973e7938640f9497753e88c4cdc358c1cfa06f1ddd826c33f44c55e2d183927baaa +PT = cd1296133cefe6f5cf6f8ec68b37172bfb793a8582d92a539f24f3582569bfcfebb706fe9f276716b185558fbfd6fe8ea99a1b +AAD = +CT = dd1694b178f15612c454885b3100c576c7b68206c57898161d4fdc51e75a428840c5cfee104c3d85fcbd92edbd1d80f22d8e64 +Tag = 741953f5 + +Count = 9 +Key = 785ea9cd2403ecaeec3e4940dce7c41ec012203a2610c780bca5d15af64748c5 +IV = fbc177ec9d47f69e2fcaba9fef7de30735e46a5d20dc66bf66c8a76a382051d780f58dccae8e2054bbe437a5bc0814381bac2b0efa99202aa1f1bf7f51b842907dc9b60f83987c31eae086e26e2018243bdd47a291b523ac6905b40dfb442ba239c876cdfaa581b2ef0683456ac944829a97b663ecd48c116d06f1f054ddfcdb +PT = a3465ad9a4009e26c39acb1d424b7ba6556a74dcc2b78a5ab65b5d07c2a97f382aadb7415395fa6fc90bd137f6894a75f70907 +AAD = +CT = 2b3eb5ab251282da1795a25f9b43ccaa5a27643d042fc315ad662947b0c5cddeb5848e6c69869ae5e81f5c76729bbaceda9889 +Tag = 64173526 + +Count = 10 +Key = 0becf9c7a069db5b9a25f2871fb0594e452126262ec1c48bca3d3024d85d0c51 +IV = e05643a72ab0b1fc42a6f17302d6446fc507bbb4f0ad59178f5084530f02534df2e673e92e67802629f93221bba545a13fb9143eb2a2ee4bce047be621a9d96a450a19951d93c527eb698ac7e132e4f00985dff91e079ae791549d37da3105c77ef5d8cb7649f1aa761b5a5fbc9e0d7dfb5aade98f3df4a641cca02f33eed55c +PT = 70ba9822801760e5ec647a41e27cf6069978be6a28be0b1da0f3661e124847823dec7ec6af737d6dea597ce5bd5baa5d6f2651 +AAD = +CT = 0d45b5ad77226524374a3bca3680528787cefd9978b3f1407688886ce28fea86e7d9005d7d6acde2a7c2fb158479918c06bba4 +Tag = 43dfd859 + +Count = 11 +Key = d1f374702b4481b83e7c7783853c1850e887b0c80cd28c0686b4cab6adc744c2 +IV = 5e15c44cef36c1c89e02b2979f4e7d275fe5973a80580753f6fd51ea477ba84346a030b90b22a9adfff02096bc0b1691c37ee21cdc1b5f4862696d195859cd2bf0b423f5d19da5e1475bcc99f96b8c7c51fe85930aea0e97304d4c025b52bd386666d0537036360e939a6fbdfd6ce3b012e551d333fa74f3cbb9d33a59477364 +PT = 996428a232b1b9ac81ff5260eb77938f0d531a4a0ed3bd774c72c18128af72e964f8d05fb7ec4e0ac8e37056a48c85713a3a01 +AAD = +CT = aa9c103ae51bf556f7c57f9948d28c859f458a74e22f039fea183e9e262b023bdac9bb5a2f167b34cde4a694c555eb0c905f34 +Tag = d1ed9cf0 + +Count = 12 +Key = a6a69524d66f015b4afb746ad0942410baf06d1aba18ef1a8b40e35633f04ce1 +IV = 5e1df758c63728b1269a1d3f610b6e3724bc7ff797dcb4a7aca9dfe1c8e1717ce4281d1c5b4b33d0aafcee342a4f4eb30089eeab2b0470d3f9b709622bde4654d41b3bbc6bf59c11edea28f26b099d83d4fefdc63ab7218221238fe1c230b1290235465b445f60eaa9822eaa8da2c08f6f2fdaaef87dacf74c7e5d6cea191c74 +PT = 0ffc6c3090817fc1a0e6f3802269e263e40d17772fabf5cbee905962878d77c8bc4223e5671bd9f310d8db56ceaebae41fc79e +AAD = +CT = f24537db3fe27eb07f1d2b9b7059dfae97df86d4e609930491c4ec3154462df308ba1c85204dd754521d8de9619603f90f8c44 +Tag = 360adc36 + +Count = 13 +Key = ff0007317f9ecc43c1601708614eb8d443a407318cbf3c085c0a9a7c67faafa8 +IV = 8186f92bd394ecfbf4bcbb9e7e442bd6faa1e59f8785d9aec82551fee38aa10212a8477be3055927379a906902c153598d50c63e316c4e4f3956ef04ce40e5b6f1901fb5d08a23b913e5ad53512b02bb75ce96f8c50a2cebdfa058ded996fe1dc5370ee50c6d90e948129d544dc89c2f28dda8429bb338d6aed0d9557f96889e +PT = e7f4063319fd31e07192a5fc9e486fc0a2a3470671a46356ab0d32989803259c0dae103a4033c533fbde585866fc2af5eba151 +AAD = +CT = 8f86644614e4f6483f28eec02fab33e7cbbd56381a8f878522c2015b91d48652472e356f608c62361939dc983e1cd364a28665 +Tag = 83335b3a + +Count = 14 +Key = 43646d9ddebec23447febe71596e6f9b2387965db1faf3f06feec8bc5c808342 +IV = 0d3774607261b07427dc77f0dcf57b026226dbd2c1df1b9d74598582da44c677af36a6bd80b6bc1f000632c84c5dfa701f8b51b4da228d340d8b4ccc4d2f5d7b5fad00809133eae9250ecd18d7a8741bb57c394396feb81c20bec23de8520f883b8e22b362dde6e6bd9dad73e8919695384a04c09a28bdb6de2fa356b0d259b1 +PT = 2aab9a484b817b759bd9d876967c90160a18208cfa753e7bccd4f73a715aaa6acc6ce666e97bc22fbcf11f263dfed332418707 +AAD = +CT = dd80974e31ad15ba26ec8a8dacfa5f57f0bc69f7113cc6c39fb038ddce37ffeee3f789a02cd86b0418fe16ca104b5fbdb26432 +Tag = 75c35f4c diff --git a/src/test/crypto/t/001_testcrypto.pl b/src/test/crypto/t/001_testcrypto.pl new file mode 100644 index 00000000000..586a37c5bd7 --- /dev/null +++ b/src/test/crypto/t/001_testcrypto.pl @@ -0,0 +1,137 @@ +# Reads and parses .rsp files and runs them through testcrypto +# +# Test vectors downloaded from: +# +# https://csrc.nist.gov/projects/cryptographic-algorithm-validation-program/cavp-testing-block-cipher-modes +# +# Specifically GCM Test Vectors (SP 800-38D): +# +# https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/mac/gcmtestvectors.zip +# +# Note that the AADlen > 0 cases were removed from our set, since we don't support that and +# the test code will just skip them currently, and we also don't bother testing 192-bit, +# but one could download the full set and run all of the files through if they wished and +# all should pass (they did when this test suite was written originally). + +use strict; +use warnings; +use TestLib; +use Test::More; + +if ($ENV{with_ssl} eq 'openssl') +{ + plan tests => 56; +} +else +{ + plan skip_all => 'SSL not supported by this build'; +} + + +# XXX add CTR tests here + +my $algo = "AES-GCM"; +my @rspfiles = ( + "gcmDecrypt128.rsp", "gcmDecrypt256.rsp", + "gcmEncryptExtIV128.rsp", "gcmEncryptExtIV256.rsp"); + +note "running tests"; + +foreach my $rspfile (@rspfiles) +{ + open(my $in_rspfile, '<', $rspfile) || die; + my %testrun; + my %lengths; + + while (my $line = <$in_rspfile>) + { + + chomp($line); + + # Remove CR, if it's there. + $line =~ s/\r$//; + + # Skip comments + if ($line =~ /^[[:space:]]*#/) { next; } + + # If we hit a blank, time to run a test + if ($line =~ /^[[:space:]]*$/) + { + if (%testrun) + { + my @testargs; + + # Set up the command to run + push(@testargs, ("$ENV{TESTDIR}/testcrypto", '-a', $algo)); + + if ($testrun{'Key'}) + { + push(@testargs, ('-k', $testrun{'Key'})); + } + + if ($testrun{'IV'}) + { + push(@testargs, ('-i', $testrun{'IV'})); + } + + if ($testrun{'CT'}) + { + push(@testargs, ('-c', $testrun{'CT'})); + } + + if ($testrun{'AAD'}) + { + # Don't currently support AAD + undef(%testrun); + next; + } + + if ($testrun{'Tag'}) + { + push(@testargs, ('-t', $testrun{'Tag'})); + } + + if ($testrun{'PT'}) + { + push(@testargs, ('-p', $testrun{'PT'})); + } + + if ($testrun{fail}) + { + command_exit_is(\@testargs, 1, + "Run $testrun{Count} of Keylen: $lengths{Keylen}, IVlen: $lengths{IVlen}, PTlen: $lengths{PTlen}, AADlen: $lengths{AADlen}, Taglen: $lengths{Taglen}" + ); + } + else + { + command_ok(\@testargs, + "Run $testrun{Count} of Keylen: $lengths{Keylen}, IVlen: $lengths{IVlen}, PTlen: $lengths{PTlen}, AADlen: $lengths{AADlen}, Taglen: $lengths{Taglen}" + ); + } + undef(%testrun); + undef(%lengths); + } + else + { + next; + } + } + + # Grab length information, just to have. + if ($line =~ /^\[([A-Za-z]*) = ([0-9]*)]$/) + { + $lengths{$1} = $2; + next; + } + + if ($line =~ /^([A-Za-z]*) = ([a-f0-9]*)$/) + { + $testrun{$1} = $2; + } + + if ($line =~ /^FAIL$/) + { + $testrun{fail} = 1; + } + } +} diff --git a/src/test/crypto/t/002_testkwp.pl b/src/test/crypto/t/002_testkwp.pl new file mode 100644 index 00000000000..25911cfd9f2 --- /dev/null +++ b/src/test/crypto/t/002_testkwp.pl @@ -0,0 +1,126 @@ +# Reads and parses .rsp files and runs them through testcrypto +# +# (Partial) Test vectors downloaded from: +# +# https://csrc.nist.gov/projects/cryptographic-algorithm-validation-program/cavp-testing-block-cipher-modes +# +# Specifically Key Wrap Test Vectors (SP 800-38F): +# +# https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/mac/kwtestvectors.zip +# +# We don't include the 192-bit tests, though they all worked when this test suite was written. +# We also don't include the _inv tests as those aren't supported in OpenSSL yet. + +use strict; +use warnings; +use TestLib; +use Test::More; + +if ($ENV{with_ssl} eq 'openssl') +{ + plan tests => 20; +} +else +{ + plan skip_all => 'SSL not supported by this build'; +} + + +my $algo; +# my @txtfiles = ("KWP_AD_128.txt", "KWP_AD_192.txt", "KWP_AD_256.txt", "KWP_AE_128.txt", "KWP_AE_192.txt", "KWP_AE_256.txt"); +my @txtfiles = + ("KWP_AD_128.txt", "KWP_AD_256.txt", "KWP_AE_128.txt", "KWP_AE_256.txt"); + +note "running tests"; + +foreach my $txtfile (@txtfiles) +{ + open(my $in_txtfile, '<', $txtfile) || die; + my %testrun; + my %lengths; + + if ($txtfile =~ /^KWP_/) + { + $algo = 'AES-KWP'; + } + + while (my $line = <$in_txtfile>) + { + + chomp($line); + + # Remove CR, if it's there. + $line =~ s/\r$//; + + # Skip comments + if ($line =~ /^[[:space:]]*#/) { next; } + + # If we hit a blank, time to run a test + if ($line =~ /^[[:space:]]*$/) + { + if (%testrun) + { + my @testargs; + + # Set up the command to run + push(@testargs, ("$ENV{TESTDIR}/testcrypto", '-a', $algo)); + + if ($testrun{'K'}) + { + push(@testargs, ('-k', $testrun{'K'})); + } + + if ($testrun{'C'}) + { + push(@testargs, ('-c', $testrun{'C'})); + } + + if ($testrun{'P'}) + { + push(@testargs, ('-p', $testrun{'P'})); + } + + if ($testrun{fail}) + { + command_exit_is(\@testargs, 1, + "Run $testrun{COUNT} of Plaintext Length: $lengths{'PLAINTEXT LENGTH'}" + ); + } + else + { + command_ok(\@testargs, + "Run $testrun{COUNT} of Plaintext Length: $lengths{'PLAINTEXT LENGTH'}" + ); + } + undef(%testrun); + undef(%lengths); + } + else + { + next; + } + } + + # Grab length information, just to have. + if ($line =~ /^\[([A-Za-z ]*) = ([0-9]*)]$/) + { + $lengths{$1} = $2; + next; + } + + if ($line =~ /^([A-Z]) = ([a-f0-9]*)$/) + { + $testrun{$1} = $2; + } + + if ($line =~ /^COUNT = ([0-9]*)$/) + { + $testrun{COUNT} = $1; + } + + if ($line =~ /^FAIL$/) + { + $testrun{fail} = 1; + } + } +} diff --git a/src/test/crypto/t/003_clusterkey.pl b/src/test/crypto/t/003_clusterkey.pl new file mode 100644 index 00000000000..5b8a255aceb --- /dev/null +++ b/src/test/crypto/t/003_clusterkey.pl @@ -0,0 +1,93 @@ +# Test cluster file encryption key managment +# + +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; + +if ($ENV{with_ssl} eq 'openssl') +{ + plan tests => 6; +} +else +{ + plan skip_all => "tests cannot run without OpenSSL"; +} + +# generate two cluster file encryption keys of random hex digits +my ($rand_hex, $rand_hex2); +$rand_hex .= sprintf("%x", rand 16) for 1 .. 64; +$rand_hex2 .= sprintf("%x", rand 16) for 1 .. 64; + +# initialize cluster using the first cluster key +my $node = get_new_node('node'); +$node->init( + extra => [ + '--file-encryption-method', 'AES256', + '--cluster-key-command', "echo $rand_hex" + ]); + +# Set wal_level to 'replica'; encryption can't use 'minimal' +$node->append_conf('postgresql.conf', 'wal_level=replica'); + +$node->start; + +# check encryption method +my $file_encryption_method = + $node->safe_psql('postgres', 'SHOW file_encryption_method;'); +ok($file_encryption_method eq 'AES256', 'file_encryption_method is valid'); + +# record pg_proc count +my $old_pg_proc_count = + $node->safe_psql('postgres', 'SELECT COUNT(*) FROM pg_proc;'); +ok($old_pg_proc_count > 0, 'pg_proc count is valid'); + +# create permanent table +$node->safe_psql('postgres', + 'CREATE TABLE perm_table (x) AS SELECT * FROM generate_series(1, 100);'); + +# create unlogged table +# Non-permanent tables like unlogged tables use a special nonce bit, so test those here. +$node->safe_psql('postgres', + 'CREATE UNLOGGED TABLE unlog_table (x) AS SELECT * FROM generate_series(1, 200);' +); + +# We can run pg_alterckey and change the cluster_key_command here +# without affecting the running server. +system_or_bail( + 'pg_alterckey', + "echo $rand_hex", + "echo $rand_hex2", + $node->data_dir); + +$node->safe_psql('postgres', + "ALTER SYSTEM SET cluster_key_command TO 'echo $rand_hex2'"); + +$node->stop; + +# start/stop with new cluster key +$node->start; + +# check encryption method +$file_encryption_method = + $node->safe_psql('postgres', 'SHOW file_encryption_method;'); +ok($file_encryption_method eq 'AES256', 'file_encryption_method is valid'); + +# check pg_proc count +my $new_pg_proc_count = + $node->safe_psql('postgres', 'SELECT COUNT(*) FROM pg_proc;'); +ok($new_pg_proc_count == $old_pg_proc_count, 'old/new pg_proc counts match'); + +# check permanent table count +my $perm_table_count = + $node->safe_psql('postgres', 'SELECT COUNT(*) FROM perm_table;'); +ok($perm_table_count == 100, 'perm_table_count count matches'); + +# check unlogged table count +my $unlog_table_count = + $node->safe_psql('postgres', 'SELECT COUNT(*) FROM unlog_table;'); +ok($unlog_table_count == 200, 'unlog_table_count count matches'); + +$node->stop; diff --git a/src/test/crypto/t/004_buffers.pl b/src/test/crypto/t/004_buffers.pl new file mode 100644 index 00000000000..a1379b4772b --- /dev/null +++ b/src/test/crypto/t/004_buffers.pl @@ -0,0 +1,157 @@ +# Test cluster file encryption buffer management + +# This tests that an encrypted server actually encrypts the database +# files. It does this by checking for strings in the database files in +# both non-encrypted and encrypted clusters. We test a system table, a +# permanent relation, and a unlogged/non-permanent table. +# (Non-permanent relations use a special nonce bit, which is why we test +# it here.) + +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; + +if ($ENV{with_ssl} eq 'openssl') +{ + plan tests => 17; +} +else +{ + plan skip_all => "tests cannot run without OpenSSL"; +} + +my %file_match_count; + +sub get_cluster_file_contents +{ + my $node = shift(); + my %relnode; + + # get postgres database oid + my $postgres_db_oid = $node->safe_psql('postgres', + "SELECT oid FROM pg_database WHERE datname = 'postgres';"); + ok($postgres_db_oid != 0, 'retrieving postgres database oid'); + + # get pg_proc relfilenode + $relnode{pg_proc} = + $node->safe_psql('postgres', "SELECT pg_relation_filenode('pg_proc');"); + ok($relnode{pg_proc} != 0, 'retrieving pg_proc relfilenode'); + + # create permanent table + $node->safe_psql('postgres', + "CREATE TABLE perm_table (x) AS SELECT 'aaaaaaaa' FROM generate_series(1, 100);" + ); + + # get permanent table relfilenode + $relnode{perm} = $node->safe_psql('postgres', + "SELECT pg_relation_filenode('perm_table');"); + ok($relnode{perm} != 0, 'retrieving permanent table relfilenode'); + + # create unlogged table + $node->safe_psql('postgres', + "CREATE UNLOGGED TABLE unlog_table (x) AS SELECT 'bbbbbbbb' FROM generate_series(1, 200);" + ); + + # get unlogged table relfilenode + $relnode{unlog} = $node->safe_psql('postgres', + "SELECT pg_relation_filenode('unlog_table');"); + ok($relnode{unlog} != 0, 'retrieving unlogged table relfilenode'); + + my $file_contents = + slurp_file($node->basedir . + '/pgdata/base/' . $postgres_db_oid . '/' . $relnode{pg_proc}); + # () converts to list context + $file_match_count{pg_proc} = () = $file_contents =~ m/pg_[a-z]{3,}/g; + + $file_contents = + slurp_file($node->basedir . + '/pgdata/base/' . $postgres_db_oid . '/' . $relnode{perm}); + $file_match_count{perm} = () = $file_contents =~ m/a{8,}/g; + + $file_contents = + slurp_file($node->basedir . + '/pgdata/base/' . $postgres_db_oid . '/' . $relnode{unlog}); + $file_match_count{unlog} = () = $file_contents =~ m/b{8,}/g; +} + +# +# Test with disabled encryption +# + +# initialize cluster +my $non_encrypted_node = get_new_node('non_encrypted_node'); +$non_encrypted_node->init(); + +$non_encrypted_node->start; + +# check encryption is disabled +my $file_encryption_method = + $non_encrypted_node->safe_psql('postgres', 'SHOW file_encryption_method;'); +ok($file_encryption_method eq '', 'file_encryption_method is valid'); + +get_cluster_file_contents($non_encrypted_node); + +# record pg_proc count +my $query_pg_proc_count = $non_encrypted_node->safe_psql('postgres', + "SELECT COUNT(*) FROM pg_proc WHERE proname ~ '^pg_[a-z]{3,}';"); +ok($query_pg_proc_count > 0, 'pg_proc count is valid'); + +# check pg_proc count +ok($file_match_count{pg_proc} != $query_pg_proc_count, + 'SQL/file pg_proc counts match'); + +# check permanent table count +ok($file_match_count{perm} != 100, 'perm_table count matches'); + +# check unlogged table count +ok($file_match_count{unlog} != 200, 'unlog_table count matches'); + +$non_encrypted_node->stop; + + +#--------------------------------------------------------------------------- + +# +# Test with enabled encryption +# + +my $rand_hex; +$rand_hex .= sprintf("%x", rand 16) for 1 .. 64; + +# initialize cluster using a cluster key +my $encrypted_node = get_new_node('encrypted_node'); +$encrypted_node->init( + extra => [ + # We tested AES256 in 003, so use AES128 + '--file-encryption-method', 'AES128', + '--cluster-key-command', "echo $rand_hex" + ]); + +# Set wal_level to 'replica'; encryption can't use 'minimal' +$encrypted_node->append_conf('postgresql.conf', 'wal_level=replica'); + +$encrypted_node->start; + +# check encryption method +$file_encryption_method = + $encrypted_node->safe_psql('postgres', 'SHOW file_encryption_method;'); +ok($file_encryption_method eq 'AES128', 'file_encryption_method is valid'); + +get_cluster_file_contents($encrypted_node); + +# Because the files are encrypted, we should get zero matches for all +# comparisons below. However, technically the encrypted data might +# match the desired string, so we allow one such match. + +# check pg_proc +ok($file_match_count{pg_proc} <= 1, 'pg_proc is encrypted'); + +# check permanent table +ok($file_match_count{perm} <= 1, 'perm_table is encrypted'); + +# check unlogged table +ok($file_match_count{unlog} <= 1, 'unlog_table is encrypted'); + +$encrypted_node->stop; diff --git a/src/test/crypto/testcrypto.c b/src/test/crypto/testcrypto.c new file mode 100644 index 00000000000..1ad0ec0cddd --- /dev/null +++ b/src/test/crypto/testcrypto.c @@ -0,0 +1,545 @@ +/*------------------------------------------------------------------------- + * + * testcrypto.c + * A utility to test our encryption / decryption routines. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/test/crypto/testcrypto.c + * + *------------------------------------------------------------------------- + */ + +#define FRONTEND 1 + +#define EXITSUCCESS 0 +#define EXITDECRYPTFAIL 1 +#define EXITFAILURE 2 + +#include "postgres_fe.h" + +#include +#include +#include +#include + +//#include "common/hex.h" +#include "common/cipher.h" +#include "common/logging.h" +#include "getopt_long.h" +#include "pg_getopt.h" + +static const char *progname; + +static void +usage(const char *progname) +{ + printf(_("%s tests encryption/decryption routines in PG common library.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION]\n"), progname); + printf("\n"); + printf(_(" Performs one encryption and one decryption run.\n")); + printf("\n"); + printf(_(" Encrypts the provided plaintext (or the empty string if none given) and generates a tag, if using AES-GCM, using the key and IV given.\n")); + printf(_(" After encryption, compares provided ciphertext to resulting ciphertext.\n")); + printf(_(" Compares provided tag, if any, to resulting tag.\n")); + printf(_(" If no tag is provided, then the tag created during encryption is used during decryption.\n")); + printf("\n"); + printf(_(" Decrypts the provided ciphertext (or the empty string if none given) using the key, and IV + tag given if using AES-GCM.\n")); + printf(_(" After successful decryption (requires tag to match for AES-GCM), compares provided plaintext to resulting plaintext.\n")); + printf(_(" Exits with '1' if decryption fails.\n")); + printf("\n"); + printf(_(" Exits with '2' for any other failure.\n")); + printf("\n"); + printf(_(" Key is always required, IV is required for AES-GCM mode.\n")); + printf("\n"); + printf(_(" Algorithms supported are AES-GCM and AES-KWP.\n")); + printf("\n"); + printf(_("\nOptions:\n")); + printf(_(" -a, --algorithm=ALG Crypto algorithm to use\n")); + printf(_(" -i, --init-vector=IV Initialization vector to use\n")); + printf(_(" -k, --key=KEY Key to use, in hex\n")); + printf(_(" -p, --plain-text=PT Plain text to encrypt\n")); + printf(_(" -c, --cipher-text=CT Cipher text to decrypt\n")); + printf(_(" -t, --tag=TAG Tag to use for decryption\n")); + printf(_(" -T, --tag-length=LEN Length of tag to use for encryption\n")); + printf(_(" -v, --verbose verbose output\n")); + printf(_(" -V, --version output version information, then exit\n")); + printf(_(" -?, --help show this help, then exit\n")); + printf("\n"); + printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT); +} + +/* + * HEX + */ +static const char hextbl[] = "0123456789abcdef"; + +static const int8 hexlookup[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + +static uint64 +hex_encode(const char *src, size_t len, char *dst) +{ + const char *end = src + len; + + while (src < end) + { + *dst++ = hextbl[(*src >> 4) & 0xF]; + *dst++ = hextbl[*src & 0xF]; + src++; + } + return (uint64) len * 2; +} + +static inline char +get_hex(const char *cp) +{ + unsigned char c = (unsigned char) *cp; + int res = -1; + + if (c < 127) + res = hexlookup[c]; + + if (res < 0) + pg_log_fatal("invalid hexadecimal digit: \"%s\"",cp); + + return (char) res; +} + +static uint64 +hex_decode(const char *src, size_t len, char *dst) +{ + const char *s, + *srcend; + char v1, + v2, + *p; + + srcend = src + len; + s = src; + p = dst; + while (s < srcend) + { + if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r') + { + s++; + continue; + } + v1 = get_hex(s) << 4; + s++; + if (s >= srcend) + pg_log_fatal("invalid hexadecimal data: odd number of digits"); + + v2 = get_hex(s); + s++; + *p++ = v1 | v2; + } + + return p - dst; +} + +static uint64 +pg_hex_enc_len(size_t srclen) +{ + return (uint64) srclen << 1; +} + +static uint64 +pg_hex_dec_len(size_t srclen) +{ + return (uint64) srclen >> 1; +} + +int +main(int argc, char *argv[]) +{ + char *algorithm = NULL, + *iv_hex = NULL, + *key_hex = NULL, + *plaintext_hex = NULL, + *ciphertext_hex = NULL, + *tag_hex = NULL; + + unsigned char *plaintext = NULL, + *ciphertext = NULL, + *key = NULL, + *iv = NULL, + *tag = NULL, + *tag_result = NULL, + *result = NULL; + + int verbose = 0, + plaintext_len = 0, + ciphertext_len = 0, + key_len = 0, + iv_len = 0, + tag_len = 16, + result_len = 0, + blocksize = 0, + cipher = PG_CIPHER_AES_GCM; + + PgCipherCtx *ctx = NULL; + + static struct option long_options[] = { + {"algorithm", required_argument, NULL, 'a'}, + {"init-vector", required_argument, NULL, 'i'}, + {"key", required_argument, NULL, 'k'}, + {"plain-text", required_argument, NULL, 'p'}, + {"cipher-text", required_argument, NULL, 'c'}, + {"tag", required_argument, NULL, 't'}, + {"tag-length", required_argument, NULL, 'T'}, + {"verbose", required_argument, NULL, 'v'}, + {NULL, 0, NULL, 0} + }; + + int c; + + pg_logging_init(argv[0]); + set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("testcrypto")); + progname = get_progname(argv[0]); + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(progname); + exit(EXITSUCCESS); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("testcrypto (PostgreSQL) " PG_VERSION); + exit(EXITSUCCESS); + } + } + + /* Process command-line argument */ + + while ((c = getopt_long(argc, argv, "a:i:k:p:c:t:T:v", long_options, NULL)) != -1) + { + switch (c) + { + case 'a': + algorithm = pg_strdup(optarg); + break; + + case 'i': + iv_hex = pg_strdup(optarg); + break; + + case 'k': + key_hex = pg_strdup(optarg); + break; + + case 'p': + plaintext_hex = pg_strdup(optarg); + break; + + case 'c': + ciphertext_hex = pg_strdup(optarg); + break; + + case 't': + tag_hex = pg_strdup(optarg); + break; + + case 'T': + tag_len = atoi(optarg); + break; + + case 'v': + verbose = 1; + break; + + default: + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(EXITFAILURE); + } + } + + /* Complain if any arguments remain */ + if (optind < argc) + { + pg_log_error("too many command-line arguments (first is \"%s\")", + argv[optind]); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(EXITFAILURE); + } + + /* Check options passed in */ + if (algorithm) + { + if (strcmp(algorithm, "AES-GCM") == 0) + cipher = PG_CIPHER_AES_GCM; + else if (strcmp(algorithm, "AES-KWP") == 0) + cipher = PG_CIPHER_AES_KWP; + else + { + pg_log_error("Unsupported algorithm selected."); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(EXITFAILURE); + } + } + + if (key_hex == NULL) + { + pg_log_error("Key must be provided"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(EXITFAILURE); + } + else + { + size_t key_hex_len = strlen(key_hex); + + key_len = pg_hex_dec_len(key_hex_len); + + key = pg_malloc0(key_len); + hex_decode(key_hex, key_hex_len, (char *) key); + } + + if (cipher == PG_CIPHER_AES_GCM && iv_hex == NULL) + { + pg_log_error("Initialization vector must be provided"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(EXITFAILURE); + } + else if (cipher == PG_CIPHER_AES_GCM) + { + size_t iv_hex_len = strlen(iv_hex); + + iv_len = pg_hex_dec_len(iv_hex_len); + + iv = pg_malloc0(iv_len); + hex_decode(iv_hex, iv_hex_len, (char *) iv); + } + + if (plaintext_hex) + { + size_t plaintext_hex_len = strlen(plaintext_hex); + + plaintext_len = pg_hex_dec_len(plaintext_hex_len); + + plaintext = pg_malloc0(plaintext_len); + hex_decode(plaintext_hex, plaintext_hex_len, (char *) plaintext); + } + + /* + * OpenSSL 1.1.1d and earlier crashes on some zero-length plaintext and + * ciphertext strings. It crashes on an encryption call to + * EVP_EncryptFinal_ex(() in GCM mode of zero-length strings if plaintext + * is NULL, even though plaintext_len is zero. Setting plaintext to + * non-NULL allows it to work. In KWP mode, zero-length strings fail if + * plaintext_len = 0 and plaintext is non-NULL (the opposite). OpenSSL + * 1.1.1e+ is fine with all options. + */ + else if (cipher == PG_CIPHER_AES_GCM) + { + plaintext_len = 0; + plaintext = pg_malloc0(1); + } + + if (ciphertext_hex) + { + size_t ciphertext_hex_len = strlen(ciphertext_hex); + + ciphertext_len = pg_hex_dec_len(ciphertext_hex_len); + + ciphertext = pg_malloc0(ciphertext_len); + hex_decode(ciphertext_hex, ciphertext_hex_len, + (char *) ciphertext); + } + /* see OpenSSL 1.1.1d item above, though crash only happens in GCM mode */ + else if (cipher == PG_CIPHER_AES_GCM) + { + ciphertext_len = 0; + ciphertext = pg_malloc0(1); + } + + if (cipher == PG_CIPHER_AES_GCM) + tag_result = pg_malloc0(tag_len); + + if (tag_hex) + { + size_t tag_hex_len = strlen(tag_hex); + + tag_len = pg_hex_dec_len(tag_hex_len); + + tag = pg_malloc0(tag_len); + hex_decode(tag_hex, tag_hex_len, (char *) tag); + } + else + tag = tag_result; + + if (verbose) + { + printf("Alrogithm: %d\n", cipher); + printf("Key length: %d (%d bits)\n", key_len, key_len * 8); + printf("IV length: %d (%d bits)\n", iv_len, iv_len * 8); + printf("Tag length: %d (%d bits)\n", tag_len, tag_len * 8); + printf("Plaintext length: %d\n", plaintext_len); + printf("Ciphertext length: %d\n", ciphertext_len); + } + + /* + * Encryption + * + * We run through the encryption even if there wasn't a plaintext + * provided- in that case we just encrypt the empty string. + */ + ctx = pg_cipher_ctx_create(cipher, key, key_len, true); + if (!ctx) + { + pg_log_error("Error creating encryption context, be sure key is of supported length"); + exit(EXITFAILURE); + } + + blocksize = pg_cipher_blocksize(ctx); + + /* If we were provided with a plaintext input */ + if (plaintext_len != 0) + { + /* Encryption might result in as much as input length + blocksize */ + result_len = plaintext_len + blocksize; + result = palloc0(result_len); + + if (ciphertext_hex == NULL) + { + ciphertext = result; + ciphertext_len = plaintext_len; + } + } + + if (cipher == PG_CIPHER_AES_GCM) + { + if (!pg_cipher_encrypt(ctx, cipher, + plaintext, plaintext_len, + result, &result_len, + iv, iv_len, + tag_result, tag_len)) + { + pg_log_error("Error during encryption."); + exit(EXITFAILURE); + } + } + else if (cipher == PG_CIPHER_AES_KWP) + { + if (!pg_cipher_keywrap(ctx, + plaintext, plaintext_len, + result, &result_len)) + { + pg_log_error("Error during encryption."); + exit(EXITFAILURE); + } + } + + if (verbose || ciphertext == NULL) + { + uint64 result_hex_len = pg_hex_enc_len(result_len); + char *result_hex = palloc0(result_hex_len + 1); + + hex_encode((char *) result, result_len, result_hex); + result_hex[result_hex_len] = '\0'; + + printf("ciphertext: %s\n", result_hex); + + if (cipher == PG_CIPHER_AES_GCM) + { + result_hex_len = pg_hex_enc_len(tag_len); + result_hex = palloc0(result_hex_len + 1); + + hex_encode((char *) tag_result, tag_len, result_hex); + result_hex[result_hex_len] = '\0'; + + printf("tag: %s\n", result_hex); + } + } + + /* + * Report on non-matching results, but still go through the decryption + * routine to make sure that we get the correct result, and then error + * out. + */ + if (plaintext_len != 0 && ciphertext != NULL && memcmp(ciphertext, result, plaintext_len) != 0) + pg_log_error("Provided ciphertext does not match"); + + if (cipher == PG_CIPHER_AES_GCM && tag != tag_result && memcmp(tag, tag_result, tag_len) != 0) + pg_log_error("Provided tag does not match"); + + /* + * If a ciphertext was provided then use that as the max size of our + * plaintext result. We shouldn't ever get a result larger. + */ + if (ciphertext_len != 0) + { + result_len = ciphertext_len; + result = palloc0(result_len); + } + + /* + * Decryption + * + * We run through the decryption even if there wasn't a ciphertext + * provided- in that case we just decrypt the empty string. + */ + ctx = pg_cipher_ctx_create(cipher, key, key_len, false); + if (!ctx) + { + pg_log_error("Error creating decryption context, be sure key is of supported length"); + exit(EXITFAILURE); + } + + if (cipher == PG_CIPHER_AES_GCM) + { + if (!pg_cipher_decrypt(ctx, cipher, + ciphertext, ciphertext_len, + result, &result_len, + iv, iv_len, + tag, tag_len)) + { + pg_log_error("Error during decryption."); + exit(EXITDECRYPTFAIL); + } + } + else if (cipher == PG_CIPHER_AES_KWP) + { + if (!pg_cipher_keyunwrap(ctx, + ciphertext, ciphertext_len, + result, &result_len)) + { + pg_log_error("Error during decryption."); + exit(EXITDECRYPTFAIL); + } + } + + if (verbose || plaintext == NULL) + { + uint64 result_hex_len = pg_hex_enc_len(result_len); + char *result_hex = palloc0(result_hex_len + 1); + + hex_encode((char *) result, result_len, result_hex); + result_hex[result_hex_len] = '\0'; + + printf("plaintext: %s\n", result_hex); + } + + if (ciphertext_len != 0 && plaintext != NULL && memcmp(plaintext, result, plaintext_len) != 0) + { + pg_log_error("Provided plaintext does not match"); + exit(EXITFAILURE); + } + + exit(EXITSUCCESS); +} diff --git a/src/test/isolation2/Makefile b/src/test/isolation2/Makefile index e8d0226af58..0fed5742cd6 100644 --- a/src/test/isolation2/Makefile +++ b/src/test/isolation2/Makefile @@ -70,3 +70,10 @@ installcheck-resgroup: install installcheck-parallel-retrieve-cursor: install $(pg_isolation2_regress_installcheck) $(EXTRA_REGRESS_OPTS) --init-file=$(top_builddir)/src/test/regress/init_file --init-file=./init_file_parallel_retrieve_cursor --bindir='$(bindir)' --inputdir=$(srcdir) --dbname=isolation2parallelretrcursor --load-extension=gp_inject_fault --schedule=$(srcdir)/parallel_retrieve_cursor_schedule + +# Add a new rule for running installcheck with parallel mode enabled. +installcheck-cbdb-parallel: install + (\ + export PGOPTIONS='-c optimizer=off -c enable_parallel=true -c gp_appendonly_insert_files=0'; \ + $(pg_isolation2_regress_installcheck) --init-file=$(top_builddir)/src/test/regress/init_file --init-file=./init_file_isolation2 --schedule=$(srcdir)/isolation2_schedule \ + ) \ No newline at end of file diff --git a/src/test/isolation2/expected/crash_recovery_dtm.out b/src/test/isolation2/expected/crash_recovery_dtm.out index ae1faf9690a..a75313ead3f 100644 --- a/src/test/isolation2/expected/crash_recovery_dtm.out +++ b/src/test/isolation2/expected/crash_recovery_dtm.out @@ -67,20 +67,17 @@ DETAIL: gid=1630210, state=Retry Commit Prepared server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. --- Reset the fault in utility mode because normal mode connection will --- not be accepted until DTX recovery is finished. --1U: SELECT gp_inject_fault('finish_prepared_start_of_function', 'reset', dbid) from gp_segment_configuration where content=0 and role='p'; - gp_inject_fault ------------------ - Success: -(1 row) --1Uq: ... -- Join back to know master has completed postmaster reset. 3<: <... completed> server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. -- Start a session on master which would complete the DTM recovery and hence COMMIT PREPARED +4: SELECT gp_inject_fault('finish_prepared_start_of_function', 'reset', dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) 4: SELECT * from commit_phase1_panic; a | b ---+--- diff --git a/src/test/isolation2/expected/gpdispatch.out b/src/test/isolation2/expected/gpdispatch.out index f3e54481655..67f569f65f4 100644 --- a/src/test/isolation2/expected/gpdispatch.out +++ b/src/test/isolation2/expected/gpdispatch.out @@ -49,6 +49,8 @@ INSERT 1000 SET 1: set gp_cte_sharing to on; SET +1: set max_parallel_workers_per_gather = 0; +SET 1: select gp_inject_fault_infinite('shareinput_writer_notifyready', 'suspend', 2); gp_inject_fault_infinite -------------------------- diff --git a/src/test/isolation2/expected/gpdispatch_1.out b/src/test/isolation2/expected/gpdispatch_1.out index 53eb9428415..5456f98d20f 100644 --- a/src/test/isolation2/expected/gpdispatch_1.out +++ b/src/test/isolation2/expected/gpdispatch_1.out @@ -49,6 +49,8 @@ INSERT 1000 SET 1: set gp_cte_sharing to on; SET +1: set max_parallel_workers_per_gather = 0; +SET 1: select gp_inject_fault_infinite('shareinput_writer_notifyready', 'suspend', 2); gp_inject_fault_infinite -------------------------- diff --git a/src/test/isolation2/input/parallel_retrieve_cursor/explain.source b/src/test/isolation2/input/parallel_retrieve_cursor/explain.source index 9863ba5a03a..1c5ef63607e 100644 --- a/src/test/isolation2/input/parallel_retrieve_cursor/explain.source +++ b/src/test/isolation2/input/parallel_retrieve_cursor/explain.source @@ -43,8 +43,8 @@ EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * -- Test: explain output: Endpoint info (on coordinator/on some segments/on all segments) -- Here because replicated table will execute on seg id: session_id % segment_number -- Just replace the random specific seg id to SEGIDX for the output -1: @post_run 'create_sub "Endpoint: on segments: contentid \[[0-9]+\]" " Endpoint: on segments: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1; -1: @post_run 'create_sub "Endpoint: on segments: contentid \[[0-9]+\]" " Endpoint: on segments: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 ORDER BY a; -1: @post_run 'create_sub "Endpoint: on segments: contentid \[[0-9]+\]" " Endpoint: on segments: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 WHERE a=1; -1: @post_run 'create_sub "Endpoint: on segments: contentid \[[0-9]+\]" " Endpoint: on segments: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 WHERE a=1 OR a=2; +1: @post_run 'create_sub "on segment: contentid \[[0-9]+\]" "on segment: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1; +1: @post_run 'create_sub "on segment: contentid \[[0-9]+\]" "on segment: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 ORDER BY a; +1: @post_run 'create_sub "on segment: contentid \[[0-9]+\]" "on segment: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 WHERE a=1; +1: @post_run 'create_sub "on segment: contentid \[[0-9]+\]" "on segment: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 WHERE a=1 OR a=2; diff --git a/src/test/isolation2/output/parallel_retrieve_cursor/explain.source b/src/test/isolation2/output/parallel_retrieve_cursor/explain.source index fe23fe05a90..4349887c935 100644 --- a/src/test/isolation2/output/parallel_retrieve_cursor/explain.source +++ b/src/test/isolation2/output/parallel_retrieve_cursor/explain.source @@ -186,7 +186,7 @@ EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * -- Test: explain output: Endpoint info (on coordinator/on some segments/on all segments) -- Here because replicated table will execute on seg id: session_id % segment_number -- Just replace the random specific seg id to SEGIDX for the output -1: @post_run 'create_sub "Endpoint: on segments: contentid \[[0-9]+\]" " Endpoint: on segments: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1; +1: @post_run 'create_sub "on segment: contentid \[[0-9]+\]" "on segment: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1; QUERY PLAN --------------------------------------- Seq Scan on public.rt1 @@ -194,7 +194,7 @@ EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * Endpoint: "on segment: contentid [SEGIDX]" Optimizer: Postgres query optimizer (4 rows) -1: @post_run 'create_sub "Endpoint: on segments: contentid \[[0-9]+\]" " Endpoint: on segments: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 ORDER BY a; +1: @post_run 'create_sub "on segment: contentid \[[0-9]+\]" "on segment: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 ORDER BY a; QUERY PLAN --------------------------------------- Sort @@ -205,7 +205,7 @@ EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * Endpoint: "on segment: contentid [SEGIDX]" Optimizer: Postgres query optimizer (7 rows) -1: @post_run 'create_sub "Endpoint: on segments: contentid \[[0-9]+\]" " Endpoint: on segments: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 WHERE a=1; +1: @post_run 'create_sub "on segment: contentid \[[0-9]+\]" "on segment: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 WHERE a=1; QUERY PLAN --------------------------------------- Seq Scan on public.rt1 @@ -214,7 +214,7 @@ EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * Endpoint: "on segment: contentid [SEGIDX]" Optimizer: Postgres query optimizer (5 rows) -1: @post_run 'create_sub "Endpoint: on segments: contentid \[[0-9]+\]" " Endpoint: on segments: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 WHERE a=1 OR a=2; +1: @post_run 'create_sub "on segment: contentid \[[0-9]+\]" "on segment: contentid [SEGIDX]" ':EXPLAIN (VERBOSE, COSTS false) DECLARE c1 PARALLEL RETRIEVE CURSOR FOR SELECT * FROM rt1 WHERE a=1 OR a=2; QUERY PLAN ---------------------------------------- Seq Scan on public.rt1 diff --git a/src/test/isolation2/sql/crash_recovery_dtm.sql b/src/test/isolation2/sql/crash_recovery_dtm.sql index b946b961c14..517b3fe183f 100644 --- a/src/test/isolation2/sql/crash_recovery_dtm.sql +++ b/src/test/isolation2/sql/crash_recovery_dtm.sql @@ -42,14 +42,11 @@ select pg_reload_conf(); 3&: SELECT wait_till_master_shutsdown(); -- Start transaction which should hit PANIC as COMMIT PREPARED will fail to one segment 1: CREATE TABLE commit_phase1_panic(a int, b int); --- Reset the fault in utility mode because normal mode connection will --- not be accepted until DTX recovery is finished. --1U: SELECT gp_inject_fault('finish_prepared_start_of_function', 'reset', dbid) - from gp_segment_configuration where content=0 and role='p'; --1Uq: -- Join back to know master has completed postmaster reset. 3<: -- Start a session on master which would complete the DTM recovery and hence COMMIT PREPARED +4: SELECT gp_inject_fault('finish_prepared_start_of_function', 'reset', dbid) + from gp_segment_configuration where content=0 and role='p'; 4: SELECT * from commit_phase1_panic; 4: INSERT INTO commit_phase1_panic select i,i from generate_series(1, 10)i; 4: SELECT count(*) from commit_phase1_panic; diff --git a/src/test/isolation2/sql/gpdispatch.sql b/src/test/isolation2/sql/gpdispatch.sql index a6b142fa310..6455ec5eef8 100644 --- a/src/test/isolation2/sql/gpdispatch.sql +++ b/src/test/isolation2/sql/gpdispatch.sql @@ -26,6 +26,7 @@ insert into test_waitevent select generate_series(1,1000); 1: set optimizer = off; 1: set gp_cte_sharing to on; +1: set max_parallel_workers_per_gather = 0; 1: select gp_inject_fault_infinite('shareinput_writer_notifyready', 'suspend', 2); 1&: WITH a1 as (select * from test_waitevent), a2 as (select * from test_waitevent) SELECT sum(a1.i) FROM a1 INNER JOIN a2 ON a2.i = a1.i UNION ALL SELECT count(a1.i) FROM a1 INNER JOIN a2 ON a2.i = a1.i; -- start_ignore diff --git a/src/test/regress/GNUmakefile b/src/test/regress/GNUmakefile index 016e35f8744..9cf399163c9 100644 --- a/src/test/regress/GNUmakefile +++ b/src/test/regress/GNUmakefile @@ -209,6 +209,13 @@ installcheck-good: all twophase_pqexecparams hooktest query_info_hook_test installcheck-parallel: all $(pg_regress_installcheck) $(REGRESS_OPTS) --schedule=$(srcdir)/parallel_schedule $(MAXCONNOPT) $(EXTRA_TESTS) +# cbdb parallel plan tests, ignore the incompatible cases, should run with 5 GUCs like below: +installcheck-cbdb-parallel: all twophase_pqexecparams + ( \ + export PGOPTIONS='-c optimizer=off -c enable_parallel=true -c min_parallel_table_scan_size=0 -c min_parallel_index_scan_size=0 -c force_parallel_mode=1 -c gp_appendonly_insert_files=0'; \ + $(pg_regress_installcheck) $(REGRESS_OPTS) --schedule=$(srcdir)/parallel_schedule --schedule=$(srcdir)/greenplum_schedule $(MAXCONNOPT) $(EXTRA_TESTS) --exclude-tests explain \ + ) + installcheck-tests: all $(pg_regress_installcheck) $(REGRESS_OPTS) $(TESTS) $(EXTRA_TESTS) diff --git a/src/test/regress/atmsort.pm b/src/test/regress/atmsort.pm index 22c44474085..04ac3ee41d9 100644 --- a/src/test/regress/atmsort.pm +++ b/src/test/regress/atmsort.pm @@ -1197,8 +1197,10 @@ sub atmsort_bigloop next; } - # EXPLAIN (COSTS OFF) ... - if ($ini =~ m/explain\s*\(.*costs\s+off.*\)/i) + # EXPLAIN (COSTS OFF/FALSE/0) ... + if (($ini =~ m/explain\s*\(.*costs\s+off.*\)/i) || + ($ini =~ m/explain\s*\(.*costs\s+false.*\)/i) || + ($ini =~ m/explain\s*\(.*costs\s+0.*\)/i)) { $directive->{explain} = "costs_off"; } diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 59670547115..7bc915f1d56 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -2950,6 +2950,7 @@ FROM (SELECT * FROM tenk1 -> Seq Scan on public.tenk1 tenk1_3 Output: tenk1_3.unique1 Settings: enable_indexonlyscan = 'off', min_parallel_table_scan_size = '0', parallel_setup_cost = '0', parallel_tuple_cost = '0' + Optimizer: Postgres query optimizer (17 rows) SELECT variance(unique1::int4), sum(unique1::int8), regr_count(unique1::float8, unique1::float8) @@ -2970,8 +2971,8 @@ FROM (SELECT * FROM tenk1 UNION ALL SELECT * FROM tenk1 UNION ALL SELECT * FROM tenk1 UNION ALL SELECT * FROM tenk1) u; - QUERY PLAN --------------------------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- Finalize Aggregate Output: variance((tenk1.unique1)::bigint), avg((tenk1.unique1)::numeric) -> Gather Motion 3:1 (slice1; segments: 3) @@ -2988,7 +2989,8 @@ FROM (SELECT * FROM tenk1 -> Seq Scan on public.tenk1 tenk1_3 Output: tenk1_3.unique1 Settings: enable_indexonlyscan = 'off', min_parallel_table_scan_size = '0', parallel_setup_cost = '0', parallel_tuple_cost = '0' -(16 rows) + Optimizer: Postgres query optimizer +(17 rows) SELECT variance(unique1::int8), avg(unique1::numeric) FROM (SELECT * FROM tenk1 diff --git a/src/test/regress/expected/alter_distribution_policy.out b/src/test/regress/expected/alter_distribution_policy.out index 60b2bfd4f8b..a5e3875ddad 100644 --- a/src/test/regress/expected/alter_distribution_policy.out +++ b/src/test/regress/expected/alter_distribution_policy.out @@ -1433,3 +1433,120 @@ select *, gp_segment_id from reorg_leaf_1_prt_p0; 1 | 1 | 1 | 1 (5 rows) +-- +-- Test case for GUC gp_force_random_redistribution. +-- Manually toggle the GUC should control the behavior of redistribution for randomly-distributed tables. +-- But REORGANIZE=true should redistribute no matter what. +-- +-- this only affects postgres planner; +set optimizer = false; +-- check the distribution difference between 't1' and 't2' after executing 'query_string' +-- return true if data distribution changed, otherwise false. +-- Note: in extremely rare cases, even after 't2' being randomly-distributed from 't1', they could still have the +-- exact same distribution. So let the tables have a reasonably large number of rows to reduce that possibility. +CREATE OR REPLACE FUNCTION check_redistributed(query_string text, t1 text, t2 text) +RETURNS BOOLEAN AS +$$ +DECLARE + before_query TEXT; + after_query TEXT; + comparison_query TEXT; + comparison_count INT; +BEGIN + -- Prepare the query strings + before_query := format('SELECT gp_segment_id as segid, count(*) AS tupcount FROM %I GROUP BY gp_segment_id', t1); + after_query := format('SELECT gp_segment_id as segid, count(*) AS tupcount FROM %I GROUP BY gp_segment_id', t2); + comparison_query := format('SELECT COUNT(*) FROM ((TABLE %I EXCEPT TABLE %I) UNION ALL (TABLE %I EXCEPT TABLE %I))q', 'distribution1', 'distribution2', 'distribution2', 'distribution1'); + + -- Create temp tables to store the result + EXECUTE format('CREATE TEMP TABLE distribution1 AS %s DISTRIBUTED REPLICATED', before_query); + + -- Execute provided query string + EXECUTE query_string; + + EXECUTE format('CREATE TEMP TABLE distribution2 AS %s DISTRIBUTED REPLICATED', after_query); + + -- Compare the tables using EXCEPT clause + EXECUTE comparison_query INTO comparison_count; + + -- Drop temp tables + EXECUTE 'DROP TABLE distribution1'; + EXECUTE 'DROP TABLE distribution2'; + + -- If count is greater than zero, then there's a difference + RETURN comparison_count > 0; +END; +$$ +LANGUAGE plpgsql; +-- CO table builds temp table first instead of doing CTAS during REORGANIZE=true +create table t_reorganize(a int, b int) using ao_column distributed by (a); +insert into t_reorganize select 0,i from generate_series(1,1000)i; +select gp_segment_id, count(*) from t_reorganize group by gp_segment_id; + gp_segment_id | count +---------------+------- + 1 | 1000 +(1 row) + +-- firstly, no redistribute +set gp_force_random_redistribution = off; +select check_redistributed('alter table t_reorganize set with (reorganize=true) distributed randomly', 't_reorganize', 't_reorganize'); + check_redistributed +--------------------- + t +(1 row) + +-- reorganize from randomly to randomly should still redistribute +select check_redistributed('alter table t_reorganize set with (reorganize=true) distributed randomly', 't_reorganize', 't_reorganize'); + check_redistributed +--------------------- + t +(1 row) + +-- but insert into table won't redistribute +create table t_random (like t_reorganize) distributed randomly; +select check_redistributed('insert into t_random select * from t_reorganize', 't_reorganize', 't_random'); + check_redistributed +--------------------- + f +(1 row) + +-- but insert into a different distribution policy would still redistribute +create table t_distbya (like t_reorganize) distributed by (a); +select check_redistributed('insert into t_distbya select * from t_reorganize', 't_reorganize', 't_distbya'); + check_redistributed +--------------------- + t +(1 row) + +-- now force distribute should redistribute in all cases +set gp_force_random_redistribution = on; +select check_redistributed('alter table t_reorganize set with (reorganize=true) distributed randomly', 't_reorganize', 't_reorganize'); + check_redistributed +--------------------- + t +(1 row) + +select check_redistributed('alter table t_reorganize set with (reorganize=true) distributed randomly', 't_reorganize', 't_reorganize'); + check_redistributed +--------------------- + t +(1 row) + +create table t_random (like t_reorganize) distributed randomly; +ERROR: relation "t_random" already exists +select check_redistributed('insert into t_random select * from t_reorganize', 't_reorganize', 't_random'); + check_redistributed +--------------------- + t +(1 row) + +create table t_distbya (like t_reorganize) distributed by (a); +ERROR: relation "t_distbya" already exists +select check_redistributed('insert into t_distbya select * from t_reorganize', 't_reorganize', 't_distbya'); + check_redistributed +--------------------- + t +(1 row) + +reset optimizer; +reset gp_force_random_redistribution; diff --git a/src/test/regress/expected/bfv_dd.out b/src/test/regress/expected/bfv_dd.out index aee7c66b131..b834e5110df 100644 --- a/src/test/regress/expected/bfv_dd.out +++ b/src/test/regress/expected/bfv_dd.out @@ -269,6 +269,7 @@ INFO: (slice 1) Dispatch command to SINGLE content (1 row) -- group by and sort +-- disable parallel for regress tests select a, count(*) from dd_part_singlecol where a=1 group by a; INFO: (slice 1) Dispatch command to SINGLE content a | count diff --git a/src/test/regress/expected/bfv_dd_optimizer.out b/src/test/regress/expected/bfv_dd_optimizer.out index 8ceb0ca5823..021bd63236c 100644 --- a/src/test/regress/expected/bfv_dd_optimizer.out +++ b/src/test/regress/expected/bfv_dd_optimizer.out @@ -267,6 +267,8 @@ INFO: (slice 1) Dispatch command to SINGLE content (1 row) -- group by and sort +-- disable parallel for regress tests +set enable_parallel = off; select a, count(*) from dd_part_singlecol where a=1 group by a; INFO: (slice 1) Dispatch command to SINGLE content a | count @@ -274,6 +276,7 @@ INFO: (slice 1) Dispatch command to SINGLE content 1 | 1 (1 row) +reset enable_parallel; select a, count(*) from dd_part_singlecol where a=1 group by a order by a; INFO: (slice 1) Dispatch command to SINGLE content a | count diff --git a/src/test/regress/expected/bfv_partition_plans.out b/src/test/regress/expected/bfv_partition_plans.out index 2dd4dfe28e1..597f507a650 100644 --- a/src/test/regress/expected/bfv_partition_plans.out +++ b/src/test/regress/expected/bfv_partition_plans.out @@ -484,12 +484,16 @@ analyze p2; analyze p3; analyze p; -- TEST +-- If force parallel, we won't have partition selector since we will use parallel join. +-- We need to disable parallel before doing this query. +set enable_parallel to false; select count_operator('select * from (select * from p1 union all select * from p2) as p_all, t where p_all.b=t.b;','Partition Selector'); count_operator ---------------- 2 (1 row) +reset enable_parallel; select count_operator('select * from (select * from p1 union select * from p2) as p_all, t where p_all.b=t.b;','Partition Selector'); count_operator ---------------- diff --git a/src/test/regress/expected/bfv_partition_plans_optimizer.out b/src/test/regress/expected/bfv_partition_plans_optimizer.out index d8bf9555ffb..0e8e85ad187 100644 --- a/src/test/regress/expected/bfv_partition_plans_optimizer.out +++ b/src/test/regress/expected/bfv_partition_plans_optimizer.out @@ -484,12 +484,16 @@ analyze p2; analyze p3; analyze p; -- TEST +-- If force parallel, we won't have partition selector since we will use parallel join. +-- We need to disable parallel before doing this query. +set enable_parallel to false; select count_operator('select * from (select * from p1 union all select * from p2) as p_all, t where p_all.b=t.b;','Partition Selector'); count_operator ---------------- 4 (1 row) +reset enable_parallel; select count_operator('select * from (select * from p1 union select * from p2) as p_all, t where p_all.b=t.b;','Partition Selector'); count_operator ---------------- diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out index 74f4eb64d9a..0a4e3e527b7 100644 --- a/src/test/regress/expected/explain.out +++ b/src/test/regress/expected/explain.out @@ -14,6 +14,7 @@ $$ declare ln text; begin + set local enable_parallel = off; for ln in execute $1 loop -- Replace any numeric word with just 'N' @@ -28,6 +29,7 @@ begin CONTINUE WHEN (ln = 'Planning:'); return next ln; end loop; + reset enable_parallel; end; $$; -- To produce valid JSON output, replace numbers with "0" or "0.0" not "N" @@ -38,6 +40,7 @@ declare data text := ''; ln text; begin + set local enable_parallel = off; for ln in execute $1 loop -- Replace any numeric word with just '0' @@ -45,6 +48,7 @@ begin data := data || ln; end loop; return data::jsonb; + reset enable_parallel; end; $$; -- Simple cases @@ -691,6 +695,7 @@ select jsonb_pretty( "Settings": { + "Optimizer": "Postgres query optimizer", + "optimizer": "off", + + "enable_parallel": "off", + "parallel_setup_cost": "0", + "parallel_tuple_cost": "0", + "min_parallel_table_scan_size": "0" + @@ -709,6 +714,7 @@ select jsonb_pretty( "Executor Memory": { + "Average": 0, + "Workers": 0, + + "Subworkers": 0, + "Maximum Memory Used": 0 + }, + "Work Maximum Memory": 0 + diff --git a/src/test/regress/expected/explain_format.out b/src/test/regress/expected/explain_format.out index f8d9d3d41cb..bd07fd1200f 100644 --- a/src/test/regress/expected/explain_format.out +++ b/src/test/regress/expected/explain_format.out @@ -3,8 +3,8 @@ -- s/\(actual time=\d+\.\d+..\d+\.\d+ rows=\d+ loops=\d+\)/(actual time=##.###..##.### rows=# loops=#)/ -- m/\(slice\d+\) Executor memory: (\d+)\w bytes\./ -- s/Executor memory: (\d+)\w bytes\./Executor memory: (#####)K bytes./ --- m/\(slice\d+\) Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./ --- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./ +-- m/\(slice\d+\) Executor memory: (\d+)\w bytes avg x \d+(x\(\d+\))* workers, \d+\w bytes max \(seg\d+\)\./ +-- s/Executor memory: (\d+)\w bytes avg x \d+(x\(\d+\))* workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./ -- m/Work_mem: \d+\w bytes max\./ -- s/Work_mem: \d+\w bytes max\. */Work_mem: ###K bytes max./ -- m/Execution Time: \d+\.\d+ ms/ @@ -78,9 +78,9 @@ EXPLAIN (ANALYZE) SELECT * from boxes LEFT JOIN apples ON apples.id = boxes.appl -> Seq Scan on box_locations (cost=0.00..596.00 rows=16534 width=36) (never executed) Planning Time: 2.219 ms (slice0) Executor memory: 127K bytes. - (slice1) Executor memory: 1104K bytes avg x 3 workers, 1104K bytes max (seg0). Work_mem: 1024K bytes max. - (slice2) Executor memory: 2162K bytes avg x 3 workers, 2162K bytes max (seg0). Work_mem: 2070K bytes max. - (slice3) Executor memory: 60K bytes avg x 3 workers, 60K bytes max (seg0). + (slice1) Executor memory: 1104K bytes avg x 3x(0) workers, 1104K bytes max (seg0). Work_mem: 1024K bytes max. + (slice2) Executor memory: 2162K bytes avg x 3x(0) workers, 2162K bytes max (seg0). Work_mem: 2070K bytes max. + (slice3) Executor memory: 60K bytes avg x 3x(0) workers, 60K bytes max (seg0). Memory used: 128000kB Optimizer: Postgres query optimizer Execution Time: 59.644 ms @@ -116,6 +116,8 @@ EXPLAIN (ANALYZE) SELECT * from boxes LEFT JOIN apples ON apples.id = boxes.appl -- s/Maximum Memory Used: \d+/Maximum Memory Used: ###/ -- m/Workers: \d+/ -- s/Workers: \d+/Workers: ##/ +-- m/Subworkers: \d+/ +-- s/Subworkers: \d+/Subworkers: ##/ -- m/Average: \d+/ -- s/Average: \d+/Average: ##/ -- m/Total memory used across slices: \d+/ @@ -468,18 +470,21 @@ QUERY PLAN Executor Memory: Average: 1129528 Workers: 3 + Subworkers: 0 Maximum Memory Used: 1129528 Work Maximum Memory: 1048576 - Slice: 2 Executor Memory: Average: 2213776 Workers: 3 + Subworkers: 0 Maximum Memory Used: 2213776 Work Maximum Memory: 2119360 - Slice: 3 Executor Memory: Average: 60624 Workers: 3 + Subworkers: 0 Maximum Memory Used: 60624 Statement statistics: Memory used: 128000 diff --git a/src/test/regress/expected/explain_format_optimizer.out b/src/test/regress/expected/explain_format_optimizer.out index bc304d030f0..224b1735d28 100644 --- a/src/test/regress/expected/explain_format_optimizer.out +++ b/src/test/regress/expected/explain_format_optimizer.out @@ -3,8 +3,8 @@ -- s/\(actual time=\d+\.\d+..\d+\.\d+ rows=\d+ loops=\d+\)/(actual time=##.###..##.### rows=# loops=#)/ -- m/\(slice\d+\) Executor memory: (\d+)\w bytes\./ -- s/Executor memory: (\d+)\w bytes\./Executor memory: (#####)K bytes./ --- m/\(slice\d+\) Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./ --- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./ +-- m/\(slice\d+\) Executor memory: (\d+)\w bytes avg x \d+(x\(\d+\))* workers, \d+\w bytes max \(seg\d+\)\./ +-- s/Executor memory: (\d+)\w bytes avg x \d+(x\(\d+\))* workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./ -- m/Work_mem: \d+\w bytes max\./ -- s/Work_mem: \d+\w bytes max\. */Work_mem: ###K bytes max./ -- m/Execution Time: \d+\.\d+ ms/ @@ -114,6 +114,8 @@ EXPLAIN (ANALYZE) SELECT * from boxes LEFT JOIN apples ON apples.id = boxes.appl -- s/Maximum Memory Used: \d+/Maximum Memory Used: ###/ -- m/Workers: \d+/ -- s/Workers: \d+/Workers: ##/ +-- m/Subworkers: \d+/ +-- s/Subworkers: \d+/Subworkers: ##/ -- m/Average: \d+/ -- s/Average: \d+/Average: ##/ -- m/Total memory used across slices: \d+/ @@ -416,16 +418,19 @@ QUERY PLAN Executor Memory: Average: 97488 Workers: 3 + Subworkers: 0 Maximum Memory Used: 97488 - Slice: 2 Executor Memory: Average: 97488 Workers: 3 + Subworkers: 0 Maximum Memory Used: 97488 - Slice: 3 Executor Memory: Average: 60624 Workers: 3 + Subworkers: 0 Maximum Memory Used: 60624 Statement statistics: Memory used: 128000 diff --git a/src/test/regress/expected/explain_optimizer.out b/src/test/regress/expected/explain_optimizer.out index eec310b1151..7d76e7228a2 100644 --- a/src/test/regress/expected/explain_optimizer.out +++ b/src/test/regress/expected/explain_optimizer.out @@ -14,6 +14,7 @@ $$ declare ln text; begin + set local enable_parallel = off; for ln in execute $1 loop -- Replace any numeric word with just 'N' @@ -28,6 +29,7 @@ begin CONTINUE WHEN (ln = 'Planning:'); return next ln; end loop; + reset enable_parallel; end; $$; -- To produce valid JSON output, replace numbers with "0" or "0.0" not "N" @@ -38,6 +40,7 @@ declare data text := ''; ln text; begin + set local enable_parallel = off; for ln in execute $1 loop -- Replace any numeric word with just '0' @@ -45,6 +48,7 @@ begin data := data || ln; end loop; return data::jsonb; + reset enable_parallel; end; $$; -- Simple cases @@ -698,6 +702,7 @@ select jsonb_pretty( "Settings": { + "Optimizer": "Pivotal Optimizer (GPORCA)", + "optimizer": "on", + + "enable_parallel": "off", + "parallel_setup_cost": "0", + "parallel_tuple_cost": "0", + "min_parallel_table_scan_size": "0" + @@ -716,6 +721,7 @@ select jsonb_pretty( "Executor Memory": { + "Average": 0, + "Workers": 0, + + "Subworkers": 0, + "Maximum Memory Used": 0 + }, + "Work Maximum Memory": 0 + diff --git a/src/test/regress/expected/gp_aggregates.out b/src/test/regress/expected/gp_aggregates.out index bb43df1d88f..065af172d6e 100644 --- a/src/test/regress/expected/gp_aggregates.out +++ b/src/test/regress/expected/gp_aggregates.out @@ -320,6 +320,7 @@ create aggregate mysum_prefunc(int4) ( -- tweak settings to force multistage agg to be used set gp_motion_cost_per_row = 1000; set optimizer_force_multistage_agg = on; +set force_parallel_mode = off; select mysum_prefunc(a::int4) from aggtest; NOTICE: combinefunc called NOTICE: combinefunc called @@ -330,6 +331,7 @@ NOTICE: combinefunc called reset gp_motion_cost_per_row; reset optimizer_force_multistage_agg; +reset force_parallel_mode; -- Test an aggregate with 'internal' transition type, and a combine function, -- but no serial/deserial functions. This is valid, but we have no use for -- the combine function in GPDB in that case. diff --git a/src/test/regress/expected/gp_parallel.out b/src/test/regress/expected/gp_parallel.out new file mode 100644 index 00000000000..f23b6f03eb2 --- /dev/null +++ b/src/test/regress/expected/gp_parallel.out @@ -0,0 +1,1549 @@ +-- +-- GP PARALLEL +-- Test GP style parallel plan. +-- GUCs shoule be set with local, do not disturb other parallel plans. +-- Should not use force_parallel_mode as it will ignore plan and check results only. +-- We want to check plan in this file! +-- If there is need to do that, set it local inside a transaction. +-- Set optimizer off in this file, ORCA parallel is not supported. +-- +-- Locus check expression: +-- This is just used to check locus codes in cdbpath_motion_for_parallel_join/cdbpathlocus_parallel_join +-- with corresponding examples quickly for parallel join. +-- Format: +-- 1_2_3 means locus 1 join locus 2 generate locus 3. +-- 1_P_2_3 means locus 1 Join(with shared hash table) locus 2 generate locus 3. +-- All this format represents for parallel join, while P implies it's a parallel_aware join. +-- +-- The numbers steal from CdbLocusType enum. +-- 0 CdbLocusType_Null +-- 1 CdbLocusType_Entry +-- 2 CdbLocusType_SingleQE +-- 3 CdbLocusType_General +-- 4 CdbLocusType_SegmentGeneral +-- 5 CdbLocusType_SegmentGeneralWorkers +-- 6 CdbLocusType_OuterQuery +-- 7 CdbLocusType_Replicated +-- 8 CdbLocusType_ReplicatedWorkers +-- 9 CdbLocusType_Hashed +-- 10 CdbLocusType_HashedOJ +-- 11 CdbLocusType_Strewn +-- 12 CdbLocusType_HashedWorkers +-- +-- +set force_parallel_mode = 0; +set optimizer = off; +create schema test_parallel; +set search_path to test_parallel; +create table ao1(x int, y int) with(appendonly=true); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'x' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table ao2(x int, y int) with(appendonly=true); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'x' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table aocs1(x int, y int) with(appendonly=true, orientation=column); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'x' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +begin; +-- encourage use of parallel plans +set local min_parallel_table_scan_size = 0; +set local max_parallel_workers_per_gather = 4; +set local enable_parallel = true; +-- insert multiple segfiles for parallel +set local gp_appendonly_insert_files = 4; +-- test appendonly table parallel +insert into ao1 select i, i from generate_series(1, 1200000) g(i); +analyze ao1; +insert into ao2 select i%10, i from generate_series(1, 1200000) g(i); +analyze ao2; +select segfilecount from pg_appendonly where relid = 'ao1'::regclass; + segfilecount +-------------- + 4 +(1 row) + +explain(costs off) select count(*) from ao1; + QUERY PLAN +-------------------------------------------------- + Finalize Aggregate + -> Gather Motion 12:1 (slice1; segments: 12) + -> Partial Aggregate + -> Parallel Seq Scan on ao1 + Optimizer: Postgres query optimizer +(5 rows) + +select count(*) from ao1; + count +--------- + 1200000 +(1 row) + +-- test aocs table parallel +insert into aocs1 select i, i from generate_series(1, 1200000) g(i); +analyze aocs1; +select segfilecount from pg_appendonly where relid = 'aocs1'::regclass; + segfilecount +-------------- + 4 +(1 row) + +explain(costs off) select count(*) from aocs1; + QUERY PLAN +-------------------------------------------------- + Finalize Aggregate + -> Gather Motion 12:1 (slice1; segments: 12) + -> Partial Aggregate + -> Parallel Seq Scan on aocs1 + Optimizer: Postgres query optimizer +(5 rows) + +select count(*) from aocs1; + count +--------- + 1200000 +(1 row) + +-- test locus of HashedWorkers can parallel join without motion +explain(locus, costs off) select count(*) from ao1, ao2 where ao1.x = ao2.x; + QUERY PLAN +-------------------------------------------------------- + Finalize Aggregate + Locus: Entry + -> Gather Motion 12:1 (slice1; segments: 12) + Locus: SingleQE + -> Partial Aggregate + Locus: HashedWorkers + Parallel Workers: 4 + -> Parallel Hash Join + Locus: HashedWorkers + Parallel Workers: 4 + Hash Cond: (ao1.x = ao2.x) + -> Parallel Seq Scan on ao1 + Locus: HashedWorkers + Parallel Workers: 4 + -> Parallel Hash + Locus: Hashed + -> Parallel Seq Scan on ao2 + Locus: HashedWorkers + Parallel Workers: 4 + Optimizer: Postgres query optimizer +(20 rows) + +select count(*) from ao1, ao2 where ao1.x = ao2.x; + count +--------- + 1080000 +(1 row) + +reset enable_parallel; +commit; +-- +-- test parallel with indices +-- +create index on ao1(y); +create index on aocs1(y); +analyze ao1; +analyze aocs1; +-- test AO/AOCS should not be IndexScan +begin; +set local enable_parallel = on; +set local enable_seqscan = off; +set local enable_indexscan = on; +set local enable_bitmapscan = on; +set local max_parallel_workers_per_gather=1; +explain(costs off) select y from ao1 where y > 1000000; + QUERY PLAN +-------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Bitmap Heap Scan on ao1 + Recheck Cond: (y > 1000000) + -> Bitmap Index Scan on ao1_y_idx + Index Cond: (y > 1000000) + Optimizer: Postgres query optimizer +(6 rows) + +explain(costs off) select y from aocs1 where y > 1000000; + QUERY PLAN +---------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Bitmap Heap Scan on aocs1 + Recheck Cond: (y > 1000000) + -> Bitmap Index Scan on aocs1_y_idx + Index Cond: (y > 1000000) + Optimizer: Postgres query optimizer +(6 rows) + +set local max_parallel_workers_per_gather=0; +explain(costs off) select y from ao1 where y > 1000000; + QUERY PLAN +-------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Bitmap Heap Scan on ao1 + Recheck Cond: (y > 1000000) + -> Bitmap Index Scan on ao1_y_idx + Index Cond: (y > 1000000) + Optimizer: Postgres query optimizer +(6 rows) + +explain(costs off) select y from aocs1 where y > 1000000; + QUERY PLAN +---------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Bitmap Heap Scan on aocs1 + Recheck Cond: (y > 1000000) + -> Bitmap Index Scan on aocs1_y_idx + Index Cond: (y > 1000000) + Optimizer: Postgres query optimizer +(6 rows) + +commit; +drop table ao1; +drop table ao2; +drop table aocs1; +-- test gp_appendonly_insert_files doesn't take effect +begin; +create table t (x int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'x' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into t select i from generate_series(1, 1000) i; +set local gp_appendonly_insert_files=4; +set local gp_appendonly_insert_files_tuples_range = 10; +create table ao1 using ao_row as select * from t; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'x' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +analyze ao1; +select segfilecount from pg_appendonly where relid='ao1'::regclass; + segfilecount +-------------- + 1 +(1 row) + +create table ao2 with(appendonly=true) as select * from t; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'x' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +analyze ao2; +select segfilecount from pg_appendonly where relid='ao2'::regclass; + segfilecount +-------------- + 1 +(1 row) + +create table aocs1 using ao_column as select * from t; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'x' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +analyze aocs1; +select segfilecount from pg_appendonly where relid='aocs1'::regclass; + segfilecount +-------------- + 1 +(1 row) + +create table aocs2 with(appendonly=true, orientation=column) as select * from t; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'x' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +analyze aocs2; +select segfilecount from pg_appendonly where relid='aocs2'::regclass; + segfilecount +-------------- + 1 +(1 row) + +abort; +-- test replicated tables parallel +begin; +set local max_parallel_workers_per_gather = 2; +create table t1(a int, b int) with(parallel_workers=2); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table rt1(a int, b int) with(parallel_workers=2) distributed replicated; +create table rt2(a int, b int) distributed replicated; +create table rt3(a int, b int) distributed replicated; +insert into t1 select i, i from generate_series(1, 100000) i; +insert into t1 select i, i+1 from generate_series(1, 10) i; +insert into rt1 select i, i+1 from generate_series(1, 10) i; +insert into rt2 select i, i+1 from generate_series(1, 10000) i; +insert into rt3 select i, i+1 from generate_series(1, 10) i; +analyze t1; +analyze rt1; +analyze rt2; +analyze rt3; +-- replica parallel select +set local enable_parallel = off; +explain(locus, costs off) select * from rt1; + QUERY PLAN +------------------------------------------ + Gather Motion 1:1 (slice1; segments: 1) + Locus: Entry + -> Seq Scan on rt1 + Locus: SegmentGeneral + Optimizer: Postgres query optimizer +(5 rows) + +select * from rt1; + a | b +----+---- + 1 | 2 + 2 | 3 + 3 | 4 + 4 | 5 + 5 | 6 + 6 | 7 + 7 | 8 + 8 | 9 + 9 | 10 + 10 | 11 +(10 rows) + +set local enable_parallel = on; +explain(locus, costs off) select * from rt1; + QUERY PLAN +------------------------------------------ + Gather Motion 2:1 (slice1; segments: 2) + Locus: Entry + -> Parallel Seq Scan on rt1 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + Optimizer: Postgres query optimizer +(6 rows) + +select * from rt1; + a | b +----+---- + 1 | 2 + 2 | 3 + 3 | 4 + 4 | 5 + 5 | 6 + 6 | 7 + 7 | 8 + 8 | 9 + 9 | 10 + 10 | 11 +(10 rows) + +-- replica join replica +set local enable_parallel = off; +select * from rt1 join rt2 on rt2.b = rt1.a; + a | b | a | b +----+----+---+---- + 2 | 3 | 1 | 2 + 3 | 4 | 2 | 3 + 4 | 5 | 3 | 4 + 5 | 6 | 4 | 5 + 6 | 7 | 5 | 6 + 7 | 8 | 6 | 7 + 8 | 9 | 7 | 8 + 9 | 10 | 8 | 9 + 10 | 11 | 9 | 10 +(9 rows) + +set local enable_parallel = on; +explain(locus, costs off) select * from rt1 join rt2 on rt2.b = rt1.a; + QUERY PLAN +------------------------------------------- + Gather Motion 1:1 (slice1; segments: 1) + Locus: Entry + -> Hash Join + Locus: SegmentGeneral + Hash Cond: (rt2.b = rt1.a) + -> Seq Scan on rt2 + Locus: SegmentGeneral + -> Hash + Locus: SegmentGeneral + -> Seq Scan on rt1 + Locus: SegmentGeneral + Optimizer: Postgres query optimizer +(12 rows) + +select * from rt1 join rt2 on rt2.b = rt1.a; + a | b | a | b +----+----+---+---- + 2 | 3 | 1 | 2 + 3 | 4 | 2 | 3 + 4 | 5 | 3 | 4 + 5 | 6 | 4 | 5 + 6 | 7 | 5 | 6 + 7 | 8 | 6 | 7 + 8 | 9 | 7 | 8 + 9 | 10 | 8 | 9 + 10 | 11 | 9 | 10 +(9 rows) + +-- +-- ex 5_P_5_5 +-- SegmentGeneralWorkers parallel join SegmentGeneralWorkers when parallel_aware generate SegmentGeneralWorerks locus. +-- +set local min_parallel_table_scan_size = 0; +explain(locus, costs off) select * from rt1 join rt2 on rt2.b = rt1.a; + QUERY PLAN +-------------------------------------------------- + Gather Motion 2:1 (slice1; segments: 2) + Locus: Entry + -> Parallel Hash Join + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + Hash Cond: (rt2.b = rt1.a) + -> Parallel Seq Scan on rt2 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + -> Parallel Hash + Locus: SegmentGeneral + -> Parallel Seq Scan on rt1 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + Optimizer: Postgres query optimizer +(15 rows) + +select * from rt1 join rt2 on rt2.b = rt1.a; + a | b | a | b +----+----+---+---- + 2 | 3 | 1 | 2 + 3 | 4 | 2 | 3 + 4 | 5 | 3 | 4 + 5 | 6 | 4 | 5 + 6 | 7 | 5 | 6 + 7 | 8 | 6 | 7 + 8 | 9 | 7 | 8 + 9 | 10 | 8 | 9 + 10 | 11 | 9 | 10 +(9 rows) + +-- +-- ex 5_4_5 +-- SegmentGeneralWorkers parallel join SegmentGeneral generate SegmentGeneralWorkers locus. +-- +set local enable_parallel_hash = off; +explain(locus, costs off) select * from rt1 join rt2 on rt2.b = rt1.a; + QUERY PLAN +-------------------------------------------- + Gather Motion 2:1 (slice1; segments: 2) + Locus: Entry + -> Hash Join + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + Hash Cond: (rt2.b = rt1.a) + -> Parallel Seq Scan on rt2 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + -> Hash + Locus: SegmentGeneral + -> Seq Scan on rt1 + Locus: SegmentGeneral + Optimizer: Postgres query optimizer +(14 rows) + +select * from rt1 join rt2 on rt2.b = rt1.a; + a | b | a | b +----+----+---+---- + 2 | 3 | 1 | 2 + 3 | 4 | 2 | 3 + 4 | 5 | 3 | 4 + 5 | 6 | 4 | 5 + 6 | 7 | 5 | 6 + 7 | 8 | 6 | 7 + 8 | 9 | 7 | 8 + 9 | 10 | 8 | 9 + 10 | 11 | 9 | 10 +(9 rows) + +-- +-- t1 join rt1 join rt2 +-- +set local enable_parallel = off; +explain(locus, costs off) select * from rt1 join t1 on rt1.a = t1.b join rt2 on rt2.a = t1.b; + QUERY PLAN +------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Hash Join + Locus: Hashed + Hash Cond: (t1.b = rt1.a) + -> Seq Scan on t1 + Locus: Hashed + -> Hash + Locus: SegmentGeneral + -> Hash Join + Locus: SegmentGeneral + Hash Cond: (rt2.a = rt1.a) + -> Seq Scan on rt2 + Locus: SegmentGeneral + -> Hash + Locus: SegmentGeneral + -> Seq Scan on rt1 + Locus: SegmentGeneral + Optimizer: Postgres query optimizer +(19 rows) + +select * from rt1 join t1 on rt1.a = t1.b join rt2 on rt2.a = t1.b; + a | b | a | b | a | b +----+----+----+----+----+---- + 2 | 3 | 2 | 2 | 2 | 3 + 3 | 4 | 3 | 3 | 3 | 4 + 4 | 5 | 4 | 4 | 4 | 5 + 7 | 8 | 7 | 7 | 7 | 8 + 8 | 9 | 8 | 8 | 8 | 9 + 3 | 4 | 2 | 3 | 3 | 4 + 4 | 5 | 3 | 4 | 4 | 5 + 5 | 6 | 4 | 5 | 5 | 6 + 8 | 9 | 7 | 8 | 8 | 9 + 9 | 10 | 8 | 9 | 9 | 10 + 5 | 6 | 5 | 5 | 5 | 6 + 6 | 7 | 6 | 6 | 6 | 7 + 9 | 10 | 9 | 9 | 9 | 10 + 10 | 11 | 10 | 10 | 10 | 11 + 6 | 7 | 5 | 6 | 6 | 7 + 7 | 8 | 6 | 7 | 7 | 8 + 10 | 11 | 9 | 10 | 10 | 11 + 1 | 2 | 1 | 1 | 1 | 2 + 2 | 3 | 1 | 2 | 2 | 3 +(19 rows) + +-- parallel hash join +set local enable_parallel = on; +set local enable_parallel_hash = on; +-- +-- SegmentGeneralWorkers parallel join HashedWorkers when parallel_aware generate HashedWorkers. +-- ex 12_P_5_12 +-- HashedWorkers parallel join SegmentGeneralWorkers when parallel_aware generate HashedWorkers. +-- +explain(locus, costs off) select * from rt1 join t1 on rt1.a = t1.b join rt2 on rt2.a = t1.b; + QUERY PLAN +-------------------------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Parallel Hash Join + Locus: HashedWorkers + Parallel Workers: 2 + Hash Cond: (rt2.a = rt1.a) + -> Parallel Seq Scan on rt2 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + -> Parallel Hash + Locus: Hashed + -> Parallel Hash Join + Locus: HashedWorkers + Parallel Workers: 2 + Hash Cond: (t1.b = rt1.a) + -> Parallel Seq Scan on t1 + Locus: HashedWorkers + Parallel Workers: 2 + -> Parallel Hash + Locus: SegmentGeneral + -> Parallel Seq Scan on rt1 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + Optimizer: Postgres query optimizer +(24 rows) + +select * from rt1 join t1 on rt1.a = t1.b join rt2 on rt2.a = t1.b; + a | b | a | b | a | b +----+----+----+----+----+---- + 5 | 6 | 5 | 5 | 5 | 6 + 6 | 7 | 5 | 6 | 6 | 7 + 6 | 7 | 6 | 6 | 6 | 7 + 7 | 8 | 6 | 7 | 7 | 8 + 9 | 10 | 9 | 9 | 9 | 10 + 10 | 11 | 9 | 10 | 10 | 11 + 10 | 11 | 10 | 10 | 10 | 11 + 2 | 3 | 2 | 2 | 2 | 3 + 3 | 4 | 2 | 3 | 3 | 4 + 3 | 4 | 3 | 3 | 3 | 4 + 4 | 5 | 3 | 4 | 4 | 5 + 4 | 5 | 4 | 4 | 4 | 5 + 5 | 6 | 4 | 5 | 5 | 6 + 7 | 8 | 7 | 7 | 7 | 8 + 8 | 9 | 7 | 8 | 8 | 9 + 8 | 9 | 8 | 8 | 8 | 9 + 9 | 10 | 8 | 9 | 9 | 10 + 1 | 2 | 1 | 1 | 1 | 2 + 2 | 3 | 1 | 2 | 2 | 3 +(19 rows) + +-- +-- t1 join rt1 join rt3 +-- +set local enable_parallel = off; +explain(locus, costs off) select * from rt1 join t1 on rt1.a = t1.b join rt3 on rt3.a = t1.b; + QUERY PLAN +------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Hash Join + Locus: Hashed + Hash Cond: (t1.b = rt1.a) + -> Hash Join + Locus: Hashed + Hash Cond: (t1.b = rt3.a) + -> Seq Scan on t1 + Locus: Hashed + -> Hash + Locus: SegmentGeneral + -> Seq Scan on rt3 + Locus: SegmentGeneral + -> Hash + Locus: SegmentGeneral + -> Seq Scan on rt1 + Locus: SegmentGeneral + Optimizer: Postgres query optimizer +(19 rows) + +select * from rt1 join t1 on rt1.a = t1.b join rt3 on rt3.a = t1.b; + a | b | a | b | a | b +----+----+----+----+----+---- + 2 | 3 | 2 | 2 | 2 | 3 + 3 | 4 | 3 | 3 | 3 | 4 + 4 | 5 | 4 | 4 | 4 | 5 + 7 | 8 | 7 | 7 | 7 | 8 + 8 | 9 | 8 | 8 | 8 | 9 + 3 | 4 | 2 | 3 | 3 | 4 + 4 | 5 | 3 | 4 | 4 | 5 + 5 | 6 | 4 | 5 | 5 | 6 + 8 | 9 | 7 | 8 | 8 | 9 + 9 | 10 | 8 | 9 | 9 | 10 + 1 | 2 | 1 | 1 | 1 | 2 + 2 | 3 | 1 | 2 | 2 | 3 + 5 | 6 | 5 | 5 | 5 | 6 + 6 | 7 | 6 | 6 | 6 | 7 + 9 | 10 | 9 | 9 | 9 | 10 + 10 | 11 | 10 | 10 | 10 | 11 + 6 | 7 | 5 | 6 | 6 | 7 + 7 | 8 | 6 | 7 | 7 | 8 + 10 | 11 | 9 | 10 | 10 | 11 +(19 rows) + +-- parallel join without parallel hash +set local enable_parallel = on; +set local enable_parallel_hash = off; +-- HashedWorkers parallel join SegmentGeneral generate HashedWorkers. +explain(locus, costs off) select * from rt1 join t1 on rt1.a = t1.b join rt3 on rt3.a = t1.b; + QUERY PLAN +------------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Hash Join + Locus: HashedWorkers + Parallel Workers: 2 + Hash Cond: (t1.b = rt1.a) + -> Hash Join + Locus: HashedWorkers + Parallel Workers: 2 + Hash Cond: (t1.b = rt3.a) + -> Parallel Seq Scan on t1 + Locus: HashedWorkers + Parallel Workers: 2 + -> Hash + Locus: SegmentGeneral + -> Seq Scan on rt3 + Locus: SegmentGeneral + -> Hash + Locus: SegmentGeneral + -> Seq Scan on rt1 + Locus: SegmentGeneral + Optimizer: Postgres query optimizer +(22 rows) + +select * from rt1 join t1 on rt1.a = t1.b join rt3 on rt3.a = t1.b; + a | b | a | b | a | b +----+----+----+----+----+---- + 1 | 2 | 1 | 1 | 1 | 2 + 2 | 3 | 1 | 2 | 2 | 3 + 5 | 6 | 5 | 5 | 5 | 6 + 6 | 7 | 6 | 6 | 6 | 7 + 9 | 10 | 9 | 9 | 9 | 10 + 10 | 11 | 10 | 10 | 10 | 11 + 6 | 7 | 5 | 6 | 6 | 7 + 7 | 8 | 6 | 7 | 7 | 8 + 10 | 11 | 9 | 10 | 10 | 11 + 2 | 3 | 2 | 2 | 2 | 3 + 3 | 4 | 3 | 3 | 3 | 4 + 4 | 5 | 4 | 4 | 4 | 5 + 7 | 8 | 7 | 7 | 7 | 8 + 8 | 9 | 8 | 8 | 8 | 9 + 3 | 4 | 2 | 3 | 3 | 4 + 4 | 5 | 3 | 4 | 4 | 5 + 5 | 6 | 4 | 5 | 5 | 6 + 8 | 9 | 7 | 8 | 8 | 9 + 9 | 10 | 8 | 9 | 9 | 10 +(19 rows) + +create table t2(a int, b int) with(parallel_workers=0); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table rt4(a int, b int) with(parallel_workers=2) distributed replicated; +insert into t2 select i, i+1 from generate_series(1, 10) i; +insert into rt4 select i, i+1 from generate_series(1, 10000) i; +analyze t2; +analyze rt4; +set local enable_parallel = off; +select * from rt4 join t2 using(b); + b | a | a +----+----+---- + 2 | 1 | 1 + 6 | 5 | 5 + 7 | 6 | 6 + 10 | 9 | 9 + 11 | 10 | 10 + 3 | 2 | 2 + 4 | 3 | 3 + 5 | 4 | 4 + 8 | 7 | 7 + 9 | 8 | 8 +(10 rows) + +set local enable_parallel = on; +set local enable_parallel_hash = off; +-- +-- ex 5_9_12 +-- SegmentGeneralWorkers(w=N) parallel join Hashed(W=0) generate HashedWorkers(w=N). +-- +explain(locus, costs off) select * from rt4 join t2 using(b); + QUERY PLAN +-------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Hash Join + Locus: HashedWorkers + Parallel Workers: 2 + Hash Cond: (rt4.b = t2.b) + -> Parallel Seq Scan on rt4 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + -> Hash + Locus: Hashed + -> Seq Scan on t2 + Locus: Hashed + Optimizer: Postgres query optimizer +(14 rows) + +select * from rt4 join t2 using(b); + b | a | a +----+----+---- + 2 | 1 | 1 + 3 | 2 | 2 + 4 | 3 | 3 + 5 | 4 | 4 + 8 | 7 | 7 + 9 | 8 | 8 + 6 | 5 | 5 + 7 | 6 | 6 + 10 | 9 | 9 + 11 | 10 | 10 +(10 rows) + +create table t3(a int, b int) with(parallel_workers=2); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into t3 select i, i+1 from generate_series(1, 9000) i; +analyze t3; +set local enable_parallel = off; +select count(*) from rt4 join t3 using(b); + count +------- + 9000 +(1 row) + +set local enable_parallel = on; +set local enable_parallel_hash = on; +-- +-- ex 5_P_12_12 +-- SegmentGeneralWorkers parallel join HashedWorkers when parallel_aware generate HashedWorkers. +-- +explain(locus, costs off) select * from rt4 join t3 using(b); + QUERY PLAN +-------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Parallel Hash Join + Locus: HashedWorkers + Parallel Workers: 2 + Hash Cond: (rt4.b = t3.b) + -> Parallel Seq Scan on rt4 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + -> Parallel Hash + Locus: Hashed + -> Parallel Seq Scan on t3 + Locus: HashedWorkers + Parallel Workers: 2 + Optimizer: Postgres query optimizer +(15 rows) + +select count(*) from rt4 join t3 using(b); + count +------- + 9000 +(1 row) + +abort; +-- +-- ex 5_11_11 +-- SegmentGeneralWorkers(workers=N) join Strewn(worker=0) without shared hash table. +-- Join locus: Strewn(worker=N). +-- +begin; +create table t_replica_workers_2(a int, b int) with(parallel_workers=2) distributed replicated; +insert into t_replica_workers_2 select i, i+1 from generate_series(1, 10) i; +analyze t_replica_workers_2; +create table t_random_workers_0(a int, b int) with(parallel_workers=0) distributed randomly; +insert into t_random_workers_0 select i, i+1 from generate_series(1, 5) i; +analyze t_random_workers_0; +set local enable_parallel= true; +set local enable_parallel_hash= false; +explain(locus, costs off) select * from t_replica_workers_2 join t_random_workers_0 using(a); + QUERY PLAN +------------------------------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Hash Join + Locus: Strewn + Parallel Workers: 2 + Hash Cond: (t_replica_workers_2.a = t_random_workers_0.a) + -> Parallel Seq Scan on t_replica_workers_2 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + -> Hash + Locus: Strewn + -> Seq Scan on t_random_workers_0 + Locus: Strewn + Optimizer: Postgres query optimizer +(14 rows) + +select * from t_replica_workers_2 join t_random_workers_0 using(a); + a | b | b +---+---+--- + 2 | 3 | 3 + 3 | 4 | 4 + 1 | 2 | 2 + 4 | 5 | 5 + 5 | 6 | 6 +(5 rows) + +-- non parallel results +set local enable_parallel=false; +select * from t_replica_workers_2 join t_random_workers_0 using(a); + a | b | b +---+---+--- + 2 | 3 | 3 + 3 | 4 | 4 + 1 | 2 | 2 + 4 | 5 | 5 + 5 | 6 | 6 +(5 rows) + +abort; +-- +-- Strewn(worker=N) join SegmentGeneralWorkers(workers=N) with shared hash table. +-- Join locus: Strewn(worker=N). +-- +begin; +create table t_replica_workers_2(a int, b int) with(parallel_workers=2) distributed replicated; +insert into t_replica_workers_2 select i, i+1 from generate_series(1, 10) i; +analyze t_replica_workers_2; +create table t_random_workers_2(a int, b int) with(parallel_workers=2) distributed randomly; +insert into t_random_workers_2 select i, i+1 from generate_series(1, 5) i; +analyze t_random_workers_2; +set local enable_parallel= true; +set local enable_parallel_hash= true; +explain(locus, costs off) select * from t_replica_workers_2 right join t_random_workers_2 using(a); + QUERY PLAN +------------------------------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Parallel Hash Left Join + Locus: Strewn + Parallel Workers: 2 + Hash Cond: (t_random_workers_2.a = t_replica_workers_2.a) + -> Parallel Seq Scan on t_random_workers_2 + Locus: Strewn + Parallel Workers: 2 + -> Parallel Hash + Locus: SegmentGeneral + -> Parallel Seq Scan on t_replica_workers_2 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + Optimizer: Postgres query optimizer +(15 rows) + +select * from t_replica_workers_2 right join t_random_workers_2 using(a); + a | b | b +---+---+--- + 5 | 6 | 6 + 1 | 2 | 2 + 2 | 3 | 3 + 3 | 4 | 4 + 4 | 5 | 5 +(5 rows) + +-- non parallel results +set local enable_parallel=false; +select * from t_replica_workers_2 right join t_random_workers_2 using(a); + a | b | b +---+---+--- + 1 | 2 | 2 + 2 | 3 | 3 + 3 | 4 | 4 + 4 | 5 | 5 + 5 | 6 | 6 +(5 rows) + +abort; +-- +-- ex 5_P_11_11 +-- SegmentGeneralWorkers(workers=N) join Strewn(workers=N) with shared hash table. +-- Join locus: Strewn(workers=N). +-- +begin; +create table t_replica_workers_2(a int, b int) with(parallel_workers=2) distributed replicated; +insert into t_replica_workers_2 select i, i+1 from generate_series(1, 10) i; +analyze t_replica_workers_2; +create table t_random_workers_2(a int, b int) with(parallel_workers=2) distributed randomly; +insert into t_random_workers_2 select i, i+1 from generate_series(1, 5) i; +analyze t_random_workers_2; +set local enable_parallel= true; +set local enable_parallel_hash= true; +explain(locus, costs off) select * from t_replica_workers_2 join t_random_workers_2 using(a); + QUERY PLAN +------------------------------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Parallel Hash Join + Locus: Strewn + Parallel Workers: 2 + Hash Cond: (t_replica_workers_2.a = t_random_workers_2.a) + -> Parallel Seq Scan on t_replica_workers_2 + Locus: SegmentGeneralWorkers + Parallel Workers: 2 + -> Parallel Hash + Locus: Strewn + -> Parallel Seq Scan on t_random_workers_2 + Locus: Strewn + Parallel Workers: 2 + Optimizer: Postgres query optimizer +(16 rows) + +select * from t_replica_workers_2 join t_random_workers_2 using(a); + a | b | b +---+---+--- + 2 | 3 | 3 + 1 | 2 | 2 + 3 | 4 | 4 + 4 | 5 | 5 + 5 | 6 | 6 +(5 rows) + +-- non parallel results +set local enable_parallel=false; +select * from t_replica_workers_2 join t_random_workers_2 using(a); + a | b | b +---+---+--- + 2 | 3 | 3 + 1 | 2 | 2 + 3 | 4 | 4 + 4 | 5 | 5 + 5 | 6 | 6 +(5 rows) + +abort; +-- +-- Test final join path's parallel_workers should be same with join_locus whose +-- parallel_workers is different from origin outer path(without motion). +-- +begin; +create table t1(a int, b int) with(parallel_workers=3); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table t2(b int, a int) with(parallel_workers=2); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'b' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into t1 select i, i+1 from generate_series(1, 10) i; +insert into t2 select i, i+1 from generate_series(1, 5) i; +analyze t1; +analyze t2; +set local optimizer=off; +set local enable_parallel=on; +set local max_parallel_workers_per_gather= 4; +explain(costs off) select * from t1 right join t2 on t1.b = t2.a; + QUERY PLAN +------------------------------------------------------------------ + Gather Motion 9:1 (slice1; segments: 9) + -> Parallel Hash Left Join + Hash Cond: (t2.a = t1.b) + -> Redistribute Motion 6:9 (slice2; segments: 6) + Hash Key: t2.a + Hash Module: 3 + -> Parallel Seq Scan on t2 + -> Parallel Hash + -> Redistribute Motion 9:9 (slice3; segments: 9) + Hash Key: t1.b + Hash Module: 3 + -> Parallel Seq Scan on t1 + Optimizer: Postgres query optimizer +(13 rows) + +abort; +-- +-- Test SingleQE locus could particapte in parallel plan. +-- +begin; +create table t1(a int, b int) with(parallel_workers=2); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table t2(a int, b int) with(parallel_workers=2); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into t1 select i%10, i from generate_series(1, 5) i; +insert into t1 values (100000); +insert into t2 select i%10, i from generate_series(1, 100000) i; +analyze t1; +analyze t2; +set local enable_parallel = on; +-- parallel hash join with shared table, SinglQE as outer partial path. +explain(locus, costs off) select * from (select count(*) as a from t2) t2 left join t1 on t1.a = t2.a; + QUERY PLAN +------------------------------------------------------------------ + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Parallel Hash Left Join + Locus: Hashed + Parallel Workers: 2 + Hash Cond: ((count(*)) = t1.a) + -> Redistribute Motion 1:6 (slice2; segments: 1) + Locus: Hashed + Parallel Workers: 2 + Hash Key: (count(*)) + Hash Module: 3 + -> Finalize Aggregate + Locus: SingleQE + -> Gather Motion 6:1 (slice3; segments: 6) + Locus: SingleQE + -> Partial Aggregate + Locus: HashedWorkers + Parallel Workers: 2 + -> Parallel Seq Scan on t2 + Locus: HashedWorkers + Parallel Workers: 2 + -> Parallel Hash + Locus: Hashed + -> Parallel Seq Scan on t1 + Locus: HashedWorkers + Parallel Workers: 2 + Optimizer: Postgres query optimizer +(27 rows) + +select * from (select count(*) as a from t2) t2 left join t1 on t1.a = t2.a; + a | a | b +--------+--------+--- + 100000 | 100000 | +(1 row) + +set local enable_parallel = off; +select * from (select count(*) as a from t2) t2 left join t1 on t1.a = t2.a; + a | a | b +--------+--------+--- + 100000 | 100000 | +(1 row) + +set local enable_parallel = on; +-- parallel hash join with shared table, SinglQE as inner partial path. +explain(locus, costs off) select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + QUERY PLAN +------------------------------------------------------------------------ + Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry + -> Parallel Hash Join + Locus: HashedWorkers + Parallel Workers: 2 + Hash Cond: (t1.a = (count(*))) + -> Parallel Seq Scan on t1 + Locus: HashedWorkers + Parallel Workers: 2 + -> Parallel Hash + Locus: Hashed + -> Redistribute Motion 1:6 (slice2; segments: 1) + Locus: Hashed + Parallel Workers: 2 + Hash Key: (count(*)) + Hash Module: 3 + -> Finalize Aggregate + Locus: SingleQE + -> Gather Motion 6:1 (slice3; segments: 6) + Locus: SingleQE + -> Partial Aggregate + Locus: HashedWorkers + Parallel Workers: 2 + -> Parallel Seq Scan on t2 + Locus: HashedWorkers + Parallel Workers: 2 + Optimizer: Postgres query optimizer +(27 rows) + +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + a | b | a +--------+---+-------- + 100000 | | 100000 +(1 row) + +set local enable_parallel = off; +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + a | b | a +--------+---+-------- + 100000 | | 100000 +(1 row) + +set local enable_parallel = on; +-- parallel hash join without shared table. +set local enable_parallel_hash = off; +explain(locus, costs off) select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + QUERY PLAN +------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Hash Join + Locus: Hashed + Hash Cond: ((count(*)) = t1.a) + -> Redistribute Motion 1:3 (slice2; segments: 1) + Locus: Hashed + Hash Key: (count(*)) + -> Finalize Aggregate + Locus: SingleQE + -> Gather Motion 6:1 (slice3; segments: 6) + Locus: SingleQE + -> Partial Aggregate + Locus: HashedWorkers + Parallel Workers: 2 + -> Parallel Seq Scan on t2 + Locus: HashedWorkers + Parallel Workers: 2 + -> Hash + Locus: Hashed + -> Seq Scan on t1 + Locus: Hashed + Optimizer: Postgres query optimizer +(23 rows) + +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + a | b | a +--------+---+-------- + 100000 | | 100000 +(1 row) + +-- parallel merge join +set local enable_hashjoin = off; +explain(locus, costs off) select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + QUERY PLAN +------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Merge Join + Locus: Hashed + Merge Cond: ((count(*)) = t1.a) + -> Sort + Locus: Hashed + Sort Key: (count(*)) + -> Redistribute Motion 1:3 (slice2; segments: 1) + Locus: Hashed + Hash Key: (count(*)) + -> Finalize Aggregate + Locus: SingleQE + -> Gather Motion 6:1 (slice3; segments: 6) + Locus: SingleQE + -> Partial Aggregate + Locus: HashedWorkers + Parallel Workers: 2 + -> Parallel Seq Scan on t2 + Locus: HashedWorkers + Parallel Workers: 2 + -> Sort + Locus: Hashed + Sort Key: t1.a + -> Seq Scan on t1 + Locus: Hashed + Optimizer: Postgres query optimizer +(27 rows) + +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + a | b | a +--------+---+-------- + 100000 | | 100000 +(1 row) + +-- parallel nestloop join +set local enable_mergejoin = off; +set local enable_nestloop = on; +explain(locus, costs off) select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + QUERY PLAN +------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Nested Loop + Locus: Hashed + Join Filter: (t1.a = (count(*))) + -> Redistribute Motion 1:3 (slice2; segments: 1) + Locus: Hashed + Hash Key: (count(*)) + -> Finalize Aggregate + Locus: SingleQE + -> Gather Motion 6:1 (slice3; segments: 6) + Locus: SingleQE + -> Partial Aggregate + Locus: HashedWorkers + Parallel Workers: 2 + -> Parallel Seq Scan on t2 + Locus: HashedWorkers + Parallel Workers: 2 + -> Seq Scan on t1 + Locus: Hashed + Optimizer: Postgres query optimizer +(21 rows) + +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + a | b | a +--------+---+-------- + 100000 | | 100000 +(1 row) + +-- non-parallel results +set local enable_parallel = off; +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; + a | b | a +--------+---+-------- + 100000 | | 100000 +(1 row) + +abort; +begin; +-- use rt1 to generate locus of SegmentGeneralWorkers +-- use rt2 to generate locus of SegmentGeneral +-- use t1 to generate locus of HashedWorkers +-- use t2 to generate locus of Hahsed +-- use pg_class to generate locus of Entry +-- use generate_series(1, 1000) to generate locus of General +-- use select count(*) as a from sq1 to generate locus of SingleQE +create table rt1(a int, b int) distributed replicated; +create table rt2(a int, b int) with (parallel_workers = 0) distributed replicated; +create table t1(a int, b int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table t2(a int, b int) with (parallel_workers = 0); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into t1 select i, i+1 from generate_series(1, 10000) i; +insert into t2 select i, i+1 from generate_series(1, 10000) i; +insert into rt1 select i, i+1 from generate_series(1, 10000) i; +insert into rt2 select i, i+1 from generate_series(1, 10000) i; +CREATE TABLE sq1 AS SELECT a, b FROM t1 WHERE gp_segment_id = 0; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +set local optimizer=off; +set local enable_parallel=on; +set local min_parallel_table_scan_size to 0; +set local max_parallel_workers_per_gather= 4; +analyze rt1; +analyze rt2; +analyze t1; +analyze t2; +analyze sq1; +-- SegmentGeneralWorkers + SegmengGeneralWorkers = SegmentGeneralWorkers +explain (locus, costs off) select * from rt1 union all select * from rt1; + QUERY PLAN +-------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Parallel Append + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + -> Parallel Seq Scan on rt1 + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + -> Parallel Seq Scan on rt1 rt1_1 + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + Optimizer: Postgres query optimizer +(12 rows) + +-- SegmentGeneralWorkers + SegmentGeneral = SegmentGeneralWorkers +explain (locus, costs off) select * from rt1 union all select * from rt2; + QUERY PLAN +-------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Parallel Append + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + -> Seq Scan on rt2 + Locus: SegmentGeneral + -> Parallel Seq Scan on rt1 + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + Optimizer: Postgres query optimizer +(11 rows) + +-- SegmentGeneralWorkers (Converted to Strewn, Limited on One Segment) + HashedWorkers = Strewn +explain (locus, costs off) select * from rt1 union all select * from t1; + QUERY PLAN +------------------------------------------------------------- + Gather Motion 9:1 (slice1; segments: 9) + Locus: Entry + -> Parallel Append + Locus: Strewn + Parallel Workers: 3 + -> Result + Locus: Strewn + Parallel Workers: 3 + One-Time Filter: (gp_execution_segment() = 0) + -> Parallel Seq Scan on rt1 + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + -> Parallel Seq Scan on t1 + Locus: HashedWorkers + Parallel Workers: 3 + Optimizer: Postgres query optimizer +(16 rows) + +-- SegmentGeneralWorkers (Converted to Strewn, Limited on One Segment) + Hashed = Strewn +explain (locus, costs off) select * from rt1 union all select * from t2; + QUERY PLAN +------------------------------------------------------------- + Gather Motion 9:1 (slice1; segments: 9) + Locus: Entry + -> Parallel Append + Locus: Strewn + Parallel Workers: 3 + -> Seq Scan on t2 + Locus: Hashed + -> Result + Locus: Strewn + Parallel Workers: 3 + One-Time Filter: (gp_execution_segment() = 0) + -> Parallel Seq Scan on rt1 + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + Optimizer: Postgres query optimizer +(15 rows) + +-- SingleQE as subquery seems cannot produce partial_pathlist and don't have chance to parallel append. +explain (locus, costs off) select a from rt1 union all select count(*) as a from sq1; + QUERY PLAN +------------------------------------------------------ + Append + Locus: Entry + -> Gather Motion 1:1 (slice1; segments: 1) + Locus: SingleQE + -> Subquery Scan on "*SELECT* 1" + Locus: SegmentGeneral + -> Seq Scan on rt1 + Locus: SegmentGeneral + -> Finalize Aggregate + Locus: SingleQE + -> Gather Motion 3:1 (slice2; segments: 3) + Locus: SingleQE + -> Partial Aggregate + Locus: Hashed + -> Seq Scan on sq1 + Locus: Hashed + Optimizer: Postgres query optimizer +(17 rows) + +-- SegmentGeneralWorkers + General = SegmentGeneralWorkers +explain (locus, costs off) select a from rt1 union all select a from generate_series(1, 1000) a; + QUERY PLAN +------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Parallel Append + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + -> Function Scan on generate_series a + Locus: General + -> Parallel Seq Scan on rt1 + Locus: SegmentGeneralWorkers + Parallel Workers: 3 + Optimizer: Postgres query optimizer +(11 rows) + +-- Entry as subquery seems cannot produce partial_pathlist and don't have chance to parallel append. +-- flaky case failed: expected use seqscan on pg_class but choose indexscan sometimes. +set local enable_indexscan = off; +set local enable_indexonlyscan = off; +explain (locus, costs off) select a from rt1 union all select oid as a from pg_class; + QUERY PLAN +------------------------------------------------ + Append + Locus: Entry + -> Gather Motion 1:1 (slice1; segments: 1) + Locus: Entry + -> Subquery Scan on "*SELECT* 1" + Locus: SegmentGeneral + -> Seq Scan on rt1 + Locus: SegmentGeneral + -> Seq Scan on pg_class + Locus: Entry + Optimizer: Postgres query optimizer +(11 rows) + +abort; +-- +-- Test two-phase parallel Limit +-- +begin; +create table t1(c1 int, c2 int) with(parallel_workers=2); +insert into t1 select i, i+1 from generate_series(1, 100000) i; +analyze t1; +set local optimizer = off; +set local enable_parallel = on; +explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; + QUERY PLAN +------------------------------------------------- + Limit + -> Gather Motion 6:1 (slice1; segments: 6) + Merge Key: c2 + -> Limit + -> Sort + Sort Key: c2 + -> Parallel Seq Scan on t1 + Optimizer: Postgres query optimizer +(8 rows) + +select * from t1 order by c2 asc limit 3 offset 5; + c1 | c2 +----+---- + 6 | 7 + 7 | 8 + 8 | 9 +(3 rows) + +-- non-parallel results +set local enable_parallel = off; +explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; + QUERY PLAN +------------------------------------------------ + Limit + -> Gather Motion 3:1 (slice1; segments: 3) + Merge Key: c2 + -> Limit + -> Sort + Sort Key: c2 + -> Seq Scan on t1 + Optimizer: Postgres query optimizer +(8 rows) + +select * from t1 order by c2 asc limit 3 offset 5; + c1 | c2 +----+---- + 6 | 7 + 7 | 8 + 8 | 9 +(3 rows) + +abort; +-- +-- Test one-phase Limit with parallel subpath +-- +begin; +create table t1(c1 int, c2 int) with(parallel_workers=2); +insert into t1 select i, i+1 from generate_series(1, 100000) i; +analyze t1; +set local optimizer = off; +set local gp_enable_multiphase_limit = off; +set local enable_parallel = on; +explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; + QUERY PLAN +------------------------------------------------ + Limit + -> Gather Motion 6:1 (slice1; segments: 6) + Merge Key: c2 + -> Sort + Sort Key: c2 + -> Parallel Seq Scan on t1 + Optimizer: Postgres query optimizer +(7 rows) + +select * from t1 order by c2 asc limit 3 offset 5; + c1 | c2 +----+---- + 6 | 7 + 7 | 8 + 8 | 9 +(3 rows) + +-- non-parallel results +set local enable_parallel = off; +explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; + QUERY PLAN +------------------------------------------------ + Limit + -> Gather Motion 3:1 (slice1; segments: 3) + Merge Key: c2 + -> Sort + Sort Key: c2 + -> Seq Scan on t1 + Optimizer: Postgres query optimizer +(7 rows) + +select * from t1 order by c2 asc limit 3 offset 5; + c1 | c2 +----+---- + 6 | 7 + 7 | 8 + 8 | 9 +(3 rows) + +abort; +-- +-- Test alter ao/aocs table parallel_workers options +-- +begin; +set local optimizer = off; +set local enable_parallel = on; +-- ao table +create table ao (a INT, b INT) using ao_row; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into ao select i as a, i as b from generate_series(1, 100) AS i; +alter table ao set (parallel_workers = 2); +explain(costs off) select count(*) from ao; + QUERY PLAN +------------------------------------------------ + Finalize Aggregate + -> Gather Motion 6:1 (slice1; segments: 6) + -> Partial Aggregate + -> Parallel Seq Scan on ao + Optimizer: Postgres query optimizer +(5 rows) + +select count(*) from ao; + count +------- + 100 +(1 row) + +alter table ao reset (parallel_workers); +-- aocs table +create table aocs (a INT, b INT) using ao_column; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into aocs select i as a, i as b from generate_series(1, 100) AS i; +alter table aocs set (parallel_workers = 2); +explain(costs off) select count(*) from aocs; + QUERY PLAN +------------------------------------------------ + Finalize Aggregate + -> Gather Motion 6:1 (slice1; segments: 6) + -> Partial Aggregate + -> Parallel Seq Scan on aocs + Optimizer: Postgres query optimizer +(5 rows) + +select count(*) from aocs; + count +------- + 100 +(1 row) + +alter table aocs reset (parallel_workers); +abort; +-- start_ignore +drop schema test_parallel cascade; +-- end_ignore +reset force_parallel_mode; +reset optimizer; diff --git a/src/test/regress/expected/gporca.out b/src/test/regress/expected/gporca.out index 2e2b1cd96c8..002e2120ffc 100644 --- a/src/test/regress/expected/gporca.out +++ b/src/test/regress/expected/gporca.out @@ -11255,49 +11255,53 @@ INSERT INTO onetimefilter1 SELECT i, i FROM generate_series(1,10)i; INSERT INTO onetimefilter2 SELECT i, i FROM generate_series(1,10)i; ANALYZE onetimefilter1; ANALYZE onetimefilter2; -EXPLAIN WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=3.43..18.61 rows=10 width=12) - -> Hash Join (cost=3.43..18.41 rows=4 width=12) +EXPLAIN (COSTS OFF) WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; + QUERY PLAN +------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Join Hash Cond: (f1.b = f2.b) - -> Redistribute Motion 3:3 (slice5; segments: 3) (cost=0.00..3.30 rows=4 width=8) + -> Redistribute Motion 3:3 (slice5; segments: 3) Hash Key: f1.b - -> Seq Scan on onetimefilter1 f1 (cost=0.00..3.10 rows=4 width=8) - -> Hash (cost=3.30..3.30 rows=4 width=4) - -> Redistribute Motion 3:3 (slice6; segments: 3) (cost=0.00..3.30 rows=4 width=4) + -> Seq Scan on onetimefilter1 f1 + -> Hash + -> Redistribute Motion 3:3 (slice6; segments: 3) Hash Key: f2.b - -> Seq Scan on onetimefilter2 f2 (cost=0.00..3.10 rows=4 width=4) + -> Seq Scan on onetimefilter2 f2 SubPlan 1 - -> Limit (cost=0.00..0.06 rows=1 width=4) - -> Result (cost=0.00..0.55 rows=4 width=4) + -> Limit + -> Result One-Time Filter: (f1.b = f2.b) - -> Materialize (cost=0.00..0.45 rows=4 width=0) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..0.40 rows=4 width=0) - -> Subquery Scan on abc (cost=0.00..0.20 rows=4 width=0) - -> Shared Scan (share slice:id 2:0) (cost=6.46..6.67 rows=4 width=8) - -> Hash Join (cost=3.23..6.46 rows=4 width=8) - Hash Cond: (onetimefilter1.a = onetimefilter2.a) - -> Seq Scan on onetimefilter1 (cost=0.00..3.10 rows=4 width=8) - -> Hash (cost=3.10..3.10 rows=4 width=4) - -> Seq Scan on onetimefilter2 (cost=0.00..3.10 rows=4 width=4) + -> Materialize + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Hash Join + Hash Cond: (onetimefilter1.a = onetimefilter2.a) + -> Seq Scan on onetimefilter1 + -> Hash + -> Seq Scan on onetimefilter2 SubPlan 2 - -> Result (cost=0.00..0.55 rows=4 width=4) + -> Result One-Time Filter: (f1.a = 2) Filter: ((f1.a)::double precision = random()) - -> Materialize (cost=0.00..0.45 rows=4 width=0) - -> Broadcast Motion 3:3 (slice3; segments: 3) (cost=0.00..0.40 rows=4 width=0) - -> Subquery Scan on abc_1 (cost=0.00..0.20 rows=4 width=0) - -> Shared Scan (share slice:id 3:0) (cost=6.46..6.67 rows=4 width=8) + -> Materialize + -> Broadcast Motion 3:3 (slice3; segments: 3) + -> Hash Join + Hash Cond: (onetimefilter1_1.a = onetimefilter2_1.a) + -> Seq Scan on onetimefilter1 onetimefilter1_1 + -> Hash + -> Seq Scan on onetimefilter2 onetimefilter2_1 SubPlan 3 - -> Result (cost=0.00..0.55 rows=4 width=4) - Filter: (abc_2.b = f1.b) - -> Materialize (cost=0.00..0.45 rows=4 width=4) - -> Broadcast Motion 3:3 (slice4; segments: 3) (cost=0.00..0.40 rows=4 width=4) - -> Subquery Scan on abc_2 (cost=0.00..0.20 rows=4 width=4) - -> Shared Scan (share slice:id 4:0) (cost=6.46..6.67 rows=4 width=8) + -> Result + Filter: (onetimefilter1_2.b = f1.b) + -> Materialize + -> Broadcast Motion 3:3 (slice4; segments: 3) + -> Hash Join + Hash Cond: (onetimefilter1_2.a = onetimefilter2_2.a) + -> Seq Scan on onetimefilter1 onetimefilter1_2 + -> Hash + -> Seq Scan on onetimefilter2 onetimefilter2_2 Optimizer: Postgres query optimizer -(39 rows) +(43 rows) WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; ?column? | coalesce | b @@ -14195,19 +14199,19 @@ CREATE TABLE dist_tab_a (a varchar(15)) DISTRIBUTED BY(a); INSERT INTO dist_tab_a VALUES('1 '), ('2 '), ('3 '); CREATE TABLE dist_tab_b (a char(15), b bigint) DISTRIBUTED BY(a); INSERT INTO dist_tab_b VALUES('1 ', 1), ('2 ', 2), ('3 ', 3); -EXPLAIN CREATE TABLE result_tab AS +EXPLAIN(COSTS OFF) CREATE TABLE result_tab AS (SELECT a.a, b.b FROM dist_tab_a a LEFT JOIN dist_tab_b b ON a.a=b.a) DISTRIBUTED BY(a); - QUERY PLAN ------------------------------------------------------------------------------------------------------ - Redistribute Motion 3:3 (slice1; segments: 3) (cost=274.75..32892.98 rows=448330 width=56) + QUERY PLAN +------------------------------------------------------------ + Redistribute Motion 3:3 (slice1; segments: 3) Hash Key: a.a - -> Hash Left Join (cost=274.75..23926.38 rows=448330 width=56) + -> Hash Left Join Hash Cond: ((a.a)::bpchar = b.a) - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..453.00 rows=13967 width=48) + -> Redistribute Motion 3:3 (slice2; segments: 3) Hash Key: a.a - -> Seq Scan on dist_tab_a a (cost=0.00..173.67 rows=13967 width=48) - -> Hash (cost=141.00..141.00 rows=10700 width=72) - -> Seq Scan on dist_tab_b b (cost=0.00..141.00 rows=10700 width=72) + -> Seq Scan on dist_tab_a a + -> Hash + -> Seq Scan on dist_tab_b b Optimizer: Postgres query optimizer (10 rows) diff --git a/src/test/regress/expected/gporca_optimizer.out b/src/test/regress/expected/gporca_optimizer.out index 6928c834a5f..6c43ab45982 100644 --- a/src/test/regress/expected/gporca_optimizer.out +++ b/src/test/regress/expected/gporca_optimizer.out @@ -11419,51 +11419,51 @@ INSERT INTO onetimefilter1 SELECT i, i FROM generate_series(1,10)i; INSERT INTO onetimefilter2 SELECT i, i FROM generate_series(1,10)i; ANALYZE onetimefilter1; ANALYZE onetimefilter2; -EXPLAIN WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1852062876647.47 rows=11 width=12) - -> Sequence (cost=0.00..1852062876647.47 rows=4 width=12) - -> Shared Scan (share slice:id 1:0) (cost=0.00..862.00 rows=4 width=1) - -> Hash Join (cost=0.00..862.00 rows=4 width=8) +EXPLAIN (COSTS OFF) WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; + QUERY PLAN +---------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Sequence + -> Shared Scan (share slice:id 1:0) + -> Hash Join Hash Cond: (onetimefilter1.a = onetimefilter2.a) - -> Seq Scan on onetimefilter1 (cost=0.00..431.00 rows=4 width=8) - -> Hash (cost=431.00..431.00 rows=4 width=4) - -> Seq Scan on onetimefilter2 (cost=0.00..431.00 rows=4 width=4) - -> Hash Join (cost=0.00..862.00 rows=4 width=12) + -> Seq Scan on onetimefilter1 + -> Hash + -> Seq Scan on onetimefilter2 + -> Hash Join Hash Cond: (onetimefilter1_1.b = onetimefilter2_1.b) - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=4 width=8) + -> Redistribute Motion 3:3 (slice2; segments: 3) Hash Key: onetimefilter1_1.b - -> Seq Scan on onetimefilter1 onetimefilter1_1 (cost=0.00..431.00 rows=4 width=8) - -> Hash (cost=431.00..431.00 rows=4 width=4) - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.00 rows=4 width=4) + -> Seq Scan on onetimefilter1 onetimefilter1_1 + -> Hash + -> Redistribute Motion 3:3 (slice3; segments: 3) Hash Key: onetimefilter2_1.b - -> Seq Scan on onetimefilter2 onetimefilter2_1 (cost=0.00..431.00 rows=4 width=4) + -> Seq Scan on onetimefilter2 onetimefilter2_1 SubPlan 1 - -> Result (cost=0.00..431.01 rows=1 width=4) - -> Limit (cost=0.00..431.01 rows=1 width=1) - -> Result (cost=0.00..431.01 rows=5 width=1) + -> Result + -> Limit + -> Result One-Time Filter: (onetimefilter1_1.b = onetimefilter2_1.b) - -> Materialize (cost=0.00..431.00 rows=11 width=1) - -> Broadcast Motion 3:3 (slice4; segments: 3) (cost=0.00..431.00 rows=11 width=1) - -> Result (cost=0.00..431.00 rows=4 width=1) - -> Shared Scan (share slice:id 4:0) (cost=0.00..431.00 rows=4 width=1) + -> Materialize + -> Broadcast Motion 3:3 (slice4; segments: 3) + -> Result + -> Shared Scan (share slice:id 4:0) SubPlan 2 - -> Result (cost=0.00..431.33 rows=3 width=4) - -> Result (cost=0.00..431.33 rows=3 width=1) + -> Result + -> Result One-Time Filter: (onetimefilter1_1.a = 2) Filter: ((onetimefilter1_1.a)::double precision = random()) - -> Materialize (cost=0.00..431.00 rows=11 width=1) - -> Broadcast Motion 3:3 (slice5; segments: 3) (cost=0.00..431.00 rows=11 width=1) - -> Result (cost=0.00..431.00 rows=4 width=1) - -> Shared Scan (share slice:id 5:0) (cost=0.00..431.00 rows=4 width=1) + -> Materialize + -> Broadcast Motion 3:3 (slice5; segments: 3) + -> Result + -> Shared Scan (share slice:id 5:0) SubPlan 3 - -> Result (cost=0.00..431.66 rows=1 width=4) + -> Result Filter: (share0_ref3.b = onetimefilter1_1.b) - -> Materialize (cost=0.00..431.00 rows=11 width=4) - -> Broadcast Motion 3:3 (slice6; segments: 3) (cost=0.00..431.00 rows=11 width=4) - -> Result (cost=0.00..431.00 rows=4 width=4) - -> Shared Scan (share slice:id 6:0) (cost=0.00..431.00 rows=4 width=4) + -> Materialize + -> Broadcast Motion 3:3 (slice6; segments: 3) + -> Result + -> Shared Scan (share slice:id 6:0) Optimizer: Pivotal Optimizer (GPORCA) (43 rows) @@ -14551,20 +14551,20 @@ CREATE TABLE dist_tab_a (a varchar(15)) DISTRIBUTED BY(a); INSERT INTO dist_tab_a VALUES('1 '), ('2 '), ('3 '); CREATE TABLE dist_tab_b (a char(15), b bigint) DISTRIBUTED BY(a); INSERT INTO dist_tab_b VALUES('1 ', 1), ('2 ', 2), ('3 ', 3); -EXPLAIN CREATE TABLE result_tab AS +EXPLAIN(COSTS OFF) CREATE TABLE result_tab AS (SELECT a.a, b.b FROM dist_tab_a a LEFT JOIN dist_tab_b b ON a.a=b.a) DISTRIBUTED BY(a); - QUERY PLAN ------------------------------------------------------------------------------------------------------- - Result (cost=0.00..862.05 rows=2 width=16) - -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..862.00 rows=2 width=16) + QUERY PLAN +------------------------------------------------------------------ + Result + -> Redistribute Motion 3:3 (slice1; segments: 3) Hash Key: dist_tab_a.a - -> Hash Left Join (cost=0.00..862.00 rows=1 width=16) + -> Hash Left Join Hash Cond: ((dist_tab_a.a)::bpchar = dist_tab_b.a) - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=8) + -> Redistribute Motion 3:3 (slice2; segments: 3) Hash Key: dist_tab_a.a - -> Seq Scan on dist_tab_a (cost=0.00..431.00 rows=1 width=8) - -> Hash (cost=431.00..431.00 rows=1 width=16) - -> Seq Scan on dist_tab_b (cost=0.00..431.00 rows=1 width=16) + -> Seq Scan on dist_tab_a + -> Hash + -> Seq Scan on dist_tab_b Optimizer: Pivotal Optimizer (GPORCA) (11 rows) diff --git a/src/test/regress/expected/guc_gp.out b/src/test/regress/expected/guc_gp.out index 83ba24eac4b..9f6c18e7cef 100644 --- a/src/test/regress/expected/guc_gp.out +++ b/src/test/regress/expected/guc_gp.out @@ -457,3 +457,49 @@ SELECT gp_inject_fault('restore_string_guc', 'reset', 1); Success: (1 row) +-- enabling gp_force_random_redistribution makes sure random redistribution happens +-- only relevant to postgres optimizer +set optimizer = false; +create table t1_dist_rand(a int) distributed randomly; +create table t2_dist_rand(a int) distributed randomly; +create table t_dist_hash(a int) distributed by (a); +-- with the GUC turned off, redistribution won't happen (no redistribution motion) +set gp_force_random_redistribution = false; +explain (costs off) insert into t2_dist_rand select * from t1_dist_rand; + QUERY PLAN +------------------------------------- + Insert on t2_dist_rand + -> Seq Scan on t1_dist_rand + Optimizer: Postgres query optimizer +(3 rows) + +explain (costs off) insert into t2_dist_rand select * from t_dist_hash; + QUERY PLAN +------------------------------------- + Insert on t2_dist_rand + -> Seq Scan on t_dist_hash + Optimizer: Postgres query optimizer +(3 rows) + +-- with the GUC turned on, redistribution would happen +set gp_force_random_redistribution = true; +explain (costs off) insert into t2_dist_rand select * from t1_dist_rand; + QUERY PLAN +------------------------------------------------------ + Insert on t2_dist_rand + -> Redistribute Motion 3:3 (slice1; segments: 3) + -> Seq Scan on t1_dist_rand + Optimizer: Postgres query optimizer +(4 rows) + +explain (costs off) insert into t2_dist_rand select * from t_dist_hash; + QUERY PLAN +------------------------------------------------------ + Insert on t2_dist_rand + -> Redistribute Motion 3:3 (slice1; segments: 3) + -> Seq Scan on t_dist_hash + Optimizer: Postgres query optimizer +(4 rows) + +reset gp_force_random_redistribution; +reset optimizer; diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index efa3c3d2517..2aa1a4a1091 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -566,6 +566,7 @@ select * from (select * from t order by a) s order by a, b limit 55; (55 rows) -- Test EXPLAIN ANALYZE with only a fullsort group. +set max_parallel_workers_per_gather = 0; select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); explain_analyze_without_memory --------------------------------------------------------------------------------------------------------------------------- @@ -584,6 +585,7 @@ select explain_analyze_without_memory('select * from (select * from t order by a Optimizer: Postgres query optimizer (13 rows) +reset max_parallel_workers_per_gather; select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55')); jsonb_pretty ------------------------------------------------- @@ -789,6 +791,7 @@ select * from t left join (select * from (select * from t order by a) v order by rollback; -- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +set max_parallel_workers_per_gather = 0; select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); explain_analyze_without_memory ---------------------------------------------------------------------------------------------------------------- @@ -808,6 +811,7 @@ select explain_analyze_without_memory('select * from (select * from t order by a Optimizer: Postgres query optimizer (14 rows) +reset max_parallel_workers_per_gather; select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70')); jsonb_pretty ------------------------------------------------- diff --git a/src/test/regress/expected/incremental_sort_optimizer.out b/src/test/regress/expected/incremental_sort_optimizer.out index d8c41b85f40..5e549c3fb31 100644 --- a/src/test/regress/expected/incremental_sort_optimizer.out +++ b/src/test/regress/expected/incremental_sort_optimizer.out @@ -541,6 +541,7 @@ select * from (select * from t order by a) s order by a, b limit 55; (55 rows) -- Test EXPLAIN ANALYZE with only a fullsort group. +set max_parallel_workers_per_gather = 0; select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); explain_analyze_without_memory ------------------------------------------------------------------------- @@ -555,6 +556,7 @@ select explain_analyze_without_memory('select * from (select * from t order by a Optimizer: Pivotal Optimizer (GPORCA) (9 rows) +reset max_parallel_workers_per_gather; select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55')); jsonb_pretty -------------- @@ -721,6 +723,7 @@ select * from t left join (select * from (select * from t order by a) v order by rollback; -- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +set max_parallel_workers_per_gather = 0; select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); explain_analyze_without_memory ------------------------------------------------------------------------- @@ -736,6 +739,7 @@ select explain_analyze_without_memory('select * from (select * from t order by a Optimizer: Pivotal Optimizer (GPORCA) (10 rows) +reset max_parallel_workers_per_gather; select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70')); jsonb_pretty -------------- @@ -1496,7 +1500,7 @@ from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub; explain (costs off) select sub.unique1, stringu1 from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub order by 1, 2; - QUERY PLAN + QUERY PLAN ----------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) Merge Key: tenk1.unique1, tenk1.stringu1 @@ -1528,7 +1532,7 @@ from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub; explain (costs off) select sub.unique1, md5(stringu1) from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub order by 1, 2; - QUERY PLAN + QUERY PLAN ---------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) Merge Key: tenk1.unique1, (md5((tenk1.stringu1)::text)) diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index a4ecfa11f2a..a6428f6824e 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -814,7 +814,7 @@ insert into mcrparted4 values (30, 21, 20); -- error ERROR: new row for relation "mcrparted4" violates partition constraint DETAIL: Failing row contains (30, 21, 20). -- check rows -select tableoid::regclass::text, * from mcrparted order by 1; +select tableoid::regclass::text, * from mcrparted order by 1, 2, 3; tableoid | a | b | c ------------+----+------+------ mcrparted0 | 0 | 1 | 1 diff --git a/src/test/regress/expected/misc_sanity.out b/src/test/regress/expected/misc_sanity.out index 20c5ab5b9cd..13947c53237 100644 --- a/src/test/regress/expected/misc_sanity.out +++ b/src/test/regress/expected/misc_sanity.out @@ -122,7 +122,18 @@ ORDER BY 1, 2; pg_resqueuecapability | ressetting | text pg_stat_last_operation | stasubtype | text pg_stat_last_shoperation | stasubtype | text -(19 rows) + pg_task | command | text + pg_task | database | text + pg_task | jobname | text + pg_task | nodename | text + pg_task | schedule | text + pg_task | username | text + pg_task_run_history | command | text + pg_task_run_history | database | text + pg_task_run_history | return_message | text + pg_task_run_history | status | text + pg_task_run_history | username | text +(30 rows) -- system catalogs without primary keys -- diff --git a/src/test/regress/expected/misc_sanity_external_fts.out b/src/test/regress/expected/misc_sanity_external_fts.out index 0e65c6eafeb..06af9c3fed1 100644 --- a/src/test/regress/expected/misc_sanity_external_fts.out +++ b/src/test/regress/expected/misc_sanity_external_fts.out @@ -121,7 +121,18 @@ ORDER BY 1, 2; pg_resqueuecapability | ressetting | text pg_stat_last_operation | stasubtype | text pg_stat_last_shoperation | stasubtype | text -(19 rows) + pg_task | command | text + pg_task | database | text + pg_task | jobname | text + pg_task | nodename | text + pg_task | schedule | text + pg_task | username | text + pg_task_run_history | command | text + pg_task_run_history | database | text + pg_task_run_history | return_message | text + pg_task_run_history | status | text + pg_task_run_history | username | text +(30 rows) -- system catalogs without primary keys -- diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index a80a213930a..9a69e775955 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -2249,6 +2249,7 @@ $$ declare ln text; begin + set local enable_parallel = off; for ln in execute format('explain (analyze, costs off, summary off, timing off) %s', $1) @@ -2258,6 +2259,7 @@ begin ln := regexp_replace(ln, 'Rows Removed by Filter: \d+', 'Rows Removed by Filter: N'); return next ln; end loop; + reset enable_parallel; end; $$; prepare ab_q4 (int, int) as diff --git a/src/test/regress/expected/partition_prune_optimizer.out b/src/test/regress/expected/partition_prune_optimizer.out index f79915bd574..60fb53da0b5 100644 --- a/src/test/regress/expected/partition_prune_optimizer.out +++ b/src/test/regress/expected/partition_prune_optimizer.out @@ -2304,6 +2304,7 @@ $$ declare ln text; begin + set local enable_parallel = off; for ln in execute format('explain (analyze, costs off, summary off, timing off) %s', $1) @@ -2313,6 +2314,7 @@ begin ln := regexp_replace(ln, 'Rows Removed by Filter: \d+', 'Rows Removed by Filter: N'); return next ln; end loop; + reset enable_parallel; end; $$; prepare ab_q4 (int, int) as diff --git a/src/test/regress/expected/pg_stat.out b/src/test/regress/expected/pg_stat.out index ead3373cca5..80e4e4ff132 100644 --- a/src/test/regress/expected/pg_stat.out +++ b/src/test/regress/expected/pg_stat.out @@ -1,4 +1,5 @@ set optimizer to off; +set max_parallel_workers_per_gather=0; drop table if exists pg_stat_test; create table pg_stat_test(a int); select @@ -94,3 +95,4 @@ from pg_stat_user_indexes where relname = 'pg_stat_test'; (1 row) reset optimizer; +reset max_parallel_workers_per_gather; diff --git a/src/test/regress/expected/qp_misc.out b/src/test/regress/expected/qp_misc.out index 8655eff8d04..5dbf5419ad6 100644 --- a/src/test/regress/expected/qp_misc.out +++ b/src/test/regress/expected/qp_misc.out @@ -17066,6 +17066,7 @@ f1,f2 SelectThaiColumnLower_p1 | 1 (1 row) +set max_parallel_workers_per_gather=0; -- SelectThaiColumnOrderByLocal_p1 select 'SelectThaiColumnOrderByLocal_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( @@ -17154,6 +17155,7 @@ f1,f2 SelectThaiColumnOrderByLocal_p1 | 1 (1 row) +reset max_parallel_workers_per_gather; -- SelectThaiColumnWhere_p1 select 'SelectThaiColumnWhere_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( diff --git a/src/test/regress/expected/qp_query_execution.out b/src/test/regress/expected/qp_query_execution.out index b8925e51e6e..9b5b3a4c6cb 100644 --- a/src/test/regress/expected/qp_query_execution.out +++ b/src/test/regress/expected/qp_query_execution.out @@ -7,6 +7,7 @@ set search_path to qp_query_execution; create language plpython3u; create or replace function qx_count_operator(query text, planner_operator text, optimizer_operator text) returns int as $$ +plpy.execute('set max_parallel_workers_per_gather=0') rv = plpy.execute('EXPLAIN '+ query) plan = '\n'.join([row['QUERY PLAN'] for row in rv]) optimizer = plan.find('Pivotal Optimizer (GPORCA)') diff --git a/src/test/regress/expected/qp_targeted_dispatch.out b/src/test/regress/expected/qp_targeted_dispatch.out index 57cee210fe6..793ade8a9e8 100644 --- a/src/test/regress/expected/qp_targeted_dispatch.out +++ b/src/test/regress/expected/qp_targeted_dispatch.out @@ -717,6 +717,7 @@ set test_print_direct_dispatch_info=on; alter table table_a set distributed randomly; INFO: Distributed transaction command 'Distributed Prepare' to ALL contents: 0 1 2 INFO: Distributed transaction command 'Distributed Commit Prepared' to ALL contents: 0 1 2 +set enable_parallel = off; select max(a0) from table_a where a0=3; INFO: (slice 2) Dispatch command to ALL contents: 0 1 2 max @@ -777,6 +778,7 @@ INFO: (slice 2) Dispatch command to SINGLE content 1 (1 row) +reset enable_parallel; explain select a0 from table_a where a0 in (select max(a1) from table_a where a0=1); QUERY PLAN ---------------------------------------------------------------------------------------------------------- diff --git a/src/test/regress/expected/qp_targeted_dispatch_optimizer.out b/src/test/regress/expected/qp_targeted_dispatch_optimizer.out index e2d9b3461c4..8c9b384e4a6 100644 --- a/src/test/regress/expected/qp_targeted_dispatch_optimizer.out +++ b/src/test/regress/expected/qp_targeted_dispatch_optimizer.out @@ -727,6 +727,7 @@ set test_print_direct_dispatch_info=on; alter table table_a set distributed randomly; INFO: Distributed transaction command 'Distributed Prepare' to ALL contents: 0 1 2 INFO: Distributed transaction command 'Distributed Commit Prepared' to ALL contents: 0 1 2 +set enable_parallel = off; select max(a0) from table_a where a0=3; INFO: (slice 1) Dispatch command to ALL contents: 0 1 2 max @@ -787,6 +788,7 @@ INFO: (slice 1) Dispatch command to SINGLE content 1 (1 row) +reset enable_parallel; explain select a0 from table_a where a0 in (select max(a1) from table_a where a0=1); QUERY PLAN ------------------------------------------------------------------------------------------------------ diff --git a/src/test/regress/expected/rangefuncs_cdb.out b/src/test/regress/expected/rangefuncs_cdb.out index 23b5b75c371..296c7d22e32 100644 --- a/src/test/regress/expected/rangefuncs_cdb.out +++ b/src/test/regress/expected/rangefuncs_cdb.out @@ -1,4 +1,8 @@ -SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%'; +-- +-- Will run in parallel mode with enable_parallel=on and non-parallel mode. +-- Filter this gucs to pass regression. +-- +SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%' and name != 'enable_parallel'; name | setting --------------------------------+--------- enable_async_append | on diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index 89c913e4d75..d79ae943abc 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -171,6 +171,8 @@ pg_statistic_ext_data|t pg_subscription|t pg_subscription_rel|t pg_tablespace|t +pg_task|t +pg_task_run_history|t pg_transform|t pg_trigger|t pg_ts_config|t diff --git a/src/test/regress/expected/segspace.out b/src/test/regress/expected/segspace.out index 5dd285b5776..7ccf7a813f5 100644 --- a/src/test/regress/expected/segspace.out +++ b/src/test/regress/expected/segspace.out @@ -1,6 +1,8 @@ -- -- Tests the spill files disk space accounting mechanism -- +-- GPDP_PARALLEL_FIXME: it's hard to make fault_injection work with prallel processes. +set enable_parallel = false; -- check segspace before test reset statement_mem; select max(bytes) as max, min(bytes) as min from gp_toolkit.gp_workfile_mgr_used_diskspace; @@ -45,6 +47,39 @@ set gp_autostats_mode = none; begin; SELECT t1.* FROM segspace_test_hj_skew AS t1, segspace_test_hj_skew AS t2 WHERE t1.i1=t2.i2; ERROR: canceling MPP operation (seg0 slice2 127.0.0.1:25432 pid=26876) +rollback; +-- +-- GPDB parallel once got errors like: +-- could not read from shared tuplestore temporary file: read only 0 of 8 bytes from file. +-- Enable parallel here to test it. +-- +begin; +set local enable_parallel = true; +set local optimizer=off; +set local min_parallel_table_scan_size=0; +set local min_parallel_index_scan_size = 0; +set local force_parallel_mode=1; +EXPLAIN(COSTS OFF) SELECT t1.* FROM segspace_test_hj_skew AS t1, segspace_test_hj_skew AS t2 WHERE t1.i1=t2.i2; + QUERY PLAN +----------------------------------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) + -> Parallel Hash Join + Hash Cond: (t1.i1 = t2.i2) + -> Parallel Seq Scan on segspace_test_hj_skew t1 + -> Parallel Hash + -> Redistribute Motion 6:6 (slice2; segments: 6) + Hash Key: t2.i2 + Hash Module: 3 + -> Parallel Seq Scan on segspace_test_hj_skew t2 + Optimizer: Postgres query optimizer +(10 rows) + +SELECT count(t1.*) FROM segspace_test_hj_skew AS t1, segspace_test_hj_skew AS t2 WHERE t1.i1=t2.i2; + count +-------- + 750000 +(1 row) + rollback; select gp_inject_fault('exec_hashjoin_new_batch', 'status', 2); gp_inject_fault @@ -469,3 +504,11 @@ NOTICE: caught exception: invalid input syntax for type integer: "bogus" (1 row) drop table segspace_test_hj_skew; +reset enable_parallel; +-- don't disturb other processes. +select cleanupAllGangs(); + cleanupallgangs +----------------- + t +(1 row) + diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out index d40653e3584..bf27948896c 100644 --- a/src/test/regress/expected/select_parallel.out +++ b/src/test/regress/expected/select_parallel.out @@ -1,10 +1,9 @@ -- -- PARALLEL +-- We have GP style parallel now, open this file in parallel mode. -- --- GPDB_96_MERGE_FIXME: We don't support parallel query. These tests won't actually --- generate any parallel plans. Should we pay attention to the parallel restrictions --- when creating MPP plans? For example, should we force parallel restricted functions --- to run in the QD? +set enable_parallel = on; +set optimizer = off; create function sp_parallel_restricted(int) returns int as $$begin return $1; end$$ language plpgsql parallel restricted; begin; @@ -16,18 +15,18 @@ set max_parallel_workers_per_gather=4; -- Parallel Append with partial-subplans explain (costs off) select round(avg(aa)), sum(aa) from a_star; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 9:1 (slice1; segments: 9) -> Partial Aggregate - -> Append - -> Seq Scan on a_star a_star_1 - -> Seq Scan on b_star a_star_2 - -> Seq Scan on c_star a_star_3 + -> Parallel Append -> Seq Scan on d_star a_star_4 - -> Seq Scan on e_star a_star_5 -> Seq Scan on f_star a_star_6 + -> Seq Scan on e_star a_star_5 + -> Seq Scan on b_star a_star_2 + -> Seq Scan on c_star a_star_3 + -> Seq Scan on a_star a_star_1 Optimizer: Postgres query optimizer (11 rows) @@ -42,18 +41,18 @@ alter table c_star set (parallel_workers = 0); alter table d_star set (parallel_workers = 0); explain (costs off) select round(avg(aa)), sum(aa) from a_star; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 9:1 (slice1; segments: 9) -> Partial Aggregate - -> Append - -> Seq Scan on a_star a_star_1 - -> Seq Scan on b_star a_star_2 - -> Seq Scan on c_star a_star_3 + -> Parallel Append -> Seq Scan on d_star a_star_4 - -> Seq Scan on e_star a_star_5 -> Seq Scan on f_star a_star_6 + -> Seq Scan on e_star a_star_5 + -> Seq Scan on b_star a_star_2 + -> Seq Scan on c_star a_star_3 + -> Seq Scan on a_star a_star_1 Optimizer: Postgres query optimizer (11 rows) @@ -70,18 +69,18 @@ alter table e_star set (parallel_workers = 0); alter table f_star set (parallel_workers = 0); explain (costs off) select round(avg(aa)), sum(aa) from a_star; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 9:1 (slice1; segments: 9) -> Partial Aggregate - -> Append - -> Seq Scan on a_star a_star_1 - -> Seq Scan on b_star a_star_2 - -> Seq Scan on c_star a_star_3 + -> Parallel Append -> Seq Scan on d_star a_star_4 - -> Seq Scan on e_star a_star_5 -> Seq Scan on f_star a_star_6 + -> Seq Scan on e_star a_star_5 + -> Seq Scan on b_star a_star_2 + -> Seq Scan on c_star a_star_3 + -> Seq Scan on a_star a_star_1 Optimizer: Postgres query optimizer (11 rows) @@ -101,8 +100,8 @@ alter table f_star reset (parallel_workers); set enable_parallel_append to off; explain (costs off) select round(avg(aa)), sum(aa) from a_star; - QUERY PLAN --------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------- Finalize Aggregate -> Gather Motion 3:1 (slice1; segments: 3) -> Partial Aggregate @@ -136,8 +135,12 @@ select sp_test_func() order by 1; -- Parallel Append is not to be used when the subpath depends on the outer param create table part_pa_test(a int, b int) partition by range(a); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. create table part_pa_test_p1 partition of part_pa_test for values from (minvalue) to (0); +NOTICE: table has parent, setting distribution columns to match parent table create table part_pa_test_p2 partition of part_pa_test for values from (0) to (maxvalue); +NOTICE: table has parent, setting distribution columns to match parent table explain (costs off) select (select max((select pa1.b from part_pa_test pa1 where pa1.a = pa2.a))) from part_pa_test pa2; @@ -170,9 +173,9 @@ explain (costs off) QUERY PLAN --------------------------------------------------------- Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 12:1 (slice1; segments: 12) -> Partial Aggregate - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk1 Filter: (stringu1 = 'GRAAAA'::name) Optimizer: Postgres query optimizer (6 rows) @@ -191,9 +194,9 @@ explain (costs off) QUERY PLAN --------------------------------------------------------- Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 12:1 (slice1; segments: 12) -> Partial Aggregate - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk1 Filter: (stringu1 = 'GRAAAA'::name) Optimizer: Postgres query optimizer (6 rows) @@ -211,8 +214,8 @@ alter table tenk1 set (parallel_workers = 4); explain (verbose, costs off) select sp_parallel_restricted(unique1) from tenk1 where stringu1 = 'GRAAAA' order by 1; - QUERY PLAN -------------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) Output: (sp_parallel_restricted(unique1)) Merge Key: (sp_parallel_restricted(unique1)) @@ -222,25 +225,24 @@ select sp_parallel_restricted(unique1) from tenk1 -> Seq Scan on public.tenk1 Output: sp_parallel_restricted(unique1) Filter: (tenk1.stringu1 = 'GRAAAA'::name) + Settings: enable_parallel = 'on', min_parallel_table_scan_size = '0', optimizer = 'off', parallel_setup_cost = '0', parallel_tuple_cost = '0' Optimizer: Postgres query optimizer - Settings: min_parallel_table_scan_size=0, optimizer=off, parallel_setup_cost=0, parallel_tuple_cost=0 (11 rows) -- test parallel plan when group by expression is in target list. explain (costs off) select length(stringu1) from tenk1 group by length(stringu1); - QUERY PLAN ------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) + QUERY PLAN +--------------------------------------------------------------- + Gather Motion 12:1 (slice1; segments: 12) -> HashAggregate Group Key: (length((stringu1)::text)) - -> Redistribute Motion 3:3 (slice2; segments: 3) + -> Redistribute Motion 12:12 (slice2; segments: 12) Hash Key: (length((stringu1)::text)) - -> HashAggregate - Group Key: length((stringu1)::text) - -> Seq Scan on tenk1 + Hash Module: 3 + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer -(9 rows) +(8 rows) select length(stringu1) from tenk1 group by length(stringu1); length @@ -250,21 +252,20 @@ select length(stringu1) from tenk1 group by length(stringu1); explain (costs off) select stringu1, count(*) from tenk1 group by stringu1 order by stringu1; - QUERY PLAN ------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) + QUERY PLAN +--------------------------------------------------------------------- + Gather Motion 12:1 (slice1; segments: 12) Merge Key: stringu1 -> Sort Sort Key: stringu1 - -> Finalize HashAggregate + -> HashAggregate Group Key: stringu1 - -> Redistribute Motion 3:3 (slice2; segments: 3) + -> Redistribute Motion 12:12 (slice2; segments: 12) Hash Key: stringu1 - -> Partial HashAggregate - Group Key: stringu1 - -> Seq Scan on tenk1 + Hash Module: 3 + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer -(10 rows) +(11 rows) -- test that parallel plan for aggregates is not selected when -- target list contains parallel restricted clause. @@ -285,12 +286,12 @@ explain (costs off) -- test prepared statement prepare tenk1_count(integer) As select count((unique1)) from tenk1 where hundred > $1; explain (costs off) execute tenk1_count(1); - QUERY PLAN ------------------------------------------------- + QUERY PLAN +-------------------------------------------------- Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 12:1 (slice1; segments: 12) -> Partial Aggregate - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk1 Filter: (hundred > 1) Optimizer: Postgres query optimizer (6 rows) @@ -310,12 +311,12 @@ explain (costs off) QUERY PLAN -------------------------------------------------------------------------------------------- Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 12:1 (slice1; segments: 12) -> Nested Loop Left Anti Semi (Not-In) Join Join Filter: ((tenk1.two = tenk2.hundred) AND (tenk1.four = tenk2.thousand)) - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk1 -> Materialize - -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Broadcast Motion 3:12 (slice2; segments: 3) -> Seq Scan on tenk2 Filter: (thousand > 100) Optimizer: Postgres query optimizer @@ -358,11 +359,11 @@ explain (costs off) Aggregate InitPlan 1 (returns $1) (slice2) -> Finalize Aggregate - -> Gather Motion 3:1 (slice3; segments: 3) + -> Gather Motion 6:1 (slice3; segments: 6) -> Partial Aggregate - -> Seq Scan on tenk2 - -> Gather Motion 3:1 (slice1; segments: 3) - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk2 + -> Gather Motion 12:1 (slice1; segments: 12) + -> Parallel Seq Scan on tenk1 Filter: (unique1 = $1) Optimizer: Postgres query optimizer (10 rows) @@ -383,12 +384,12 @@ set enable_seqscan to off; set enable_bitmapscan to off; explain (costs off) select count((unique1)) from tenk1 where hundred > 1; - QUERY PLAN ------------------------------------------------------------ + QUERY PLAN +-------------------------------------------------------------------- Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 12:1 (slice1; segments: 12) -> Partial Aggregate - -> Index Scan using tenk1_hundred on tenk1 + -> Parallel Index Scan using tenk1_hundred on tenk1 Index Cond: (hundred > 1) Optimizer: Postgres query optimizer (6 rows) @@ -402,12 +403,12 @@ select count((unique1)) from tenk1 where hundred > 1; -- test parallel index-only scans. explain (costs off) select count(*) from tenk1 where thousand > 95; - QUERY PLAN ------------------------------------------------------------------------ + QUERY PLAN +-------------------------------------------------------------------------------- Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 12:1 (slice1; segments: 12) -> Partial Aggregate - -> Index Only Scan using tenk1_thous_tenthous on tenk1 + -> Parallel Index Only Scan using tenk1_thous_tenthous on tenk1 Index Cond: (thousand > 95) Optimizer: Postgres query optimizer (6 rows) @@ -539,13 +540,13 @@ explain (costs off) QUERY PLAN ------------------------------------------------------------------------------ Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 12:1 (slice1; segments: 12) -> Partial Aggregate -> Nested Loop - -> Seq Scan on tenk2 + -> Parallel Seq Scan on tenk2 Filter: (thousand = 0) -> Materialize - -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Broadcast Motion 3:12 (slice2; segments: 3) -> Bitmap Heap Scan on tenk1 Recheck Cond: (hundred > 1) -> Bitmap Index Scan on tenk1_hundred @@ -560,6 +561,8 @@ select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0; (1 row) create table bmscantest (a int, t text); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. insert into bmscantest select r, 'fooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' FROM generate_series(1,100000) r; create index i_bmtest ON bmscantest(a); select count(*) from bmscantest where a>1; @@ -610,6 +613,8 @@ begin end loop; end; $$; +-- test sort stats plan, disable parallel +set max_parallel_workers_per_gather = 0; select * from explain_parallel_sort_stats(); explain_parallel_sort_stats ---------------------------------------------------------------------------------- @@ -626,6 +631,7 @@ select * from explain_parallel_sort_stats(); Optimizer: Postgres query optimizer (11 rows) +reset max_parallel_workers_per_gather; reset enable_indexscan; reset enable_hashjoin; reset enable_mergejoin; @@ -665,19 +671,23 @@ reset enable_nestloop; set enable_hashagg = false; explain (costs off) select count(*) from tenk1 group by twenty; - QUERY PLAN ------------------------------------------------- - Finalize GroupAggregate - Group Key: twenty - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: twenty - -> Partial GroupAggregate - Group Key: twenty - -> Sort - Sort Key: twenty - -> Seq Scan on tenk1 + QUERY PLAN +------------------------------------------------------------------ + Gather Motion 6:1 (slice1; segments: 6) + -> Finalize GroupAggregate + Group Key: twenty + -> Sort + Sort Key: twenty + -> Redistribute Motion 6:6 (slice2; segments: 6) + Hash Key: twenty + Hash Module: 3 + -> Partial GroupAggregate + Group Key: twenty + -> Sort + Sort Key: twenty + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer -(10 rows) +(14 rows) select count(*) from tenk1 group by twenty; count @@ -713,9 +723,9 @@ end; $$ language plpgsql PARALLEL SAFE; explain (costs off, verbose) select ten, sp_simple_func(ten) from tenk1 where ten < 100 order by ten; - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Gather Motion 6:1 (slice1; segments: 6) Output: ten, (sp_simple_func(ten)) Merge Key: ten -> Result @@ -723,31 +733,35 @@ explain (costs off, verbose) -> Sort Output: ten Sort Key: tenk1.ten - -> Seq Scan on public.tenk1 + -> Parallel Seq Scan on public.tenk1 Output: ten Filter: (tenk1.ten < 100) + Settings: enable_hashagg = 'off', enable_parallel = 'on', min_parallel_table_scan_size = '0', optimizer = 'off', parallel_setup_cost = '0', parallel_tuple_cost = '0' Optimizer: Postgres query optimizer - Settings: enable_hashagg=off, min_parallel_table_scan_size=0, optimizer=off, parallel_setup_cost=0, parallel_tuple_cost=0 (13 rows) drop function sp_simple_func(integer); -- test handling of SRFs in targetlist (bug in 10.0) explain (costs off) select count(*), generate_series(1,2) from tenk1 group by twenty; - QUERY PLAN ------------------------------------------------------- - ProjectSet - -> Finalize GroupAggregate - Group Key: twenty - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: twenty - -> Partial GroupAggregate - Group Key: twenty - -> Sort - Sort Key: twenty - -> Seq Scan on tenk1 + QUERY PLAN +------------------------------------------------------------------------ + Gather Motion 6:1 (slice1; segments: 6) + -> ProjectSet + -> Finalize GroupAggregate + Group Key: twenty + -> Sort + Sort Key: twenty + -> Redistribute Motion 6:6 (slice2; segments: 6) + Hash Key: twenty + Hash Module: 3 + -> Partial GroupAggregate + Group Key: twenty + -> Sort + Sort Key: twenty + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer -(11 rows) +(15 rows) select count(*), generate_series(1,2) from tenk1 group by twenty; count | generate_series @@ -798,19 +812,23 @@ select count(*), generate_series(1,2) from tenk1 group by twenty; set parallel_leader_participation = off; explain (costs off) select count(*) from tenk1 group by twenty; - QUERY PLAN ------------------------------------------------- - Finalize GroupAggregate - Group Key: twenty - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: twenty - -> Partial GroupAggregate - Group Key: twenty - -> Sort - Sort Key: twenty - -> Seq Scan on tenk1 + QUERY PLAN +------------------------------------------------------------------ + Gather Motion 6:1 (slice1; segments: 6) + -> Finalize GroupAggregate + Group Key: twenty + -> Sort + Sort Key: twenty + -> Redistribute Motion 6:6 (slice2; segments: 6) + Hash Key: twenty + Hash Module: 3 + -> Partial GroupAggregate + Group Key: twenty + -> Sort + Sort Key: twenty + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer -(10 rows) +(14 rows) select count(*) from tenk1 group by twenty; count @@ -873,16 +891,16 @@ select * from ---------+-------+--- AAAAxx | 2500 | 1 HHHHxx | 2500 | 1 - OOOOxx | 2500 | 1 VVVVxx | 2500 | 1 + OOOOxx | 2500 | 1 AAAAxx | 2500 | 2 HHHHxx | 2500 | 2 - OOOOxx | 2500 | 2 VVVVxx | 2500 | 2 + OOOOxx | 2500 | 2 AAAAxx | 2500 | 3 HHHHxx | 2500 | 3 - OOOOxx | 2500 | 3 VVVVxx | 2500 | 3 + OOOOxx | 2500 | 3 (12 rows) reset enable_material; @@ -893,9 +911,9 @@ select avg(unique1::int8) from tenk1; QUERY PLAN ------------------------------------------------ Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 6:1 (slice1; segments: 6) -> Partial Aggregate - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer (5 rows) @@ -908,15 +926,15 @@ select avg(unique1::int8) from tenk1; -- gather merge test with a LIMIT explain (costs off) select fivethous from tenk1 order by fivethous limit 4; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +---------------------------------------------------- Limit - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 6:1 (slice1; segments: 6) Merge Key: fivethous -> Limit -> Sort Sort Key: fivethous - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer (8 rows) @@ -933,15 +951,15 @@ select fivethous from tenk1 order by fivethous limit 4; set max_parallel_workers = 0; explain (costs off) select string4 from tenk1 order by string4 limit 5; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +---------------------------------------------------- Limit - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 6:1 (slice1; segments: 6) Merge Key: string4 -> Limit -> Sort Sort Key: string4 - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer (8 rows) @@ -961,15 +979,15 @@ select string4 from tenk1 order by string4 limit 5; set parallel_leader_participation = off; explain (costs off) select string4 from tenk1 order by string4 limit 5; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +---------------------------------------------------- Limit - -> Gather Motion 3:1 (slice1; segments: 3) + -> Gather Motion 6:1 (slice1; segments: 6) Merge Key: string4 -> Limit -> Sort Sort Key: string4 - -> Seq Scan on tenk1 + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer (8 rows) @@ -989,10 +1007,10 @@ SAVEPOINT settings; SET LOCAL force_parallel_mode = 1; explain (costs off) select stringu1::int2 from tenk1 where unique1 = 1; - QUERY PLAN ------------------------------------------------ - Gather Motion 1:1 (slice1; segments: 1) - -> Index Scan using tenk1_unique1 on tenk1 + QUERY PLAN +-------------------------------------------------------- + Gather Motion 2:1 (slice1; segments: 2) + -> Parallel Index Scan using tenk1_unique1 on tenk1 Index Cond: (unique1 = 1) Optimizer: Postgres query optimizer (4 rows) @@ -1068,8 +1086,8 @@ explain (costs off, verbose) Output: b.unique1 -> Seq Scan on public.tenk1 b Output: b.unique1 + Settings: enable_parallel = 'on', min_parallel_table_scan_size = '0', optimizer = 'off', parallel_setup_cost = '0', parallel_tuple_cost = '0' Optimizer: Postgres query optimizer - Settings: min_parallel_table_scan_size=0, parallel_setup_cost=0, parallel_tuple_cost=0 (23 rows) -- LIMIT/OFFSET within sub-selects can't be pushed to workers. @@ -1097,11 +1115,12 @@ explain (costs off) -- to increase the parallel query test coverage SAVEPOINT settings; SET LOCAL force_parallel_mode = 1; -EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; - QUERY PLAN ----------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (actual rows=10000 loops=1) - -> Seq Scan on tenk1 (actual rows=3386 loops=1) +-- GPDB_PARALLEL_FIXME: analyze actual rows may be different by running multiple times. +EXPLAIN (timing off, summary off, costs off) SELECT * FROM tenk1; + QUERY PLAN +------------------------------------------ + Gather Motion 6:1 (slice1; segments: 6) + -> Parallel Seq Scan on tenk1 Optimizer: Postgres query optimizer (3 rows) @@ -1111,8 +1130,7 @@ ROLLBACK TO SAVEPOINT settings; SAVEPOINT settings; SET LOCAL force_parallel_mode = 1; select (stringu1 || repeat('abcd', 5000))::int2 from tenk1 where unique1 = 1; -ERROR: invalid input syntax for type smallint: "BAAAAAabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd" -CONTEXT: parallel worker +ERROR: invalid input syntax for type smallint: "BAAAAAabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd" (seg1 slice1 127.0.1.1:9003 pid=1200934) ROLLBACK TO SAVEPOINT settings; -- test interaction with set-returning functions SAVEPOINT settings; @@ -1125,11 +1143,11 @@ UNION ALL SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; QUERY PLAN ---------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Append - -> Seq Scan on tenk1 + Gather Motion 6:1 (slice1; segments: 6) + -> Parallel Append + -> Parallel Seq Scan on tenk1 Filter: (fivethous = (tenthous + 1)) - -> Seq Scan on tenk1 tenk1_1 + -> Parallel Seq Scan on tenk1 tenk1_1 Filter: (fivethous = (tenthous + 1)) Optimizer: Postgres query optimizer (7 rows) @@ -1145,26 +1163,26 @@ SELECT unique1 FROM tenk1 WHERE fivethous = ORDER BY 1; QUERY PLAN -------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) + Gather Motion 6:1 (slice1; segments: 6) Merge Key: tenk1.unique1 -> Sort Sort Key: tenk1.unique1 - -> Append - -> Seq Scan on tenk1 - Filter: (fivethous = $0) - InitPlan 1 (returns $0) (slice2) + -> Parallel Append + -> Parallel Seq Scan on tenk1 + Filter: (fivethous = $1) + InitPlan 2 (returns $1) (slice2) -> Limit - -> Gather Motion 3:1 (slice3; segments: 3) + -> Gather Motion 6:1 (slice3; segments: 6) -> Limit - -> Seq Scan on tenk1 tenk1_2 + -> Parallel Seq Scan on tenk1 tenk1_3 Filter: (fivethous = 1) - -> Seq Scan on tenk1 tenk1_1 - Filter: (fivethous = $1) - InitPlan 2 (returns $1) (slice4) + -> Parallel Seq Scan on tenk1 tenk1_1 + Filter: (fivethous = $0) + InitPlan 1 (returns $0) (slice4) -> Limit - -> Gather Motion 3:1 (slice5; segments: 3) + -> Gather Motion 6:1 (slice5; segments: 6) -> Limit - -> Seq Scan on tenk1 tenk1_3 + -> Parallel Seq Scan on tenk1 tenk1_2 Filter: (fivethous = 1) Optimizer: Postgres query optimizer (22 rows) @@ -1179,8 +1197,8 @@ ORDER BY 1, 2, 3; EXPLAIN (VERBOSE, COSTS OFF) SELECT generate_series(1, two), array(select generate_series(1, two)) FROM tenk1 ORDER BY tenthous; - QUERY PLAN ----------------------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) Output: (generate_series(1, tenk1.two)), ((SubPlan 1)), tenk1.tenthous Merge Key: tenk1.tenthous @@ -1197,7 +1215,7 @@ SELECT generate_series(1, two), array(select generate_series(1, two)) -> ProjectSet Output: generate_series(1, tenk1.two) -> Result - Settings: min_parallel_table_scan_size = '0', parallel_setup_cost = '0', parallel_tuple_cost = '0' + Settings: enable_parallel = 'on', min_parallel_table_scan_size = '0', optimizer = 'off', parallel_setup_cost = '0', parallel_tuple_cost = '0' Optimizer: Postgres query optimizer (18 rows) @@ -1210,13 +1228,15 @@ $$declare x int[]; return x; end$$ language plpgsql parallel safe; CREATE TABLE fooarr(f1 text, f2 int[], f3 text); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'f1' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. INSERT INTO fooarr VALUES('1', ARRAY[1,2], 'one'); PREPARE pstmt(text, int[]) AS SELECT * FROM fooarr WHERE f1 = $1 AND f2 = $2; EXPLAIN (COSTS OFF) EXECUTE pstmt('1', make_some_array(1,2)); QUERY PLAN ------------------------------------------------------------------ - Gather Motion 1:1 (slice1; segments: 1) - -> Seq Scan on fooarr + Gather Motion 2:1 (slice1; segments: 2) + -> Parallel Seq Scan on fooarr Filter: ((f1 = '1'::text) AND (f2 = '{1,2}'::integer[])) Optimizer: Postgres query optimizer (4 rows) @@ -1250,3 +1270,5 @@ SELECT 1 FROM tenk1_vw_sec (12 rows) rollback; +reset enable_parallel; +reset optimizer; diff --git a/src/test/regress/expected/select_parallel_optimizer.out b/src/test/regress/expected/select_parallel_optimizer.out deleted file mode 100644 index 13be5d1f6e8..00000000000 --- a/src/test/regress/expected/select_parallel_optimizer.out +++ /dev/null @@ -1,1252 +0,0 @@ --- --- PARALLEL --- --- GPDB_96_MERGE_FIXME: We don't support parallel query. These tests won't actually --- generate any parallel plans. Should we pay attention to the parallel restrictions --- when creating MPP plans? For example, should we force parallel restricted functions --- to run in the QD? -create function sp_parallel_restricted(int) returns int as - $$begin return $1; end$$ language plpgsql parallel restricted; -begin; --- encourage use of parallel plans -set parallel_setup_cost=0; -set parallel_tuple_cost=0; -set min_parallel_table_scan_size=0; -set max_parallel_workers_per_gather=4; --- Parallel Append with partial-subplans -explain (costs off) - select round(avg(aa)), sum(aa) from a_star; - QUERY PLAN ------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Append - -> Seq Scan on a_star a_star_1 - -> Seq Scan on b_star a_star_2 - -> Seq Scan on c_star a_star_3 - -> Seq Scan on d_star a_star_4 - -> Seq Scan on e_star a_star_5 - -> Seq Scan on f_star a_star_6 - Optimizer: Postgres query optimizer -(11 rows) - -select round(avg(aa)), sum(aa) from a_star a1; - round | sum --------+----- - 14 | 355 -(1 row) - --- Parallel Append with both partial and non-partial subplans -alter table c_star set (parallel_workers = 0); -alter table d_star set (parallel_workers = 0); -explain (costs off) - select round(avg(aa)), sum(aa) from a_star; - QUERY PLAN ------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Append - -> Seq Scan on a_star a_star_1 - -> Seq Scan on b_star a_star_2 - -> Seq Scan on c_star a_star_3 - -> Seq Scan on d_star a_star_4 - -> Seq Scan on e_star a_star_5 - -> Seq Scan on f_star a_star_6 - Optimizer: Postgres query optimizer -(11 rows) - -select round(avg(aa)), sum(aa) from a_star a2; - round | sum --------+----- - 14 | 355 -(1 row) - --- Parallel Append with only non-partial subplans -alter table a_star set (parallel_workers = 0); -alter table b_star set (parallel_workers = 0); -alter table e_star set (parallel_workers = 0); -alter table f_star set (parallel_workers = 0); -explain (costs off) - select round(avg(aa)), sum(aa) from a_star; - QUERY PLAN ------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Append - -> Seq Scan on a_star a_star_1 - -> Seq Scan on b_star a_star_2 - -> Seq Scan on c_star a_star_3 - -> Seq Scan on d_star a_star_4 - -> Seq Scan on e_star a_star_5 - -> Seq Scan on f_star a_star_6 - Optimizer: Postgres query optimizer -(11 rows) - -select round(avg(aa)), sum(aa) from a_star a3; - round | sum --------+----- - 14 | 355 -(1 row) - --- Disable Parallel Append -alter table a_star reset (parallel_workers); -alter table b_star reset (parallel_workers); -alter table c_star reset (parallel_workers); -alter table d_star reset (parallel_workers); -alter table e_star reset (parallel_workers); -alter table f_star reset (parallel_workers); -set enable_parallel_append to off; -explain (costs off) - select round(avg(aa)), sum(aa) from a_star; - QUERY PLAN ------------------------------------------------------ - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Append - -> Seq Scan on a_star a_star_1 - -> Seq Scan on b_star a_star_2 - -> Seq Scan on c_star a_star_3 - -> Seq Scan on d_star a_star_4 - -> Seq Scan on e_star a_star_5 - -> Seq Scan on f_star a_star_6 - Optimizer: Postgres query optimizer -(11 rows) - -select round(avg(aa)), sum(aa) from a_star a4; - round | sum --------+----- - 14 | 355 -(1 row) - -reset enable_parallel_append; --- Parallel Append that runs serially -create function sp_test_func() returns setof text as -$$ select 'foo'::varchar union all select 'bar'::varchar $$ -language sql stable; -select sp_test_func() order by 1; - sp_test_func --------------- - bar - foo -(2 rows) - --- Parallel Append is not to be used when the subpath depends on the outer param -create table part_pa_test(a int, b int) partition by range(a); -create table part_pa_test_p1 partition of part_pa_test for values from (minvalue) to (0); -create table part_pa_test_p2 partition of part_pa_test for values from (0) to (maxvalue); -explain (costs off) - select (select max((select pa1.b from part_pa_test pa1 where pa1.a = pa2.a))) - from part_pa_test pa2; - QUERY PLAN -------------------------------------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Append - -> Seq Scan on part_pa_test_p1 pa2_1 - -> Seq Scan on part_pa_test_p2 pa2_2 - SubPlan 1 - -> Result - Filter: (pa1.a = pa2.a) - -> Materialize - -> Broadcast Motion 3:3 (slice2; segments: 3) - -> Append - -> Seq Scan on part_pa_test_p1 pa1_1 - -> Seq Scan on part_pa_test_p2 pa1_2 - SubPlan 2 - -> Result - Optimizer: Postgres query optimizer -(17 rows) - -drop table part_pa_test; --- test with leader participation disabled -set parallel_leader_participation = off; -explain (costs off) - select count(*) from tenk1 where stringu1 = 'GRAAAA'; - QUERY PLAN ---------------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Seq Scan on tenk1 - Filter: (stringu1 = 'GRAAAA'::name) - Optimizer: Postgres query optimizer -(6 rows) - -select count(*) from tenk1 where stringu1 = 'GRAAAA'; - count -------- - 15 -(1 row) - --- test with leader participation disabled, but no workers available (so --- the leader will have to run the plan despite the setting) -set max_parallel_workers = 0; -explain (costs off) - select count(*) from tenk1 where stringu1 = 'GRAAAA'; - QUERY PLAN ---------------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Seq Scan on tenk1 - Filter: (stringu1 = 'GRAAAA'::name) - Optimizer: Postgres query optimizer -(6 rows) - -select count(*) from tenk1 where stringu1 = 'GRAAAA'; - count -------- - 15 -(1 row) - -reset max_parallel_workers; -reset parallel_leader_participation; --- test that parallel_restricted function doesn't run in worker -alter table tenk1 set (parallel_workers = 4); -explain (verbose, costs off) -select sp_parallel_restricted(unique1) from tenk1 - where stringu1 = 'GRAAAA' order by 1; - QUERY PLAN ----------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - Output: (sp_parallel_restricted(unique1)) - Merge Key: (sp_parallel_restricted(unique1)) - -> Sort - Output: (sp_parallel_restricted(unique1)) - Sort Key: (sp_parallel_restricted(tenk1.unique1)) - -> Seq Scan on public.tenk1 - Output: sp_parallel_restricted(unique1) - Filter: (tenk1.stringu1 = 'GRAAAA'::name) - Settings: min_parallel_table_scan_size = '0', optimizer = 'on', parallel_setup_cost = '0', parallel_tuple_cost = '0' - Optimizer: Postgres query optimizer -(11 rows) - --- test parallel plan when group by expression is in target list. -explain (costs off) - select length(stringu1) from tenk1 group by length(stringu1); - QUERY PLAN ------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> HashAggregate - Group Key: (length((stringu1)::text)) - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: (length((stringu1)::text)) - -> HashAggregate - Group Key: length((stringu1)::text) - -> Seq Scan on tenk1 - Optimizer: Postgres query optimizer -(9 rows) - -select length(stringu1) from tenk1 group by length(stringu1); - length --------- - 6 -(1 row) - -explain (costs off) - select stringu1, count(*) from tenk1 group by stringu1 order by stringu1; - QUERY PLAN ------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - Merge Key: stringu1 - -> Finalize GroupAggregate - Group Key: stringu1 - -> Sort - Sort Key: stringu1 - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: stringu1 - -> Streaming Partial HashAggregate - Group Key: stringu1 - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(12 rows) - --- test that parallel plan for aggregates is not selected when --- target list contains parallel restricted clause. -explain (costs off) - select sum(sp_parallel_restricted(unique1)) from tenk1 - group by(sp_parallel_restricted(unique1)); - QUERY PLAN ------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> HashAggregate - Group Key: (sp_parallel_restricted(unique1)) - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: (sp_parallel_restricted(unique1)) - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(7 rows) - --- test prepared statement -prepare tenk1_count(integer) As select count((unique1)) from tenk1 where hundred > $1; -explain (costs off) execute tenk1_count(1); - QUERY PLAN ------------------------------------------------------------ - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Index Scan using tenk1_hundred on tenk1 - Index Cond: (hundred > 1) - Optimizer: Pivotal Optimizer (GPORCA) -(6 rows) - -execute tenk1_count(1); - count -------- - 9800 -(1 row) - -deallocate tenk1_count; --- test parallel plans for queries containing un-correlated subplans. -alter table tenk2 set (parallel_workers = 0); -explain (costs off) - select count(*) from tenk1 where (two, four) not in - (select hundred, thousand from tenk2 where thousand > 100); - QUERY PLAN --------------------------------------------------------------------------------------------- - Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Nested Loop Left Anti Semi (Not-In) Join - Join Filter: ((tenk1.two = tenk2.hundred) AND (tenk1.four = tenk2.thousand)) - -> Seq Scan on tenk1 - -> Materialize - -> Broadcast Motion 3:3 (slice2; segments: 3) - -> Seq Scan on tenk2 - Filter: (thousand > 100) - Optimizer: Postgres query optimizer -(10 rows) - -select count(*) from tenk1 where (two, four) not in - (select hundred, thousand from tenk2 where thousand > 100); - count -------- - 10000 -(1 row) - --- this is not parallel-safe due to use of random() within SubLink's testexpr: -explain (costs off) - select * from tenk1 where (unique1 + random())::integer not in - (select ten from tenk2); - QUERY PLAN ---------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Hash Left Anti Semi (Not-In) Join - Hash Cond: ((((tenk1.unique1)::double precision + random()))::integer = tenk2.ten) - -> Seq Scan on tenk1 - -> Hash - -> Broadcast Motion 3:3 (slice2; segments: 3) - -> Seq Scan on tenk2 - Optimizer: Pivotal Optimizer (GPORCA) -(8 rows) - -alter table tenk2 reset (parallel_workers); --- test parallel plan for a query containing initplan. -set enable_indexscan = off; -set enable_indexonlyscan = off; -set enable_bitmapscan = off; -alter table tenk2 set (parallel_workers = 2); -explain (costs off) - select count(*) from tenk1 - where tenk1.unique1 = (Select max(tenk2.unique1) from tenk2); - QUERY PLAN ------------------------------------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Nested Loop - Join Filter: true - -> Broadcast Motion 1:3 (slice2) - -> Finalize Aggregate - -> Gather Motion 3:1 (slice3; segments: 3) - -> Partial Aggregate - -> Seq Scan on tenk2 - -> Index Only Scan using tenk1_unique1 on tenk1 - Index Cond: (unique1 = (max(tenk2.unique1))) - Optimizer: Pivotal Optimizer (GPORCA) -(13 rows) - -select count(*) from tenk1 - where tenk1.unique1 = (Select max(tenk2.unique1) from tenk2); - count -------- - 1 -(1 row) - -reset enable_indexscan; -reset enable_indexonlyscan; -reset enable_bitmapscan; -alter table tenk2 reset (parallel_workers); --- test parallel index scans. -set enable_seqscan to off; -set enable_bitmapscan to off; -explain (costs off) - select count((unique1)) from tenk1 where hundred > 1; - QUERY PLAN ------------------------------------------------------------ - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Index Scan using tenk1_hundred on tenk1 - Index Cond: (hundred > 1) - Optimizer: Pivotal Optimizer (GPORCA) -(6 rows) - -select count((unique1)) from tenk1 where hundred > 1; - count -------- - 9800 -(1 row) - --- test parallel index-only scans. -explain (costs off) - select count(*) from tenk1 where thousand > 95; - QUERY PLAN ------------------------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: (thousand > 95) - Optimizer: Pivotal Optimizer (GPORCA) -(6 rows) - -select count(*) from tenk1 where thousand > 95; - count -------- - 9040 -(1 row) - --- test rescan cases too -set enable_material = false; -explain (costs off) -select * from - (select count(unique1) from tenk1 where hundred > 10) ss - right join (values (1),(2),(3)) v(x) on true; - QUERY PLAN ------------------------------------------------------------------------ - Nested Loop Left Join - Join Filter: true - -> Values Scan on "Values" - -> Finalize Aggregate - -> Materialize - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Index Scan using tenk1_hundred on tenk1 - Index Cond: (hundred > 10) - Optimizer: Pivotal Optimizer (GPORCA) -(10 rows) - -select * from - (select count(unique1) from tenk1 where hundred > 10) ss - right join (values (1),(2),(3)) v(x) on true; - count | x --------+--- - 8900 | 1 - 8900 | 2 - 8900 | 3 -(3 rows) - -explain (costs off) -select * from - (select count(*) from tenk1 where thousand > 99) ss - right join (values (1),(2),(3)) v(x) on true; - QUERY PLAN ------------------------------------------------------------------------------- - Nested Loop Left Join - Join Filter: true - -> Values Scan on "Values" - -> Finalize Aggregate - -> Materialize - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: (thousand > 99) - Optimizer: Pivotal Optimizer (GPORCA) -(10 rows) - -select * from - (select count(*) from tenk1 where thousand > 99) ss - right join (values (1),(2),(3)) v(x) on true; - count | x --------+--- - 9000 | 1 - 9000 | 2 - 9000 | 3 -(3 rows) - --- test rescans for a Limit node with a parallel node beneath it. -reset enable_seqscan; -set enable_indexonlyscan to off; -set enable_indexscan to off; -alter table tenk1 set (parallel_workers = 0); -alter table tenk2 set (parallel_workers = 1); -explain (costs off) -select count(*) from tenk1 - left join (select tenk2.unique1 from tenk2 order by 1 limit 1000) ss - on tenk1.unique1 < ss.unique1 + 1 - where tenk1.unique1 < 2; - QUERY PLAN ------------------------------------------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Nested Loop Left Join - Join Filter: (tenk1.unique1 < (tenk2.unique1 + 1)) - -> Index Only Scan using tenk1_unique1 on tenk1 - Index Cond: (unique1 < 2) - -> Materialize - -> Broadcast Motion 1:3 (slice2) - -> Limit - -> Gather Motion 3:1 (slice3; segments: 3) - Merge Key: tenk2.unique1 - -> Limit - -> Sort - Sort Key: tenk2.unique1 - -> Seq Scan on tenk2 - Optimizer: Pivotal Optimizer (GPORCA) -(17 rows) - -select count(*) from tenk1 - left join (select tenk2.unique1 from tenk2 order by 1 limit 1000) ss - on tenk1.unique1 < ss.unique1 + 1 - where tenk1.unique1 < 2; - count -------- - 1999 -(1 row) - ---reset the value of workers for each table as it was before this test. -alter table tenk1 set (parallel_workers = 4); -alter table tenk2 reset (parallel_workers); -reset enable_material; -reset enable_bitmapscan; -reset enable_indexonlyscan; -reset enable_indexscan; --- test parallel bitmap heap scan. -set enable_seqscan to off; -set enable_indexscan to off; -set enable_hashjoin to off; -set enable_mergejoin to off; -set enable_material to off; --- test prefetching, if the platform allows it -DO $$ -BEGIN - SET effective_io_concurrency = 50; -EXCEPTION WHEN invalid_parameter_value THEN -END $$; -set work_mem='64kB'; --set small work mem to force lossy pages -WARNING: "work_mem": setting is deprecated, and may be removed in a future release. -explain (costs off) - select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0; - QUERY PLAN ---------------------------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Nested Loop - Join Filter: true - -> Broadcast Motion 3:3 (slice2; segments: 3) - -> Seq Scan on tenk2 - Filter: (thousand = 0) - -> Index Only Scan using tenk1_hundred on tenk1 - Index Cond: (hundred > 1) - Optimizer: Pivotal Optimizer (GPORCA) -(11 rows) - -select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0; - count -------- - 98000 -(1 row) - -create table bmscantest (a int, t text); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. -HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. -insert into bmscantest select r, 'fooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo' FROM generate_series(1,100000) r; -create index i_bmtest ON bmscantest(a); -select count(*) from bmscantest where a>1; - count -------- - 99999 -(1 row) - --- test accumulation of stats for parallel nodes -reset enable_seqscan; -alter table tenk2 set (parallel_workers = 0); -explain (analyze, timing off, summary off, costs off) - select count(*) from tenk1, tenk2 where tenk1.hundred > 1 - and tenk2.thousand=0; - QUERY PLAN ----------------------------------------------------------------------------------------------- - Finalize Aggregate (actual rows=1 loops=1) - -> Gather Motion 3:1 (slice1; segments: 3) (actual rows=3 loops=1) - -> Partial Aggregate (actual rows=1 loops=1) - -> Nested Loop (actual rows=33200 loops=1) - Join Filter: true - -> Broadcast Motion 3:3 (slice2; segments: 3) (actual rows=10 loops=1) - -> Seq Scan on tenk2 (actual rows=5 loops=1) - Filter: (thousand = 0) - Rows Removed by Filter: 3363 - -> Index Only Scan using tenk1_hundred on tenk1 (actual rows=3018 loops=11) - Index Cond: (hundred > 1) - Heap Fetches: 0 - Optimizer: Pivotal Optimizer (GPORCA) -(13 rows) - -alter table tenk2 reset (parallel_workers); -reset work_mem; -WARNING: "work_mem": setting is deprecated, and may be removed in a future release. -create function explain_parallel_sort_stats() returns setof text -language plpgsql as -$$ -declare ln text; -begin - for ln in - explain (analyze, timing off, summary off, costs off) - select * from - (select ten from tenk1 where ten < 100 order by ten) ss - right join (values (1),(2),(3)) v(x) on true - loop - ln := regexp_replace(ln, 'Memory: \S*', 'Memory: xxx'); - return next ln; - end loop; -end; -$$; -select * from explain_parallel_sort_stats(); - explain_parallel_sort_stats ----------------------------------------------------------------------------------- - Nested Loop Left Join (actual rows=30000 loops=1) - Join Filter: true - -> Values Scan on "Values" (actual rows=3 loops=1) - -> Materialize (actual rows=7500 loops=4) - -> Gather Motion 3:1 (slice1; segments: 3) (actual rows=10000 loops=1) - -> Seq Scan on tenk1 (actual rows=3386 loops=1) - Filter: (ten < 100) - Optimizer: Pivotal Optimizer (GPORCA) -(8 rows) - -reset enable_indexscan; -reset enable_hashjoin; -reset enable_mergejoin; -reset enable_material; -reset effective_io_concurrency; -drop table bmscantest; -drop function explain_parallel_sort_stats(); --- test parallel merge join path. -analyze tenk2; -set enable_hashjoin to off; -set enable_nestloop to off; -explain (costs off) - select count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1; - QUERY PLAN ----------------------------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Nested Loop - Join Filter: true - -> Broadcast Motion 3:3 (slice2; segments: 3) - -> Seq Scan on tenk1 - -> Index Only Scan using tenk2_unique1 on tenk2 - Index Cond: (unique1 = tenk1.unique1) - Optimizer: Pivotal Optimizer (GPORCA) -(10 rows) - -select count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1; - count -------- - 10000 -(1 row) - -reset enable_hashjoin; -reset enable_nestloop; --- test gather merge -set enable_hashagg = false; -explain (costs off) - select count(*) from tenk1 group by twenty; - QUERY PLAN ------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate - Group Key: twenty - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: twenty - -> Streaming Partial HashAggregate - Group Key: twenty - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(9 rows) - -select count(*) from tenk1 group by twenty; - count -------- - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 -(20 rows) - ---test expressions in targetlist are pushed down for gather merge -create function sp_simple_func(var1 integer) returns integer -as $$ -begin - return var1 + 10; -end; -$$ language plpgsql PARALLEL SAFE; -explain (costs off, verbose) - select ten, sp_simple_func(ten) from tenk1 where ten < 100 order by ten; - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------- - Result - Output: ten, sp_simple_func(ten) - -> Gather Motion 3:1 (slice1; segments: 3) - Output: ten - Merge Key: ten - -> Sort - Output: ten - Sort Key: tenk1.ten - -> Seq Scan on public.tenk1 - Output: ten - Filter: (tenk1.ten < 100) - Optimizer: Pivotal Optimizer (GPORCA) - Settings: enable_hashagg=off, min_parallel_table_scan_size=0, optimizer=on, parallel_setup_cost=0, parallel_tuple_cost=0 -(13 rows) - -drop function sp_simple_func(integer); --- test handling of SRFs in targetlist (bug in 10.0) -explain (costs off) - select count(*), generate_series(1,2) from tenk1 group by twenty; - QUERY PLAN ------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> ProjectSet - -> Finalize HashAggregate - Group Key: twenty - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: twenty - -> Streaming Partial HashAggregate - Group Key: twenty - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(10 rows) - -select count(*), generate_series(1,2) from tenk1 group by twenty; - count | generate_series --------+----------------- - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 - 500 | 1 - 500 | 2 -(40 rows) - --- test gather merge with parallel leader participation disabled -set parallel_leader_participation = off; -explain (costs off) - select count(*) from tenk1 group by twenty; - QUERY PLAN ------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate - Group Key: twenty - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: twenty - -> Streaming Partial HashAggregate - Group Key: twenty - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(9 rows) - -select count(*) from tenk1 group by twenty; - count -------- - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 - 500 -(20 rows) - -reset parallel_leader_participation; ---test rescan behavior of gather merge -set enable_material = false; -explain (costs off) -select * from - (select string4, count(unique2) - from tenk1 group by string4 order by string4) ss - right join (values (1),(2),(3)) v(x) on true; - QUERY PLAN ------------------------------------------------------------------------- - Nested Loop Left Join - Join Filter: true - -> Values Scan on "Values" - -> Materialize - -> Gather Motion 3:1 (slice1; segments: 3) - -> Finalize HashAggregate - Group Key: tenk1.string4 - -> Redistribute Motion 3:3 (slice2; segments: 3) - Hash Key: tenk1.string4 - -> Streaming Partial HashAggregate - Group Key: tenk1.string4 - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(13 rows) - -select * from - (select string4, count(unique2) - from tenk1 group by string4 order by string4) ss - right join (values (1),(2),(3)) v(x) on true; - string4 | count | x ----------+-------+--- - AAAAxx | 2500 | 1 - HHHHxx | 2500 | 1 - OOOOxx | 2500 | 1 - VVVVxx | 2500 | 1 - AAAAxx | 2500 | 2 - HHHHxx | 2500 | 2 - OOOOxx | 2500 | 2 - VVVVxx | 2500 | 2 - AAAAxx | 2500 | 3 - HHHHxx | 2500 | 3 - OOOOxx | 2500 | 3 - VVVVxx | 2500 | 3 -(12 rows) - -reset enable_material; -reset enable_hashagg; --- check parallelized int8 aggregate (bug #14897) -explain (costs off) -select avg(unique1::int8) from tenk1; - QUERY PLAN ------------------------------------------------- - Finalize Aggregate - -> Gather Motion 3:1 (slice1; segments: 3) - -> Partial Aggregate - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(5 rows) - -select avg(unique1::int8) from tenk1; - avg ------------------------ - 4999.5000000000000000 -(1 row) - --- gather merge test with a LIMIT -explain (costs off) - select fivethous from tenk1 order by fivethous limit 4; - QUERY PLAN ------------------------------------------------- - Limit - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: fivethous - -> Limit - -> Sort - Sort Key: fivethous - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(8 rows) - -select fivethous from tenk1 order by fivethous limit 4; - fivethous ------------ - 0 - 0 - 1 - 1 -(4 rows) - --- gather merge test with 0 worker -set max_parallel_workers = 0; -explain (costs off) - select string4 from tenk1 order by string4 limit 5; - QUERY PLAN ------------------------------------------------- - Limit - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: string4 - -> Limit - -> Sort - Sort Key: string4 - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(8 rows) - -select string4 from tenk1 order by string4 limit 5; - string4 ---------- - AAAAxx - AAAAxx - AAAAxx - AAAAxx - AAAAxx -(5 rows) - --- gather merge test with 0 workers, with parallel leader --- participation disabled (the leader will have to run the plan --- despite the setting) -set parallel_leader_participation = off; -explain (costs off) - select string4 from tenk1 order by string4 limit 5; - QUERY PLAN ------------------------------------------------- - Limit - -> Gather Motion 3:1 (slice1; segments: 3) - Merge Key: string4 - -> Limit - -> Sort - Sort Key: string4 - -> Seq Scan on tenk1 - Optimizer: Pivotal Optimizer (GPORCA) -(8 rows) - -select string4 from tenk1 order by string4 limit 5; - string4 ---------- - AAAAxx - AAAAxx - AAAAxx - AAAAxx - AAAAxx -(5 rows) - -reset parallel_leader_participation; -reset max_parallel_workers; -SAVEPOINT settings; -SET LOCAL force_parallel_mode = 1; -explain (costs off) - select stringu1::int2 from tenk1 where unique1 = 1; - QUERY PLAN ------------------------------------------------ - Gather Motion 1:1 (slice1; segments: 1) - -> Index Scan using tenk1_unique1 on tenk1 - Index Cond: (unique1 = 1) - Optimizer: Pivotal Optimizer (GPORCA) -(4 rows) - -ROLLBACK TO SAVEPOINT settings; --- exercise record typmod remapping between backends -CREATE FUNCTION make_record(n int) - RETURNS RECORD LANGUAGE plpgsql PARALLEL SAFE AS -$$ -BEGIN - RETURN CASE n - WHEN 1 THEN ROW(1) - WHEN 2 THEN ROW(1, 2) - WHEN 3 THEN ROW(1, 2, 3) - WHEN 4 THEN ROW(1, 2, 3, 4) - ELSE ROW(1, 2, 3, 4, 5) - END; -END; -$$; -SAVEPOINT settings; -SET LOCAL force_parallel_mode = 1; -SELECT make_record(x) FROM (SELECT generate_series(1, 5) x) ss ORDER BY x; - make_record -------------- - (1) - (1,2) - (1,2,3) - (1,2,3,4) - (1,2,3,4,5) -(5 rows) - -ROLLBACK TO SAVEPOINT settings; -DROP function make_record(n int); --- test the sanity of parallel query after the active role is dropped. -drop role if exists regress_parallel_worker; -create role regress_parallel_worker; -set role regress_parallel_worker; -reset session authorization; -drop role regress_parallel_worker; -set force_parallel_mode = 1; -select count(*) from tenk1; - count -------- - 10000 -(1 row) - -reset force_parallel_mode; -reset role; --- Window function calculation can't be pushed to workers. -explain (costs off, verbose) - select count(*) from tenk1 a where (unique1, two) in - (select unique1, row_number() over() from tenk1 b); - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - Finalize Aggregate - Output: count(*) - -> Gather Motion 3:1 (slice1; segments: 3) - Output: (PARTIAL count(*)) - -> Partial Aggregate - Output: PARTIAL count(*) - -> Hash Semi Join - Hash Cond: ((a.unique1 = b.unique1) AND (a.two = (row_number() OVER (?)))) - -> Seq Scan on public.tenk1 a - Output: a.unique1, a.unique2, a.two, a.four, a.ten, a.twenty, a.hundred, a.thousand, a.twothousand, a.fivethous, a.tenthous, a.odd, a.even, a.stringu1, a.stringu2, a.string4 - -> Hash - Output: b.unique1, (row_number() OVER (?)) - -> Redistribute Motion 1:3 (slice2; segments: 1) - Output: b.unique1, (row_number() OVER (?)) - Hash Key: b.unique1 - -> WindowAgg - Output: b.unique1, row_number() OVER (?) - -> Gather Motion 3:1 (slice3; segments: 3) - Output: b.unique1 - -> Seq Scan on public.tenk1 b - Output: b.unique1 - Optimizer: Postgres query optimizer - Settings: min_parallel_table_scan_size=0, parallel_setup_cost=0, parallel_tuple_cost=0 -(23 rows) - --- LIMIT/OFFSET within sub-selects can't be pushed to workers. -explain (costs off) - select * from tenk1 a where two in - (select two from tenk1 b where stringu1 like '%AAAA' limit 3); - QUERY PLAN ---------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Hash Join - Hash Cond: (a.two = b.two) - -> Seq Scan on tenk1 a - -> Hash - -> Broadcast Motion 1:3 (slice2; segments: 1) - -> HashAggregate - Group Key: b.two - -> Limit - -> Gather Motion 3:1 (slice3; segments: 3) - -> Limit - -> Seq Scan on tenk1 b - Filter: (stringu1 ~~ '%AAAA'::text) - Optimizer: Postgres query optimizer -(14 rows) - --- to increase the parallel query test coverage -SAVEPOINT settings; -SET LOCAL force_parallel_mode = 1; -EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; - QUERY PLAN ----------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (actual rows=10000 loops=1) - -> Seq Scan on tenk1 (actual rows=3386 loops=1) - Optimizer: Pivotal Optimizer (GPORCA) -(3 rows) - -ROLLBACK TO SAVEPOINT settings; --- provoke error in worker --- (make the error message long enough to require multiple bufferloads) -SAVEPOINT settings; -SET LOCAL force_parallel_mode = 1; -select (stringu1 || repeat('abcd', 5000))::int2 from tenk1 where unique1 = 1; -ERROR: invalid input syntax for type smallint: "BAAAAAabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd" -ROLLBACK TO SAVEPOINT settings; --- test interaction with set-returning functions -SAVEPOINT settings; --- multiple subqueries under a single Gather node --- must set parallel_setup_cost > 0 to discourage multiple Gather nodes -SET LOCAL parallel_setup_cost = 10; -EXPLAIN (COSTS OFF) -SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1 -UNION ALL -SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1; - QUERY PLAN ----------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - -> Append - -> Seq Scan on tenk1 - Filter: (fivethous = (tenthous + 1)) - -> Seq Scan on tenk1 tenk1_1 - Filter: (fivethous = (tenthous + 1)) - Optimizer: Pivotal Optimizer (GPORCA) -(7 rows) - -ROLLBACK TO SAVEPOINT settings; --- can't use multiple subqueries under a single Gather node due to initPlans -EXPLAIN (COSTS OFF) -SELECT unique1 FROM tenk1 WHERE fivethous = - (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1) -UNION ALL -SELECT unique1 FROM tenk1 WHERE fivethous = - (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1) -ORDER BY 1; - QUERY PLAN ------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - Merge Key: tenk1.unique1 - -> Sort - Sort Key: tenk1.unique1 - -> Append - -> Hash Join - Hash Cond: (tenk1.fivethous = tenk1_1.unique1) - -> Seq Scan on tenk1 - -> Hash - -> Broadcast Motion 1:3 (slice2) - -> Limit - -> Gather Motion 3:1 (slice3; segments: 3) - -> Seq Scan on tenk1 tenk1_1 - Filter: (fivethous = 1) - -> Hash Join - Hash Cond: (tenk1_2.fivethous = tenk1_3.unique2) - -> Seq Scan on tenk1 tenk1_2 - -> Hash - -> Broadcast Motion 1:3 (slice4) - -> Limit - -> Gather Motion 3:1 (slice5; segments: 3) - -> Limit - -> Seq Scan on tenk1 tenk1_3 - Filter: (fivethous = 1) - Optimizer: Pivotal Optimizer (GPORCA) -(25 rows) - --- test interaction with SRFs -SELECT * FROM information_schema.foreign_data_wrapper_options -ORDER BY 1, 2, 3; - foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value -------------------------------+---------------------------+-------------+-------------- -(0 rows) - -EXPLAIN (VERBOSE, COSTS OFF) -SELECT generate_series(1, two), array(select generate_series(1, two)) - FROM tenk1 ORDER BY tenthous; - QUERY PLAN ----------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) - Output: (generate_series(1, tenk1.two)), ((SubPlan 1)), tenk1.tenthous - Merge Key: tenk1.tenthous - -> ProjectSet - Output: generate_series(1, tenk1.two), (SubPlan 1), tenk1.tenthous - -> Result - Output: tenk1.two, tenk1.tenthous - -> Sort - Output: tenk1.tenthous, tenk1.two - Sort Key: tenk1.tenthous - -> Seq Scan on public.tenk1 - Output: tenk1.tenthous, tenk1.two - SubPlan 1 - -> ProjectSet - Output: generate_series(1, tenk1.two) - -> Result - Settings: min_parallel_table_scan_size = '0', parallel_setup_cost = '0', parallel_tuple_cost = '0' - Optimizer: Postgres query optimizer -(18 rows) - --- test passing expanded-value representations to workers -CREATE FUNCTION make_some_array(int,int) returns int[] as -$$declare x int[]; - begin - x[1] := $1; - x[2] := $2; - return x; - end$$ language plpgsql parallel safe; -CREATE TABLE fooarr(f1 text, f2 int[], f3 text); -INSERT INTO fooarr VALUES('1', ARRAY[1,2], 'one'); -PREPARE pstmt(text, int[]) AS SELECT * FROM fooarr WHERE f1 = $1 AND f2 = $2; -EXPLAIN (COSTS OFF) EXECUTE pstmt('1', make_some_array(1,2)); - QUERY PLAN ------------------------------------------------------------------- - Gather Motion 1:1 (slice1; segments: 1) - -> Seq Scan on fooarr - Filter: ((f1 = '1'::text) AND (f2 = '{1,2}'::integer[])) - Optimizer: Pivotal Optimizer (GPORCA) -(4 rows) - -EXECUTE pstmt('1', make_some_array(1,2)); - f1 | f2 | f3 -----+-------+----- - 1 | {1,2} | one -(1 row) - -DEALLOCATE pstmt; --- test interaction between subquery and partial_paths -CREATE VIEW tenk1_vw_sec WITH (security_barrier) AS SELECT * FROM tenk1; -EXPLAIN (COSTS OFF) -SELECT 1 FROM tenk1_vw_sec - WHERE (SELECT sum(f1) FROM int4_tbl WHERE f1 < unique1) < 100; - QUERY PLAN ------------------------------------------------------------------------------ - Gather Motion 3:1 (slice1; segments: 3) - -> Subquery Scan on tenk1_vw_sec - Filter: ((SubPlan 1) < 100) - -> Seq Scan on tenk1 - SubPlan 1 - -> Aggregate - -> Result - Filter: (int4_tbl.f1 < tenk1_vw_sec.unique1) - -> Materialize - -> Broadcast Motion 3:3 (slice2; segments: 3) - -> Seq Scan on int4_tbl - Optimizer: Postgres query optimizer -(12 rows) - -rollback; diff --git a/src/test/regress/expected/shared_scan.out b/src/test/regress/expected/shared_scan.out index f8530362b0e..c915797023b 100644 --- a/src/test/regress/expected/shared_scan.out +++ b/src/test/regress/expected/shared_scan.out @@ -70,13 +70,18 @@ SET statement_timeout = '15s'; (2 rows) RESET statement_timeout; -SELECT *, +SELECT COUNT(*) +FROM (SELECT *, ( WITH cte AS (SELECT * FROM jazz WHERE jazz.e = bar.c) SELECT 1 FROM cte c1, cte c2 ) - FROM bar; -ERROR: shareinputscan with outer refs is not supported by GPDB + FROM bar) as s; + count +------- + 100 +(1 row) + CREATE TABLE t1 (a int, b int); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. @@ -162,3 +167,70 @@ ERROR: structure of query does not match function result type DETAIL: Number of returned columns (3) does not match expected column count (2). CONTEXT: SQL statement "WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 ORDER BY b LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)" PL/pgSQL function col_mismatch_func2() line 6 at RETURN QUERY +-- https://github.com/greenplum-db/gpdb/issues/12701 +-- Disable cte sharing in subquery +drop table if exists pk_list; +NOTICE: table "pk_list" does not exist, skipping +create table pk_list (id int, schema_name varchar, table_name varchar) distributed by (id); +drop table if exists calender; +NOTICE: table "calender" does not exist, skipping +create table calender (id int, data_hour timestamp) distributed by (id); +explain (costs off) +with + tbls as (select distinct schema_name, table_name as table_nm from pk_list), + tbls_daily_report_23 as (select unnest(string_to_array('mart_cm.card' ,',')) as table_nm_23), + tbls_w_onl_actl_data as (select unnest(string_to_array('mart_cm.cont_resp,mart_cm.card', ',')) as table_nm_onl_act) +select data_hour, stat.schema_name as schema_nm, dt.table_nm +from ( + select * from calender c + cross join tbls +) dt +inner join ( + select tbls.schema_name, tbls.table_nm as table_name + from tbls tbls +) stat on dt.table_nm = stat.table_name +where + (data_hour = date_trunc('day',data_hour) and stat.schema_name || '.' ||stat.table_name not in (select table_nm_23 from tbls_daily_report_23)) + and (stat.schema_name || '.' ||stat.table_name not in (select table_nm_onl_act from tbls_w_onl_actl_data)) + or (stat.schema_name || '.' ||stat.table_name in (select table_nm_onl_act from tbls_w_onl_actl_data)); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Join + Hash Cond: ((tbls.table_nm)::text = (stat.table_name)::text) + Join Filter: (((c.data_hour = date_trunc('day'::text, c.data_hour)) AND (NOT (hashed SubPlan 1)) AND (NOT (hashed SubPlan 2))) OR (hashed SubPlan 3)) + -> Nested Loop + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Subquery Scan on tbls + -> HashAggregate + Group Key: pk_list.schema_name, pk_list.table_name + -> Redistribute Motion 3:3 (slice3; segments: 3) + Hash Key: pk_list.schema_name, pk_list.table_name + -> HashAggregate + Group Key: pk_list.schema_name, pk_list.table_name + -> Seq Scan on pk_list + -> Materialize + -> Seq Scan on calender c + -> Hash + -> Broadcast Motion 3:3 (slice4; segments: 3) + -> Subquery Scan on stat + Filter: (((NOT (hashed SubPlan 1)) AND (NOT (hashed SubPlan 2))) OR (hashed SubPlan 3)) + -> HashAggregate + Group Key: pk_list_1.schema_name, pk_list_1.table_name + -> Redistribute Motion 3:3 (slice5; segments: 3) + Hash Key: pk_list_1.schema_name, pk_list_1.table_name + -> HashAggregate + Group Key: pk_list_1.schema_name, pk_list_1.table_name + -> Seq Scan on pk_list pk_list_1 + SubPlan 1 + -> ProjectSet + -> Result + SubPlan 2 + -> ProjectSet + -> Result + SubPlan 3 + -> ProjectSet + -> Result + Optimizer: Postgres query optimizer +(37 rows) + diff --git a/src/test/regress/expected/shared_scan_optimizer.out b/src/test/regress/expected/shared_scan_optimizer.out index b1ccd7b8308..fd05fbff33d 100644 --- a/src/test/regress/expected/shared_scan_optimizer.out +++ b/src/test/regress/expected/shared_scan_optimizer.out @@ -72,13 +72,18 @@ SET statement_timeout = '15s'; (2 rows) RESET statement_timeout; -SELECT *, +SELECT COUNT(*) +FROM (SELECT *, ( WITH cte AS (SELECT * FROM jazz WHERE jazz.e = bar.c) SELECT 1 FROM cte c1, cte c2 ) - FROM bar; -ERROR: shareinputscan with outer refs is not supported by GPDB + FROM bar) as s; + count +------- + 100 +(1 row) + CREATE TABLE t1 (a int, b int); NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. @@ -170,3 +175,70 @@ ERROR: structure of query does not match function result type DETAIL: Number of returned columns (3) does not match expected column count (2). CONTEXT: SQL statement "WITH cte AS (SELECT * FROM t1 WHERE random() < 0.1 ORDER BY b LIMIT 10) SELECT a, 1 , 1 FROM cte JOIN t2 USING (a)" PL/pgSQL function col_mismatch_func2() line 6 at RETURN QUERY +-- https://github.com/greenplum-db/gpdb/issues/12701 +-- Disable cte sharing in subquery +drop table if exists pk_list; +NOTICE: table "pk_list" does not exist, skipping +create table pk_list (id int, schema_name varchar, table_name varchar) distributed by (id); +drop table if exists calender; +NOTICE: table "calender" does not exist, skipping +create table calender (id int, data_hour timestamp) distributed by (id); +explain (costs off) +with + tbls as (select distinct schema_name, table_name as table_nm from pk_list), + tbls_daily_report_23 as (select unnest(string_to_array('mart_cm.card' ,',')) as table_nm_23), + tbls_w_onl_actl_data as (select unnest(string_to_array('mart_cm.cont_resp,mart_cm.card', ',')) as table_nm_onl_act) +select data_hour, stat.schema_name as schema_nm, dt.table_nm +from ( + select * from calender c + cross join tbls +) dt +inner join ( + select tbls.schema_name, tbls.table_nm as table_name + from tbls tbls +) stat on dt.table_nm = stat.table_name +where + (data_hour = date_trunc('day',data_hour) and stat.schema_name || '.' ||stat.table_name not in (select table_nm_23 from tbls_daily_report_23)) + and (stat.schema_name || '.' ||stat.table_name not in (select table_nm_onl_act from tbls_w_onl_actl_data)) + or (stat.schema_name || '.' ||stat.table_name in (select table_nm_onl_act from tbls_w_onl_actl_data)); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Join + Hash Cond: ((tbls.table_nm)::text = (stat.table_name)::text) + Join Filter: (((c.data_hour = date_trunc('day'::text, c.data_hour)) AND (NOT (hashed SubPlan 1)) AND (NOT (hashed SubPlan 2))) OR (hashed SubPlan 3)) + -> Nested Loop + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Subquery Scan on tbls + -> HashAggregate + Group Key: pk_list.schema_name, pk_list.table_name + -> Redistribute Motion 3:3 (slice3; segments: 3) + Hash Key: pk_list.schema_name, pk_list.table_name + -> HashAggregate + Group Key: pk_list.schema_name, pk_list.table_name + -> Seq Scan on pk_list + -> Materialize + -> Seq Scan on calender c + -> Hash + -> Broadcast Motion 3:3 (slice4; segments: 3) + -> Subquery Scan on stat + Filter: (((NOT (hashed SubPlan 1)) AND (NOT (hashed SubPlan 2))) OR (hashed SubPlan 3)) + -> HashAggregate + Group Key: pk_list_1.schema_name, pk_list_1.table_name + -> Redistribute Motion 3:3 (slice5; segments: 3) + Hash Key: pk_list_1.schema_name, pk_list_1.table_name + -> HashAggregate + Group Key: pk_list_1.schema_name, pk_list_1.table_name + -> Seq Scan on pk_list pk_list_1 + SubPlan 1 + -> ProjectSet + -> Result + SubPlan 2 + -> ProjectSet + -> Result + SubPlan 3 + -> ProjectSet + -> Result + Optimizer: Postgres query optimizer +(37 rows) + diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 9faaf22c0d1..2c50ece0c49 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -20,6 +20,7 @@ declare tmp text[]; first_row bool := true; begin + set local enable_parallel = off; for ln in execute format('explain analyze %s', $1) loop @@ -29,6 +30,7 @@ begin return query select tmp[1]::int, tmp[2]::int; end if; end loop; + reset enable_parallel; end; $$; -- Verify failures diff --git a/src/test/regress/expected/stats_ext_optimizer.out b/src/test/regress/expected/stats_ext_optimizer.out index 17c05332a45..e8dd488be71 100644 --- a/src/test/regress/expected/stats_ext_optimizer.out +++ b/src/test/regress/expected/stats_ext_optimizer.out @@ -20,6 +20,7 @@ declare tmp text[]; first_row bool := true; begin + set local enable_parallel = off; for ln in execute format('explain analyze %s', $1) loop @@ -29,6 +30,7 @@ begin return query select tmp[1]::int, tmp[2]::int; end if; end loop; + reset enable_parallel; end; $$; -- Verify failures diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 9832b7ea0bb..2fee5d171d6 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1611,6 +1611,7 @@ create function explain_sq_limit() returns setof text language plpgsql as $$ declare ln text; begin + set local enable_parallel=off; for ln in explain (analyze, summary off, timing off, costs off) select * from (select pk,c2 from sq_limit order by c1,pk) as x limit 3 @@ -1618,6 +1619,7 @@ begin ln := regexp_replace(ln, 'Memory: \S*', 'Memory: xxx'); return next ln; end loop; + reset enable_parallel; end; $$; select * from explain_sq_limit(); @@ -1788,8 +1790,8 @@ select * from x, x x2 where x.n = x2.n; Output: share0_ref1.f1, share0_ref1.n -> Seq Scan on public.subselect_tbl Output: subselect_tbl.f1, 'regression'::name - Optimizer: Postgres query optimizer Settings: gp_cte_sharing=on + Optimizer: Postgres query optimizer (21 rows) explain (verbose, costs off) diff --git a/src/test/regress/expected/subselect_optimizer.out b/src/test/regress/expected/subselect_optimizer.out index 095a31106f0..6b022a9830f 100644 --- a/src/test/regress/expected/subselect_optimizer.out +++ b/src/test/regress/expected/subselect_optimizer.out @@ -1676,6 +1676,7 @@ create function explain_sq_limit() returns setof text language plpgsql as $$ declare ln text; begin + set local enable_parallel=off; for ln in explain (analyze, summary off, timing off, costs off) select * from (select pk,c2 from sq_limit order by c1,pk) as x limit 3 @@ -1683,6 +1684,7 @@ begin ln := regexp_replace(ln, 'Memory: \S*', 'Memory: xxx'); return next ln; end loop; + reset enable_parallel; end; $$; select * from explain_sq_limit(); @@ -1853,8 +1855,8 @@ select * from x, x x2 where x.n = x2.n; Output: share0_ref1.f1, share0_ref1.n -> Seq Scan on public.subselect_tbl Output: subselect_tbl.f1, 'regression'::name - Optimizer: Postgres query optimizer Settings: gp_cte_sharing=on, optimizer=on + Optimizer: Postgres query optimizer (21 rows) explain (verbose, costs off) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 60185eba66a..22a68a9e289 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -99,7 +99,8 @@ select count(*) = 0 as ok from pg_stat_wal_receiver; -- This is to record the prevailing planner enable_foo settings during -- a regression test run. -select name, setting from pg_settings where name like 'enable%'; +-- GP parallel tests will run another with enable_parallel=on, filter this to pass regression. +select name, setting from pg_settings where name like 'enable%' and name != 'enable_parallel'; name | setting --------------------------------+--------- enable_async_append | on diff --git a/src/test/regress/expected/task.out b/src/test/regress/expected/task.out new file mode 100644 index 00000000000..3d35f62321f --- /dev/null +++ b/src/test/regress/expected/task.out @@ -0,0 +1,83 @@ +-- Vacuum every day at 10:00am (GMT) +create task vacuum_db SCHEDULE '0 10 * * *' AS 'vacuum'; +-- Stop scheduling a task +drop task vacuum_db; +-- Invalid input: missing parts +create task missing_parts schedule '* * * *' as 'select 1'; +ERROR: invalid schedule: * * * * +HINT: Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds' +-- Invalid input: trailing characters +create task trail_char schedule '5 secondc' as 'select 1'; +ERROR: invalid schedule: 5 secondc +HINT: Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds' +create task trail_char schedule '50 seconds c' as 'select 1'; +ERROR: invalid schedule: 50 seconds c +HINT: Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds' +-- Invalid input: seconds out of range +create task invalid_seconds schedule '-1 seconds' as 'select 1'; +ERROR: invalid schedule: -1 seconds +HINT: Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds' +create task invalid_seconds schedule '0 seconds' as 'select 1'; +ERROR: invalid schedule: 0 seconds +HINT: Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds' +create task invalid_seconds schedule '60 seconds' as 'select 1'; +ERROR: invalid schedule: 60 seconds +HINT: Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds' +create task invalid_seconds schedule '1000000000000 seconds' as 'select 1'; +ERROR: invalid schedule: 1000000000000 seconds +HINT: Use cron format (e.g. 5 4 * * *), or interval format '[1-59] seconds' +-- Vacuum every day at 10:00am (GMT) +create task vacuum_db SCHEDULE '0 10 * * *' AS 'vacuum'; +select schedule, command, active, jobname from pg_task order by jobid; + schedule | command | active | jobname +------------+---------+--------+----------- + 0 10 * * * | vacuum | t | vacuum_db +(1 row) + +-- Make that 11:00am (GMT) +alter task vacuum_db schedule '0 11 * * *'; +select schedule, command, active, jobname from pg_task order by jobid; + schedule | command | active | jobname +------------+---------+--------+----------- + 0 11 * * * | vacuum | t | vacuum_db +(1 row) + +-- Make that VACUUM FULL +alter task vacuum_db as 'vacuum full'; +select schedule, command, active, jobname from pg_task order by jobid; + schedule | command | active | jobname +------------+-------------+--------+----------- + 0 11 * * * | vacuum full | t | vacuum_db +(1 row) + +-- Update to a non existing database +alter task vacuum_db database hopedoesnotexist; +ERROR: database "hopedoesnotexist" does not exist +-- Create a database that does not allow connection +create database task_dbno; +revoke CONNECT on DATABASE task_dbno from PUBLIC; +-- create a test user +create user task_cron with password 'pwd'; +NOTICE: resource queue required -- using default resource queue "pg_default" +-- Create a task for another user +create task another_user_task schedule '* 10 * * *' database task_dbno user task_cron as 'vacuum'; +ERROR: User task_cron does not have CONNECT privilege on task_dbno (job_metadata.c:210) +-- Schedule a task for this user on the database that does not accept connections +alter task vacuum_db database task_dbno user task_cron; +ERROR: User task_cron does not have CONNECT privilege on task_dbno (job_metadata.c:683) +-- Schedule a task that user doest not exist +alter task vacuum_db user hopedoesnotexist; +ERROR: role "hopedoesnotexist" does not exist +-- valid interval tasks +create task valid_task_1 schedule '1 second' as 'select 1'; +create task valid_task_2 schedule ' 30 sEcOnDs ' as 'select 1'; +create task valid_task_3 schedule '59 seconds' as 'select 1'; +create task valid_task_4 schedule '17 seconds ' as 'select 1'; +-- clean up +drop database task_dbno; +drop user task_cron; +drop task vacuum_db; +drop task valid_task_1; +drop task valid_task_2; +drop task valid_task_3; +drop task valid_task_4; diff --git a/src/test/regress/expected/with_clause.out b/src/test/regress/expected/with_clause.out index d366dea9c2e..dbed613c813 100644 --- a/src/test/regress/expected/with_clause.out +++ b/src/test/regress/expected/with_clause.out @@ -2296,7 +2296,7 @@ UNION ALL -> Subquery Scan on y -> Shared Scan (share slice:id 1:0) -> Result - One-Time Filter: (gp_execution_segment() = 0) + One-Time Filter: (gp_execution_segment() = 2) -> Result One-Time Filter: (pg_sleep('1'::double precision) IS NOT NULL) -> Subquery Scan on cte_1 diff --git a/src/test/regress/expected/workfile/hashagg_spill.out b/src/test/regress/expected/workfile/hashagg_spill.out index 5bdd6af63b5..a66314fac6d 100644 --- a/src/test/regress/expected/workfile/hashagg_spill.out +++ b/src/test/regress/expected/workfile/hashagg_spill.out @@ -25,11 +25,12 @@ result = [] for i in range(len(rv)): cur_line = rv[i]['QUERY PLAN'] if search_text.lower() in cur_line.lower(): - p = re.compile('.+\((segment \d+).+ Workfile: \((\d+) spilling\)') + p = re.compile('.+Segments: (\d+).+\((segment \d+).+ Workfile: \((\d+) spilling\)') m = p.match(cur_line) - workfile_created = int(m.group(2)) - cur_row = int(workfile_created == nsegments) + workfile_created = int(m.group(3)) + cur_row = int(workfile_created == int(m.group(1))) result.append(cur_row) + break return result $$ language plpython3u; diff --git a/src/test/regress/expected/workfile/materialize_spill.out b/src/test/regress/expected/workfile/materialize_spill.out index 1d04161f74c..e2687bbbb96 100644 --- a/src/test/regress/expected/workfile/materialize_spill.out +++ b/src/test/regress/expected/workfile/materialize_spill.out @@ -59,6 +59,9 @@ set enable_nestloop = true; -- ORCA doesn't honor enable_nestloop/enable_hashjoin, so this won't produce -- the kind of plan we're looking for. set optimizer=off; +-- GP_PARALLEL_FIXME: seems like work_mem are affected by parallel, thus more spilling +-- happened. Temporally disable parallel in this case to pass the test. +set enable_parallel=off; -- This is the actual test query. select * FROM test_mat_small as t1 left outer join test_mat_large AS t2 on t1.i1=t2.i2; i1 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 @@ -106,6 +109,7 @@ $$) as n; 0 (1 row) +reset enable_parallel; drop schema materialize_spill cascade; NOTICE: drop cascades to 3 other objects DETAIL: drop cascades to function num_workfiles_created(text) diff --git a/src/test/regress/expected/workfile/sisc_mat_sort.out b/src/test/regress/expected/workfile/sisc_mat_sort.out index ae3aae681e1..d5f96a0a3f5 100644 --- a/src/test/regress/expected/workfile/sisc_mat_sort.out +++ b/src/test/regress/expected/workfile/sisc_mat_sort.out @@ -35,6 +35,7 @@ analyze testsiscm; set statement_mem="3MB"; set gp_resqueue_print_operator_memory_limits=on; set gp_cte_sharing=on; +set max_parallel_workers_per_gather = 0; -- The expected output is very sensitive to the kind of plan this produces. -- We're testing the executor, not the planner, so force ORCA off, to get -- the particular plan @@ -107,6 +108,7 @@ where t1.c1 = t2.c1 and t1.c3 = t2.c3 limit 50000;'); 1 (2 rows) +reset max_parallel_workers_per_gather; drop schema sisc_mat_sort cascade; NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to function is_workfile_created(text) diff --git a/src/test/regress/expected/workfile/sisc_sort_spill.out b/src/test/regress/expected/workfile/sisc_sort_spill.out index ad1c32a1ee5..3aab3fde88e 100644 --- a/src/test/regress/expected/workfile/sisc_sort_spill.out +++ b/src/test/regress/expected/workfile/sisc_sort_spill.out @@ -34,6 +34,7 @@ insert into testsisc select i, i % 1000, i % 100000, i % 75 from set statement_mem="2MB"; set gp_resqueue_print_operator_memory_limits=on; set gp_cte_sharing=on; +set max_parallel_workers_per_gather = 0; -- ORCA optimizes away the ORDER BY in our test query, and therefore doesn't exercise -- a Sort that spills. set optimizer=off; @@ -107,6 +108,7 @@ limit 50000;'); 1 (2 rows) +reset max_parallel_workers_per_gather; drop schema sisc_sort_spill cascade; NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to function is_workfile_created(text) diff --git a/src/test/regress/expected/write_parallel.out b/src/test/regress/expected/write_parallel.out index 66623b8d87e..63de88f0252 100644 --- a/src/test/regress/expected/write_parallel.out +++ b/src/test/regress/expected/write_parallel.out @@ -1,11 +1,7 @@ -- -- PARALLEL -- --- GPDB_96_MERGE_FIXME: We don't support parallel query. These tests won't actually --- generate any parallel plans. Same as in 'select_parallel' test. --- Serializable isolation would disable parallel query, so explicitly use an --- arbitrary other level. -begin isolation level repeatable read; +begin; -- encourage use of parallel plans set parallel_setup_cost=0; set parallel_tuple_cost=0; @@ -33,6 +29,8 @@ HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur create table parallel_write as select length(stringu1) from tenk1 group by length(stringu1); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'length' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. drop table parallel_write; explain (costs off) select length(stringu1) into parallel_write from tenk1 group by length(stringu1); @@ -52,6 +50,8 @@ HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur select length(stringu1) into parallel_write from tenk1 group by length(stringu1); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'length' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. drop table parallel_write; explain (costs off) create materialized view parallel_mat_view as select length(stringu1) from tenk1 group by length(stringu1); @@ -71,9 +71,13 @@ HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur create materialized view parallel_mat_view as select length(stringu1) from tenk1 group by length(stringu1); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'length' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. create unique index on parallel_mat_view(length); refresh materialized view parallel_mat_view; refresh materialized view concurrently parallel_mat_view; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'tid' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. drop materialized view parallel_mat_view; prepare prep_stmt as select length(stringu1) from tenk1 group by length(stringu1); explain (costs off) create table parallel_write as execute prep_stmt; @@ -92,5 +96,7 @@ HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur (8 rows) create table parallel_write as execute prep_stmt; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'length' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. drop table parallel_write; rollback; diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule index 08a556ab546..e66d9266f17 100755 --- a/src/test/regress/greenplum_schedule +++ b/src/test/regress/greenplum_schedule @@ -250,6 +250,9 @@ test: uao_dml/uao_dml_column test: ao_locks test: freeze_aux_tables +# gp parallel test +test: gp_parallel + # These cannot run in parallel, because they check that VACUUM FULL shrinks table size. # A concurrent session could hold back the xid horizon and prevent old tuples from being # removed. diff --git a/src/test/regress/init_file b/src/test/regress/init_file index 8d889b384fe..e2da53a1cea 100644 --- a/src/test/regress/init_file +++ b/src/test/regress/init_file @@ -59,6 +59,7 @@ m/^NOTICE:.*building index for child partition ".*"/ # In case of split partitions we end up creating temp tables to exchange the partitions # E.g exchanged partition "p1" of relation "parttest_t" with relation "pg_temp_4062621" m/^NOTICE:.*exchanged partition ".*" with relation ".*"/ +m/^NOTICE:.*plan parallel aware node count: .*/ m/^WARNING: could not close temporary file .*: No such file or directory/ @@ -118,6 +119,9 @@ s/ERROR: infinite recursion detected.*/ERROR: infinite recursion detected/ m/ERROR: could not find hash function for hash operator.*/ s/ERROR: could not find hash function for hash operator.*/ERROR: could not find hash function for hash operator/ +m/ERROR: could not devise a plan.*/ +s/ERROR: could not devise a plan.*/ERROR: could not devise a plan (cdbpath.c:XXX)/ + m/nodename nor servname provided, or not known/ s/nodename nor servname provided, or not known/Name or service not known/ @@ -141,4 +145,9 @@ s/.//gs m/ERROR: can't split update for inherit table: .*/ s/ERROR: can't split update for inherit table: .*/ERROR: can't split update for inherit table:/ +m/set enable_parallel = off;/ +s/.//gs +m/reset enable_parallel;/ +s/.//gs + -- end_matchsubs diff --git a/src/test/regress/input/dispatch.source b/src/test/regress/input/dispatch.source index 59d573ed8d0..1d37258a539 100644 --- a/src/test/regress/input/dispatch.source +++ b/src/test/regress/input/dispatch.source @@ -204,6 +204,7 @@ set gp_gang_creation_retry_timer to 1000; select cleanupAllGangs(); +set max_parallel_workers_per_gather=0; -- trigger fault and report segment 0 in recovery for 5 times select gp_inject_fault('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint); select cleanupAllGangs(); @@ -350,6 +351,7 @@ update dispatch_test_t1 set c2 = 3 from dispatch_test_t2, dispatch_test_t3 where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3; select gp_inject_fault('before_one_slice_dispatched', 'reset', 1); +reset max_parallel_workers_per_gather; -- test logging of gang management SET gp_log_gang = 'debug'; diff --git a/src/test/regress/input/temp_tablespaces.source b/src/test/regress/input/temp_tablespaces.source index 7d5b1507ec0..3bbd8f2fb37 100644 --- a/src/test/regress/input/temp_tablespaces.source +++ b/src/test/regress/input/temp_tablespaces.source @@ -41,6 +41,9 @@ CREATE TABLE tts_foo (i int, j int) distributed by(i); insert into tts_foo select i, i from generate_series(1,80000)i; ANALYZE tts_foo; set gp_cte_sharing=on; +-- GP_PARALLEL_FIXME: since we disabled shared input scan in parallel mode, sisc_xslice_temp_files +-- will never be triggered. We need set max_parallel_workers_per_gather to 0 in this case. +set max_parallel_workers_per_gather = 0; -- CASE 1: when temp_tablespaces is set, hashagg and share-input-scan -- should honor the GUC and creates temp files under the specified tablespaces. @@ -131,6 +134,8 @@ select gp_wait_until_triggered_fault('hashagg_spill_temp_files', 1, dbid) select gp_inject_fault('hashagg_spill_temp_files', 'reset', dbid) from gp_segment_configuration where role='p' and content>=0; +-- GP_PARALLEL_FIXME: we need to reset max_parallel_workers_per_gather since we changed it. +reset max_parallel_workers_per_gather; drop table tts_foo, tts_bar, tts_hashagg; drop tablespace mytempsp0; diff --git a/src/test/regress/output/dispatch.source b/src/test/regress/output/dispatch.source index 96e5d771275..109f6a2e8fa 100644 --- a/src/test/regress/output/dispatch.source +++ b/src/test/regress/output/dispatch.source @@ -288,6 +288,7 @@ select cleanupAllGangs(); t (1 row) +set max_parallel_workers_per_gather=0; -- trigger fault and report segment 0 in recovery for 5 times select gp_inject_fault('process_startup_packet', 'skip', '', 'dispatch_test_db', '', 1, 5, 0, 2::smallint); gp_inject_fault @@ -595,6 +596,7 @@ select gp_inject_fault('before_one_slice_dispatched', 'reset', 1); Success: (1 row) +reset max_parallel_workers_per_gather; -- test logging of gang management SET gp_log_gang = 'debug'; -- test INFO raised from segments with various kinds of fields diff --git a/src/test/regress/output/temp_tablespaces.source b/src/test/regress/output/temp_tablespaces.source index 705464306ae..867cfcf15a6 100644 --- a/src/test/regress/output/temp_tablespaces.source +++ b/src/test/regress/output/temp_tablespaces.source @@ -52,6 +52,9 @@ CREATE TABLE tts_foo (i int, j int) distributed by(i); insert into tts_foo select i, i from generate_series(1,80000)i; ANALYZE tts_foo; set gp_cte_sharing=on; +-- GP_PARALLEL_FIXME: since we disabled shared input scan in parallel mode, sisc_xslice_temp_files +-- will never be triggered. We need set max_parallel_workers_per_gather to 0 in this case. +set max_parallel_workers_per_gather = 0; -- CASE 1: when temp_tablespaces is set, hashagg and share-input-scan -- should honor the GUC and creates temp files under the specified tablespaces. -- temp_tablespaces will synchronized to all segments @@ -221,6 +224,8 @@ select gp_inject_fault('hashagg_spill_temp_files', 'reset', dbid) Success: (3 rows) +-- GP_PARALLEL_FIXME: we need to reset max_parallel_workers_per_gather since we changed it. +reset max_parallel_workers_per_gather; drop table tts_foo, tts_bar, tts_hashagg; drop tablespace mytempsp0; drop tablespace mytempsp1; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 54ccd53cfe0..7760160f463 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -101,6 +101,11 @@ test: brin gin gist spgist privileges init_privs security_label collate matview # ---------- test: brin_bloom brin_multi +# ---------- +# Additional Task tests +# ---------- +test: task + # ---------- # Another group of parallel tests # ---------- diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index d94126044f1..dfed51d3203 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -94,6 +94,7 @@ bool print_failure_diffs_is_enabled = false; bool optimizer_enabled = false; bool resgroup_enabled = false; bool external_fts = false; +bool force_parallel_enabled = false; static _stringlist *loadextension = NULL; static int max_connections = 0; static int max_concurrent_tests = 0; @@ -1888,7 +1889,8 @@ results_differ(const char *testname, const char *resultsfile, const char *defaul { strlcpy(expectfile, default_expectfile, sizeof(expectfile)); } - if (ignore_plans) + + if (ignore_plans || force_parallel_enabled) ignore_plans_opts = " -gpd_ignore_plans"; else ignore_plans_opts = ""; @@ -3393,6 +3395,9 @@ regression_main(int argc, char *argv[], "Resource group enabled. Using resource group answer files whenever possible", "Resource group disabled. Using default answer files"); + force_parallel_enabled = check_feature_status("force_parallel_mode", "on", + "Force parallel mode enabled. Result diffs will ignore plans.", + "Force parallel mode disabled. Using default answer files"); /* * Ready to run the tests */ diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 134d5851cb0..41f4d4a4e61 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -122,6 +122,7 @@ test: replica_identity test: rowsecurity test: object_address test: tablesample +test: task test: appendonly_sample test: aocs_sample test: groupingsets diff --git a/src/test/regress/sql/alter_distribution_policy.sql b/src/test/regress/sql/alter_distribution_policy.sql index 86e8a858483..f6479f3a974 100644 --- a/src/test/regress/sql/alter_distribution_policy.sql +++ b/src/test/regress/sql/alter_distribution_policy.sql @@ -454,3 +454,80 @@ alter table reorg_leaf_1_prt_p0_2_prt_1 set with (reorganize=true) distributed b select *, gp_segment_id from reorg_leaf_1_prt_p0; alter table reorg_leaf_1_prt_p0_2_prt_1 set with (reorganize=true); select *, gp_segment_id from reorg_leaf_1_prt_p0; + +-- +-- Test case for GUC gp_force_random_redistribution. +-- Manually toggle the GUC should control the behavior of redistribution for randomly-distributed tables. +-- But REORGANIZE=true should redistribute no matter what. +-- + +-- this only affects postgres planner; +set optimizer = false; + +-- check the distribution difference between 't1' and 't2' after executing 'query_string' +-- return true if data distribution changed, otherwise false. +-- Note: in extremely rare cases, even after 't2' being randomly-distributed from 't1', they could still have the +-- exact same distribution. So let the tables have a reasonably large number of rows to reduce that possibility. +CREATE OR REPLACE FUNCTION check_redistributed(query_string text, t1 text, t2 text) +RETURNS BOOLEAN AS +$$ +DECLARE + before_query TEXT; + after_query TEXT; + comparison_query TEXT; + comparison_count INT; +BEGIN + -- Prepare the query strings + before_query := format('SELECT gp_segment_id as segid, count(*) AS tupcount FROM %I GROUP BY gp_segment_id', t1); + after_query := format('SELECT gp_segment_id as segid, count(*) AS tupcount FROM %I GROUP BY gp_segment_id', t2); + comparison_query := format('SELECT COUNT(*) FROM ((TABLE %I EXCEPT TABLE %I) UNION ALL (TABLE %I EXCEPT TABLE %I))q', 'distribution1', 'distribution2', 'distribution2', 'distribution1'); + + -- Create temp tables to store the result + EXECUTE format('CREATE TEMP TABLE distribution1 AS %s DISTRIBUTED REPLICATED', before_query); + + -- Execute provided query string + EXECUTE query_string; + + EXECUTE format('CREATE TEMP TABLE distribution2 AS %s DISTRIBUTED REPLICATED', after_query); + + -- Compare the tables using EXCEPT clause + EXECUTE comparison_query INTO comparison_count; + + -- Drop temp tables + EXECUTE 'DROP TABLE distribution1'; + EXECUTE 'DROP TABLE distribution2'; + + -- If count is greater than zero, then there's a difference + RETURN comparison_count > 0; +END; +$$ +LANGUAGE plpgsql; + +-- CO table builds temp table first instead of doing CTAS during REORGANIZE=true +create table t_reorganize(a int, b int) using ao_column distributed by (a); +insert into t_reorganize select 0,i from generate_series(1,1000)i; +select gp_segment_id, count(*) from t_reorganize group by gp_segment_id; + +-- firstly, no redistribute +set gp_force_random_redistribution = off; +select check_redistributed('alter table t_reorganize set with (reorganize=true) distributed randomly', 't_reorganize', 't_reorganize'); +-- reorganize from randomly to randomly should still redistribute +select check_redistributed('alter table t_reorganize set with (reorganize=true) distributed randomly', 't_reorganize', 't_reorganize'); +-- but insert into table won't redistribute +create table t_random (like t_reorganize) distributed randomly; +select check_redistributed('insert into t_random select * from t_reorganize', 't_reorganize', 't_random'); +-- but insert into a different distribution policy would still redistribute +create table t_distbya (like t_reorganize) distributed by (a); +select check_redistributed('insert into t_distbya select * from t_reorganize', 't_reorganize', 't_distbya'); + +-- now force distribute should redistribute in all cases +set gp_force_random_redistribution = on; +select check_redistributed('alter table t_reorganize set with (reorganize=true) distributed randomly', 't_reorganize', 't_reorganize'); +select check_redistributed('alter table t_reorganize set with (reorganize=true) distributed randomly', 't_reorganize', 't_reorganize'); +create table t_random (like t_reorganize) distributed randomly; +select check_redistributed('insert into t_random select * from t_reorganize', 't_reorganize', 't_random'); +create table t_distbya (like t_reorganize) distributed by (a); +select check_redistributed('insert into t_distbya select * from t_reorganize', 't_reorganize', 't_distbya'); + +reset optimizer; +reset gp_force_random_redistribution; diff --git a/src/test/regress/sql/bfv_dd.sql b/src/test/regress/sql/bfv_dd.sql index 35de6d2a9fb..635c53bf4b0 100644 --- a/src/test/regress/sql/bfv_dd.sql +++ b/src/test/regress/sql/bfv_dd.sql @@ -18,8 +18,10 @@ insert into dd_singlecol_1 values(null, null); analyze dd_singlecol_1; -- ctas tests +set enable_parallel = off; create table dd_ctas_1 as select * from dd_singlecol_1 where a=1 distributed by (a); create table dd_ctas_2 as select * from dd_singlecol_1 where a is NULL distributed by (a); +reset enable_parallel; select * from dd_ctas_1; select * from dd_ctas_2; @@ -104,7 +106,10 @@ select 'one' from dd_part_singlecol where a=1; select a, 'one' from dd_part_singlecol where a=1; -- group by and sort +-- disable parallel for regress tests +set enable_parallel = off; select a, count(*) from dd_part_singlecol where a=1 group by a; +reset enable_parallel; select a, count(*) from dd_part_singlecol where a=1 group by a order by a; @@ -131,7 +136,9 @@ select * from dd_singlecol_idx where (a=1 or a=2) and b<2; select 'one' from dd_singlecol_idx where (a=1 or a=2) and b=1; +set enable_parallel = off; select a, count(*) from dd_singlecol_idx where (a=1 or a=2) and b=1 group by a; +reset enable_parallel; select count(*) from dd_singlecol_idx; @@ -215,8 +222,10 @@ select 'one' from dd_singlecol_idx where a=1 and b=1; select a+b from dd_singlecol_idx where a=1 and b=1; +set enable_parallel = off; -- group by select a, count(*) from dd_singlecol_idx where a=1 and b=1 group by a; +reset enable_parallel; -- multicol select * from dd_multicol_idx where a=1 and b=1 and c<5; @@ -292,21 +301,27 @@ select 'one' from dd_singlecol_1 where a=1; select a, 'one' from dd_singlecol_1 where a=1; -- group by and sort +set enable_parallel = off; select a, count(*) from dd_singlecol_1 where a=1 group by a; +reset enable_parallel; select a, count(*) from dd_singlecol_1 where a=1 group by a order by a; -- inner joins select * from dd_singlecol_1 t1, dd_singlecol_2 t2 where t1.a=t2.a and t1.a=1; +set enable_parallel = off; select * from dd_singlecol_1 t1, dd_singlecol_2 t2 where t1.a=t2.b and t1.a=1; +reset enable_parallel; select * from dd_singlecol_1 t1, dd_singlecol_2 t2 where t1.b>t2.a and t1.a=1; -- outer joins select * from dd_singlecol_1 t1 left outer join dd_singlecol_2 t2 on (t1.a=t2.a) where t1.a=1; +set enable_parallel = off; select * from dd_singlecol_1 t1 left outer join dd_singlecol_2 t2 on (t1.a=t2.b) where t1.a=1 and t2.b=1; +reset enable_parallel; select * from dd_singlecol_1 t1 left outer join dd_singlecol_2 t2 on (t1.b=t2.b) where t1.a=1; @@ -314,7 +329,9 @@ select * from dd_singlecol_2 t2 left outer join dd_singlecol_1 t1 on (t1.b=t2.b) -- subqueries +set enable_parallel = off; select * from dd_singlecol_1 t1 where a=1 and b < (select count(*) from dd_singlecol_2 t2 where t2.a=t1.a); +reset enable_parallel; select * from dd_singlecol_1 t1 where a=1 and b in (select count(*) from dd_singlecol_2 t2 where t2.a<=t1.a); @@ -331,10 +348,12 @@ select * from dd_singlecol_1 where a>1 and a<5; select * from dd_singlecol_1 where a=1 or b=5; +set enable_parallel = off; -- group by and sort select b, count(*) from dd_singlecol_1 where a=1 group by b; select b, count(*) from dd_singlecol_1 where a=1 group by b order by b; +reset enable_parallel; -- randomly distributed tables create table dd_random(a int, b int) distributed randomly; diff --git a/src/test/regress/sql/bfv_dd_multicolumn.sql b/src/test/regress/sql/bfv_dd_multicolumn.sql index 74974bcc919..18f9df2ce5f 100644 --- a/src/test/regress/sql/bfv_dd_multicolumn.sql +++ b/src/test/regress/sql/bfv_dd_multicolumn.sql @@ -79,8 +79,10 @@ select 'one' from dd_multicol_1 where a=1 and b=1; select a, 'one' from dd_multicol_1 where a=1 and b=1; +set enable_parallel = off; -- group by and sort select a, count(*) from dd_multicol_1 where a=1 and b=1 group by a,b; +reset enable_parallel; select a, count(*) from dd_multicol_1 where a=1 and b=1 group by a,b order by a,b; diff --git a/src/test/regress/sql/bfv_partition_plans.sql b/src/test/regress/sql/bfv_partition_plans.sql index 94e4ab9e553..c730d50d8cd 100644 --- a/src/test/regress/sql/bfv_partition_plans.sql +++ b/src/test/regress/sql/bfv_partition_plans.sql @@ -222,7 +222,11 @@ analyze p3; analyze p; -- TEST +-- If force parallel, we won't have partition selector since we will use parallel join. +-- We need to disable parallel before doing this query. +set enable_parallel to false; select count_operator('select * from (select * from p1 union all select * from p2) as p_all, t where p_all.b=t.b;','Partition Selector'); +reset enable_parallel; select count_operator('select * from (select * from p1 union select * from p2) as p_all, t where p_all.b=t.b;','Partition Selector'); diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql index 3f9ae9843a2..ab60391dd3b 100644 --- a/src/test/regress/sql/explain.sql +++ b/src/test/regress/sql/explain.sql @@ -16,6 +16,7 @@ $$ declare ln text; begin + set local enable_parallel = off; for ln in execute $1 loop -- Replace any numeric word with just 'N' @@ -30,6 +31,7 @@ begin CONTINUE WHEN (ln = 'Planning:'); return next ln; end loop; + reset enable_parallel; end; $$; @@ -41,6 +43,7 @@ declare data text := ''; ln text; begin + set local enable_parallel = off; for ln in execute $1 loop -- Replace any numeric word with just '0' @@ -48,6 +51,7 @@ begin data := data || ln; end loop; return data::jsonb; + reset enable_parallel; end; $$; diff --git a/src/test/regress/sql/explain_format.sql b/src/test/regress/sql/explain_format.sql index cf6b72d6883..602891698fd 100644 --- a/src/test/regress/sql/explain_format.sql +++ b/src/test/regress/sql/explain_format.sql @@ -3,8 +3,8 @@ -- s/\(actual time=\d+\.\d+..\d+\.\d+ rows=\d+ loops=\d+\)/(actual time=##.###..##.### rows=# loops=#)/ -- m/\(slice\d+\) Executor memory: (\d+)\w bytes\./ -- s/Executor memory: (\d+)\w bytes\./Executor memory: (#####)K bytes./ --- m/\(slice\d+\) Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./ --- s/Executor memory: (\d+)\w bytes avg x \d+ workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./ +-- m/\(slice\d+\) Executor memory: (\d+)\w bytes avg x \d+(x\(\d+\))* workers, \d+\w bytes max \(seg\d+\)\./ +-- s/Executor memory: (\d+)\w bytes avg x \d+(x\(\d+\))* workers, \d+\w bytes max \(seg\d+\)\./Executor memory: ####K bytes avg x #### workers, ####K bytes max (seg#)./ -- m/Work_mem: \d+\w bytes max\./ -- s/Work_mem: \d+\w bytes max\. */Work_mem: ###K bytes max./ -- m/Execution Time: \d+\.\d+ ms/ @@ -71,6 +71,8 @@ EXPLAIN (ANALYZE) SELECT * from boxes LEFT JOIN apples ON apples.id = boxes.appl -- s/Maximum Memory Used: \d+/Maximum Memory Used: ###/ -- m/Workers: \d+/ -- s/Workers: \d+/Workers: ##/ +-- m/Subworkers: \d+/ +-- s/Subworkers: \d+/Subworkers: ##/ -- m/Average: \d+/ -- s/Average: \d+/Average: ##/ -- m/Total memory used across slices: \d+/ diff --git a/src/test/regress/sql/gp_aggregates.sql b/src/test/regress/sql/gp_aggregates.sql index 3c9605759fe..1704b26c7ce 100644 --- a/src/test/regress/sql/gp_aggregates.sql +++ b/src/test/regress/sql/gp_aggregates.sql @@ -123,9 +123,11 @@ create aggregate mysum_prefunc(int4) ( -- tweak settings to force multistage agg to be used set gp_motion_cost_per_row = 1000; set optimizer_force_multistage_agg = on; +set force_parallel_mode = off; select mysum_prefunc(a::int4) from aggtest; reset gp_motion_cost_per_row; reset optimizer_force_multistage_agg; +reset force_parallel_mode; -- Test an aggregate with 'internal' transition type, and a combine function, diff --git a/src/test/regress/sql/gp_parallel.sql b/src/test/regress/sql/gp_parallel.sql new file mode 100644 index 00000000000..cade5a7a8df --- /dev/null +++ b/src/test/regress/sql/gp_parallel.sql @@ -0,0 +1,471 @@ +-- +-- GP PARALLEL +-- Test GP style parallel plan. +-- GUCs shoule be set with local, do not disturb other parallel plans. +-- Should not use force_parallel_mode as it will ignore plan and check results only. +-- We want to check plan in this file! +-- If there is need to do that, set it local inside a transaction. +-- Set optimizer off in this file, ORCA parallel is not supported. +-- +-- Locus check expression: +-- This is just used to check locus codes in cdbpath_motion_for_parallel_join/cdbpathlocus_parallel_join +-- with corresponding examples quickly for parallel join. +-- Format: +-- 1_2_3 means locus 1 join locus 2 generate locus 3. +-- 1_P_2_3 means locus 1 Join(with shared hash table) locus 2 generate locus 3. +-- All this format represents for parallel join, while P implies it's a parallel_aware join. +-- +-- The numbers steal from CdbLocusType enum. +-- 0 CdbLocusType_Null +-- 1 CdbLocusType_Entry +-- 2 CdbLocusType_SingleQE +-- 3 CdbLocusType_General +-- 4 CdbLocusType_SegmentGeneral +-- 5 CdbLocusType_SegmentGeneralWorkers +-- 6 CdbLocusType_OuterQuery +-- 7 CdbLocusType_Replicated +-- 8 CdbLocusType_ReplicatedWorkers +-- 9 CdbLocusType_Hashed +-- 10 CdbLocusType_HashedOJ +-- 11 CdbLocusType_Strewn +-- 12 CdbLocusType_HashedWorkers +-- +-- +set force_parallel_mode = 0; +set optimizer = off; + +create schema test_parallel; +set search_path to test_parallel; + +create table ao1(x int, y int) with(appendonly=true); +create table ao2(x int, y int) with(appendonly=true); +create table aocs1(x int, y int) with(appendonly=true, orientation=column); + +begin; + +-- encourage use of parallel plans +set local min_parallel_table_scan_size = 0; +set local max_parallel_workers_per_gather = 4; +set local enable_parallel = true; + +-- insert multiple segfiles for parallel +set local gp_appendonly_insert_files = 4; + +-- test appendonly table parallel +insert into ao1 select i, i from generate_series(1, 1200000) g(i); +analyze ao1; +insert into ao2 select i%10, i from generate_series(1, 1200000) g(i); +analyze ao2; +select segfilecount from pg_appendonly where relid = 'ao1'::regclass; +explain(costs off) select count(*) from ao1; +select count(*) from ao1; + +-- test aocs table parallel +insert into aocs1 select i, i from generate_series(1, 1200000) g(i); +analyze aocs1; +select segfilecount from pg_appendonly where relid = 'aocs1'::regclass; +explain(costs off) select count(*) from aocs1; +select count(*) from aocs1; + +-- test locus of HashedWorkers can parallel join without motion +explain(locus, costs off) select count(*) from ao1, ao2 where ao1.x = ao2.x; +select count(*) from ao1, ao2 where ao1.x = ao2.x; + +reset enable_parallel; +commit; + +-- +-- test parallel with indices +-- +create index on ao1(y); +create index on aocs1(y); +analyze ao1; +analyze aocs1; + +-- test AO/AOCS should not be IndexScan +begin; +set local enable_parallel = on; +set local enable_seqscan = off; +set local enable_indexscan = on; +set local enable_bitmapscan = on; + +set local max_parallel_workers_per_gather=1; +explain(costs off) select y from ao1 where y > 1000000; +explain(costs off) select y from aocs1 where y > 1000000; +set local max_parallel_workers_per_gather=0; +explain(costs off) select y from ao1 where y > 1000000; +explain(costs off) select y from aocs1 where y > 1000000; +commit; + +drop table ao1; +drop table ao2; +drop table aocs1; + + +-- test gp_appendonly_insert_files doesn't take effect +begin; + +create table t (x int); +insert into t select i from generate_series(1, 1000) i; +set local gp_appendonly_insert_files=4; +set local gp_appendonly_insert_files_tuples_range = 10; + +create table ao1 using ao_row as select * from t; +analyze ao1; +select segfilecount from pg_appendonly where relid='ao1'::regclass; + +create table ao2 with(appendonly=true) as select * from t; +analyze ao2; +select segfilecount from pg_appendonly where relid='ao2'::regclass; + +create table aocs1 using ao_column as select * from t; +analyze aocs1; +select segfilecount from pg_appendonly where relid='aocs1'::regclass; + +create table aocs2 with(appendonly=true, orientation=column) as select * from t; +analyze aocs2; +select segfilecount from pg_appendonly where relid='aocs2'::regclass; + +abort; + +-- test replicated tables parallel +begin; +set local max_parallel_workers_per_gather = 2; +create table t1(a int, b int) with(parallel_workers=2); +create table rt1(a int, b int) with(parallel_workers=2) distributed replicated; +create table rt2(a int, b int) distributed replicated; +create table rt3(a int, b int) distributed replicated; +insert into t1 select i, i from generate_series(1, 100000) i; +insert into t1 select i, i+1 from generate_series(1, 10) i; +insert into rt1 select i, i+1 from generate_series(1, 10) i; +insert into rt2 select i, i+1 from generate_series(1, 10000) i; +insert into rt3 select i, i+1 from generate_series(1, 10) i; +analyze t1; +analyze rt1; +analyze rt2; +analyze rt3; + +-- replica parallel select +set local enable_parallel = off; +explain(locus, costs off) select * from rt1; +select * from rt1; +set local enable_parallel = on; +explain(locus, costs off) select * from rt1; +select * from rt1; + +-- replica join replica +set local enable_parallel = off; +select * from rt1 join rt2 on rt2.b = rt1.a; +set local enable_parallel = on; +explain(locus, costs off) select * from rt1 join rt2 on rt2.b = rt1.a; +select * from rt1 join rt2 on rt2.b = rt1.a; +-- +-- ex 5_P_5_5 +-- SegmentGeneralWorkers parallel join SegmentGeneralWorkers when parallel_aware generate SegmentGeneralWorerks locus. +-- +set local min_parallel_table_scan_size = 0; +explain(locus, costs off) select * from rt1 join rt2 on rt2.b = rt1.a; +select * from rt1 join rt2 on rt2.b = rt1.a; + +-- +-- ex 5_4_5 +-- SegmentGeneralWorkers parallel join SegmentGeneral generate SegmentGeneralWorkers locus. +-- +set local enable_parallel_hash = off; +explain(locus, costs off) select * from rt1 join rt2 on rt2.b = rt1.a; +select * from rt1 join rt2 on rt2.b = rt1.a; + +-- +-- t1 join rt1 join rt2 +-- +set local enable_parallel = off; +explain(locus, costs off) select * from rt1 join t1 on rt1.a = t1.b join rt2 on rt2.a = t1.b; +select * from rt1 join t1 on rt1.a = t1.b join rt2 on rt2.a = t1.b; +-- parallel hash join +set local enable_parallel = on; +set local enable_parallel_hash = on; +-- +-- SegmentGeneralWorkers parallel join HashedWorkers when parallel_aware generate HashedWorkers. +-- ex 12_P_5_12 +-- HashedWorkers parallel join SegmentGeneralWorkers when parallel_aware generate HashedWorkers. +-- +explain(locus, costs off) select * from rt1 join t1 on rt1.a = t1.b join rt2 on rt2.a = t1.b; +select * from rt1 join t1 on rt1.a = t1.b join rt2 on rt2.a = t1.b; + +-- +-- t1 join rt1 join rt3 +-- +set local enable_parallel = off; +explain(locus, costs off) select * from rt1 join t1 on rt1.a = t1.b join rt3 on rt3.a = t1.b; +select * from rt1 join t1 on rt1.a = t1.b join rt3 on rt3.a = t1.b; +-- parallel join without parallel hash +set local enable_parallel = on; +set local enable_parallel_hash = off; +-- HashedWorkers parallel join SegmentGeneral generate HashedWorkers. +explain(locus, costs off) select * from rt1 join t1 on rt1.a = t1.b join rt3 on rt3.a = t1.b; +select * from rt1 join t1 on rt1.a = t1.b join rt3 on rt3.a = t1.b; + +create table t2(a int, b int) with(parallel_workers=0); +create table rt4(a int, b int) with(parallel_workers=2) distributed replicated; +insert into t2 select i, i+1 from generate_series(1, 10) i; +insert into rt4 select i, i+1 from generate_series(1, 10000) i; +analyze t2; +analyze rt4; +set local enable_parallel = off; +select * from rt4 join t2 using(b); +set local enable_parallel = on; +set local enable_parallel_hash = off; +-- +-- ex 5_9_12 +-- SegmentGeneralWorkers(w=N) parallel join Hashed(W=0) generate HashedWorkers(w=N). +-- +explain(locus, costs off) select * from rt4 join t2 using(b); +select * from rt4 join t2 using(b); + +create table t3(a int, b int) with(parallel_workers=2); +insert into t3 select i, i+1 from generate_series(1, 9000) i; +analyze t3; +set local enable_parallel = off; +select count(*) from rt4 join t3 using(b); +set local enable_parallel = on; +set local enable_parallel_hash = on; +-- +-- ex 5_P_12_12 +-- SegmentGeneralWorkers parallel join HashedWorkers when parallel_aware generate HashedWorkers. +-- +explain(locus, costs off) select * from rt4 join t3 using(b); +select count(*) from rt4 join t3 using(b); + +abort; + + +-- +-- ex 5_11_11 +-- SegmentGeneralWorkers(workers=N) join Strewn(worker=0) without shared hash table. +-- Join locus: Strewn(worker=N). +-- +begin; +create table t_replica_workers_2(a int, b int) with(parallel_workers=2) distributed replicated; +insert into t_replica_workers_2 select i, i+1 from generate_series(1, 10) i; +analyze t_replica_workers_2; +create table t_random_workers_0(a int, b int) with(parallel_workers=0) distributed randomly; +insert into t_random_workers_0 select i, i+1 from generate_series(1, 5) i; +analyze t_random_workers_0; +set local enable_parallel= true; +set local enable_parallel_hash= false; +explain(locus, costs off) select * from t_replica_workers_2 join t_random_workers_0 using(a); +select * from t_replica_workers_2 join t_random_workers_0 using(a); +-- non parallel results +set local enable_parallel=false; +select * from t_replica_workers_2 join t_random_workers_0 using(a); +abort; + +-- +-- Strewn(worker=N) join SegmentGeneralWorkers(workers=N) with shared hash table. +-- Join locus: Strewn(worker=N). +-- +begin; +create table t_replica_workers_2(a int, b int) with(parallel_workers=2) distributed replicated; +insert into t_replica_workers_2 select i, i+1 from generate_series(1, 10) i; +analyze t_replica_workers_2; +create table t_random_workers_2(a int, b int) with(parallel_workers=2) distributed randomly; +insert into t_random_workers_2 select i, i+1 from generate_series(1, 5) i; +analyze t_random_workers_2; +set local enable_parallel= true; +set local enable_parallel_hash= true; +explain(locus, costs off) select * from t_replica_workers_2 right join t_random_workers_2 using(a); +select * from t_replica_workers_2 right join t_random_workers_2 using(a); +-- non parallel results +set local enable_parallel=false; +select * from t_replica_workers_2 right join t_random_workers_2 using(a); +abort; + + +-- +-- ex 5_P_11_11 +-- SegmentGeneralWorkers(workers=N) join Strewn(workers=N) with shared hash table. +-- Join locus: Strewn(workers=N). +-- +begin; +create table t_replica_workers_2(a int, b int) with(parallel_workers=2) distributed replicated; +insert into t_replica_workers_2 select i, i+1 from generate_series(1, 10) i; +analyze t_replica_workers_2; +create table t_random_workers_2(a int, b int) with(parallel_workers=2) distributed randomly; +insert into t_random_workers_2 select i, i+1 from generate_series(1, 5) i; +analyze t_random_workers_2; +set local enable_parallel= true; +set local enable_parallel_hash= true; +explain(locus, costs off) select * from t_replica_workers_2 join t_random_workers_2 using(a); +select * from t_replica_workers_2 join t_random_workers_2 using(a); +-- non parallel results +set local enable_parallel=false; +select * from t_replica_workers_2 join t_random_workers_2 using(a); +abort; + +-- +-- Test final join path's parallel_workers should be same with join_locus whose +-- parallel_workers is different from origin outer path(without motion). +-- +begin; +create table t1(a int, b int) with(parallel_workers=3); +create table t2(b int, a int) with(parallel_workers=2); +insert into t1 select i, i+1 from generate_series(1, 10) i; +insert into t2 select i, i+1 from generate_series(1, 5) i; +analyze t1; +analyze t2; +set local optimizer=off; +set local enable_parallel=on; +set local max_parallel_workers_per_gather= 4; +explain(costs off) select * from t1 right join t2 on t1.b = t2.a; +abort; + +-- +-- Test SingleQE locus could particapte in parallel plan. +-- +begin; +create table t1(a int, b int) with(parallel_workers=2); +create table t2(a int, b int) with(parallel_workers=2); +insert into t1 select i%10, i from generate_series(1, 5) i; +insert into t1 values (100000); +insert into t2 select i%10, i from generate_series(1, 100000) i; +analyze t1; +analyze t2; +set local enable_parallel = on; +-- parallel hash join with shared table, SinglQE as outer partial path. +explain(locus, costs off) select * from (select count(*) as a from t2) t2 left join t1 on t1.a = t2.a; +select * from (select count(*) as a from t2) t2 left join t1 on t1.a = t2.a; +set local enable_parallel = off; +select * from (select count(*) as a from t2) t2 left join t1 on t1.a = t2.a; +set local enable_parallel = on; +-- parallel hash join with shared table, SinglQE as inner partial path. +explain(locus, costs off) select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +set local enable_parallel = off; +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +set local enable_parallel = on; +-- parallel hash join without shared table. +set local enable_parallel_hash = off; +explain(locus, costs off) select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +-- parallel merge join +set local enable_hashjoin = off; +explain(locus, costs off) select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +-- parallel nestloop join +set local enable_mergejoin = off; +set local enable_nestloop = on; +explain(locus, costs off) select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +-- non-parallel results +set local enable_parallel = off; +select * from t1 join (select count(*) as a from t2) t2 on t1.a = t2.a; +abort; + +begin; +-- use rt1 to generate locus of SegmentGeneralWorkers +-- use rt2 to generate locus of SegmentGeneral +-- use t1 to generate locus of HashedWorkers +-- use t2 to generate locus of Hahsed +-- use pg_class to generate locus of Entry +-- use generate_series(1, 1000) to generate locus of General +-- use select count(*) as a from sq1 to generate locus of SingleQE +create table rt1(a int, b int) distributed replicated; +create table rt2(a int, b int) with (parallel_workers = 0) distributed replicated; +create table t1(a int, b int); +create table t2(a int, b int) with (parallel_workers = 0); +insert into t1 select i, i+1 from generate_series(1, 10000) i; +insert into t2 select i, i+1 from generate_series(1, 10000) i; +insert into rt1 select i, i+1 from generate_series(1, 10000) i; +insert into rt2 select i, i+1 from generate_series(1, 10000) i; +CREATE TABLE sq1 AS SELECT a, b FROM t1 WHERE gp_segment_id = 0; +set local optimizer=off; +set local enable_parallel=on; +set local min_parallel_table_scan_size to 0; +set local max_parallel_workers_per_gather= 4; +analyze rt1; +analyze rt2; +analyze t1; +analyze t2; +analyze sq1; + +-- SegmentGeneralWorkers + SegmengGeneralWorkers = SegmentGeneralWorkers +explain (locus, costs off) select * from rt1 union all select * from rt1; +-- SegmentGeneralWorkers + SegmentGeneral = SegmentGeneralWorkers +explain (locus, costs off) select * from rt1 union all select * from rt2; +-- SegmentGeneralWorkers (Converted to Strewn, Limited on One Segment) + HashedWorkers = Strewn +explain (locus, costs off) select * from rt1 union all select * from t1; +-- SegmentGeneralWorkers (Converted to Strewn, Limited on One Segment) + Hashed = Strewn +explain (locus, costs off) select * from rt1 union all select * from t2; +-- SingleQE as subquery seems cannot produce partial_pathlist and don't have chance to parallel append. +explain (locus, costs off) select a from rt1 union all select count(*) as a from sq1; +-- SegmentGeneralWorkers + General = SegmentGeneralWorkers +explain (locus, costs off) select a from rt1 union all select a from generate_series(1, 1000) a; +-- Entry as subquery seems cannot produce partial_pathlist and don't have chance to parallel append. +-- flaky case failed: expected use seqscan on pg_class but choose indexscan sometimes. +set local enable_indexscan = off; +set local enable_indexonlyscan = off; +explain (locus, costs off) select a from rt1 union all select oid as a from pg_class; +abort; + +-- +-- Test two-phase parallel Limit +-- +begin; +create table t1(c1 int, c2 int) with(parallel_workers=2); +insert into t1 select i, i+1 from generate_series(1, 100000) i; +analyze t1; +set local optimizer = off; +set local enable_parallel = on; +explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +select * from t1 order by c2 asc limit 3 offset 5; +-- non-parallel results +set local enable_parallel = off; +explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +select * from t1 order by c2 asc limit 3 offset 5; +abort; + +-- +-- Test one-phase Limit with parallel subpath +-- +begin; +create table t1(c1 int, c2 int) with(parallel_workers=2); +insert into t1 select i, i+1 from generate_series(1, 100000) i; +analyze t1; +set local optimizer = off; +set local gp_enable_multiphase_limit = off; +set local enable_parallel = on; +explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +select * from t1 order by c2 asc limit 3 offset 5; +-- non-parallel results +set local enable_parallel = off; +explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +select * from t1 order by c2 asc limit 3 offset 5; +abort; +-- +-- Test alter ao/aocs table parallel_workers options +-- +begin; +set local optimizer = off; +set local enable_parallel = on; +-- ao table +create table ao (a INT, b INT) using ao_row; +insert into ao select i as a, i as b from generate_series(1, 100) AS i; +alter table ao set (parallel_workers = 2); +explain(costs off) select count(*) from ao; +select count(*) from ao; +alter table ao reset (parallel_workers); +-- aocs table +create table aocs (a INT, b INT) using ao_column; +insert into aocs select i as a, i as b from generate_series(1, 100) AS i; +alter table aocs set (parallel_workers = 2); +explain(costs off) select count(*) from aocs; +select count(*) from aocs; +alter table aocs reset (parallel_workers); +abort; + +-- start_ignore +drop schema test_parallel cascade; +-- end_ignore + +reset force_parallel_mode; +reset optimizer; diff --git a/src/test/regress/sql/gporca.sql b/src/test/regress/sql/gporca.sql index 3f79bd133fa..b9beab300de 100644 --- a/src/test/regress/sql/gporca.sql +++ b/src/test/regress/sql/gporca.sql @@ -2305,7 +2305,7 @@ INSERT INTO onetimefilter1 SELECT i, i FROM generate_series(1,10)i; INSERT INTO onetimefilter2 SELECT i, i FROM generate_series(1,10)i; ANALYZE onetimefilter1; ANALYZE onetimefilter2; -EXPLAIN WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; +EXPLAIN (COSTS OFF) WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; WITH abc AS (SELECT onetimefilter1.a, onetimefilter1.b FROM onetimefilter1, onetimefilter2 WHERE onetimefilter1.a=onetimefilter2.a) SELECT (SELECT 1 FROM abc WHERE f1.b = f2.b LIMIT 1), COALESCE((SELECT 2 FROM abc WHERE f1.a=random() AND f1.a=2), 0), (SELECT b FROM abc WHERE b=f1.b) FROM onetimefilter1 f1, onetimefilter2 f2 WHERE f1.b = f2.b; @@ -3426,7 +3426,7 @@ CREATE TABLE dist_tab_a (a varchar(15)) DISTRIBUTED BY(a); INSERT INTO dist_tab_a VALUES('1 '), ('2 '), ('3 '); CREATE TABLE dist_tab_b (a char(15), b bigint) DISTRIBUTED BY(a); INSERT INTO dist_tab_b VALUES('1 ', 1), ('2 ', 2), ('3 ', 3); -EXPLAIN CREATE TABLE result_tab AS +EXPLAIN(COSTS OFF) CREATE TABLE result_tab AS (SELECT a.a, b.b FROM dist_tab_a a LEFT JOIN dist_tab_b b ON a.a=b.a) DISTRIBUTED BY(a); CREATE TABLE result_tab AS (SELECT a.a, b.b FROM dist_tab_a a LEFT JOIN dist_tab_b b ON a.a=b.a) DISTRIBUTED BY(a); diff --git a/src/test/regress/sql/guc_gp.sql b/src/test/regress/sql/guc_gp.sql index ee00e98b872..1f814252063 100644 --- a/src/test/regress/sql/guc_gp.sql +++ b/src/test/regress/sql/guc_gp.sql @@ -269,3 +269,24 @@ drop table public.restore_guc_test; reset search_path; SELECT gp_inject_fault('change_string_guc', 'reset', 1); SELECT gp_inject_fault('restore_string_guc', 'reset', 1); + +-- enabling gp_force_random_redistribution makes sure random redistribution happens +-- only relevant to postgres optimizer +set optimizer = false; + +create table t1_dist_rand(a int) distributed randomly; +create table t2_dist_rand(a int) distributed randomly; +create table t_dist_hash(a int) distributed by (a); + +-- with the GUC turned off, redistribution won't happen (no redistribution motion) +set gp_force_random_redistribution = false; +explain (costs off) insert into t2_dist_rand select * from t1_dist_rand; +explain (costs off) insert into t2_dist_rand select * from t_dist_hash; + +-- with the GUC turned on, redistribution would happen +set gp_force_random_redistribution = true; +explain (costs off) insert into t2_dist_rand select * from t1_dist_rand; +explain (costs off) insert into t2_dist_rand select * from t_dist_hash; + +reset gp_force_random_redistribution; +reset optimizer; diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql index 3eb0952f9fd..5cdfbab2eba 100644 --- a/src/test/regress/sql/incremental_sort.sql +++ b/src/test/regress/sql/incremental_sort.sql @@ -140,7 +140,9 @@ analyze t; explain (costs off) select * from (select * from t order by a) s order by a, b limit 55; select * from (select * from t order by a) s order by a, b limit 55; -- Test EXPLAIN ANALYZE with only a fullsort group. +set max_parallel_workers_per_gather = 0; select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); +reset max_parallel_workers_per_gather; select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55')); select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55'); delete from t; @@ -172,7 +174,9 @@ explain (costs off) select * from t left join (select * from (select * from t or select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); rollback; -- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +set max_parallel_workers_per_gather = 0; select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); +reset max_parallel_workers_per_gather; select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70')); select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70'); delete from t; diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index ed93e6c10bc..3f2d7479b10 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -549,7 +549,7 @@ insert into mcrparted5 values (30, 21, 20); insert into mcrparted4 values (30, 21, 20); -- error -- check rows -select tableoid::regclass::text, * from mcrparted order by 1; +select tableoid::regclass::text, * from mcrparted order by 1, 2, 3; -- cleanup drop table mcrparted; diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index b452d41b29f..f9e3b1f2013 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -484,6 +484,7 @@ $$ declare ln text; begin + set local enable_parallel = off; for ln in execute format('explain (analyze, costs off, summary off, timing off) %s', $1) @@ -493,6 +494,7 @@ begin ln := regexp_replace(ln, 'Rows Removed by Filter: \d+', 'Rows Removed by Filter: N'); return next ln; end loop; + reset enable_parallel; end; $$; diff --git a/src/test/regress/sql/pg_stat.sql b/src/test/regress/sql/pg_stat.sql index 105e29c27c1..d9fc37850b0 100644 --- a/src/test/regress/sql/pg_stat.sql +++ b/src/test/regress/sql/pg_stat.sql @@ -1,4 +1,5 @@ set optimizer to off; +set max_parallel_workers_per_gather=0; drop table if exists pg_stat_test; create table pg_stat_test(a int); @@ -54,3 +55,4 @@ select from pg_stat_user_indexes where relname = 'pg_stat_test'; reset optimizer; +reset max_parallel_workers_per_gather; diff --git a/src/test/regress/sql/qp_dml_joins.sql b/src/test/regress/sql/qp_dml_joins.sql index b7a557f9b78..6129c9d2398 100644 --- a/src/test/regress/sql/qp_dml_joins.sql +++ b/src/test/regress/sql/qp_dml_joins.sql @@ -225,10 +225,12 @@ INSERT INTO dml_heap_check_p SELECT i, 'p','p', i FROM generate_series(1,100)i; INSERT INTO dml_heap_check_p VALUES(1,'pn','pn',NULL),(2,'pn','pn',NULL),(3,'pn','pn',NULL),(4,'pn','pn',NULL),(5,'pn','pn',NULL); +set enable_parallel = off; CREATE TABLE dml_heap_r (a int , b int default -1, c text) DISTRIBUTED BY (a); CREATE TABLE dml_heap_p (a numeric, b decimal , c boolean , d text , e int) DISTRIBUTED BY (a,b); CREATE TABLE dml_heap_s as select dml_heap_r.b, dml_heap_r.a, dml_heap_r.c from dml_heap_r, dml_heap_p WHERE dml_heap_r.a = dml_heap_p.a; ALTER TABLE dml_heap_s SET DISTRIBUTED BY (b); +reset enable_parallel; INSERT INTO dml_heap_p SELECT id * 1.012, id * 1.1, true, 'new', id as d FROM (SELECT * FROM generate_series(1,100) as id) AS x; INSERT INTO dml_heap_p VALUES(generate_series(1,10),NULL,false,'pn',NULL); diff --git a/src/test/regress/sql/qp_misc.sql b/src/test/regress/sql/qp_misc.sql index 6970bcbfeed..7aacbfd5268 100644 --- a/src/test/regress/sql/qp_misc.sql +++ b/src/test/regress/sql/qp_misc.sql @@ -13135,6 +13135,7 @@ select rnum, lower(c1) from tlth where rnum=47 group by f1,f2 ) Q ) P; +set max_parallel_workers_per_gather=0; -- SelectThaiColumnOrderByLocal_p1 select 'SelectThaiColumnOrderByLocal_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( @@ -13218,6 +13219,7 @@ select rnum, c1 from tlth where rnum <> 38 group by f1,f2 ) Q ) P; +reset max_parallel_workers_per_gather; -- SelectThaiColumnWhere_p1 select 'SelectThaiColumnWhere_p1' test_name_part, case when c = 1 then 1 else 0 end pass_ind from ( select count(distinct c) c from ( diff --git a/src/test/regress/sql/qp_misc_rio.sql b/src/test/regress/sql/qp_misc_rio.sql index 86acf6b5afc..414cbef2f17 100644 --- a/src/test/regress/sql/qp_misc_rio.sql +++ b/src/test/regress/sql/qp_misc_rio.sql @@ -54,6 +54,7 @@ group by 1,b.revenue; -- ---------------------------------------------------------------------- +set enable_parallel = off; CREATE TABLE testtable0000 AS SELECT spend, row_number() OVER (PARTITION BY 0) AS i, (spend % 2) AS r FROM (select generate_series(1,10) as spend) x DISTRIBUTED RANDOMLY; @@ -179,7 +180,6 @@ SELECT s4, COUNT(*) FROM testtable0005 GROUP BY s4 order by s4; - CREATE VIEW testtable0006 AS SELECT testtable0005.*, miro_foo.s5_xform @@ -192,6 +192,7 @@ JOIN (SELECT s5, ON testtable0005.s5 = miro_foo.s5; SELECT MIN(s5_xform), MIN(s5_xform) FROM testtable0006; +reset enable_parallel; diff --git a/src/test/regress/sql/qp_query_execution.sql b/src/test/regress/sql/qp_query_execution.sql index 0660c099ec4..d5c59a7a6db 100644 --- a/src/test/regress/sql/qp_query_execution.sql +++ b/src/test/regress/sql/qp_query_execution.sql @@ -7,6 +7,7 @@ set search_path to qp_query_execution; create language plpython3u; create or replace function qx_count_operator(query text, planner_operator text, optimizer_operator text) returns int as $$ +plpy.execute('set max_parallel_workers_per_gather=0') rv = plpy.execute('EXPLAIN '+ query) plan = '\n'.join([row['QUERY PLAN'] for row in rv]) optimizer = plan.find('Pivotal Optimizer (GPORCA)') diff --git a/src/test/regress/sql/qp_targeted_dispatch.sql b/src/test/regress/sql/qp_targeted_dispatch.sql index 7029e6f37d7..7a23088e42b 100644 --- a/src/test/regress/sql/qp_targeted_dispatch.sql +++ b/src/test/regress/sql/qp_targeted_dispatch.sql @@ -316,7 +316,9 @@ distributed by (key); insert into mpp7620 values (200, 'horse'); -- enable printing of printing info set test_print_direct_dispatch_info=on; +set enable_parallel = off; Create table zoompp7620 as select * from mpp7620 where key=200; +reset enable_parallel; insert into mpp7620 values (200, 200); insert into zoompp7620 select * from mpp7620 where key=200; insert into zoompp7620(key) select key from mpp7620 where mpp7620.key=200; @@ -341,6 +343,7 @@ set test_print_direct_dispatch_info=on; --Check to see distributed vs distributed randomly alter table table_a set distributed randomly; +set enable_parallel = off; select max(a0) from table_a where a0=3; alter table table_a set distributed by (a0); explain select * from table_a where a0=3; @@ -348,6 +351,7 @@ explain select a0 from table_a where a0 in (select max(a1) from table_a); select a0 from table_a where a0 in (select max(a1) from table_a); select max(a1) from table_a; select max(a0) from table_a where a0=1; +reset enable_parallel; explain select a0 from table_a where a0 in (select max(a1) from table_a where a0=1); reset test_print_direct_dispatch_info; diff --git a/src/test/regress/sql/query_finish_pending.sql b/src/test/regress/sql/query_finish_pending.sql index eff997d47fc..220fd4c35e5 100644 --- a/src/test/regress/sql/query_finish_pending.sql +++ b/src/test/regress/sql/query_finish_pending.sql @@ -31,8 +31,10 @@ select gp_inject_fault('execsort_sort_bounded_heap', 'reset', 2); -- set QueryFinishPending=true in sort_bounded_heap. This will stop sort. select gp_inject_fault('execsort_sort_bounded_heap', 'finish_pending', 2); +set enable_parallel = off; -- return results although sort will be interrupted in one of the segments select i1 from _tmp_table order by i2 limit 3; +reset enable_parallel; select gp_inject_fault('execsort_sort_bounded_heap', 'status', 2); @@ -55,6 +57,8 @@ select gp_inject_fault('execshare_input_next', 'reset', 2); -- This will eagerly free the memory context of shared input scan's child node. select gp_inject_fault('execshare_input_next', 'finish_pending', 2); +set enable_parallel = off; + with cte as (select i2 from testsisc order by i2) select * from cte c1, cte c2 limit 2; @@ -74,6 +78,8 @@ select * from cte c1, cte c2 limit 2; select gp_inject_fault('execshare_input_next', 'status', 2); +reset enable_parallel; + -- Disable faultinjectors select gp_inject_fault('execsort_sort_mergeruns', 'reset', 2); select gp_inject_fault('execsort_dumptuples', 'reset', 2); diff --git a/src/test/regress/sql/rangefuncs_cdb.sql b/src/test/regress/sql/rangefuncs_cdb.sql index c2ce59615b9..0b3dc239adb 100644 --- a/src/test/regress/sql/rangefuncs_cdb.sql +++ b/src/test/regress/sql/rangefuncs_cdb.sql @@ -1,4 +1,8 @@ -SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%'; +-- +-- Will run in parallel mode with enable_parallel=on and non-parallel mode. +-- Filter this gucs to pass regression. +-- +SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%' and name != 'enable_parallel'; -- start_ignore create schema rangefuncs_cdb; set search_path to rangefuncs_cdb, public; diff --git a/src/test/regress/sql/segspace.sql b/src/test/regress/sql/segspace.sql index 38334ac122b..aa4d1df0d27 100644 --- a/src/test/regress/sql/segspace.sql +++ b/src/test/regress/sql/segspace.sql @@ -1,6 +1,8 @@ -- -- Tests the spill files disk space accounting mechanism -- +-- GPDP_PARALLEL_FIXME: it's hard to make fault_injection work with prallel processes. +set enable_parallel = false; -- check segspace before test reset statement_mem; @@ -41,6 +43,21 @@ begin; SELECT t1.* FROM segspace_test_hj_skew AS t1, segspace_test_hj_skew AS t2 WHERE t1.i1=t2.i2; rollback; +-- +-- GPDB parallel once got errors like: +-- could not read from shared tuplestore temporary file: read only 0 of 8 bytes from file. +-- Enable parallel here to test it. +-- +begin; +set local enable_parallel = true; +set local optimizer=off; +set local min_parallel_table_scan_size=0; +set local min_parallel_index_scan_size = 0; +set local force_parallel_mode=1; +EXPLAIN(COSTS OFF) SELECT t1.* FROM segspace_test_hj_skew AS t1, segspace_test_hj_skew AS t2 WHERE t1.i1=t2.i2; +SELECT count(t1.*) FROM segspace_test_hj_skew AS t1, segspace_test_hj_skew AS t2 WHERE t1.i1=t2.i2; +rollback; + select gp_inject_fault('exec_hashjoin_new_batch', 'status', 2); -- check used segspace after test @@ -327,3 +344,8 @@ $func$ language plpgsql; select workset_cleanup_test(); drop table segspace_test_hj_skew; + +reset enable_parallel; + +-- don't disturb other processes. +select cleanupAllGangs(); diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index 94b81940962..fbd94c31a64 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -1,11 +1,10 @@ -- -- PARALLEL +-- We have GP style parallel now, open this file in parallel mode. -- --- GPDB_96_MERGE_FIXME: We don't support parallel query. These tests won't actually --- generate any parallel plans. Should we pay attention to the parallel restrictions --- when creating MPP plans? For example, should we force parallel restricted functions --- to run in the QD? +set enable_parallel = on; +set optimizer = off; create function sp_parallel_restricted(int) returns int as $$begin return $1; end$$ language plpgsql parallel restricted; @@ -242,7 +241,10 @@ begin end loop; end; $$; +-- test sort stats plan, disable parallel +set max_parallel_workers_per_gather = 0; select * from explain_parallel_sort_stats(); +reset max_parallel_workers_per_gather; reset enable_indexscan; reset enable_hashjoin; @@ -400,7 +402,8 @@ explain (costs off) -- to increase the parallel query test coverage SAVEPOINT settings; SET LOCAL force_parallel_mode = 1; -EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1; +-- GPDB_PARALLEL_FIXME: analyze actual rows may be different by running multiple times. +EXPLAIN (timing off, summary off, costs off) SELECT * FROM tenk1; ROLLBACK TO SAVEPOINT settings; -- provoke error in worker @@ -463,3 +466,5 @@ SELECT 1 FROM tenk1_vw_sec rollback; +reset enable_parallel; +reset optimizer; diff --git a/src/test/regress/sql/shared_scan.sql b/src/test/regress/sql/shared_scan.sql index d1922d7503f..7234cef6e4a 100644 --- a/src/test/regress/sql/shared_scan.sql +++ b/src/test/regress/sql/shared_scan.sql @@ -48,12 +48,13 @@ SET statement_timeout = '15s'; RESET statement_timeout; -SELECT *, +SELECT COUNT(*) +FROM (SELECT *, ( WITH cte AS (SELECT * FROM jazz WHERE jazz.e = bar.c) SELECT 1 FROM cte c1, cte c2 ) - FROM bar; + FROM bar) as s; CREATE TABLE t1 (a int, b int); CREATE TABLE t2 (a int); @@ -93,3 +94,29 @@ $$; -- This should only ERROR and should not SIGSEGV SELECT col_mismatch_func2(); + +-- https://github.com/greenplum-db/gpdb/issues/12701 +-- Disable cte sharing in subquery +drop table if exists pk_list; +create table pk_list (id int, schema_name varchar, table_name varchar) distributed by (id); +drop table if exists calender; +create table calender (id int, data_hour timestamp) distributed by (id); + +explain (costs off) +with + tbls as (select distinct schema_name, table_name as table_nm from pk_list), + tbls_daily_report_23 as (select unnest(string_to_array('mart_cm.card' ,',')) as table_nm_23), + tbls_w_onl_actl_data as (select unnest(string_to_array('mart_cm.cont_resp,mart_cm.card', ',')) as table_nm_onl_act) +select data_hour, stat.schema_name as schema_nm, dt.table_nm +from ( + select * from calender c + cross join tbls +) dt +inner join ( + select tbls.schema_name, tbls.table_nm as table_name + from tbls tbls +) stat on dt.table_nm = stat.table_name +where + (data_hour = date_trunc('day',data_hour) and stat.schema_name || '.' ||stat.table_name not in (select table_nm_23 from tbls_daily_report_23)) + and (stat.schema_name || '.' ||stat.table_name not in (select table_nm_onl_act from tbls_w_onl_actl_data)) + or (stat.schema_name || '.' ||stat.table_name in (select table_nm_onl_act from tbls_w_onl_actl_data)); diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index e5b8fac1066..f005f2c2957 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -25,6 +25,7 @@ declare tmp text[]; first_row bool := true; begin + set local enable_parallel = off; for ln in execute format('explain analyze %s', $1) loop @@ -34,6 +35,7 @@ begin return query select tmp[1]::int, tmp[2]::int; end if; end loop; + reset enable_parallel; end; $$; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index ecf25a42e24..308f11baf5c 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -831,6 +831,7 @@ create function explain_sq_limit() returns setof text language plpgsql as $$ declare ln text; begin + set local enable_parallel=off; for ln in explain (analyze, summary off, timing off, costs off) select * from (select pk,c2 from sq_limit order by c1,pk) as x limit 3 @@ -838,6 +839,7 @@ begin ln := regexp_replace(ln, 'Memory: \S*', 'Memory: xxx'); return next ln; end loop; + reset enable_parallel; end; $$; diff --git a/src/test/regress/sql/sysviews.sql b/src/test/regress/sql/sysviews.sql index b24816e3d5a..3891a767df0 100644 --- a/src/test/regress/sql/sysviews.sql +++ b/src/test/regress/sql/sysviews.sql @@ -48,7 +48,8 @@ select count(*) = 0 as ok from pg_stat_wal_receiver; -- This is to record the prevailing planner enable_foo settings during -- a regression test run. -select name, setting from pg_settings where name like 'enable%'; +-- GP parallel tests will run another with enable_parallel=on, filter this to pass regression. +select name, setting from pg_settings where name like 'enable%' and name != 'enable_parallel'; -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/task.sql b/src/test/regress/sql/task.sql new file mode 100644 index 00000000000..217b265daf9 --- /dev/null +++ b/src/test/regress/sql/task.sql @@ -0,0 +1,65 @@ +-- Vacuum every day at 10:00am (GMT) +create task vacuum_db SCHEDULE '0 10 * * *' AS 'vacuum'; + +-- Stop scheduling a task +drop task vacuum_db; + +-- Invalid input: missing parts +create task missing_parts schedule '* * * *' as 'select 1'; + +-- Invalid input: trailing characters +create task trail_char schedule '5 secondc' as 'select 1'; +create task trail_char schedule '50 seconds c' as 'select 1'; + +-- Invalid input: seconds out of range +create task invalid_seconds schedule '-1 seconds' as 'select 1'; +create task invalid_seconds schedule '0 seconds' as 'select 1'; +create task invalid_seconds schedule '60 seconds' as 'select 1'; +create task invalid_seconds schedule '1000000000000 seconds' as 'select 1'; + +-- Vacuum every day at 10:00am (GMT) +create task vacuum_db SCHEDULE '0 10 * * *' AS 'vacuum'; +select schedule, command, active, jobname from pg_task order by jobid; + +-- Make that 11:00am (GMT) +alter task vacuum_db schedule '0 11 * * *'; +select schedule, command, active, jobname from pg_task order by jobid; + +-- Make that VACUUM FULL +alter task vacuum_db as 'vacuum full'; +select schedule, command, active, jobname from pg_task order by jobid; + +-- Update to a non existing database +alter task vacuum_db database hopedoesnotexist; + +-- Create a database that does not allow connection +create database task_dbno; +revoke CONNECT on DATABASE task_dbno from PUBLIC; + +-- create a test user +create user task_cron with password 'pwd'; + +-- Create a task for another user +create task another_user_task schedule '* 10 * * *' database task_dbno user task_cron as 'vacuum'; + +-- Schedule a task for this user on the database that does not accept connections +alter task vacuum_db database task_dbno user task_cron; + +-- Schedule a task that user doest not exist +alter task vacuum_db user hopedoesnotexist; + +-- valid interval tasks +create task valid_task_1 schedule '1 second' as 'select 1'; +create task valid_task_2 schedule ' 30 sEcOnDs ' as 'select 1'; +create task valid_task_3 schedule '59 seconds' as 'select 1'; +create task valid_task_4 schedule '17 seconds ' as 'select 1'; + +-- clean up +drop database task_dbno; +drop user task_cron; + +drop task vacuum_db; +drop task valid_task_1; +drop task valid_task_2; +drop task valid_task_3; +drop task valid_task_4; diff --git a/src/test/regress/sql/workfile/hashagg_spill.sql b/src/test/regress/sql/workfile/hashagg_spill.sql index 502274fb928..f9642fa9368 100644 --- a/src/test/regress/sql/workfile/hashagg_spill.sql +++ b/src/test/regress/sql/workfile/hashagg_spill.sql @@ -28,11 +28,12 @@ result = [] for i in range(len(rv)): cur_line = rv[i]['QUERY PLAN'] if search_text.lower() in cur_line.lower(): - p = re.compile('.+\((segment \d+).+ Workfile: \((\d+) spilling\)') + p = re.compile('.+Segments: (\d+).+\((segment \d+).+ Workfile: \((\d+) spilling\)') m = p.match(cur_line) - workfile_created = int(m.group(2)) - cur_row = int(workfile_created == nsegments) + workfile_created = int(m.group(3)) + cur_row = int(workfile_created == int(m.group(1))) result.append(cur_row) + break return result $$ language plpython3u; diff --git a/src/test/regress/sql/workfile/materialize_spill.sql b/src/test/regress/sql/workfile/materialize_spill.sql index 3484ec5b79d..bb02936a969 100644 --- a/src/test/regress/sql/workfile/materialize_spill.sql +++ b/src/test/regress/sql/workfile/materialize_spill.sql @@ -60,6 +60,9 @@ set enable_nestloop = true; -- ORCA doesn't honor enable_nestloop/enable_hashjoin, so this won't produce -- the kind of plan we're looking for. set optimizer=off; +-- GP_PARALLEL_FIXME: seems like work_mem are affected by parallel, thus more spilling +-- happened. Temporally disable parallel in this case to pass the test. +set enable_parallel=off; -- This is the actual test query. select * FROM test_mat_small as t1 left outer join test_mat_large AS t2 on t1.i1=t2.i2; @@ -83,4 +86,5 @@ from num_workfiles_created($$ select * FROM test_mat_small as t1 left outer join test_mat_large AS t2 on t1.i1=t2.i2 limit 10 $$) as n; +reset enable_parallel; drop schema materialize_spill cascade; diff --git a/src/test/regress/sql/workfile/sisc_mat_sort.sql b/src/test/regress/sql/workfile/sisc_mat_sort.sql index d3f32567514..f548bca7d06 100644 --- a/src/test/regress/sql/workfile/sisc_mat_sort.sql +++ b/src/test/regress/sql/workfile/sisc_mat_sort.sql @@ -38,6 +38,7 @@ analyze testsiscm; set statement_mem="3MB"; set gp_resqueue_print_operator_memory_limits=on; set gp_cte_sharing=on; +set max_parallel_workers_per_gather = 0; -- The expected output is very sensitive to the kind of plan this produces. -- We're testing the executor, not the planner, so force ORCA off, to get -- the particular plan @@ -79,4 +80,5 @@ select * from ctesisc as t1, ctesisc as t2 where t1.c1 = t2.c1 and t1.c3 = t2.c3 limit 50000;'); +reset max_parallel_workers_per_gather; drop schema sisc_mat_sort cascade; diff --git a/src/test/regress/sql/workfile/sisc_sort_spill.sql b/src/test/regress/sql/workfile/sisc_sort_spill.sql index 632715fa3b4..ad1731bb691 100644 --- a/src/test/regress/sql/workfile/sisc_sort_spill.sql +++ b/src/test/regress/sql/workfile/sisc_sort_spill.sql @@ -36,6 +36,7 @@ insert into testsisc select i, i % 1000, i % 100000, i % 75 from set statement_mem="2MB"; set gp_resqueue_print_operator_memory_limits=on; set gp_cte_sharing=on; +set max_parallel_workers_per_gather = 0; -- ORCA optimizes away the ORDER BY in our test query, and therefore doesn't exercise -- a Sort that spills. set optimizer=off; @@ -82,4 +83,5 @@ select * from sisc_sort_spill.is_workfile_created('explain (analyze, verbose) where t1.i1 = t2.i2 limit 50000;'); +reset max_parallel_workers_per_gather; drop schema sisc_sort_spill cascade; diff --git a/src/test/regress/sql/write_parallel.sql b/src/test/regress/sql/write_parallel.sql index e71716a8ce2..ae660dc2265 100644 --- a/src/test/regress/sql/write_parallel.sql +++ b/src/test/regress/sql/write_parallel.sql @@ -2,12 +2,7 @@ -- PARALLEL -- --- GPDB_96_MERGE_FIXME: We don't support parallel query. These tests won't actually --- generate any parallel plans. Same as in 'select_parallel' test. - --- Serializable isolation would disable parallel query, so explicitly use an --- arbitrary other level. -begin isolation level repeatable read; +begin; -- encourage use of parallel plans set parallel_setup_cost=0; diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index 838bcda919f..fa02b959a73 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -133,19 +133,21 @@ sub mkvcbuild archive.c base64.c checksum_helper.c config_info.c controldata_utils.c d2s.c encnames.c exec.c f2s.c file_perm.c file_utils.c hashfn.c ip.c jsonapi.c - keywords.c kwlookup.c link-canary.c md5_common.c + keywords.c kmgr_utils.c kwlookup.c link-canary.c md5_common.c pg_get_line.c pg_lzcompress.c pgfnames.c psprintf.c relpath.c rmtree.c saslprep.c scram-common.c string.c stringinfo.c unicode_norm.c username.c wait_error.c wchar.c); if ($solution->{options}->{openssl}) { + push(@pgcommonallfiles, 'cipher_openssl.c'); push(@pgcommonallfiles, 'cryptohash_openssl.c'); push(@pgcommonallfiles, 'hmac_openssl.c'); push(@pgcommonallfiles, 'protocol_openssl.c'); } else { + push(@pgcommonallfiles, 'cipher.c'); push(@pgcommonallfiles, 'cryptohash.c'); push(@pgcommonallfiles, 'hmac.c'); push(@pgcommonallfiles, 'md5.c');