Skip to content

Refactor WAL SMGER FE/BE split and let frontend code also write encrypted WAL #479

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
429 changes: 215 additions & 214 deletions contrib/pg_tde/src/access/pg_tde_tdemap.c

Large diffs are not rendered by default.

151 changes: 94 additions & 57 deletions contrib/pg_tde/src/access/pg_tde_xlog_smgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
#include "encryption/enc_tde.h"
#include "pg_tde.h"
#include "pg_tde_defines.h"
#include "pg_tde_guc.h"

#ifdef FRONTEND
#include "pg_tde_fe.h"
#else
#include "pg_tde_guc.h"
#include "port/atomics.h"
#endif

Expand All @@ -32,15 +32,21 @@ static ssize_t tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offs
TimeLineID tli, XLogSegNo segno, int segSize);
static ssize_t tdeheap_xlog_seg_write(int fd, const void *buf, size_t count,
off_t offset, TimeLineID tli,
XLogSegNo segno);
XLogSegNo segno, int segSize);

static const XLogSmgr tde_xlog_smgr = {
.seg_read = tdeheap_xlog_seg_read,
.seg_write = tdeheap_xlog_seg_write,
};

#ifndef FRONTEND
static Size TDEXLogEncryptBuffSize(void);
static void *EncryptionCryptCtx = NULL;

/* TODO: can be swapped out to the disk */
static InternalKey EncryptionKey =
{
.type = MAP_ENTRY_EMPTY,
.start_lsn = InvalidXLogRecPtr,
};

/*
* Must be the same as in replication/walsender.c
Expand All @@ -49,26 +55,37 @@ static Size TDEXLogEncryptBuffSize(void);
*/
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)

static ssize_t TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count,
off_t offset, TimeLineID tli,
XLogSegNo segno);
/*
* Since the backend code needs to use atomics and shared memory while the
* frotnend code cannot do that we provide two separate implementations of some
* data structures and the functions which operate one them.
*/

#ifndef FRONTEND

typedef struct EncryptionStateData
{
char *segBuf;
char db_map_path[MAXPGPATH];
pg_atomic_uint64 enc_key_lsn; /* to sync with readers */
} EncryptionStateData;

static EncryptionStateData *EncryptionState = NULL;

/* TODO: can be swapped out to the disk */
static InternalKey EncryptionKey =
static char *EncryptionBuf;

static XLogRecPtr
TDEXLogGetEncKeyLsn()
{
.type = MAP_ENTRY_EMPTY,
.start_lsn = InvalidXLogRecPtr,
};
static void *EncryptionCryptCtx = NULL;
return (XLogRecPtr) pg_atomic_read_u64(&EncryptionState->enc_key_lsn);
}

static void
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
{
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, start_lsn);
}

static Size TDEXLogEncryptBuffSize(void);

static int XLOGChooseNumBuffers(void);

Expand Down Expand Up @@ -126,7 +143,6 @@ void
TDEXLogShmemInit(void)
{
bool foundBuf;
char *allocptr;

EncryptionState = (EncryptionStateData *)
ShmemInitStruct("TDE XLog Encryption State",
Expand All @@ -137,59 +153,61 @@ TDEXLogShmemInit(void)

if (EncryptXLog)
{
allocptr = ((char *) EncryptionState) + sizeof(EncryptionStateData);
allocptr = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, allocptr);
EncryptionState->segBuf = allocptr;
EncryptionBuf = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, ((char *) EncryptionState) + sizeof(EncryptionStateData));

Assert((char *) EncryptionState + TDEXLogEncryptStateSize() >= (char *) EncryptionState->segBuf + TDEXLogEncryptBuffSize());
Assert((char *) EncryptionState + TDEXLogEncryptStateSize() >= (char *) EncryptionBuf + TDEXLogEncryptBuffSize());
}

pg_atomic_init_u64(&EncryptionState->enc_key_lsn, 0);

elog(DEBUG1, "pg_tde: initialized encryption buffer %lu bytes", TDEXLogEncryptStateSize());
}

/*
* Encrypt XLog page(s) from the buf and write to the segment file.
*/
static ssize_t
TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
TimeLineID tli, XLogSegNo segno)
#else /* !FRONTEND */

typedef struct EncryptionStateData
{
char iv_prefix[16];
InternalKey *key = &EncryptionKey;
char *enc_buff = EncryptionState->segBuf;
char db_map_path[MAXPGPATH];
XLogRecPtr enc_key_lsn; /* to sync with reader */
} EncryptionStateData;

Assert(count <= TDEXLogEncryptBuffSize());
static EncryptionStateData EncryptionStateD = {0};

#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn));
#endif
static EncryptionStateData *EncryptionState = &EncryptionStateD;

CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
pg_tde_stream_crypt(iv_prefix, offset,
(char *) buf, count,
enc_buff, key, &EncryptionCryptCtx);
static char EncryptionBuf[MAX_SEND_SIZE];

return pg_pwrite(fd, enc_buff, count, offset);
static XLogRecPtr
TDEXLogGetEncKeyLsn()
{
return (XLogRecPtr) EncryptionState->enc_key_lsn;
}

static void
TDEXLogSetEncKeyLsn(XLogRecPtr start_lsn)
{
EncryptionState->enc_key_lsn = EncryptionKey.start_lsn;
}

#endif /* !FRONTEND */
#endif /* FRONTEND */

void
TDEXLogSmgrInit(void)
TDEXLogSmgrInit()
{
SetXLogSmgr(&tde_xlog_smgr);
}

void
TDEXLogSmgrInitWrite(bool encrypt_xlog)
Comment on lines +200 to +201
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my pg_resetwal PR I used last WAL key state to check if I need to write encrypted block or not. But with this init function it looks like that better solution will be to read GUC parameter from server, because I guess we don't want to read keys explicitly in resetwal code.

What do you think?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I am not sure which is the best but I do not think either prevents this from being merged.

{
#ifndef FRONTEND
/* TODO: move to the separate func, it's not an SMGR init */
InternalKey *key = pg_tde_read_last_wal_key();

/*
* Always generate a new key on starting PostgreSQL to protect against
* attacks on CTR ciphers based on comparing the WAL generated by two
* divergent copies of the same cluster.
*/
if (EncryptXLog)
if (encrypt_xlog)
{
pg_tde_create_wal_key(&EncryptionKey, &GLOBAL_SPACE_RLOCATOR(XLOG_TDE_OID),
TDE_KEY_TYPE_WAL_ENCRYPTED);
Expand All @@ -202,45 +220,66 @@ TDEXLogSmgrInit(void)
else if (key)
{
EncryptionKey = *key;
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, EncryptionKey.start_lsn);
TDEXLogSetEncKeyLsn(EncryptionKey.start_lsn);
}

if (key)
pfree(key);

pg_tde_set_db_file_path(GLOBAL_SPACE_RLOCATOR(XLOG_TDE_OID).dbOid, EncryptionState->db_map_path);
}

/*
* Encrypt XLog page(s) from the buf and write to the segment file.
*/
static ssize_t
TDEXLogWriteEncryptedPages(int fd, const void *buf, size_t count, off_t offset,
TimeLineID tli, XLogSegNo segno)
{
char iv_prefix[16];
InternalKey *key = &EncryptionKey;
char *enc_buff = EncryptionBuf;

#ifndef FRONTEND
Assert(count <= TDEXLogEncryptBuffSize());
#endif
SetXLogSmgr(&tde_xlog_smgr);

#ifdef TDE_XLOG_DEBUG
elog(DEBUG1, "write encrypted WAL, size: %lu, offset: %ld [%lX], seg: %X/%X, key_start_lsn: %X/%X",
count, offset, offset, LSN_FORMAT_ARGS(segno), LSN_FORMAT_ARGS(key->start_lsn));
#endif

CalcXLogPageIVPrefix(tli, segno, key->base_iv, iv_prefix);
pg_tde_stream_crypt(iv_prefix, offset,
(char *) buf, count,
enc_buff, key, &EncryptionCryptCtx);

return pg_pwrite(fd, enc_buff, count, offset);
}

static ssize_t
tdeheap_xlog_seg_write(int fd, const void *buf, size_t count, off_t offset,
TimeLineID tli, XLogSegNo segno)
TimeLineID tli, XLogSegNo segno, int segSize)
{
#ifndef FRONTEND

/*
* Set the last (most recent) key's start LSN if not set.
*
* This func called with WALWriteLock held, so no need in any extra sync.
*/
if (EncryptionKey.type != MAP_ENTRY_EMPTY &&
pg_atomic_read_u64(&EncryptionState->enc_key_lsn) == 0)
if (EncryptionKey.type != MAP_ENTRY_EMPTY && TDEXLogGetEncKeyLsn() == 0)
{
XLogRecPtr lsn;

XLogSegNoOffsetToRecPtr(segno, offset, wal_segment_size, lsn);
XLogSegNoOffsetToRecPtr(segno, offset, segSize, lsn);

pg_tde_wal_last_key_set_lsn(lsn, EncryptionState->db_map_path);
EncryptionKey.start_lsn = lsn;
pg_atomic_write_u64(&EncryptionState->enc_key_lsn, lsn);
TDEXLogSetEncKeyLsn(lsn);
}

if (EncryptXLog)
if (EncryptionKey.type == TDE_KEY_TYPE_WAL_ENCRYPTED)
return TDEXLogWriteEncryptedPages(fd, buf, count, offset, tli, segno);
else
#endif
return pg_pwrite(fd, buf, count, offset);
}

Expand Down Expand Up @@ -273,8 +312,7 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
keys = pg_tde_fetch_wal_keys(InvalidXLogRecPtr);
}

#ifndef FRONTEND
write_key_lsn = pg_atomic_read_u64(&EncryptionState->enc_key_lsn);
write_key_lsn = TDEXLogGetEncKeyLsn();

if (!XLogRecPtrIsInvalid(write_key_lsn))
{
Expand All @@ -291,7 +329,6 @@ tdeheap_xlog_seg_read(int fd, void *buf, size_t count, off_t offset,
keys = pg_tde_get_wal_cache_keys();
}
}
#endif

XLogSegNoOffsetToRecPtr(segno, offset, segSize, data_start);
XLogSegNoOffsetToRecPtr(segno, offset + readsz, segSize, data_end);
Expand Down
1 change: 1 addition & 0 deletions contrib/pg_tde/src/include/access/pg_tde_xlog_smgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@
extern Size TDEXLogEncryptStateSize(void);
extern void TDEXLogShmemInit(void);
extern void TDEXLogSmgrInit(void);
extern void TDEXLogSmgrInitWrite(bool encrypt_xlog);

#endif /* PG_TDE_XLOGSMGR_H */
2 changes: 2 additions & 0 deletions contrib/pg_tde/src/include/pg_tde_fe.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@

static int tde_fe_error_level = 0;

#define data_sync_elevel(elevel) (elevel)

/*
* -------------
*/
Expand Down
1 change: 1 addition & 0 deletions contrib/pg_tde/src/pg_tde.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ tde_shmem_startup(void)
PrincipalKeyShmemInit();
TDEXLogShmemInit();
TDEXLogSmgrInit();
TDEXLogSmgrInitWrite(EncryptXLog);
}

void
Expand Down
4 changes: 2 additions & 2 deletions src/backend/access/transam/xlog.c
Original file line number Diff line number Diff line change
Expand Up @@ -2446,7 +2446,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
INSTR_TIME_SET_ZERO(start);

pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
written = xlog_smgr->seg_write(openLogFile, from, nleft, startoffset, tli, openLogSegNo);
written = xlog_smgr->seg_write(openLogFile, from, nleft, startoffset, tli, openLogSegNo, wal_segment_size);
pgstat_report_wait_end();

/*
Expand Down Expand Up @@ -3491,7 +3491,7 @@ XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
}
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
if ((int) xlog_smgr->seg_write(fd, buffer.data, sizeof(buffer), offset, destTLI, destsegno) != (int) sizeof(buffer))
if ((int) xlog_smgr->seg_write(fd, buffer.data, sizeof(buffer), offset, destTLI, destsegno, wal_segment_size) != (int) sizeof(buffer))
{
int save_errno = errno;

Expand Down
2 changes: 1 addition & 1 deletion src/backend/replication/walreceiver.c
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli)

byteswritten = xlog_smgr->seg_write(recvFile, buf, segbytes,
(off_t) startoff, recvFileTLI,
recvSegNo);
recvSegNo, wal_segment_size);
if (byteswritten <= 0)
{
char xlogfname[MAXFNAMELEN];
Expand Down
2 changes: 1 addition & 1 deletion src/bin/pg_rewind/pg_rewind.c
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ main(int argc, char **argv)
char tde_path[MAXPGPATH];
snprintf(tde_path, sizeof(tde_path), "%s/%s", datadir_target, PG_TDE_DATA_DIR);
pg_tde_fe_init(tde_path);
TDEXLogSmgrInit();
TDEXLogSmgrInit();
}
#endif
/*
Expand Down
4 changes: 2 additions & 2 deletions src/include/access/xlog_smgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ typedef struct XLogSmgr
TimeLineID tli, XLogSegNo segno, int segSize);

ssize_t (*seg_write) (int fd, const void *buf, size_t count, off_t offset,
TimeLineID tli, XLogSegNo segno);
TimeLineID tli, XLogSegNo segno, int segSize);
} XLogSmgr;

static inline ssize_t
default_seg_write(int fd, const void *buf, size_t count, off_t offset,
TimeLineID tli, XLogSegNo segno)
TimeLineID tli, XLogSegNo segno, int segSize)
{
return pg_pwrite(fd, buf, count, offset);
}
Expand Down
2 changes: 1 addition & 1 deletion src/include/pg_config_manual.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,4 +384,4 @@
*/
/* #define TRACE_SYNCSCAN */

#define PERCONA_API_VERSION 1
#define PERCONA_API_VERSION 2
Loading