Skip to content

Commit

Permalink
Fix slow partial block writes caused by addition of the zero_cache (#185
Browse files Browse the repository at this point in the history
).

Before the zero_cache was added, the block_cache efficiently absorbed partial
block writes. When the zero_cache was added, it was given the standard dumb
read/patch/write algorithm for partial block writes. This resulted in heavy
write amplification when large s3backer block sizes were being used.

Restore the partial read/write hooks but make them optional. Reimplement them
in the zero_cache and block_cache layers only (reverting part of d1bce95),
and fix the zero_cache implementation so it forwards partial read and write
requests to the block_cache (after first checking them) instead of trying to
handle them itself, less efficiently.
  • Loading branch information
archiecobbs committed May 23, 2022
1 parent 3e83d48 commit d949846
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 0 deletions.
34 changes: 34 additions & 0 deletions block_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ static int block_cache_read_block(struct s3backer_store *s3b, s3b_block_t block_
u_char *actual_etag, const u_char *expect_etag, int strict);
static int block_cache_write_block(struct s3backer_store *s3b, s3b_block_t block_num, const void *src, u_char *etag,
check_cancel_t *check_cancel, void *check_cancel_arg);
static int block_cache_read_block_part(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, void *dest);
static int block_cache_write_block_part(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, const void *src);
static int block_cache_survey_non_zero(struct s3backer_store *s3b, block_list_func_t *callback, void *arg);
static int block_cache_shutdown(struct s3backer_store *s3b);
static void block_cache_destroy(struct s3backer_store *s3b);
Expand Down Expand Up @@ -253,6 +255,8 @@ block_cache_create(struct block_cache_conf *config, struct s3backer_store *inner
s3b->set_mount_token = block_cache_set_mount_token;
s3b->read_block = block_cache_read_block;
s3b->write_block = block_cache_write_block;
s3b->read_block_part = block_cache_read_block_part;
s3b->write_block_part = block_cache_write_block_part;
s3b->bulk_zero = generic_bulk_zero;
s3b->survey_non_zero = block_cache_survey_non_zero;
s3b->shutdown = block_cache_shutdown;
Expand Down Expand Up @@ -607,6 +611,21 @@ block_cache_read_block(struct s3backer_store *const s3b, s3b_block_t block_num,
return block_cache_read(priv, block_num, 0, config->block_size, dest);
}

static int
block_cache_read_block_part(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, void *dest)
{
struct block_cache_private *const priv = s3b->data;
struct block_cache_conf *const config = priv->config;

// Sanity check
assert(len > 0);
assert(len < config->block_size);
assert(off + len <= config->block_size);

// Read data
return block_cache_read(priv, block_num, off, len, dest);
}

/*
* Read a block, and trigger read-ahead if necessary.
*/
Expand Down Expand Up @@ -845,6 +864,21 @@ block_cache_write_block(struct s3backer_store *const s3b, s3b_block_t block_num,
return block_cache_write(priv, block_num, 0, config->block_size, src);
}

static int
block_cache_write_block_part(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, const void *src)
{
struct block_cache_private *const priv = s3b->data;
struct block_cache_conf *const config = priv->config;

// Sanity check
assert(len > 0);
assert(len < config->block_size);
assert(off + len <= config->block_size);

// Write data
return block_cache_write(priv, block_num, off, len, src);
}

/*
* Write a block or a portion thereof.
*/
Expand Down
8 changes: 8 additions & 0 deletions block_part.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ block_part_read_block_part(struct s3backer_store *s3b, struct block_part *const
assert(edge->length < priv->block_size);
assert(edge->offset + edge->length <= priv->block_size);

// Does the next layer down support partial block reads natively?
if (s3b->read_block_part != NULL)
return (*s3b->read_block_part)(s3b, edge->block, edge->offset, edge->length, edge->data);

// Allocate buffer
if ((buf = malloc(priv->block_size)) == NULL)
return errno;
Expand Down Expand Up @@ -181,6 +185,10 @@ block_part_write_block_part(struct s3backer_store *s3b, struct block_part *const
assert(edge->length < priv->block_size);
assert(edge->offset + edge->length <= priv->block_size);

// Does the next layer down support partial block writes natively?
if (s3b->write_block_part != NULL)
return (*s3b->write_block_part)(s3b, edge->block, edge->offset, edge->length, edge->data);

// Allocate buffer
if ((buf = malloc(priv->block_size)) == NULL)
return errno;
Expand Down
20 changes: 20 additions & 0 deletions s3backer.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,16 @@ struct s3backer_store {
int (*read_block)(struct s3backer_store *s3b, s3b_block_t block_num, void *dest,
u_char *actual_etag, const u_char *expect_etag, int strict);

/*
* Read part of one block.
*
* This is an optional function; if not supported, this hook may be null.
*
* Returns zero on success or a (positive) errno value on error.
* May return ENOTCONN if create_threads() has not yet been invoked.
*/
int (*read_block_part)(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, void *dest);

/*
* Write one block.
*
Expand All @@ -225,6 +235,16 @@ struct s3backer_store {
int (*write_block)(struct s3backer_store *s3b, s3b_block_t block_num, const void *src, u_char *etag,
check_cancel_t *check_cancel, void *arg);

/*
* Write part of one block.
*
* This is an optional function; if not supported, this hook may be null.
*
* Returns zero on success or a (positive) errno value on error.
* May return ENOTCONN if create_threads() has not yet been invoked.
*/
int (*write_block_part)(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, const void *src);

/*
* Bulk block zeroing (i.e., deletion).
*
Expand Down
62 changes: 62 additions & 0 deletions zero_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ static int zero_cache_read_block(struct s3backer_store *s3b, s3b_block_t block_n
u_char *actual_etag, const u_char *expect_etag, int strict);
static int zero_cache_write_block(struct s3backer_store *s3b, s3b_block_t block_num, const void *src, u_char *etag,
check_cancel_t *check_cancel, void *check_cancel_arg);
static int zero_cache_read_block_part(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, void *dest);
static int zero_cache_write_block_part(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, const void *src);
static int zero_cache_bulk_zero(struct s3backer_store *const s3b, const s3b_block_t *block_nums, u_int num_blocks);
static int zero_cache_survey_non_zero(struct s3backer_store *s3b, block_list_func_t *callback, void *arg);
static int zero_cache_shutdown(struct s3backer_store *s3b);
Expand Down Expand Up @@ -129,6 +131,10 @@ zero_cache_create(struct zero_cache_conf *config, struct s3backer_store *inner)
s3b->set_mount_token = zero_cache_set_mount_token;
s3b->read_block = zero_cache_read_block;
s3b->write_block = zero_cache_write_block;
if (inner->read_block_part != NULL)
s3b->read_block_part = zero_cache_read_block_part;
if (inner->write_block_part != NULL)
s3b->write_block_part = zero_cache_write_block_part;
s3b->bulk_zero = zero_cache_bulk_zero;
s3b->survey_non_zero = zero_cache_survey_non_zero;
s3b->shutdown = zero_cache_shutdown;
Expand Down Expand Up @@ -417,6 +423,62 @@ zero_cache_write_block(struct s3backer_store *const s3b, s3b_block_t block_num,
return r;
}

static int
zero_cache_read_block_part(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, void *dest)
{
struct zero_cache_private *const priv = s3b->data;
struct zero_cache_conf *const config = priv->config;

// Sanity check
assert(len > 0);
assert(len < config->block_size);
assert(off + len <= config->block_size);

// Check for the case where we are reading a block that is already known to be zero
pthread_mutex_lock(&priv->mutex);
if (bitmap_test(priv->zeros, block_num)) {
priv->stats.read_hits++;
CHECK_RETURN(pthread_mutex_unlock(&priv->mutex));
memset(dest, 0, len);
return 0;
}
CHECK_RETURN(pthread_mutex_unlock(&priv->mutex));

// Perform the partial read
return (*priv->inner->read_block_part)(priv->inner, block_num, off, len, dest);
}

static int
zero_cache_write_block_part(struct s3backer_store *s3b, s3b_block_t block_num, u_int off, u_int len, const void *src)
{
struct zero_cache_private *const priv = s3b->data;
struct zero_cache_conf *const config = priv->config;
int data_is_zeros;

// Sanity check
assert(len > 0);
assert(len < config->block_size);
assert(off + len <= config->block_size);

// Check whether data is all zeros
data_is_zeros = memcmp(src, zero_block, len) == 0;

// Handle the case where we know this block is zero
pthread_mutex_lock(&priv->mutex);
if (bitmap_test(priv->zeros, block_num)) {
if (data_is_zeros) { // ok, it's still zero -> return immediately
priv->stats.write_hits++;
CHECK_RETURN(pthread_mutex_unlock(&priv->mutex));
return 0;
}
zero_cache_update_block(priv, block_num, 0); // be conservative and say we are no longer sure
}
CHECK_RETURN(pthread_mutex_unlock(&priv->mutex));

// Perform the partial write
return (*priv->inner->write_block_part)(priv->inner, block_num, off, len, src);
}

static int
zero_cache_bulk_zero(struct s3backer_store *const s3b, const s3b_block_t *block_nums, u_int num_blocks)
{
Expand Down

0 comments on commit d949846

Please sign in to comment.