Skip to content

Commit

Permalink
Record largest seqno in table properties and verify in file ingestion (
Browse files Browse the repository at this point in the history
…facebook#12951)

Summary:
this helps to avoid scanning input files when ingesting db generated files: https://github.com/facebook/rocksdb/blob/ecb844babda9e669ff00a5d1ee51eda7430c9bc0/db/external_sst_file_ingestion_job.cc#L917-L935

Pull Request resolved: facebook#12951

Test Plan:
* `IngestDBGeneratedFileTest.FailureCase` is updated to verify that this table property is verified during ingestion
* existing unit tests for other ingestion use cases.

Reviewed By: jowlyzhang

Differential Revision: D61608285

Pulled By: cbi42

fbshipit-source-id: b5b7aae9741531349ab247be6ffaa3f3628b76ca
  • Loading branch information
cbi42 authored and facebook-github-bot committed Aug 21, 2024
1 parent 4df71db commit c62de54
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 9 deletions.
1 change: 1 addition & 0 deletions db/event_helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
<< "comparator" << table_properties.comparator_name
<< "user_defined_timestamps_persisted"
<< table_properties.user_defined_timestamps_persisted
<< "key_largest_seqno" << table_properties.key_largest_seqno
<< "merge_operator" << table_properties.merge_operator_name
<< "prefix_extractor_name"
<< table_properties.prefix_extractor_name << "property_collectors"
Expand Down
15 changes: 12 additions & 3 deletions db/external_sst_file_ingestion_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -914,9 +914,18 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
} else if (!iter->status().ok()) {
return iter->status();
}
if (ingestion_options_.allow_db_generated_files) {
// Verify that all keys have seqno zero.
// TODO: store largest seqno in table property and validate it instead.
SequenceNumber largest_seqno =
table_reader.get()->GetTableProperties()->key_largest_seqno;
// UINT64_MAX means unknown and the file is generated before table property
// `key_largest_seqno` is introduced.
if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
return Status::Corruption(
"External file has non zero largest sequence number " +
std::to_string(largest_seqno));
}
if (ingestion_options_.allow_db_generated_files &&
largest_seqno == UINT64_MAX) {
// Need to verify that all keys have seqno zero.
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Status pik_status =
ParseInternalKey(iter->key(), &key, allow_data_in_errors);
Expand Down
8 changes: 4 additions & 4 deletions db/external_sst_file_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3776,6 +3776,7 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
live_meta[0].relative_filename);
// Ingesting a file whose boundary key has non-zero seqno.
Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
// This error msg is from checking seqno of boundary keys.
ASSERT_TRUE(
s.ToString().find("External file has non zero sequence number") !=
std::string::npos);
Expand Down Expand Up @@ -3822,10 +3823,9 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
live_meta[0].directory + "/" + live_meta[0].relative_filename;
s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
ASSERT_NOK(s);
ASSERT_TRUE(
s.ToString().find(
"External file has a key with non zero sequence number") !=
std::string::npos);
// This error msg is from checking largest seqno in table property.
ASSERT_TRUE(s.ToString().find("non zero largest sequence number") !=
std::string::npos);
db_->ReleaseSnapshot(snapshot);
}

Expand Down
8 changes: 8 additions & 0 deletions include/rocksdb/table_properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ struct TablePropertiesNames {
static const std::string kSequenceNumberTimeMapping;
static const std::string kTailStartOffset;
static const std::string kUserDefinedTimestampsPersisted;
static const std::string kKeyLargestSeqno;
};

// `TablePropertiesCollector` provides the mechanism for users to collect
Expand Down Expand Up @@ -134,6 +135,7 @@ class TablePropertiesCollector {

// Return the human-readable properties, where the key is property name and
// the value is the human-readable form of value.
// Returned properties are used for logging.
// It will only be called after Finish() has been called by RocksDB internal.
virtual UserCollectedProperties GetReadableProperties() const = 0;

Expand Down Expand Up @@ -292,6 +294,12 @@ struct TableProperties {
// it's explicitly written to meta properties block.
uint64_t user_defined_timestamps_persisted = 1;

// The largest sequence number of keys in this file.
// UINT64_MAX means unknown.
// Only written to properties block if known (should be known unless the
// table is empty).
uint64_t key_largest_seqno = UINT64_MAX;

// DB identity
// db_id is an identifier generated the first time the DB is created
// If DB identity is unset or unassigned, `db_id` will be an empty string.
Expand Down
9 changes: 8 additions & 1 deletion table/block_based/block_based_table_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,9 @@ struct BlockBasedTableBuilder::Rep {
if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
}
// Default is UINT64_MAX for unknown. Setting it to 0 here
// to allow updating it by taking max in BlockBasedTableBuilder::Add().
props.key_largest_seqno = 0;

if (FormatVersionUsesContextChecksum(table_options.format_version)) {
// Must be non-zero and semi- or quasi-random
Expand Down Expand Up @@ -1014,7 +1017,10 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
if (!ok()) {
return;
}
ValueType value_type = ExtractValueType(ikey);
ValueType value_type;
SequenceNumber seq;
UnPackSequenceAndType(ExtractInternalKeyFooter(ikey), &seq, &value_type);
r->props.key_largest_seqno = std::max(r->props.key_largest_seqno, seq);
if (IsValueType(value_type)) {
#ifndef NDEBUG
if (r->props.num_entries > r->props.num_range_deletions) {
Expand Down Expand Up @@ -1781,6 +1787,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
rep_->props.user_defined_timestamps_persisted =
rep_->persist_user_defined_timestamps;

assert(IsEmpty() || rep_->props.key_largest_seqno != UINT64_MAX);
// Add basic properties
property_block_builder.AddTableProperty(rep_->props);

Expand Down
5 changes: 5 additions & 0 deletions table/meta_blocks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
Add(TablePropertiesNames::kSequenceNumberTimeMapping,
props.seqno_to_time_mapping);
}
if (props.key_largest_seqno != UINT64_MAX) {
Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno);
}
}

Slice PropertyBlockBuilder::Finish() {
Expand Down Expand Up @@ -336,6 +339,8 @@ Status ReadTablePropertiesHelper(
&new_table_properties->tail_start_offset},
{TablePropertiesNames::kUserDefinedTimestampsPersisted,
&new_table_properties->user_defined_timestamps_persisted},
{TablePropertiesNames::kKeyLargestSeqno,
&new_table_properties->key_largest_seqno},
};

std::string last_key;
Expand Down
4 changes: 4 additions & 0 deletions table/table_properties.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ std::string TableProperties::ToString(const std::string& prop_delim,
user_defined_timestamps_persisted ? std::string("true")
: std::string("false"),
prop_delim, kv_delim);
AppendProperty(result, "largest sequence number in file", key_largest_seqno,
prop_delim, kv_delim);

AppendProperty(
result, "merge operator name",
Expand Down Expand Up @@ -311,6 +313,8 @@ const std::string TablePropertiesNames::kTailStartOffset =
"rocksdb.tail.start.offset";
const std::string TablePropertiesNames::kUserDefinedTimestampsPersisted =
"rocksdb.user.defined.timestamps.persisted";
const std::string TablePropertiesNames::kKeyLargestSeqno =
"rocksdb.key.largest.seqno";

#ifndef NDEBUG
// WARNING: TEST_SetRandomTableProperties assumes the following layout of
Expand Down
2 changes: 1 addition & 1 deletion table/table_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4726,7 +4726,7 @@ static void DoCompressionTest(CompressionType comp) {
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3550));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3550));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7075));
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7100));
c.ResetTableReader();
}

Expand Down
1 change: 1 addition & 0 deletions unreleased_history/new_features/tp_largest_seqno.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* Add a new table property "rocksdb.key.largest.seqno" which records the largest sequence number of all keys in file. It is verified to be zero during SST file ingestion.

0 comments on commit c62de54

Please sign in to comment.