Skip to content

Commit e98f4ad

Browse files
committed
Plumb IOActivity down through ReadOptions
1 parent 646a708 commit e98f4ad

31 files changed

+150
-33
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,7 @@ set(SOURCES
895895
util/stderr_logger.cc
896896
util/string_util.cc
897897
util/thread_local.cc
898+
util/thread_io_activity.cc
898899
util/threadpool_imp.cc
899900
util/xxhash.cc
900901
utilities/agg_merge/agg_merge.cc

HISTORY.md

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
### New Features
1111
* Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called.
1212
* Allow runtime changes to whether `WriteBufferManager` allows stall or not by calling `SetAllowStall()`
13+
* New statistics `rocksdb.sst.read.{flush|compaction}.micros` that measures read time of block-based SST tables during flush or compaction, in addition to the existing aggregated statistics `rocksdb.sst.read.micros`
1314

1415
## 8.1.0 (03/18/2023)
1516
### Behavior changes

TARGETS

+2
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
262262
"util/status.cc",
263263
"util/stderr_logger.cc",
264264
"util/string_util.cc",
265+
"util/thread_io_activity.cc",
265266
"util/thread_local.cc",
266267
"util/threadpool_imp.cc",
267268
"util/xxhash.cc",
@@ -608,6 +609,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
608609
"util/status.cc",
609610
"util/stderr_logger.cc",
610611
"util/string_util.cc",
612+
"util/thread_io_activity.cc",
611613
"util/thread_local.cc",
612614
"util/threadpool_imp.cc",
613615
"util/xxhash.cc",

db/builder.cc

+1
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ Status BuildTable(
370370
// No matter whether use_direct_io_for_flush_and_compaction is true,
371371
// the goal is to cache it here for further user reads.
372372
ReadOptions read_options;
373+
read_options.io_activity = GetThreadIOActivity();
373374
std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
374375
read_options, file_options, tboptions.internal_comparator, *meta,
375376
nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,

db/compaction/compaction_job.cc

+12-3
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#include "table/unique_id_impl.h"
5757
#include "test_util/sync_point.h"
5858
#include "util/stop_watch.h"
59+
#include "util/thread_io_activity.h"
5960

6061
namespace ROCKSDB_NAMESPACE {
6162

@@ -505,8 +506,10 @@ void CompactionJob::GenSubcompactionBoundaries() {
505506
for (size_t i = 0; i < num_files; i++) {
506507
FileMetaData* f = flevel->files[i].file_metadata;
507508
std::vector<TableReader::Anchor> my_anchors;
508-
Status s = cfd->table_cache()->ApproximateKeyAnchors(
509-
ReadOptions(), icomp, *f, my_anchors);
509+
ReadOptions ro;
510+
ro.io_activity = Env::IOActivity::kCompaction;
511+
Status s = cfd->table_cache()->ApproximateKeyAnchors(ro, icomp, *f,
512+
my_anchors);
510513
if (!s.ok() || my_anchors.empty()) {
511514
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
512515
}
@@ -617,6 +620,7 @@ Status CompactionJob::Run() {
617620
AutoThreadOperationStageUpdater stage_updater(
618621
ThreadStatus::STAGE_COMPACTION_RUN);
619622
TEST_SYNC_POINT("CompactionJob::Run():Start");
623+
ThreadIOActivityGuard thread_io_activity_guard(Env::IOActivity::kCompaction);
620624
log_buffer_->FlushBufferToLog();
621625
LogCompaction();
622626

@@ -710,6 +714,8 @@ Status CompactionJob::Run() {
710714
compact_->compaction->mutable_cf_options()->prefix_extractor;
711715
std::atomic<size_t> next_file_idx(0);
712716
auto verify_table = [&](Status& output_status) {
717+
ThreadIOActivityGuard verify_table_thread_io_activity_guard(
718+
Env::IOActivity::kCompaction);
713719
while (true) {
714720
size_t file_idx = next_file_idx.fetch_add(1);
715721
if (file_idx >= files_output.size()) {
@@ -723,6 +729,8 @@ Status CompactionJob::Run() {
723729
// verification as user reads since the goal is to cache it here for
724730
// further user reads
725731
ReadOptions read_options;
732+
read_options.io_activity = Env::IOActivity::kCompaction;
733+
726734
InternalIterator* iter = cfd->table_cache()->NewIterator(
727735
read_options, file_options_, cfd->internal_comparator(),
728736
files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
@@ -1032,7 +1040,7 @@ void CompactionJob::NotifyOnSubcompactionCompleted(
10321040
void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
10331041
assert(sub_compact);
10341042
assert(sub_compact->compaction);
1035-
1043+
ThreadIOActivityGuard thread_io_activity_guard(Env::IOActivity::kCompaction);
10361044
if (db_options_.compaction_service) {
10371045
CompactionServiceJobStatus comp_status =
10381046
ProcessKeyValueCompactionWithCompactionService(sub_compact);
@@ -1083,6 +1091,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
10831091
read_options.verify_checksums = true;
10841092
read_options.fill_cache = false;
10851093
read_options.rate_limiter_priority = GetRateLimiterPriority();
1094+
read_options.io_activity = Env::IOActivity::kCompaction;
10861095
// Compaction iterators shouldn't be confined to a single prefix.
10871096
// Compactions use Seek() for
10881097
// (a) concurrent compactions,

db/convenience.cc

+2-3
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ Status VerifySstFileChecksum(const Options& options,
5656
std::unique_ptr<RandomAccessFileReader> file_reader(
5757
new RandomAccessFileReader(
5858
std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */,
59-
nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
60-
ioptions.rate_limiter.get()));
59+
nullptr /* stats */, Histograms::SST_READ_MICROS /* hist_type */,
60+
nullptr /* file_read_hist */, ioptions.rate_limiter.get()));
6161
const bool kImmortal = true;
6262
auto reader_options = TableReaderOptions(
6363
ioptions, options.prefix_extractor, env_options, internal_comparator,
@@ -76,4 +76,3 @@ Status VerifySstFileChecksum(const Options& options,
7676
}
7777

7878
} // namespace ROCKSDB_NAMESPACE
79-

db/db_impl/db_impl_compaction_flush.cc

+1
Original file line numberDiff line numberDiff line change
@@ -1080,6 +1080,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
10801080

10811081
ReadOptions ro;
10821082
ro.total_order_seek = true;
1083+
ro.io_activity = Env::IOActivity::kCompaction;
10831084
bool overlap;
10841085
for (int level = 0;
10851086
level < current_version->storage_info()->num_non_empty_levels();

db/db_impl/db_impl_open.cc

+3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "rocksdb/wal_filter.h"
2525
#include "test_util/sync_point.h"
2626
#include "util/rate_limiter.h"
27+
#include "util/thread_io_activity.h"
2728

2829
namespace ROCKSDB_NAMESPACE {
2930
Options SanitizeOptions(const std::string& dbname, const Options& src,
@@ -1557,6 +1558,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
15571558
Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
15581559
MemTable* mem, VersionEdit* edit) {
15591560
mutex_.AssertHeld();
1561+
ThreadIOActivityGuard thread_io_activity_guard(Env::IOActivity::kFlush);
15601562
assert(cfd);
15611563
assert(cfd->imm());
15621564
// The immutable memtable list must be empty.
@@ -1574,6 +1576,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
15741576
meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
15751577
ReadOptions ro;
15761578
ro.total_order_seek = true;
1579+
ro.io_activity = Env::IOActivity::kFlush;
15771580
Arena arena;
15781581
Status s;
15791582
TableProperties table_properties;

db/db_impl/db_impl_write.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1805,7 +1805,7 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread,
18051805
bool delayed = false;
18061806
{
18071807
StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL,
1808-
&time_delayed);
1808+
Histograms::HISTOGRAM_ENUM_MAX, &time_delayed);
18091809
// To avoid parallel timed delays (bad throttling), only support them
18101810
// on the primary write queue.
18111811
uint64_t delay;

db/flush_job.cc

+3
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "util/coding.h"
4747
#include "util/mutexlock.h"
4848
#include "util/stop_watch.h"
49+
#include "util/thread_io_activity.h"
4950

5051
namespace ROCKSDB_NAMESPACE {
5152

@@ -211,6 +212,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
211212
bool* switched_to_mempurge) {
212213
TEST_SYNC_POINT("FlushJob::Start");
213214
db_mutex_->AssertHeld();
215+
ThreadIOActivityGuard thread_io_activity_guard(Env::IOActivity::kFlush);
214216
assert(pick_memtable_called);
215217
// Mempurge threshold can be dynamically changed.
216218
// For sake of consistency, mempurge_threshold is
@@ -841,6 +843,7 @@ Status FlushJob::WriteLevel0Table() {
841843
range_del_iters;
842844
ReadOptions ro;
843845
ro.total_order_seek = true;
846+
ro.io_activity = Env::IOActivity::kFlush;
844847
Arena arena;
845848
uint64_t total_num_entries = 0, total_num_deletes = 0;
846849
uint64_t total_data_size = 0;

db/perf_context_test.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,8 @@ TEST_F(PerfContextTest, StopWatchOverhead) {
187187
uint64_t elapsed = 0;
188188
std::vector<uint64_t> timings(kTotalIterations);
189189

190-
StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed);
190+
StopWatch timer(SystemClock::Default().get(), nullptr, 0,
191+
Histograms::HISTOGRAM_ENUM_MAX, &elapsed);
191192
for (auto& timing : timings) {
192193
timing = elapsed;
193194
}

db/version_builder.cc

+4-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "port/port.h"
3434
#include "table/table_reader.h"
3535
#include "util/string_util.h"
36+
#include "util/thread_io_activity.h"
3637

3738
namespace ROCKSDB_NAMESPACE {
3839

@@ -1323,8 +1324,10 @@ class VersionBuilder::Rep {
13231324
auto* file_meta = files_meta[file_idx].first;
13241325
int level = files_meta[file_idx].second;
13251326
TableCache::TypedHandle* handle = nullptr;
1327+
ReadOptions read_options;
1328+
read_options.io_activity = GetThreadIOActivity();
13261329
statuses[file_idx] = table_cache_->FindTable(
1327-
ReadOptions(), file_options_,
1330+
read_options, file_options_,
13281331
*(base_vstorage_->InternalComparator()), *file_meta, &handle,
13291332
prefix_extractor, false /*no_io */, true /* record_read_stats */,
13301333
internal_stats->GetFileReadHist(level), false, level,

db/version_set.cc

+3-2
Original file line numberDiff line numberDiff line change
@@ -1566,8 +1566,9 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
15661566
std::unique_ptr<RandomAccessFileReader> file_reader(
15671567
new RandomAccessFileReader(
15681568
std::move(file), file_name, nullptr /* env */, io_tracer_,
1569-
nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
1570-
nullptr /* rate_limiter */, ioptions->listeners));
1569+
nullptr /* stats */, Histograms::SST_READ_MICROS /* hist_type */,
1570+
nullptr /* file_read_hist */, nullptr /* rate_limiter */,
1571+
ioptions->listeners));
15711572
std::unique_ptr<TableProperties> props;
15721573
s = ReadTableProperties(
15731574
file_reader.get(), file_meta->fd.GetFileSize(),

file/file_util.cc

+3-3
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,9 @@ IOStatus GenerateOneFileChecksum(
185185
if (!io_s.ok()) {
186186
return io_s;
187187
}
188-
reader.reset(new RandomAccessFileReader(std::move(r_file), file_path,
189-
nullptr /*Env*/, io_tracer, nullptr,
190-
0, nullptr, rate_limiter));
188+
reader.reset(new RandomAccessFileReader(
189+
std::move(r_file), file_path, nullptr /*Env*/, io_tracer, nullptr,
190+
Histograms::HISTOGRAM_ENUM_MAX, nullptr, rate_limiter));
191191
}
192192

193193
// Found that 256 KB readahead size provides the best performance, based on

file/file_util.h

+2
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
8080
}
8181

8282
opts.rate_limiter_priority = ro.rate_limiter_priority;
83+
opts.io_activity = ro.io_activity;
84+
8385
return IOStatus::OK();
8486
}
8587

file/random_access_file_reader.cc

+18-5
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,14 @@
2020
#include "test_util/sync_point.h"
2121
#include "util/random.h"
2222
#include "util/rate_limiter.h"
23+
#include "util/thread_io_activity.h"
2324

2425
namespace ROCKSDB_NAMESPACE {
25-
26+
const std::array<Histograms, std::size_t(Env::IOActivity::kUnknown)>
27+
kReadHistograms{{
28+
SST_READ_FLUSH_MICROS,
29+
SST_READ_COMPACTION_MICROS,
30+
}};
2631
inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
2732
bool is_last_level, size_t size) {
2833
IOSTATS_ADD(bytes_read, size);
@@ -94,6 +99,9 @@ IOStatus RandomAccessFileReader::Read(
9499
uint64_t elapsed = 0;
95100
{
96101
StopWatch sw(clock_, stats_, hist_type_,
102+
(opts.io_activity != Env::IOActivity::kUnknown)
103+
? kReadHistograms[(std::size_t)(opts.io_activity)]
104+
: Histograms::HISTOGRAM_ENUM_MAX,
97105
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
98106
true /*delay_enabled*/);
99107
auto prev_perf_level = GetPerfLevel();
@@ -288,6 +296,9 @@ IOStatus RandomAccessFileReader::MultiRead(
288296
uint64_t elapsed = 0;
289297
{
290298
StopWatch sw(clock_, stats_, hist_type_,
299+
(opts.io_activity != Env::IOActivity::kUnknown)
300+
? kReadHistograms[(std::size_t)(opts.io_activity)]
301+
: Histograms::HISTOGRAM_ENUM_MAX,
291302
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
292303
true /*delay_enabled*/);
293304
auto prev_perf_level = GetPerfLevel();
@@ -476,13 +487,15 @@ IOStatus RandomAccessFileReader::ReadAsync(
476487

477488
assert(read_async_info->buf_.CurrentSize() == 0);
478489

479-
StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed,
480-
true /*overwrite*/, true /*delay_enabled*/);
490+
StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/,
491+
Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/,
492+
true /*delay_enabled*/);
481493
s = file_->ReadAsync(aligned_req, opts, read_async_callback,
482494
read_async_info, io_handle, del_fn, nullptr /*dbg*/);
483495
} else {
484-
StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed,
485-
true /*overwrite*/, true /*delay_enabled*/);
496+
StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/,
497+
Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/,
498+
true /*delay_enabled*/);
486499
s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
487500
io_handle, del_fn, nullptr /*dbg*/);
488501
}

file/random_access_file_reader.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,8 @@ class RandomAccessFileReader {
122122
std::unique_ptr<FSRandomAccessFile>&& raf, const std::string& _file_name,
123123
SystemClock* clock = nullptr,
124124
const std::shared_ptr<IOTracer>& io_tracer = nullptr,
125-
Statistics* stats = nullptr, uint32_t hist_type = 0,
125+
Statistics* stats = nullptr,
126+
uint32_t hist_type = Histograms::HISTOGRAM_ENUM_MAX,
126127
HistogramImpl* file_read_hist = nullptr,
127128
RateLimiter* rate_limiter = nullptr,
128129
const std::vector<std::shared_ptr<EventListener>>& listeners = {},

include/rocksdb/env.h

+7
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,13 @@ class Env : public Customizable {
436436
IO_TOTAL = 4
437437
};
438438

439+
// EXPERIMENTAL
440+
enum class IOActivity : uint8_t {
441+
kFlush = 0,
442+
kCompaction = 1,
443+
kUnknown = 2,
444+
};
445+
439446
// Arrange to run "(*function)(arg)" once in a background thread, in
440447
// the thread pool specified by pri. By default, jobs go to the 'LOW'
441448
// priority thread pool.

include/rocksdb/file_system.h

+2
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ struct IOOptions {
116116
// directories and list only files in GetChildren API.
117117
bool do_not_recurse;
118118

119+
Env::IOActivity io_activity = Env::IOActivity::kUnknown;
120+
119121
IOOptions() : IOOptions(false) {}
120122

121123
explicit IOOptions(bool force_dir_fsync_)

include/rocksdb/options.h

+2
Original file line numberDiff line numberDiff line change
@@ -1696,6 +1696,8 @@ struct ReadOptions {
16961696
// Default: true
16971697
bool optimize_multiget_for_io;
16981698

1699+
Env::IOActivity io_activity = Env::IOActivity::kUnknown;
1700+
16991701
ReadOptions();
17001702
ReadOptions(bool cksum, bool cache);
17011703
};

include/rocksdb/statistics.h

+5
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,12 @@ enum Histograms : uint32_t {
466466
NUM_FILES_IN_SINGLE_COMPACTION,
467467
DB_SEEK,
468468
WRITE_STALL,
469+
// Time spent in reading block-based or plain SST table
469470
SST_READ_MICROS,
471+
// Time spent in reading block-based SST table for flush or compaction
472+
SST_READ_FLUSH_MICROS,
473+
SST_READ_COMPACTION_MICROS,
474+
470475
// The number of subcompactions actually scheduled during a compaction
471476
NUM_SUBCOMPACTIONS_SCHEDULED,
472477
// Value size distribution in each operation

java/rocksjni/portal.h

+8
Original file line numberDiff line numberDiff line change
@@ -5619,6 +5619,10 @@ class HistogramTypeJni {
56195619
return 0x38;
56205620
case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES:
56215621
return 0x39;
5622+
case ROCKSDB_NAMESPACE::Histograms::SST_READ_FLUSH_MICROS:
5623+
return 0x3A;
5624+
case ROCKSDB_NAMESPACE::Histograms::SST_READ_COMPACTION_MICROS:
5625+
return 0x3B;
56225626
case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
56235627
// 0x1F for backwards compatibility on current minor version.
56245628
return 0x1F;
@@ -5738,6 +5742,10 @@ class HistogramTypeJni {
57385742
case 0x39:
57395743
return ROCKSDB_NAMESPACE::Histograms::
57405744
TABLE_OPEN_PREFETCH_TAIL_READ_BYTES;
5745+
case 0x3A:
5746+
return ROCKSDB_NAMESPACE::Histograms::SST_READ_FLUSH_MICROS;
5747+
case 0x3B:
5748+
return ROCKSDB_NAMESPACE::Histograms::SST_READ_COMPACTION_MICROS;
57415749
case 0x1F:
57425750
// 0x1F for backwards compatibility on current minor version.
57435751
return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;

java/src/main/java/org/rocksdb/HistogramType.java

+4
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ public enum HistogramType {
169169
*/
170170
TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39),
171171

172+
SST_READ_FLUSH_MICROS((byte) 0x3A),
173+
174+
SST_READ_COMPACTION_MICROS((byte) 0x3B),
175+
172176
// 0x1F for backwards compatibility on current minor version.
173177
HISTOGRAM_ENUM_MAX((byte) 0x1F);
174178

0 commit comments

Comments
 (0)