Skip to content

Commit 6e5a8e9

Browse files
committed
draft
1 parent 646a708 commit 6e5a8e9

23 files changed

+125
-20
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,7 @@ set(SOURCES
895895
util/stderr_logger.cc
896896
util/string_util.cc
897897
util/thread_local.cc
898+
util/thread_io_activity.cc
898899
util/threadpool_imp.cc
899900
util/xxhash.cc
900901
utilities/agg_merge/agg_merge.cc

TARGETS

+2
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
262262
"util/status.cc",
263263
"util/stderr_logger.cc",
264264
"util/string_util.cc",
265+
"util/thread_io_activity.cc",
265266
"util/thread_local.cc",
266267
"util/threadpool_imp.cc",
267268
"util/xxhash.cc",
@@ -608,6 +609,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
608609
"util/status.cc",
609610
"util/stderr_logger.cc",
610611
"util/string_util.cc",
612+
"util/thread_io_activity.cc",
611613
"util/thread_local.cc",
612614
"util/threadpool_imp.cc",
613615
"util/xxhash.cc",

db/builder.cc

+1
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ Status BuildTable(
370370
// No matter whether use_direct_io_for_flush_and_compaction is true,
371371
// the goal is to cache it here for further user reads.
372372
ReadOptions read_options;
373+
read_options.io_activity = Env::IOActivity::kFlush;
373374
std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
374375
read_options, file_options, tboptions.internal_comparator, *meta,
375376
nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,

db/compaction/compaction_job.cc

+8-1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#include "table/unique_id_impl.h"
5757
#include "test_util/sync_point.h"
5858
#include "util/stop_watch.h"
59+
#include "util/thread_io_activity.h"
5960

6061
namespace ROCKSDB_NAMESPACE {
6162

@@ -617,6 +618,7 @@ Status CompactionJob::Run() {
617618
AutoThreadOperationStageUpdater stage_updater(
618619
ThreadStatus::STAGE_COMPACTION_RUN);
619620
TEST_SYNC_POINT("CompactionJob::Run():Start");
621+
ThreadIOActivityGuard thread_io_activity_guard(Env::IOActivity::kCompaction);
620622
log_buffer_->FlushBufferToLog();
621623
LogCompaction();
622624

@@ -710,6 +712,8 @@ Status CompactionJob::Run() {
710712
compact_->compaction->mutable_cf_options()->prefix_extractor;
711713
std::atomic<size_t> next_file_idx(0);
712714
auto verify_table = [&](Status& output_status) {
715+
ThreadIOActivityGuard verify_table_thread_io_activity_guard(
716+
Env::IOActivity::kCompaction);
713717
while (true) {
714718
size_t file_idx = next_file_idx.fetch_add(1);
715719
if (file_idx >= files_output.size()) {
@@ -723,6 +727,8 @@ Status CompactionJob::Run() {
723727
// verification as user reads since the goal is to cache it here for
724728
// further user reads
725729
ReadOptions read_options;
730+
read_options.io_activity = Env::IOActivity::kCompaction;
731+
726732
InternalIterator* iter = cfd->table_cache()->NewIterator(
727733
read_options, file_options_, cfd->internal_comparator(),
728734
files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
@@ -1032,7 +1038,7 @@ void CompactionJob::NotifyOnSubcompactionCompleted(
10321038
void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
10331039
assert(sub_compact);
10341040
assert(sub_compact->compaction);
1035-
1041+
ThreadIOActivityGuard thread_io_activity_guard(Env::IOActivity::kCompaction);
10361042
if (db_options_.compaction_service) {
10371043
CompactionServiceJobStatus comp_status =
10381044
ProcessKeyValueCompactionWithCompactionService(sub_compact);
@@ -1083,6 +1089,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
10831089
read_options.verify_checksums = true;
10841090
read_options.fill_cache = false;
10851091
read_options.rate_limiter_priority = GetRateLimiterPriority();
1092+
read_options.io_activity = Env::IOActivity::kCompaction;
10861093
// Compaction iterators shouldn't be confined to a single prefix.
10871094
// Compactions use Seek() for
10881095
// (a) concurrent compactions,

db/db_impl/db_impl_open.cc

+3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "rocksdb/wal_filter.h"
2525
#include "test_util/sync_point.h"
2626
#include "util/rate_limiter.h"
27+
#include "util/thread_io_activity.h"
2728

2829
namespace ROCKSDB_NAMESPACE {
2930
Options SanitizeOptions(const std::string& dbname, const Options& src,
@@ -1557,6 +1558,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
15571558
Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
15581559
MemTable* mem, VersionEdit* edit) {
15591560
mutex_.AssertHeld();
1561+
ThreadIOActivityGuard thread_io_activity_guard(Env::IOActivity::kFlush);
15601562
assert(cfd);
15611563
assert(cfd->imm());
15621564
// The immutable memtable list must be empty.
@@ -1700,6 +1702,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
17001702
InternalStats::BYTES_FLUSHED,
17011703
stats.bytes_written + stats.bytes_written_blob);
17021704
RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
1705+
thread_io_activity = Env::IOActivity::kUnknown;
17031706
return s;
17041707
}
17051708

db/db_impl/db_impl_write.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1805,7 +1805,7 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread,
18051805
bool delayed = false;
18061806
{
18071807
StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL,
1808-
&time_delayed);
1808+
Histograms::HISTOGRAM_ENUM_MAX, &time_delayed);
18091809
// To avoid parallel timed delays (bad throttling), only support them
18101810
// on the primary write queue.
18111811
uint64_t delay;

db/flush_job.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "util/coding.h"
4747
#include "util/mutexlock.h"
4848
#include "util/stop_watch.h"
49+
#include "util/thread_io_activity.h"
4950

5051
namespace ROCKSDB_NAMESPACE {
5152

@@ -211,6 +212,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
211212
bool* switched_to_mempurge) {
212213
TEST_SYNC_POINT("FlushJob::Start");
213214
db_mutex_->AssertHeld();
215+
ThreadIOActivityGuard thread_io_activity_guard(Env::IOActivity::kFlush);
214216
assert(pick_memtable_called);
215217
// Mempurge threshold can be dynamically changed.
216218
// For sake of consistency, mempurge_threshold is
@@ -350,7 +352,6 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
350352
stream << "file_cpu_read_nanos"
351353
<< (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
352354
}
353-
354355
return s;
355356
}
356357

db/perf_context_test.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,8 @@ TEST_F(PerfContextTest, StopWatchOverhead) {
187187
uint64_t elapsed = 0;
188188
std::vector<uint64_t> timings(kTotalIterations);
189189

190-
StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed);
190+
StopWatch timer(SystemClock::Default().get(), nullptr, 0,
191+
Histograms::HISTOGRAM_ENUM_MAX, &elapsed);
191192
for (auto& timing : timings) {
192193
timing = elapsed;
193194
}

db/version_builder.cc

+4-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "port/port.h"
3434
#include "table/table_reader.h"
3535
#include "util/string_util.h"
36+
#include "util/thread_io_activity.h"
3637

3738
namespace ROCKSDB_NAMESPACE {
3839

@@ -1323,8 +1324,10 @@ class VersionBuilder::Rep {
13231324
auto* file_meta = files_meta[file_idx].first;
13241325
int level = files_meta[file_idx].second;
13251326
TableCache::TypedHandle* handle = nullptr;
1327+
ReadOptions read_options;
1328+
read_options.io_activity = thread_io_activity;
13261329
statuses[file_idx] = table_cache_->FindTable(
1327-
ReadOptions(), file_options_,
1330+
read_options, file_options_,
13281331
*(base_vstorage_->InternalComparator()), *file_meta, &handle,
13291332
prefix_extractor, false /*no_io */, true /* record_read_stats */,
13301333
internal_stats->GetFileReadHist(level), false, level,

file/file_util.h

+2
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
8080
}
8181

8282
opts.rate_limiter_priority = ro.rate_limiter_priority;
83+
opts.io_activity = ro.io_activity;
84+
8385
return IOStatus::OK();
8486
}
8587

file/random_access_file_reader.cc

+18-5
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,14 @@
2020
#include "test_util/sync_point.h"
2121
#include "util/random.h"
2222
#include "util/rate_limiter.h"
23+
#include "util/thread_io_activity.h"
2324

2425
namespace ROCKSDB_NAMESPACE {
25-
26+
const std::array<Histograms, std::size_t(Env::IOActivity::kUnknown)>
27+
kReadHistograms{{
28+
SST_READ_FLUSH_MICROS,
29+
SST_READ_COMPACTION_MICROS,
30+
}};
2631
inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
2732
bool is_last_level, size_t size) {
2833
IOSTATS_ADD(bytes_read, size);
@@ -94,6 +99,9 @@ IOStatus RandomAccessFileReader::Read(
9499
uint64_t elapsed = 0;
95100
{
96101
StopWatch sw(clock_, stats_, hist_type_,
102+
(opts.io_activity != Env::IOActivity::kUnknown)
103+
? kReadHistograms[(std::size_t)(opts.io_activity)]
104+
: Histograms::HISTOGRAM_ENUM_MAX,
97105
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
98106
true /*delay_enabled*/);
99107
auto prev_perf_level = GetPerfLevel();
@@ -288,6 +296,9 @@ IOStatus RandomAccessFileReader::MultiRead(
288296
uint64_t elapsed = 0;
289297
{
290298
StopWatch sw(clock_, stats_, hist_type_,
299+
(opts.io_activity != Env::IOActivity::kUnknown)
300+
? kReadHistograms[(std::size_t)(opts.io_activity)]
301+
: Histograms::HISTOGRAM_ENUM_MAX,
291302
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
292303
true /*delay_enabled*/);
293304
auto prev_perf_level = GetPerfLevel();
@@ -476,13 +487,15 @@ IOStatus RandomAccessFileReader::ReadAsync(
476487

477488
assert(read_async_info->buf_.CurrentSize() == 0);
478489

479-
StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed,
480-
true /*overwrite*/, true /*delay_enabled*/);
490+
StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/,
491+
Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/,
492+
true /*delay_enabled*/);
481493
s = file_->ReadAsync(aligned_req, opts, read_async_callback,
482494
read_async_info, io_handle, del_fn, nullptr /*dbg*/);
483495
} else {
484-
StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed,
485-
true /*overwrite*/, true /*delay_enabled*/);
496+
StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/,
497+
Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/,
498+
true /*delay_enabled*/);
486499
s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
487500
io_handle, del_fn, nullptr /*dbg*/);
488501
}

include/rocksdb/env.h

+7
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,13 @@ class Env : public Customizable {
436436
IO_TOTAL = 4
437437
};
438438

439+
// EXPERIMENTAL
440+
enum class IOActivity : uint8_t {
441+
kFlush = 0,
442+
kCompaction = 1,
443+
kUnknown = 2,
444+
};
445+
439446
// Arrange to run "(*function)(arg)" once in a background thread, in
440447
// the thread pool specified by pri. By default, jobs go to the 'LOW'
441448
// priority thread pool.

include/rocksdb/file_system.h

+2
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ struct IOOptions {
116116
// directories and list only files in GetChildren API.
117117
bool do_not_recurse;
118118

119+
Env::IOActivity io_activity = Env::IOActivity::kUnknown;
120+
119121
IOOptions() : IOOptions(false) {}
120122

121123
explicit IOOptions(bool force_dir_fsync_)

include/rocksdb/options.h

+2
Original file line numberDiff line numberDiff line change
@@ -1696,6 +1696,8 @@ struct ReadOptions {
16961696
// Default: true
16971697
bool optimize_multiget_for_io;
16981698

1699+
Env::IOActivity io_activity = Env::IOActivity::kUnknown;
1700+
16991701
ReadOptions();
17001702
ReadOptions(bool cksum, bool cache);
17011703
};

include/rocksdb/statistics.h

+3
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,9 @@ enum Histograms : uint32_t {
467467
DB_SEEK,
468468
WRITE_STALL,
469469
SST_READ_MICROS,
470+
SST_READ_FLUSH_MICROS,
471+
SST_READ_COMPACTION_MICROS,
472+
470473
// The number of subcompactions actually scheduled during a compaction
471474
NUM_SUBCOMPACTIONS_SCHEDULED,
472475
// Value size distribution in each operation

java/rocksjni/portal.h

+8
Original file line numberDiff line numberDiff line change
@@ -5619,6 +5619,10 @@ class HistogramTypeJni {
56195619
return 0x38;
56205620
case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES:
56215621
return 0x39;
5622+
case ROCKSDB_NAMESPACE::Histograms::SST_READ_FLUSH_MICROS:
5623+
return 0x3A;
5624+
case ROCKSDB_NAMESPACE::Histograms::SST_READ_COMPACTION_MICROS:
5625+
return 0x3B;
56225626
case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
56235627
// 0x1F for backwards compatibility on current minor version.
56245628
return 0x1F;
@@ -5738,6 +5742,10 @@ class HistogramTypeJni {
57385742
case 0x39:
57395743
return ROCKSDB_NAMESPACE::Histograms::
57405744
TABLE_OPEN_PREFETCH_TAIL_READ_BYTES;
5745+
case 0x3A:
5746+
return ROCKSDB_NAMESPACE::Histograms::SST_READ_FLUSH_MICROS;
5747+
case 0x3B:
5748+
return ROCKSDB_NAMESPACE::Histograms::SST_READ_COMPACTION_MICROS;
57415749
case 0x1F:
57425750
// 0x1F for backwards compatibility on current minor version.
57435751
return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;

java/src/main/java/org/rocksdb/HistogramType.java

+4
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ public enum HistogramType {
169169
*/
170170
TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39),
171171

172+
SST_READ_FLUSH_MICROS((byte) 0x3A),
173+
174+
SST_READ_COMPACTION_MICROS((byte) 0x3B),
175+
172176
// 0x1F for backwards compatibility on current minor version.
173177
HISTOGRAM_ENUM_MAX((byte) 0x1F);
174178

monitoring/statistics.cc

+2
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,8 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
240240
{DB_SEEK, "rocksdb.db.seek.micros"},
241241
{WRITE_STALL, "rocksdb.db.write.stall"},
242242
{SST_READ_MICROS, "rocksdb.sst.read.micros"},
243+
{SST_READ_FLUSH_MICROS, "rocksdb.sst.read.flush.micros"},
244+
{SST_READ_COMPACTION_MICROS, "rocksdb.sst.read.compaction.micros"},
243245
{NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
244246
{BYTES_PER_READ, "rocksdb.bytes.per.read"},
245247
{BYTES_PER_WRITE, "rocksdb.bytes.per.write"},

src.mk

+1
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ LIB_SOURCES = \
250250
util/stderr_logger.cc \
251251
util/string_util.cc \
252252
util/thread_local.cc \
253+
util/thread_io_activity.cc \
253254
util/threadpool_imp.cc \
254255
util/xxhash.cc \
255256
utilities/agg_merge/agg_merge.cc \

table/block_based/block_based_table_reader.cc

+1
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,7 @@ Status BlockBasedTable::Open(
583583
ro.io_timeout = read_options.io_timeout;
584584
ro.rate_limiter_priority = read_options.rate_limiter_priority;
585585
ro.verify_checksums = read_options.verify_checksums;
586+
ro.io_activity = read_options.io_activity;
586587

587588
// prefetch both index and filters, down to all partitions
588589
const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;

util/stop_watch.h

+19-10
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,28 @@
99

1010
namespace ROCKSDB_NAMESPACE {
1111
// Auto-scoped.
12-
// Records the measure time into the corresponding histogram if statistics
12+
// Records the measure time into the corresponding histogram(s) if statistics
1313
// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr
1414
// and overwrite is true, it will be added to *elapsed if overwrite is false.
1515
class StopWatch {
1616
public:
1717
StopWatch(SystemClock* clock, Statistics* statistics,
18-
const uint32_t hist_type, uint64_t* elapsed = nullptr,
19-
bool overwrite = true, bool delay_enabled = false)
18+
const uint32_t hist_type_1,
19+
const uint32_t hist_type_2 = Histograms::HISTOGRAM_ENUM_MAX,
20+
uint64_t* elapsed = nullptr, bool overwrite = true,
21+
bool delay_enabled = false)
2022
: clock_(clock),
2123
statistics_(statistics),
22-
hist_type_(hist_type),
24+
hist_type_1_(hist_type_1),
25+
hist_type_2_(hist_type_2),
2326
elapsed_(elapsed),
2427
overwrite_(overwrite),
2528
stats_enabled_(statistics &&
2629
statistics->get_stats_level() >=
2730
StatsLevel::kExceptTimers &&
28-
statistics->HistEnabledForType(hist_type)),
31+
statistics->HistEnabledForType(hist_type_1) &&
32+
(hist_type_2 == Histograms::HISTOGRAM_ENUM_MAX ||
33+
statistics->HistEnabledForType(hist_type_2))),
2934
delay_enabled_(delay_enabled),
3035
total_delay_(0),
3136
delay_start_time_(0),
@@ -44,10 +49,13 @@ class StopWatch {
4449
*elapsed_ -= total_delay_;
4550
}
4651
if (stats_enabled_) {
47-
statistics_->reportTimeToHistogram(
48-
hist_type_, (elapsed_ != nullptr)
49-
? *elapsed_
50-
: (clock_->NowMicros() - start_time_));
52+
const auto time = (elapsed_ != nullptr)
53+
? *elapsed_
54+
: (clock_->NowMicros() - start_time_);
55+
statistics_->reportTimeToHistogram(hist_type_1_, time);
56+
if (hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX) {
57+
statistics_->reportTimeToHistogram(hist_type_2_, time);
58+
}
5159
}
5260
}
5361

@@ -75,7 +83,8 @@ class StopWatch {
7583
private:
7684
SystemClock* clock_;
7785
Statistics* statistics_;
78-
const uint32_t hist_type_;
86+
const uint32_t hist_type_1_;
87+
const uint32_t hist_type_2_;
7988
uint64_t* elapsed_;
8089
bool overwrite_;
8190
bool stats_enabled_;

0 commit comments

Comments
 (0)