Skip to content

Commit 57d1c71

Browse files
xiaguanCopilot
andauthored
feat(client): Add client-side metrics for transfer and RPC operations (#733)
* feat(client): Add RPC operation and tansfer metrics tracking * feat(client): Add client-side metrics for transfer and RPC operations This commit introduces a comprehensive metrics system for the client component, tracking transfer byte counts and operation latencies with both human-readable summaries and Prometheus-style serialization. Key features include: - New TransferMetric for tracking read/write bytes and latency histograms - MasterClientMetric for RPC call counting and latency tracking - Environment-controlled metrics reporting (MC_STORE_METRIC_REPORT) - Automatic periodic metrics collection thread - Enhanced test coverage for metrics validation - Unified metrics interface across all client operations The implementation provides detailed latency percentiles (P50/P95) and total byte tracking with automatic unit conversion (B/KB/MB/GB). * fix test * Update mooncake-store/src/client.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update mooncake-store/src/client.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update mooncake-store/src/client.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix format issue * refactor(client): make client metrics optional using ClientMetric class * fix lint --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 335d1a1 commit 57d1c71

File tree

11 files changed

+869
-28
lines changed

11 files changed

+869
-28
lines changed

mooncake-store/include/client.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
#include <mutex>
66
#include <optional>
77
#include <string>
8+
#include <thread>
89
#include <vector>
910
#include <ylt/util/tl/expected.hpp>
1011

12+
#include "client_metric.h"
1113
#include "ha_helper.h"
1214
#include "master_client.h"
1315
#include "storage_backend.h"
@@ -196,6 +198,24 @@ class Client {
196198
std::vector<tl::expected<bool, ErrorCode>> BatchIsExist(
197199
const std::vector<std::string>& keys);
198200

201+
// For human-readable metrics
202+
tl::expected<std::string, ErrorCode> GetSummaryMetrics() {
203+
if (metrics_ == nullptr) {
204+
return tl::make_unexpected(ErrorCode::INVALID_PARAMS);
205+
}
206+
return metrics_->summary_metrics();
207+
}
208+
209+
// For Prometheus-style metrics
210+
tl::expected<std::string, ErrorCode> SerializeMetrics() {
211+
if (metrics_ == nullptr) {
212+
return tl::make_unexpected(ErrorCode::INVALID_PARAMS);
213+
}
214+
std::string str;
215+
metrics_->serialize(str);
216+
return str;
217+
}
218+
199219
private:
200220
/**
201221
* @brief Private constructor to enforce creation through Create() method
@@ -255,6 +275,9 @@ class Client {
255275
std::vector<tl::expected<void, ErrorCode>> CollectResults(
256276
const std::vector<PutOperation>& ops);
257277

278+
// Client-side metrics
279+
std::unique_ptr<ClientMetric> metrics_;
280+
258281
// Core components
259282
TransferEngine transfer_engine_;
260283
MasterClient master_client_;
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
#pragma once
2+
3+
#include <atomic>
4+
#include <sstream>
5+
#include <thread>
6+
#include <vector>
7+
#include <ylt/metric/counter.hpp>
8+
#include <ylt/metric/histogram.hpp>
9+
#include <ylt/metric/summary.hpp>
10+
#include "utils.h"
11+
12+
namespace mooncake {
13+
14+
// latency bucket is in microsecond
15+
// Tuned for RDMA: fine-grained in <1ms, with ms-scale tail up to 1s
16+
const std::vector<double> kLatencyBucket = {
17+
// sub-ms to 1ms region
18+
125, 150, 200, 250, 300, 400, 500, 750, 1000,
19+
// ms-level tail for batch/occasional spikes
20+
1500, 2000, 3000, 5000, 7000, 15000, 20000,
21+
// safeguards for long tails
22+
50000, 100000, 200000, 500000, 1000000};
23+
24+
struct TransferMetric {
25+
ylt::metric::counter_t total_read_bytes{"mooncake_transfer_read_bytes",
26+
"Total bytes read"};
27+
ylt::metric::counter_t total_write_bytes{"mooncake_transfer_write_bytes",
28+
"Total bytes written"};
29+
ylt::metric::histogram_t batch_put_latency_us{
30+
"mooncake_transfer_batch_put_latency",
31+
"Batch Put transfer latency (us)", kLatencyBucket};
32+
ylt::metric::histogram_t batch_get_latency_us{
33+
"mooncake_transfer_batch_get_latency",
34+
"Batch Get transfer latency (us)", kLatencyBucket};
35+
ylt::metric::histogram_t get_latency_us{"mooncake_transfer_get_latency",
36+
"Get transfer latency (us)",
37+
kLatencyBucket};
38+
ylt::metric::histogram_t put_latency_us{"mooncake_transfer_put_latency",
39+
"Put transfer latency (us)",
40+
kLatencyBucket};
41+
42+
void serialize(std::string& str) {
43+
total_read_bytes.serialize(str);
44+
total_write_bytes.serialize(str);
45+
batch_put_latency_us.serialize(str);
46+
batch_get_latency_us.serialize(str);
47+
get_latency_us.serialize(str);
48+
put_latency_us.serialize(str);
49+
}
50+
51+
std::string summary_metrics() {
52+
std::stringstream ss;
53+
ss << "=== Transfer Metrics Summary ===\n";
54+
55+
// Bytes transferred
56+
auto read_bytes = total_read_bytes.value();
57+
auto write_bytes = total_write_bytes.value();
58+
ss << "Total Read: " << byte_size_to_string(read_bytes) << "\n";
59+
ss << "Total Write: " << byte_size_to_string(write_bytes) << "\n";
60+
61+
// Latency summaries
62+
ss << "\n=== Latency Summary (microseconds) ===\n";
63+
ss << "Get: " << format_latency_summary(get_latency_us) << "\n";
64+
ss << "Put: " << format_latency_summary(put_latency_us) << "\n";
65+
ss << "Batch Get: " << format_latency_summary(batch_get_latency_us)
66+
<< "\n";
67+
ss << "Batch Put: " << format_latency_summary(batch_put_latency_us)
68+
<< "\n";
69+
70+
return ss.str();
71+
}
72+
73+
private:
74+
std::string format_latency_summary(ylt::metric::histogram_t& hist) {
75+
// Access the internal sum and bucket counts
76+
auto sum_ptr =
77+
const_cast<ylt::metric::histogram_t&>(hist).get_bucket_counts();
78+
if (sum_ptr.empty()) {
79+
return "No data";
80+
}
81+
82+
// Calculate total count from all buckets
83+
int64_t total_count = 0;
84+
for (auto& bucket : sum_ptr) {
85+
total_count += bucket->value();
86+
}
87+
88+
if (total_count == 0) {
89+
return "No data";
90+
}
91+
92+
// Get sum from the histogram's internal sum gauge
93+
// Note: We need to access the private sum_ member, which requires
94+
// friendship or reflection For now, let's use a simpler approach
95+
// showing just count
96+
std::stringstream ss;
97+
ss << "count=" << total_count;
98+
99+
// Find P95
100+
int64_t p95_target = (total_count * 95) / 100;
101+
int64_t cumulative = 0;
102+
double p95_bucket = 0;
103+
104+
for (size_t i = 0; i < sum_ptr.size() && i < kLatencyBucket.size();
105+
i++) {
106+
cumulative += sum_ptr[i]->value();
107+
if (cumulative >= p95_target && p95_bucket == 0) {
108+
p95_bucket = kLatencyBucket[i];
109+
break;
110+
}
111+
}
112+
113+
if (p95_bucket > 0) {
114+
ss << ", p95<" << p95_bucket << "μs";
115+
}
116+
117+
// Find max bucket (highest bucket with data)
118+
double max_bucket = 0;
119+
for (size_t i = sum_ptr.size(); i > 0; i--) {
120+
size_t idx = i - 1;
121+
if (idx < kLatencyBucket.size() && sum_ptr[idx]->value() > 0) {
122+
max_bucket = kLatencyBucket[idx];
123+
break;
124+
}
125+
}
126+
127+
if (max_bucket > 0) {
128+
ss << ", max<" << max_bucket << "μs";
129+
}
130+
131+
return ss.str();
132+
}
133+
};
134+
135+
struct MasterClientMetric {
136+
std::array<std::string, 1> rpc_names = {"rpc_name"};
137+
138+
MasterClientMetric()
139+
: rpc_count("mooncake_client_rpc_count",
140+
"Total number of RPC calls made by the client", rpc_names),
141+
rpc_latency("mooncake_client_rpc_latency",
142+
"Latency of RPC calls made by the client (in us)",
143+
kLatencyBucket, rpc_names) {}
144+
145+
ylt::metric::dynamic_counter_1t rpc_count;
146+
ylt::metric::dynamic_histogram_1t rpc_latency;
147+
void serialize(std::string& str) {
148+
rpc_count.serialize(str);
149+
rpc_latency.serialize(str);
150+
}
151+
152+
std::string summary_metrics() {
153+
std::stringstream ss;
154+
ss << "=== RPC Metrics Summary ===\n";
155+
156+
// For dynamic metrics, we need to check if there are any labels with
157+
// data
158+
if (rpc_count.label_value_count() == 0) {
159+
ss << "No RPC calls recorded\n";
160+
return ss.str();
161+
}
162+
163+
// Get all available RPC names from the dynamic metrics
164+
// We'll iterate through all possible RPC names instead of using a fixed
165+
// list
166+
std::vector<std::string> all_rpc_names = {"GetReplicaList",
167+
"PutStart",
168+
"PutEnd",
169+
"PutRevoke",
170+
"ExistKey",
171+
"Remove",
172+
"RemoveAll",
173+
"MountSegment",
174+
"UnmountSegment",
175+
"GetFsdir",
176+
"BatchGetReplicaList",
177+
"BatchPutStart",
178+
"BatchPutEnd",
179+
"BatchPutRevoke"};
180+
181+
bool found_any = false;
182+
for (const auto& rpc_name : all_rpc_names) {
183+
std::array<std::string, 1> label_array = {rpc_name};
184+
185+
// Check if this RPC has any data by trying to access bucket counts
186+
auto bucket_counts = rpc_latency.get_bucket_counts();
187+
int64_t total_count = 0;
188+
for (auto& bucket : bucket_counts) {
189+
total_count += bucket->value(label_array);
190+
}
191+
192+
// Skip RPCs with zero count
193+
if (total_count == 0) continue;
194+
195+
found_any = true;
196+
ss << rpc_name << ": count=" << total_count;
197+
198+
// Find P95
199+
int64_t p95_target = (total_count * 95) / 100;
200+
int64_t cumulative = 0;
201+
double p95_bucket = 0;
202+
203+
for (size_t i = 0;
204+
i < bucket_counts.size() && i < kLatencyBucket.size(); i++) {
205+
cumulative += bucket_counts[i]->value(label_array);
206+
if (cumulative >= p95_target && p95_bucket == 0) {
207+
p95_bucket = kLatencyBucket[i];
208+
break;
209+
}
210+
}
211+
212+
if (p95_bucket > 0) {
213+
ss << ", p95<" << p95_bucket << "μs";
214+
}
215+
216+
// Find max bucket (highest bucket with data)
217+
double max_bucket = 0;
218+
for (size_t i = bucket_counts.size(); i > 0; i--) {
219+
size_t idx = i - 1;
220+
if (idx < kLatencyBucket.size() &&
221+
bucket_counts[idx]->value(label_array) > 0) {
222+
max_bucket = kLatencyBucket[idx];
223+
break;
224+
}
225+
}
226+
227+
if (max_bucket > 0) {
228+
ss << ", max<" << max_bucket << "μs";
229+
}
230+
231+
ss << "\n";
232+
}
233+
234+
if (!found_any) {
235+
ss << "No RPC calls recorded\n";
236+
}
237+
238+
return ss.str();
239+
}
240+
};
241+
242+
struct ClientMetric {
243+
TransferMetric transfer_metric;
244+
MasterClientMetric master_client_metric;
245+
246+
/**
247+
* @brief Creates a ClientMetric instance based on environment variables
248+
* @return std::unique_ptr<ClientMetric> containing the instance if enabled,
249+
* nullptr if disabled
250+
*
251+
* Environment variables:
252+
* - MC_STORE_CLIENT_METRIC: Enable/disable metrics (enabled by default,
253+
* set to 0/false to disable)
254+
* - MC_STORE_CLIENT_METRIC_INTERVAL: Reporting interval in seconds
255+
* (default: 0, 0 = collect but don't report)
256+
*/
257+
static std::unique_ptr<ClientMetric> Create();
258+
259+
void serialize(std::string& str);
260+
std::string summary_metrics();
261+
262+
uint64_t GetReportingInterval() const { return metrics_interval_seconds_; }
263+
264+
explicit ClientMetric(uint64_t interval_seconds = 0);
265+
~ClientMetric();
266+
267+
private:
268+
// Metrics reporting thread management
269+
std::jthread metrics_reporting_thread_;
270+
std::atomic<bool> should_stop_metrics_thread_{false};
271+
uint64_t metrics_interval_seconds_{0};
272+
273+
void StartMetricsReportingThread();
274+
void StopMetricsReportingThread();
275+
};
276+
}; // namespace mooncake

mooncake-store/include/master_client.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,10 @@
44
#include <string>
55
#include <vector>
66
#include <ylt/coro_rpc/coro_rpc_client.hpp>
7+
#include "client_metric.h"
78

8-
#include "rpc_service.h"
99
#include "types.h"
1010

11-
using namespace async_simple;
12-
using namespace coro_rpc;
13-
1411
namespace mooncake {
1512

1613
static const std::string kDefaultMasterAddress = "localhost:50051";
@@ -20,7 +17,7 @@ static const std::string kDefaultMasterAddress = "localhost:50051";
2017
*/
2118
class MasterClient {
2219
public:
23-
MasterClient();
20+
MasterClient(MasterClientMetric* metrics = nullptr) : metrics_(metrics) {}
2421
~MasterClient();
2522

2623
MasterClient(const MasterClient&) = delete;
@@ -219,22 +216,25 @@ class MasterClient {
219216
*/
220217
class RpcClientAccessor {
221218
public:
222-
void SetClient(std::shared_ptr<coro_rpc_client> client) {
219+
void SetClient(std::shared_ptr<coro_rpc::coro_rpc_client> client) {
223220
std::lock_guard<std::shared_mutex> lock(client_mutex_);
224221
client_ = client;
225222
}
226223

227-
std::shared_ptr<coro_rpc_client> GetClient() {
224+
std::shared_ptr<coro_rpc::coro_rpc_client> GetClient() {
228225
std::shared_lock<std::shared_mutex> lock(client_mutex_);
229226
return client_;
230227
}
231228

232229
private:
233230
mutable std::shared_mutex client_mutex_;
234-
std::shared_ptr<coro_rpc_client> client_;
231+
std::shared_ptr<coro_rpc::coro_rpc_client> client_;
235232
};
236233
RpcClientAccessor client_accessor_;
237234

235+
// Metrics for tracking RPC operations
236+
MasterClientMetric* metrics_;
237+
238238
// Mutex to insure the Connect function is atomic.
239239
mutable Mutex connect_mutex_;
240240
// The address which is passed to the coro_rpc_client

0 commit comments

Comments
 (0)