Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: add HTTP API to generate TiDB metric profile #18272

Merged
merged 34 commits into from
Jul 14, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
655490b
init
crazycs520 Jun 19, 2020
a5c5bc9
init
crazycs520 Jun 26, 2020
7d40979
Merge branch 'add-more-metric' into profile
crazycs520 Jun 26, 2020
b0c038e
add more metric
crazycs520 Jun 27, 2020
2548269
generate dot profile
crazycs520 Jun 29, 2020
98d76be
add time parameter
crazycs520 Jun 29, 2020
4221e91
Merge branch 'master' into profile
crazycs520 Jun 29, 2020
bd32b4a
Update server/http_status.go
crazycs520 Jun 30, 2020
bde95cf
make ci happy
crazycs520 Jun 30, 2020
7a00f41
Merge branch 'master' into profile
crazycs520 Jul 6, 2020
588afb2
add more metric
crazycs520 Jul 6, 2020
0ec571f
Merge branch 'master' into profile
crazycs520 Jul 6, 2020
e08855f
add more metric
crazycs520 Jul 6, 2020
d423a8f
remove small node display and refine parent node cost calculation
crazycs520 Jul 6, 2020
927b63f
Merge branch 'profile' of https://github.com/crazycs520/tidb into pro…
crazycs520 Jul 6, 2020
6f05691
Merge branch 'master' into profile
crazycs520 Jul 6, 2020
bef2762
refine some tikv raftstore metric name
crazycs520 Jul 6, 2020
07014ba
Merge branch 'profile' of https://github.com/crazycs520/tidb into pro…
crazycs520 Jul 6, 2020
0100a7d
fix the duplicate node bug
crazycs520 Jul 6, 2020
652a7a1
Merge branch 'master' into profile
crazycs520 Jul 7, 2020
7639665
Merge branch 'master' into profile
crazycs520 Jul 7, 2020
eb37b1e
refine code
crazycs520 Jul 7, 2020
8ddd646
remove redundant code
crazycs520 Jul 7, 2020
3481671
Merge branch 'master' into profile
crazycs520 Jul 7, 2020
fbd26e0
Merge branch 'master' into profile
crazycs520 Jul 8, 2020
81fae5d
Merge branch 'master' into profile
crazycs520 Jul 8, 2020
47806bd
address comment
crazycs520 Jul 8, 2020
70884f7
refine code
crazycs520 Jul 8, 2020
8a52b54
Merge branch 'master' into profile
crazycs520 Jul 8, 2020
068105a
Merge branch 'master' into profile
crazycs520 Jul 9, 2020
c6e1e1b
Merge branch 'master' into profile
crazycs520 Jul 13, 2020
6f496de
Merge branch 'master' into profile
crazycs520 Jul 13, 2020
a5557ee
Merge branch 'master' into profile
crazycs520 Jul 13, 2020
358c493
Merge branch 'master' into profile
crazycs520 Jul 14, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
439 changes: 439 additions & 0 deletions executor/inspection_profile.go

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions executor/inspection_summary.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ var inspectionSummaryRules = map[string][]string{
"tidb_kv_backoff_duration",
"tidb_kv_request_duration",
"pd_client_cmd_duration",
"tikv_grpc_messge_duration",
"tikv_grpc_message_duration",
"tikv_average_grpc_messge_duration",
"tikv_channel_full",
"tikv_scheduler_is_busy",
Expand Down Expand Up @@ -155,7 +155,7 @@ var inspectionSummaryRules = map[string][]string{
"tikv_grpc_avg_req_batch_size",
"tikv_grpc_avg_resp_batch_size",
"tikv_grpc_errors",
"tikv_grpc_messge_duration",
"tikv_grpc_message_duration",
"tikv_grpc_qps",
"tikv_grpc_req_batch_size",
"tikv_grpc_resp_batch_size",
Expand Down Expand Up @@ -219,7 +219,7 @@ var inspectionSummaryRules = map[string][]string{
"tikv_grpc_avg_req_batch_size",
"tikv_grpc_avg_resp_batch_size",
"tikv_grpc_errors",
"tikv_grpc_messge_duration",
"tikv_grpc_message_duration",
"tikv_grpc_qps",
"tikv_grpc_req_batch_size",
"tikv_grpc_resp_batch_size",
Expand Down
50 changes: 41 additions & 9 deletions infoschema/metric_table_def.go
Original file line number Diff line number Diff line change
Expand Up @@ -1069,7 +1069,7 @@ var MetricTableMap = map[string]MetricTableDef{
Comment: "The quantile size of requests into request batch per TiKV instance",
},

"tikv_grpc_messge_duration": {
"tikv_grpc_message_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_grpc_msg_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))`,
Labels: []string{"instance", "type"},
Quantile: 0.99,
Expand Down Expand Up @@ -1354,6 +1354,22 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "type"},
Comment: "The average time which is caused by latch wait in command",
},
"tikv_scheduler_processing_read_duration": {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why add these metrics?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,type))`,
Labels: []string{"instance", "type"},
Quantile: 0.99,
Comment: "The quantile time of scheduler processing read in command",
},
"tikv_scheduler_processing_read_total_count": {
PromQL: "sum(increase(tikv_scheduler_processing_read_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total count of scheduler processing read in command",
},
"tikv_scheduler_processing_read_total_time": {
PromQL: "sum(increase(tikv_scheduler_processing_read_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total time of scheduler processing read in command",
},

"tikv_scheduler_keys_read": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_scheduler_kv_command_key_read_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,type))`,
Expand Down Expand Up @@ -1573,8 +1589,8 @@ var MetricTableMap = map[string]MetricTableDef{
Comment: "The quantile of time consumed when handling coprocessor requests",
},
"tikv_cop_wait_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,req,instance))`,
Labels: []string{"instance", "req"},
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,req,type,instance))`,
Labels: []string{"instance", "req", "type"},
Quantile: 1,
Comment: "The quantile of time consumed when coprocessor requests are wait for being handled",
},
Expand Down Expand Up @@ -2583,6 +2599,22 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "sql_type"},
Comment: "The total time of TiDB query durations(second)",
},
"tidb_txn_cmd_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))`,
Labels: []string{"instance", "type"},
Quantile: 0.90,
Comment: "The quantile of TiDB transaction command durations(second)",
},
"tidb_txn_cmd_total_count": {
PromQL: "sum(increase(tidb_tikvclient_txn_cmd_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total count of TiDB transaction command",
},
"tidb_txn_cmd_total_time": {
PromQL: "sum(increase(tidb_tikvclient_txn_cmd_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total time of TiDB transaction command",
},
"tidb_slow_query_cop_process_total_count": {
PromQL: "sum(increase(tidb_server_slow_query_cop_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Expand Down Expand Up @@ -2728,13 +2760,13 @@ var MetricTableMap = map[string]MetricTableDef{
Comment: "The total time of time consumed to handle coprocessor read requests",
},
"tikv_cop_wait_total_count": {
PromQL: "sum(increase(tikv_coprocessor_request_wait_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req)",
Labels: []string{"instance", "req"},
PromQL: "sum(increase(tikv_coprocessor_request_wait_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req,type)",
Labels: []string{"instance", "req", "type"},
Comment: "The total count of coprocessor requests that wait for being handled",
},
"tikv_cop_wait_total_time": {
PromQL: "sum(increase(tikv_coprocessor_request_wait_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req)",
Labels: []string{"instance", "req"},
PromQL: "sum(increase(tikv_coprocessor_request_wait_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req,type)",
Labels: []string{"instance", "req", "type"},
Comment: "The total time of time consumed when coprocessor requests are wait for being handled",
},
"tikv_raft_store_events_total_count": {
Expand All @@ -2757,12 +2789,12 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "task"},
Comment: "The total time of time consumed when executing GC tasks",
},
"tikv_grpc_messge_total_count": {
"tikv_grpc_message_total_count": {
PromQL: "sum(increase(tikv_grpc_msg_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total count of tikv execution gRPC message",
},
"tikv_grpc_messge_total_time": {
"tikv_grpc_message_total_time": {
PromQL: "sum(increase(tikv_grpc_msg_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total time of execution time of gRPC message",
Expand Down
2 changes: 2 additions & 0 deletions metrics/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,6 @@ const (
LblOptimistic = "optimistic"
LblStore = "store"
LblAddress = "address"
LblBatchGet = "batch_get"
LblGet = "get"
)
39 changes: 39 additions & 0 deletions server/http_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,10 @@ type allServerInfoHandler struct {
*tikvHandlerTool
}

type profileHandler struct {
*tikvHandlerTool
}

// valueHandler is the handler for get value.
type valueHandler struct {
}
Expand Down Expand Up @@ -1698,6 +1702,41 @@ func (h dbTableHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
writeData(w, dbTblInfo)
}

func (h profileHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
sctx, err := session.CreateSession(h.Store)
if err != nil {
writeError(w, err)
return
}
var start, end time.Time
if req.FormValue("end") != "" {
end, err = time.ParseInLocation(time.RFC3339, req.FormValue("end"), sctx.GetSessionVars().Location())
if err != nil {
writeError(w, err)
return
}
} else {
end = time.Now()
}
if req.FormValue("start") != "" {
start, err = time.ParseInLocation(time.RFC3339, req.FormValue("start"), sctx.GetSessionVars().Location())
if err != nil {
writeError(w, err)
return
}
} else {
start = end.Add(-time.Minute * 10)
}
pb := executor.NewProfileBuilder(sctx, start, end)
err = pb.Collect()
if err != nil {
writeError(w, err)
return
}
_, err = w.Write(pb.Build())
terror.Log(errors.Trace(err))
}

// testHandler is the handler for tests. It's convenient to provide some APIs for integration tests.
type testHandler struct {
*tikvHandlerTool
Expand Down
2 changes: 2 additions & 0 deletions server/http_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ func (s *Server) startHTTPServer() {
router.Handle("/mvcc/hex/{hexKey}", mvccTxnHandler{tikvHandlerTool, opMvccGetByHex})
router.Handle("/mvcc/index/{db}/{table}/{index}/{handle}", mvccTxnHandler{tikvHandlerTool, opMvccGetByIdx})

// HTTP path for generate metric profile.
router.Handle("/metric/profile", profileHandler{tikvHandlerTool})
crazycs520 marked this conversation as resolved.
Show resolved Hide resolved
crazycs520 marked this conversation as resolved.
Show resolved Hide resolved
// HTTP path for web UI.
if host, port, err := net.SplitHostPort(s.statusAddr); err == nil {
if host == "" {
Expand Down
8 changes: 8 additions & 0 deletions store/tikv/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"strings"
"sync"
"sync/atomic"
"time"
"unsafe"

"github.com/opentracing/opentracing-go"
Expand Down Expand Up @@ -187,6 +188,9 @@ func appendBatchKeysBySize(b []batchKeys, region RegionVerID, keys [][]byte, siz
}

func (s *tikvSnapshot) batchGetKeysByRegions(bo *Backoffer, keys [][]byte, collectF func(k, v []byte)) error {
defer func(start time.Time) {
tikvTxnCmdHistogramWithBatchGet.Observe(time.Since(start).Seconds())
AilinKid marked this conversation as resolved.
Show resolved Hide resolved
}(time.Now())
groups, _, err := s.store.regionCache.GroupKeysByRegion(bo, keys, nil)
if err != nil {
return errors.Trace(err)
Expand Down Expand Up @@ -308,6 +312,10 @@ func (s *tikvSnapshot) Get(ctx context.Context, k kv.Key) ([]byte, error) {
ctx = opentracing.ContextWithSpan(ctx, span1)
}

defer func(start time.Time) {
tikvTxnCmdHistogramWithGet.Observe(time.Since(start).Seconds())
}(time.Now())

ctx = context.WithValue(ctx, txnStartKey, s.version.Ver)
val, err := s.get(NewBackoffer(ctx, getMaxBackoff), k)
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions store/tikv/txn.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ var (
var (
tikvTxnCmdHistogramWithCommit = metrics.TiKVTxnCmdHistogram.WithLabelValues(metrics.LblCommit)
tikvTxnCmdHistogramWithRollback = metrics.TiKVTxnCmdHistogram.WithLabelValues(metrics.LblRollback)
tikvTxnCmdHistogramWithBatchGet = metrics.TiKVTxnCmdHistogram.WithLabelValues(metrics.LblBatchGet)
tikvTxnCmdHistogramWithGet = metrics.TiKVTxnCmdHistogram.WithLabelValues(metrics.LblGet)
)

// tikvTxn implements kv.Transaction.
Expand Down