diff --git a/metrics/gprc.go b/metrics/gprc.go new file mode 100644 index 0000000000000..33875054b64a1 --- /dev/null +++ b/metrics/gprc.go @@ -0,0 +1,27 @@ +// Copyright 2019 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +// Metrics to monitor gRPC service +var ( + GRPCConnTransientFailureCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "tidb", + Subsystem: "grpc", + Name: "connection_transient_failure_count", + Help: "Counter of gRPC connection transient failure", + }, []string{LblAddress, LblStore}) +) diff --git a/metrics/metrics.go b/metrics/metrics.go index bc1291fb30805..5a43c4168f03c 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -150,4 +150,5 @@ func RegisterMetrics() { prometheus.MustRegister(TiKVBatchClientUnavailable) prometheus.MustRegister(TiKVRangeTaskStats) prometheus.MustRegister(TiKVRangeTaskPushDuration) + prometheus.MustRegister(GRPCConnTransientFailureCounter) } diff --git a/metrics/session.go b/metrics/session.go index 0ea3548547986..4e7559400bc60 100644 --- a/metrics/session.go +++ b/metrics/session.go @@ -112,4 +112,6 @@ const ( LblSQLType = "sql_type" LblGeneral = "general" LblInternal = "internal" + LblStore = "store" + LblAddress = "address" ) diff --git a/metrics/tikvclient.go b/metrics/tikvclient.go index f4cb704f7d38a..ce6be01ce596d 100644 --- a/metrics/tikvclient.go +++ b/metrics/tikvclient.go @@ -74,7 +74,7 @@ var ( Name: "request_seconds", Help: "Bucketed histogram of sending request duration.", Buckets: prometheus.ExponentialBuckets(0.0005, 2, 20), // 0.5ms ~ 524s - }, []string{LblType, "store"}) + }, []string{LblType, LblStore}) TiKVCoprocessorHistogram = prometheus.NewHistogram( prometheus.HistogramOpts{ diff --git a/store/tikv/client.go b/store/tikv/client.go index 6a483ba3ae8f8..c263648903a64 100644 --- a/store/tikv/client.go +++ b/store/tikv/client.go @@ -34,6 +34,7 @@ import ( "github.com/pingcap/tidb/store/tikv/tikvrpc" "github.com/pingcap/tidb/util/logutil" "google.golang.org/grpc" + "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials" "google.golang.org/grpc/keepalive" ) @@ -288,14 +289,19 @@ func (c *rpcClient) SendRequest(ctx context.Context, addr string, req *tikvrpc.R } } + clientConn := connArray.Get() + if state := clientConn.GetState(); state == connectivity.TransientFailure { + metrics.GRPCConnTransientFailureCounter.WithLabelValues(addr, storeID).Inc() + } + if req.IsDebugReq() { - client := debugpb.NewDebugClient(connArray.Get()) + client := debugpb.NewDebugClient(clientConn) ctx1, cancel := context.WithTimeout(ctx, timeout) defer cancel() return tikvrpc.CallDebugRPC(ctx1, client, req) } - client := tikvpb.NewTikvClient(connArray.Get()) + client := tikvpb.NewTikvClient(clientConn) if req.Type != tikvrpc.CmdCopStream { ctx1, cancel := context.WithTimeout(ctx, timeout)