Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: add maxprocs metrics for mcs #7604

Merged
merged 8 commits into from
Dec 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions pkg/mcs/scheduling/server/metrics.go → pkg/basicserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,27 @@ package server

import "github.com/prometheus/client_golang/prometheus"

const (
namespace = "scheduling"
serverSubsystem = "server"
)

var (
// Meta & Server info.
serverInfo = prometheus.NewGaugeVec(
// ServerMaxProcsGauge record the maxprocs.
ServerMaxProcsGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "pd",
Subsystem: "service",
Name: "maxprocs",
Help: "The value of GOMAXPROCS.",
})

// ServerInfoGauge indicates the pd server info including version and git hash.
ServerInfoGauge = prometheus.NewGaugeVec(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another option, we can define the metrics for different services here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One metrics is maybe enough. We can get instance roles from service-role plane and find version info by instance.

prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: serverSubsystem,
Namespace: "pd",
Subsystem: "server",
Name: "info",
Help: "Indicate the scheduling server info, and the value is the start timestamp (s).",
Help: "Indicate the pd server info, and the value is the start timestamp (s).",
}, []string{"version", "hash"})
)

func init() {
prometheus.MustRegister(serverInfo)
prometheus.MustRegister(ServerMaxProcsGauge)
prometheus.MustRegister(ServerInfoGauge)
}
9 changes: 0 additions & 9 deletions pkg/mcs/resourcemanager/server/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,6 @@ const (
)

var (
// Meta & Server info.
serverInfo = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: serverSubsystem,
Name: "info",
Help: "Indicate the resource manager server info, and the value is the start timestamp (s).",
}, []string{"version", "hash"})
// RU cost metrics.
// `sum` is added to the name to maintain compatibility with the previous use of histogram.
readRequestUnitCost = prometheus.NewCounterVec(
Expand Down Expand Up @@ -111,7 +103,6 @@ var (
)

func init() {
prometheus.MustRegister(serverInfo)
prometheus.MustRegister(readRequestUnitCost)
prometheus.MustRegister(writeRequestUnitCost)
prometheus.MustRegister(sqlLayerRequestUnitCost)
Expand Down
4 changes: 3 additions & 1 deletion pkg/mcs/resourcemanager/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"net/http"
"os"
"os/signal"
"runtime"
"strconv"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -294,7 +295,8 @@ func (s *Server) startServer() (err error) {
log.Info("init cluster id", zap.Uint64("cluster-id", s.clusterID))
// The independent Resource Manager service still reuses PD version info since PD and Resource Manager are just
// different service modes provided by the same pd-server binary
serverInfo.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix()))
bs.ServerInfoGauge.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix()))
bs.ServerMaxProcsGauge.Set(float64(runtime.GOMAXPROCS(0)))

uniqueName := s.cfg.GetAdvertiseListenAddr()
uniqueID := memberutil.GenerateUniqueID(uniqueName)
Expand Down
5 changes: 3 additions & 2 deletions pkg/mcs/scheduling/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"net/http"
"os"
"os/signal"
"runtime"
"strconv"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -409,8 +410,8 @@ func (s *Server) startServer() (err error) {
log.Info("init cluster id", zap.Uint64("cluster-id", s.clusterID))
// The independent Scheduling service still reuses PD version info since PD and Scheduling are just
// different service modes provided by the same pd-server binary
serverInfo.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix()))

bs.ServerInfoGauge.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix()))
bs.ServerMaxProcsGauge.Set(float64(runtime.GOMAXPROCS(0)))
s.serviceID = &discovery.ServiceRegistryEntry{ServiceAddr: s.cfg.AdvertiseListenAddr}
uniqueName := s.cfg.GetAdvertiseListenAddr()
uniqueID := memberutil.GenerateUniqueID(uniqueName)
Expand Down
9 changes: 0 additions & 9 deletions pkg/mcs/tso/server/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,6 @@ var (
Help: "Record critical metadata.",
}, []string{"type"})

serverInfo = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: "server",
Name: "info",
Help: "Indicate the tso server info, and the value is the start timestamp (s).",
}, []string{"version", "hash"})

tsoHandleDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Expand All @@ -56,6 +48,5 @@ var (
func init() {
prometheus.MustRegister(timeJumpBackCounter)
prometheus.MustRegister(metaDataGauge)
prometheus.MustRegister(serverInfo)
prometheus.MustRegister(tsoHandleDuration)
}
4 changes: 3 additions & 1 deletion pkg/mcs/tso/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"net/http"
"os"
"os/signal"
"runtime"
"strconv"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -359,7 +360,8 @@ func (s *Server) startServer() (err error) {
metaDataGauge.WithLabelValues(fmt.Sprintf("cluster%d", s.clusterID)).Set(0)
// The independent TSO service still reuses PD version info since PD and TSO are just
// different service modes provided by the same pd-server binary
serverInfo.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix()))
bs.ServerInfoGauge.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix()))
bs.ServerMaxProcsGauge.Set(float64(runtime.GOMAXPROCS(0)))

// Initialize the TSO service.
s.serverLoopCtx, s.serverLoopCancel = context.WithCancel(s.Context())
Expand Down
17 changes: 0 additions & 17 deletions server/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,6 @@ var (
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 29), // 0.1ms ~ 7hours
}, []string{"address", "store"})

serverInfo = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pd",
Subsystem: "server",
Name: "info",
Help: "Indicate the pd server info, and the value is the start timestamp (s).",
}, []string{"version", "hash"})

serviceAuditHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "pd",
Expand All @@ -152,13 +144,6 @@ var (
Help: "PD server service handling audit",
Buckets: prometheus.DefBuckets,
}, []string{"service", "method", "caller_id", "ip"})
serverMaxProcs = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "pd",
Subsystem: "service",
Name: "maxprocs",
Help: "The value of GOMAXPROCS.",
})

forwardFailCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Expand All @@ -181,11 +166,9 @@ func init() {
prometheus.MustRegister(tsoHandleDuration)
prometheus.MustRegister(regionHeartbeatHandleDuration)
prometheus.MustRegister(storeHeartbeatHandleDuration)
prometheus.MustRegister(serverInfo)
prometheus.MustRegister(bucketReportCounter)
prometheus.MustRegister(bucketReportLatency)
prometheus.MustRegister(serviceAuditHistogram)
prometheus.MustRegister(bucketReportInterval)
prometheus.MustRegister(serverMaxProcs)
prometheus.MustRegister(forwardFailCounter)
}
5 changes: 3 additions & 2 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
"github.com/pingcap/log"
"github.com/pingcap/sysutil"
"github.com/tikv/pd/pkg/audit"
bs "github.com/tikv/pd/pkg/basicserver"
"github.com/tikv/pd/pkg/core"
"github.com/tikv/pd/pkg/encryption"
"github.com/tikv/pd/pkg/errs"
Expand Down Expand Up @@ -428,7 +429,7 @@ func (s *Server) startServer(ctx context.Context) error {
log.Info("init cluster id", zap.Uint64("cluster-id", s.clusterID))
// It may lose accuracy if use float64 to store uint64. So we store the cluster id in label.
metadataGauge.WithLabelValues(fmt.Sprintf("cluster%d", s.clusterID)).Set(0)
serverInfo.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix()))
bs.ServerInfoGauge.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix()))

s.rootPath = endpoint.PDRootPath(s.clusterID)
s.member.InitMemberInfo(s.cfg.AdvertiseClientUrls, s.cfg.AdvertisePeerUrls, s.Name(), s.rootPath)
Expand Down Expand Up @@ -504,7 +505,7 @@ func (s *Server) startServer(ctx context.Context) error {

// Server has started.
atomic.StoreInt64(&s.isRunning, 1)
serverMaxProcs.Set(float64(runtime.GOMAXPROCS(0)))
bs.ServerMaxProcsGauge.Set(float64(runtime.GOMAXPROCS(0)))
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion tests/integrations/mcs/resourcemanager/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func TestResourceManagerServer(t *testing.T) {
re.Equal(http.StatusOK, resp.StatusCode)
respBytes, err := io.ReadAll(resp.Body)
re.NoError(err)
re.Contains(string(respBytes), "resource_manager_server_info")
re.Contains(string(respBytes), "pd_server_info")
}

// Test status handler
Expand Down
2 changes: 1 addition & 1 deletion tests/integrations/mcs/scheduling/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ func (suite *apiTestSuite) checkMetrics(cluster *tests.TestCluster) {
re.Equal(http.StatusOK, resp.StatusCode)
respBytes, err := io.ReadAll(resp.Body)
re.NoError(err)
re.Contains(string(respBytes), "scheduling_server_info")
re.Contains(string(respBytes), "pd_server_info")
}

func (suite *apiTestSuite) TestStatus() {
Expand Down
2 changes: 1 addition & 1 deletion tests/integrations/mcs/tso/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ func (suite *tsoAPITestSuite) TestMetrics() {
re.Equal(http.StatusOK, resp.StatusCode)
respBytes, err := io.ReadAll(resp.Body)
re.NoError(err)
re.Contains(string(respBytes), "tso_server_info")
re.Contains(string(respBytes), "pd_server_info")
}

func (suite *tsoAPITestSuite) TestStatus() {
Expand Down
Loading