From 67ce1efb277416377796b936fd6bf834c070f2d6 Mon Sep 17 00:00:00 2001 From: Bala FA Date: Tue, 15 Oct 2024 11:54:28 +0530 Subject: [PATCH] Add drive I/O metrics for Prometheus (#955) Below metrics are exported: * directpv_stats_drive_ready * directpv_stats_drive_total_read_bytes * directpv_stats_drive_total_write_bytes * directpv_stats_drive_read_latency_seconds * directpv_stats_drive_write_latency_seconds * directpv_stats_drive_wait_time_seconds Fixes https://github.com/minio/directpv/issues/839 Signed-off-by: Bala.FA --- docs/monitoring.md | 8 ++- pkg/device/sysfs_linux.go | 18 +++++ pkg/metrics/collector.go | 143 +++++++++++++++++++++++++++++++++++++- 3 files changed, 165 insertions(+), 4 deletions(-) diff --git a/docs/monitoring.md b/docs/monitoring.md index 113927b9..c7fef856 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -3,7 +3,13 @@ DirectPV nodes export Prometheus compatible metrics data via port `10443`. The metrics data includes * directpv_stats_bytes_used * directpv_stats_bytes_total -and categorized by labels `tenant`, `volumeID` and `node`. +* directpv_stats_drive_ready +* directpv_stats_drive_total_read_bytes +* directpv_stats_drive_total_write_bytes +* directpv_stats_drive_read_latency_seconds +* directpv_stats_drive_write_latency_seconds +* directpv_stats_drive_wait_time_seconds +and categorized by labels `drive`, `tenant`, `volumeID` and `node`. To scrape data in Prometheus, each node must be accessible by port `10443`. A simple example is below diff --git a/pkg/device/sysfs_linux.go b/pkg/device/sysfs_linux.go index 1c3f90e1..b122d58f 100644 --- a/pkg/device/sysfs_linux.go +++ b/pkg/device/sysfs_linux.go @@ -106,3 +106,21 @@ func getHolders(name string) ([]string, error) { func getDMName(name string) (string, error) { return readFirstLine("/sys/class/block/" + name + "/dm/name") } + +// GetStat returns statistics for a given device name. +func GetStat(name string) (stats []uint64, err error) { + line, err := readFirstLine("/sys/class/block/" + name + "/stat") + if err != nil { + return nil, err + } + + for _, token := range strings.Fields(line) { + ui64, err := strconv.ParseUint(token, 10, 64) + if err != nil { + return nil, err + } + stats = append(stats, ui64) + } + + return stats, nil +} diff --git a/pkg/metrics/collector.go b/pkg/metrics/collector.go index 2567eabd..aa040b22 100644 --- a/pkg/metrics/collector.go +++ b/pkg/metrics/collector.go @@ -18,17 +18,51 @@ package metrics import ( "context" + "fmt" directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types" "github.com/minio/directpv/pkg/client" "github.com/minio/directpv/pkg/consts" + "github.com/minio/directpv/pkg/device" "github.com/minio/directpv/pkg/sys" "github.com/minio/directpv/pkg/types" + "github.com/minio/directpv/pkg/utils" "github.com/minio/directpv/pkg/xfs" "github.com/prometheus/client_golang/prometheus" "k8s.io/klog/v2" ) +const defaultSectorSize = 512 + +type driveStats struct { + readBytes float64 + readTicks float64 + writeBytes float64 + writeTicks float64 + timeInQueue float64 +} + +func getDriveStats(driveName string) (*driveStats, error) { + stat, err := device.GetStat(driveName) + switch { + case err != nil: + return nil, err + case len(stat) == 0: + return nil, fmt.Errorf("unable to read stat from drive %v", driveName) + case len(stat) < 10: + return nil, fmt.Errorf("invalid stat format from drive %v", driveName) + } + + // Refer https://www.kernel.org/doc/Documentation/block/stat.txt for meaning of each field. + return &driveStats{ + readBytes: float64(stat[2] * defaultSectorSize), + readTicks: float64(stat[3]), + writeBytes: float64(stat[6] * defaultSectorSize), + writeTicks: float64(stat[7]), + timeInQueue: float64(stat[10]), + }, nil +} + type metricsCollector struct { nodeID directpvtypes.NodeID desc *prometheus.Desc @@ -95,21 +129,124 @@ func (c *metricsCollector) publishVolumeStats(ctx context.Context, volume *types ) } +func (c *metricsCollector) publishDriveStats(drive *types.Drive, ch chan<- prometheus.Metric) { + deviceID, err := c.getDeviceByFSUUID(drive.Status.FSUUID) + if err != nil { + klog.ErrorS( + err, + "unable to find device by FSUUID; "+ + "either device is removed or run command "+ + "`sudo udevadm control --reload-rules && sudo udevadm trigger`"+ + " on the host to reload", + "FSUUID", drive.Status.FSUUID) + client.Eventf( + drive, client.EventTypeWarning, client.EventReasonMetrics, + "unable to find device by FSUUID %v; "+ + "either device is removed or run command "+ + "`sudo udevadm control --reload-rules && sudo udevadm trigger`"+ + " on the host to reload", drive.Status.FSUUID) + + return + } + deviceName := utils.TrimDevPrefix(deviceID) + + status := float64(1) // Online + driveStat, err := getDriveStats(deviceName) + if err != nil { + klog.ErrorS(err, "unable to read drive statistics") + status = float64(0) // Offline + } + + // Metrics + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(consts.AppName, "stats", "drive_ready"), + "Drive Online/Offline Status", + []string{"drive"}, nil), + prometheus.GaugeValue, + status, drive.Name, + ) + + if driveStat == nil { + return + } + + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(consts.AppName, "stats", "drive_total_read_bytes"), + "Total number of bytes read from the drive", + []string{"drive"}, nil), + prometheus.CounterValue, + driveStat.readBytes, drive.Name, + ) + + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(consts.AppName, "stats", "drive_total_write_bytes"), + "Total number of bytes written to the drive", + []string{"drive"}, nil), + prometheus.CounterValue, + driveStat.writeBytes, drive.Name, + ) + + // Drive Read/Write Latency + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(consts.AppName, "stats", "drive_read_latency_seconds"), + "Drive Read Latency", + []string{"drive"}, nil), + prometheus.GaugeValue, + driveStat.readTicks/1000, drive.Name, + ) + + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(consts.AppName, "stats", "drive_write_latency_seconds"), + "Drive Write Latency", + []string{"drive"}, nil), + prometheus.GaugeValue, + driveStat.writeTicks/1000, drive.Name, + ) + + // Wait Time + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(consts.AppName, "stats", "drive_wait_time_seconds"), + "Drive Wait Time", + []string{"drive"}, nil), + prometheus.GaugeValue, + driveStat.timeInQueue/1000, drive.Name, + ) +} + // Collect is called by Prometheus registry when collecting metrics. func (c *metricsCollector) Collect(ch chan<- prometheus.Metric) { ctx, cancelFunc := context.WithCancel(context.Background()) defer cancelFunc() - resultCh := client.NewVolumeLister(). + // Collecting volume statistics + volumeResultCh := client.NewVolumeLister(). NodeSelector([]directpvtypes.LabelValue{directpvtypes.ToLabelValue(string(c.nodeID))}). List(ctx) - for result := range resultCh { + for result := range volumeResultCh { if result.Err != nil { - return + break } if result.Volume.Status.TargetPath != "" { c.publishVolumeStats(ctx, &result.Volume, ch) } } + + // Collecting drive statistics + driveResultCh := client.NewDriveLister(). + NodeSelector([]directpvtypes.LabelValue{directpvtypes.ToLabelValue(string(c.nodeID))}). + List(ctx) + for result := range driveResultCh { + if result.Err != nil { + break + } + + c.publishDriveStats(&result.Drive, ch) + } }