Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add drive I/O metrics for Prometheus #955

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
DirectPV nodes export Prometheus compatible metrics data via port `10443`. The metrics data includes
* directpv_stats_bytes_used
* directpv_stats_bytes_total
and categorized by labels `tenant`, `volumeID` and `node`.
* directpv_stats_drive_ready
* directpv_stats_drive_total_read_bytes
* directpv_stats_drive_total_write_bytes
* directpv_stats_drive_read_latency_seconds
* directpv_stats_drive_write_latency_seconds
* directpv_stats_drive_wait_time_seconds
and categorized by labels `drive`, `tenant`, `volumeID` and `node`.

To scrape data in Prometheus, each node must be accessible by port `10443`. A simple example is below

Expand Down
18 changes: 18 additions & 0 deletions pkg/device/sysfs_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,21 @@ func getHolders(name string) ([]string, error) {
func getDMName(name string) (string, error) {
return readFirstLine("/sys/class/block/" + name + "/dm/name")
}

// GetStat returns statistics for a given device name.
func GetStat(name string) (stats []uint64, err error) {
Praveenrajmani marked this conversation as resolved.
Show resolved Hide resolved
line, err := readFirstLine("/sys/class/block/" + name + "/stat")
if err != nil {
return nil, err
}

for _, token := range strings.Fields(line) {
ui64, err := strconv.ParseUint(token, 10, 64)
if err != nil {
return nil, err
}
stats = append(stats, ui64)
}

return stats, nil
}
143 changes: 140 additions & 3 deletions pkg/metrics/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,51 @@ package metrics

import (
"context"
"fmt"

directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types"
"github.com/minio/directpv/pkg/client"
"github.com/minio/directpv/pkg/consts"
"github.com/minio/directpv/pkg/device"
"github.com/minio/directpv/pkg/sys"
"github.com/minio/directpv/pkg/types"
"github.com/minio/directpv/pkg/utils"
"github.com/minio/directpv/pkg/xfs"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/klog/v2"
)

const defaultSectorSize = 512

type driveStats struct {
readBytes float64
readTicks float64
writeBytes float64
writeTicks float64
timeInQueue float64
}

func getDriveStats(driveName string) (*driveStats, error) {
Praveenrajmani marked this conversation as resolved.
Show resolved Hide resolved
stat, err := device.GetStat(driveName)
switch {
case err != nil:
return nil, err
case len(stat) == 0:
return nil, fmt.Errorf("unable to read stat from drive %v", driveName)
case len(stat) < 10:
return nil, fmt.Errorf("invalid stat format from drive %v", driveName)
}

// Refer https://www.kernel.org/doc/Documentation/block/stat.txt for meaning of each field.
return &driveStats{
readBytes: float64(stat[2] * defaultSectorSize),
readTicks: float64(stat[3]),
writeBytes: float64(stat[6] * defaultSectorSize),
writeTicks: float64(stat[7]),
timeInQueue: float64(stat[10]),
}, nil
}

type metricsCollector struct {
nodeID directpvtypes.NodeID
desc *prometheus.Desc
Expand Down Expand Up @@ -95,21 +129,124 @@ func (c *metricsCollector) publishVolumeStats(ctx context.Context, volume *types
)
}

func (c *metricsCollector) publishDriveStats(drive *types.Drive, ch chan<- prometheus.Metric) {
deviceID, err := c.getDeviceByFSUUID(drive.Status.FSUUID)
if err != nil {
klog.ErrorS(
err,
"unable to find device by FSUUID; "+
"either device is removed or run command "+
"`sudo udevadm control --reload-rules && sudo udevadm trigger`"+
" on the host to reload",
"FSUUID", drive.Status.FSUUID)
client.Eventf(
drive, client.EventTypeWarning, client.EventReasonMetrics,
"unable to find device by FSUUID %v; "+
"either device is removed or run command "+
"`sudo udevadm control --reload-rules && sudo udevadm trigger`"+
" on the host to reload", drive.Status.FSUUID)

return
}
deviceName := utils.TrimDevPrefix(deviceID)

status := float64(1) // Online
driveStat, err := getDriveStats(deviceName)
if err != nil {
klog.ErrorS(err, "unable to read drive statistics")
status = float64(0) // Offline
}

// Metrics
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_ready"),
"Drive Online/Offline Status",
[]string{"drive"}, nil),
prometheus.GaugeValue,
status, drive.Name,
)

if driveStat == nil {
return
}

ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_total_read_bytes"),
"Total number of bytes read from the drive",
[]string{"drive"}, nil),
prometheus.CounterValue,
driveStat.readBytes, drive.Name,
)

ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_total_write_bytes"),
"Total number of bytes written to the drive",
[]string{"drive"}, nil),
prometheus.CounterValue,
driveStat.writeBytes, drive.Name,
)

// Drive Read/Write Latency
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_read_latency_seconds"),
"Drive Read Latency",
Praveenrajmani marked this conversation as resolved.
Show resolved Hide resolved
[]string{"drive"}, nil),
prometheus.GaugeValue,
Praveenrajmani marked this conversation as resolved.
Show resolved Hide resolved
driveStat.readTicks/1000, drive.Name,
)

ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_write_latency_seconds"),
"Drive Write Latency",
[]string{"drive"}, nil),
prometheus.GaugeValue,
driveStat.writeTicks/1000, drive.Name,
)

// Wait Time
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_wait_time_seconds"),
"Drive Wait Time",
[]string{"drive"}, nil),
prometheus.GaugeValue,
driveStat.timeInQueue/1000, drive.Name,
)
}

// Collect is called by Prometheus registry when collecting metrics.
func (c *metricsCollector) Collect(ch chan<- prometheus.Metric) {
ctx, cancelFunc := context.WithCancel(context.Background())
defer cancelFunc()

resultCh := client.NewVolumeLister().
// Collecting volume statistics
volumeResultCh := client.NewVolumeLister().
NodeSelector([]directpvtypes.LabelValue{directpvtypes.ToLabelValue(string(c.nodeID))}).
List(ctx)
for result := range resultCh {
for result := range volumeResultCh {
if result.Err != nil {
return
break
}

if result.Volume.Status.TargetPath != "" {
c.publishVolumeStats(ctx, &result.Volume, ch)
}
}

// Collecting drive statistics
driveResultCh := client.NewDriveLister().
NodeSelector([]directpvtypes.LabelValue{directpvtypes.ToLabelValue(string(c.nodeID))}).
List(ctx)
for result := range driveResultCh {
if result.Err != nil {
break
}

c.publishDriveStats(&result.Drive, ch)
}
}