diff --git a/webapp/backend/pkg/database/interface.go b/webapp/backend/pkg/database/interface.go index f5dae30e..94a93d38 100644 --- a/webapp/backend/pkg/database/interface.go +++ b/webapp/backend/pkg/database/interface.go @@ -2,6 +2,7 @@ package database import ( "context" + "github.com/analogj/scrutiny/webapp/backend/pkg" "github.com/analogj/scrutiny/webapp/backend/pkg/models" "github.com/analogj/scrutiny/webapp/backend/pkg/models/collector" @@ -21,6 +22,7 @@ type DeviceRepo interface { SaveSmartAttributes(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (measurements.Smart, error) GetSmartAttributeHistory(ctx context.Context, wwn string, durationKey string, attributes []string) ([]measurements.Smart, error) + GetSmartAttributeHistoryTail(ctx context.Context, wwn string, n int, offset int, attributes []string) ([]measurements.Smart, error) SaveSmartTemperature(ctx context.Context, wwn string, deviceProtocol string, collectorSmartData collector.SmartInfo) error diff --git a/webapp/backend/pkg/database/scrutiny_repository_device_smart_attributes.go b/webapp/backend/pkg/database/scrutiny_repository_device_smart_attributes.go index 96015bb6..8722163e 100644 --- a/webapp/backend/pkg/database/scrutiny_repository_device_smart_attributes.go +++ b/webapp/backend/pkg/database/scrutiny_repository_device_smart_attributes.go @@ -3,13 +3,14 @@ package database import ( "context" "fmt" + "strings" + "time" + "github.com/analogj/scrutiny/webapp/backend/pkg/models/collector" "github.com/analogj/scrutiny/webapp/backend/pkg/models/measurements" influxdb2 "github.com/influxdata/influxdb-client-go/v2" "github.com/influxdata/influxdb-client-go/v2/api" log "github.com/sirupsen/logrus" - "strings" - "time" ) //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -84,6 +85,73 @@ func (sr *scrutinyRepository) GetSmartAttributeHistory(ctx context.Context, wwn } +// GetSmartAttributeHistory MUST return in sorted order, where newest entries are at the beginning of the list, and oldest are at the end. +func (sr *scrutinyRepository) GetSmartAttributeHistoryTail(ctx context.Context, wwn string, n int, offset int, attributes []string) ([]measurements.Smart, error) { + partialQueryStr := []string{ + `import "influxdata/influxdb/schema"`, + } + + nestedDurationKeys := sr.lookupNestedDurationKeys(DURATION_KEY_FOREVER) + subQueryNames := []string{} + for _, nestedDurationKey := range nestedDurationKeys { + bucketName := sr.lookupBucketName(nestedDurationKey) + durationRange := sr.lookupDuration(nestedDurationKey) + + subQueryNames = append(subQueryNames, fmt.Sprintf(`%sData`, nestedDurationKey)) + partialQueryStr = append(partialQueryStr, []string{ + fmt.Sprintf(`%sData = from(bucket: "%s")`, nestedDurationKey, bucketName), + fmt.Sprintf(`|> range(start: %s, stop: %s)`, durationRange[0], durationRange[1]), + `|> filter(fn: (r) => r["_measurement"] == "smart" )`, + fmt.Sprintf(`|> filter(fn: (r) => r["device_wwn"] == "%s" )`, wwn), + // We only need the last `offset` # of entries from each table to guarantee we can + // get the last `n` # of entries starting from `offset` of the union + fmt.Sprintf(`|> tail(n: %d)`, offset), + "|> schema.fieldsAsCols()", + }...) + } + + partialQueryStr = append(partialQueryStr, []string{ + fmt.Sprintf("union(tables: [%s])", strings.Join(subQueryNames, ", ")), + "|> group()", + `|> sort(columns: ["_time"], desc: false)`, + fmt.Sprintf(`|> tail(n: %d, offset: %d)`, n, offset), + `|> yield(name: "last")`, + }...) + + queryStr := strings.Join(partialQueryStr, "\n") + log.Infoln(queryStr) + + smartResults := []measurements.Smart{} + + result, err := sr.influxQueryApi.Query(ctx, queryStr) + if err == nil { + // Use Next() to iterate over query result lines + for result.Next() { + // Observe when there is new grouping key producing new table + if result.TableChanged() { + //fmt.Printf("table: %s\n", result.TableMetadata().String()) + } + + smartData, err := measurements.NewSmartFromInfluxDB(result.Record().Values()) + if err != nil { + return nil, err + } + smartResults = append(smartResults, *smartData) + + } + if result.Err() != nil { + fmt.Printf("Query error: %s\n", result.Err().Error()) + } + } else { + return nil, err + } + + //we have to sort the smartResults again, because the `union` command will return multiple 'tables' and only sort the records in each table. + sortSmartMeasurementsDesc(smartResults) + + return smartResults, nil +} + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Helper Methods //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/webapp/backend/pkg/notify/notify.go b/webapp/backend/pkg/notify/notify.go index 4a1862e1..d04ddd5b 100644 --- a/webapp/backend/pkg/notify/notify.go +++ b/webapp/backend/pkg/notify/notify.go @@ -15,11 +15,13 @@ import ( "github.com/analogj/go-util/utils" "github.com/analogj/scrutiny/webapp/backend/pkg" "github.com/analogj/scrutiny/webapp/backend/pkg/config" + "github.com/analogj/scrutiny/webapp/backend/pkg/database" "github.com/analogj/scrutiny/webapp/backend/pkg/models" "github.com/analogj/scrutiny/webapp/backend/pkg/models/measurements" "github.com/analogj/scrutiny/webapp/backend/pkg/thresholds" "github.com/containrrr/shoutrrr" shoutrrrTypes "github.com/containrrr/shoutrrr/pkg/types" + "github.com/gin-gonic/gin" "github.com/sirupsen/logrus" "golang.org/x/sync/errgroup" ) @@ -30,7 +32,7 @@ const NotifyFailureTypeSmartFailure = "SmartFailure" const NotifyFailureTypeScrutinyFailure = "ScrutinyFailure" // ShouldNotify check if the error Message should be filtered (level mismatch or filtered_attributes) -func ShouldNotify(device models.Device, smartAttrs measurements.Smart, statusThreshold pkg.MetricsStatusThreshold, statusFilterAttributes pkg.MetricsStatusFilterAttributes) bool { +func ShouldNotify(device models.Device, smartAttrs measurements.Smart, statusThreshold pkg.MetricsStatusThreshold, statusFilterAttributes pkg.MetricsStatusFilterAttributes, repeatNotifications bool, c *gin.Context, deviceRepo database.DeviceRepo) bool { // 1. check if the device is healthy if device.DeviceStatus == pkg.DeviceStatusPassed { return false @@ -54,52 +56,79 @@ func ShouldNotify(device models.Device, smartAttrs measurements.Smart, statusThr requiredAttrStatus = pkg.AttributeStatusFailedScrutiny } - // 2. check if the attributes that are failing should be filtered (non-critical) - // 3. for any unfiltered attribute, store the failure reason (Smart or Scrutiny) - if statusFilterAttributes == pkg.MetricsStatusFilterAttributesCritical { - hasFailingCriticalAttr := false - var statusFailingCriticalAttr pkg.AttributeStatus - - for attrId, attrData := range smartAttrs.Attributes { - //find failing attribute - if attrData.GetStatus() == pkg.AttributeStatusPassed { - continue //skip all passing attributes - } + // This is the only case where individual attributes need not be considered + if statusFilterAttributes == pkg.MetricsStatusFilterAttributesAll && repeatNotifications { + return pkg.DeviceStatusHas(device.DeviceStatus, requiredDeviceStatus) + } - // merge the status's of all critical attributes - statusFailingCriticalAttr = pkg.AttributeStatusSet(statusFailingCriticalAttr, attrData.GetStatus()) + var failingAttributes []string + for attrId, attrData := range smartAttrs.Attributes { + var status pkg.AttributeStatus = attrData.GetStatus() + if status == pkg.AttributeStatusPassed { + continue + } - //found a failing attribute, see if its critical - if device.IsScsi() && thresholds.ScsiMetadata[attrId].Critical { - hasFailingCriticalAttr = true - } else if device.IsNvme() && thresholds.NmveMetadata[attrId].Critical { - hasFailingCriticalAttr = true + if statusFilterAttributes == pkg.MetricsStatusFilterAttributesCritical { + critical := false + if device.IsScsi() { + critical = thresholds.ScsiMetadata[attrId].Critical + } else if device.IsNvme() { + critical = thresholds.NmveMetadata[attrId].Critical } else { //this is ATA attrIdInt, err := strconv.Atoi(attrId) if err != nil { continue } - if thresholds.AtaMetadata[attrIdInt].Critical { - hasFailingCriticalAttr = true - } + critical = thresholds.AtaMetadata[attrIdInt].Critical + } + if !critical { + continue } - } - if !hasFailingCriticalAttr { - //no critical attributes are failing, and notifyFilterAttributes == "critical" + failingAttributes = append(failingAttributes, attrId) + } + + if repeatNotifications { + lastPoints, err := deviceRepo.GetSmartAttributeHistoryTail(c, c.Param("wwn"), 1, 1, failingAttributes) + if err == nil && len(lastPoints) > 1 { + for _, attrId := range failingAttributes { + if old, ok := lastPoints[0].Attributes[attrId].(*measurements.SmartAtaAttribute); ok { + if current, ok := smartAttrs.Attributes[attrId].(*measurements.SmartAtaAttribute); ok { + if old.TransformedValue != current.TransformedValue { + return true + } + } + } + if old, ok := lastPoints[0].Attributes[attrId].(*measurements.SmartNvmeAttribute); ok { + if current, ok := smartAttrs.Attributes[attrId].(*measurements.SmartNvmeAttribute); ok { + if old.TransformedValue != current.TransformedValue { + return true + } + } + } + if old, ok := lastPoints[0].Attributes[attrId].(*measurements.SmartScsiAttribute); ok { + if current, ok := smartAttrs.Attributes[attrId].(*measurements.SmartScsiAttribute); ok { + if old.TransformedValue != current.TransformedValue { + return true + } + } + } + } return false - } else { - // check if any of the critical attributes have a status that we're looking for - return pkg.AttributeStatusHas(statusFailingCriticalAttr, requiredAttrStatus) } - + return true } else { - // 2. SKIP - we are processing every attribute. - // 3. check if the device failure level matches the wanted failure level. - return pkg.DeviceStatusHas(device.DeviceStatus, requiredDeviceStatus) + for _, attr := range failingAttributes { + attrStatus := smartAttrs.Attributes[attr].GetStatus() + if pkg.AttributeStatusHas(attrStatus, requiredAttrStatus) { + return true + } + } } + + return false } // TODO: include user label for device. diff --git a/webapp/backend/pkg/web/handler/upload_device_metrics.go b/webapp/backend/pkg/web/handler/upload_device_metrics.go index f58d6ed6..f83107f7 100644 --- a/webapp/backend/pkg/web/handler/upload_device_metrics.go +++ b/webapp/backend/pkg/web/handler/upload_device_metrics.go @@ -2,6 +2,8 @@ package handler import ( "fmt" + "net/http" + "github.com/analogj/scrutiny/webapp/backend/pkg" "github.com/analogj/scrutiny/webapp/backend/pkg/config" "github.com/analogj/scrutiny/webapp/backend/pkg/database" @@ -9,7 +11,6 @@ import ( "github.com/analogj/scrutiny/webapp/backend/pkg/notify" "github.com/gin-gonic/gin" "github.com/sirupsen/logrus" - "net/http" ) func UploadDeviceMetrics(c *gin.Context) { @@ -73,6 +74,9 @@ func UploadDeviceMetrics(c *gin.Context) { smartData, pkg.MetricsStatusThreshold(appConfig.GetInt(fmt.Sprintf("%s.metrics.status_threshold", config.DB_USER_SETTINGS_SUBKEY))), pkg.MetricsStatusFilterAttributes(appConfig.GetInt(fmt.Sprintf("%s.metrics.status_filter_attributes", config.DB_USER_SETTINGS_SUBKEY))), + appConfig.GetBool(fmt.Sprintf("%s.metrics.repeat_notifications", config.DB_USER_SETTINGS_SUBKEY)), + c, + deviceRepo, ) { //send notifications