From 3962799980cd98fb0a323e8ea7d47ef39464e09b Mon Sep 17 00:00:00 2001 From: wood chen Date: Tue, 3 Dec 2024 17:54:45 +0800 Subject: [PATCH] feat(config): update alert configurations and error rate thresholds - Increased ErrorRate threshold from 0.5 to 0.8 for stricter alerting. - Added AlertInterval setting to config for customizable notification intervals. - Updated latency thresholds for small, medium, and large files to improve performance monitoring. - Enhanced metrics handling to incorporate new alert configurations in the system. --- data/config.json | 3 ++- internal/config/types.go | 1 + internal/constants/constants.go | 12 ++++++++---- internal/handler/metrics.go | 7 ++++++- internal/monitor/monitor.go | 14 ++++++++++++-- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/data/config.json b/data/config.json index b10e9d8..7ca302e 100644 --- a/data/config.json +++ b/data/config.json @@ -45,7 +45,8 @@ "WindowInterval": "5m", "DedupeWindow": "15m", "MinRequests": 10, - "ErrorRate": 0.5 + "ErrorRate": 0.8, + "AlertInterval": "24h" }, "Latency": { "SmallFileSize": 1048576, diff --git a/internal/config/types.go b/internal/config/types.go index d60073d..579c491 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -47,6 +47,7 @@ type MetricsConfig struct { DedupeWindow time.Duration `json:"DedupeWindow"` // 告警去重时间窗口 MinRequests int64 `json:"MinRequests"` // 触发告警的最小请求数 ErrorRate float64 `json:"ErrorRate"` // 错误率告警阈值 + AlertInterval time.Duration `json:"AlertInterval"` // 告警间隔时间 } `json:"Alert"` // 延迟告警配置 Latency struct { diff --git a/internal/constants/constants.go b/internal/constants/constants.go index e5607ea..2814e9b 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -24,17 +24,18 @@ var ( AlertWindowSize = 12 // 监控窗口数量 AlertWindowInterval = 5 * time.Minute // 每个窗口时间长度 AlertDedupeWindow = 15 * time.Minute // 告警去重时间窗口 + AlertNotifyInterval = 24 * time.Hour // 告警通知间隔 MinRequestsForAlert int64 = 10 // 触发告警的最小请求数 - ErrorRateThreshold = 0.5 // 错误率告警阈值 (50%) + ErrorRateThreshold = 0.8 // 错误率告警阈值 // 延迟告警阈值 SmallFileSize int64 = 1 * MB // 小文件阈值 MediumFileSize int64 = 10 * MB // 中等文件阈值 LargeFileSize int64 = 100 * MB // 大文件阈值 - SmallFileLatency = 3 * time.Second // 小文件最大延迟 - MediumFileLatency = 8 * time.Second // 中等文件最大延迟 - LargeFileLatency = 30 * time.Second // 大文件最大延迟 + SmallFileLatency = 5 * time.Second // 小文件最大延迟 + MediumFileLatency = 10 * time.Second // 中等文件最大延迟 + LargeFileLatency = 50 * time.Second // 大文件最大延迟 HugeFileLatency = 300 * time.Second // 超大文件最大延迟 (5分钟) // 单位常量 @@ -60,6 +61,9 @@ func UpdateFromConfig(cfg *config.Config) { if cfg.Metrics.Alert.ErrorRate > 0 { ErrorRateThreshold = cfg.Metrics.Alert.ErrorRate } + if cfg.Metrics.Alert.AlertInterval > 0 { + AlertNotifyInterval = cfg.Metrics.Alert.AlertInterval + } // 延迟告警配置 if cfg.Metrics.Latency.SmallFileSize > 0 { diff --git a/internal/handler/metrics.go b/internal/handler/metrics.go index 243e333..74c4844 100644 --- a/internal/handler/metrics.go +++ b/internal/handler/metrics.go @@ -47,6 +47,11 @@ func (h *ProxyHandler) MetricsHandler(w http.ResponseWriter, r *http.Request) { return } + var avgLatency int64 + if latency, ok := stats["avg_latency"]; ok && latency != nil { + avgLatency = latency.(int64) + } + metrics := Metrics{ Uptime: uptime.String(), ActiveRequests: stats["active_requests"].(int64), @@ -55,7 +60,7 @@ func (h *ProxyHandler) MetricsHandler(w http.ResponseWriter, r *http.Request) { ErrorRate: float64(stats["total_errors"].(int64)) / float64(stats["total_requests"].(int64)), NumGoroutine: stats["num_goroutine"].(int), MemoryUsage: stats["memory_usage"].(string), - AverageResponseTime: metrics.FormatDuration(time.Duration(stats["avg_latency"].(int64))), + AverageResponseTime: metrics.FormatDuration(time.Duration(avgLatency)), TotalBytes: stats["total_bytes"].(int64), BytesPerSecond: float64(stats["total_bytes"].(int64)) / metrics.Max(uptime.Seconds(), 1), RequestsPerSecond: float64(stats["total_requests"].(int64)) / metrics.Max(uptime.Seconds(), 1), diff --git a/internal/monitor/monitor.go b/internal/monitor/monitor.go index b207606..c33a97e 100644 --- a/internal/monitor/monitor.go +++ b/internal/monitor/monitor.go @@ -48,9 +48,10 @@ type Monitor struct { alerts chan Alert handlers []AlertHandler dedup sync.Map - errorWindow [12]ErrorStats // 5分钟一个窗口,保存最近1小时 + lastNotify sync.Map + errorWindow [12]ErrorStats currentWindow atomic.Int32 - transferWindow [12]TransferStats // 5分钟一个窗口,保存最近1小时 + transferWindow [12]TransferStats currentTWindow atomic.Int32 } @@ -90,6 +91,15 @@ func (m *Monitor) processAlerts() { continue } + // 检查是否在通知间隔内 + notifyKey := fmt.Sprintf("notify:%s", alert.Level) + if lastTime, ok := m.lastNotify.Load(notifyKey); ok { + if time.Since(lastTime.(time.Time)) < constants.AlertNotifyInterval { + continue + } + } + m.lastNotify.Store(notifyKey, time.Now()) + for _, handler := range m.handlers { handler.HandleAlert(alert) }