Skip to content

Commit

Permalink
feat(config): update alert configurations and error rate thresholds
Browse files Browse the repository at this point in the history
- Increased ErrorRate threshold from 0.5 to 0.8 for stricter alerting.
- Added AlertInterval setting to config for customizable notification intervals.
- Updated latency thresholds for small, medium, and large files to improve performance monitoring.
- Enhanced metrics handling to incorporate new alert configurations in the system.
  • Loading branch information
woodchen-ink committed Dec 3, 2024
1 parent 68c27b5 commit 3962799
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 8 deletions.
3 changes: 2 additions & 1 deletion data/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
"WindowInterval": "5m",
"DedupeWindow": "15m",
"MinRequests": 10,
"ErrorRate": 0.5
"ErrorRate": 0.8,
"AlertInterval": "24h"
},
"Latency": {
"SmallFileSize": 1048576,
Expand Down
1 change: 1 addition & 0 deletions internal/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ type MetricsConfig struct {
DedupeWindow time.Duration `json:"DedupeWindow"` // 告警去重时间窗口
MinRequests int64 `json:"MinRequests"` // 触发告警的最小请求数
ErrorRate float64 `json:"ErrorRate"` // 错误率告警阈值
AlertInterval time.Duration `json:"AlertInterval"` // 告警间隔时间
} `json:"Alert"`
// 延迟告警配置
Latency struct {
Expand Down
12 changes: 8 additions & 4 deletions internal/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,18 @@ var (
AlertWindowSize = 12 // 监控窗口数量
AlertWindowInterval = 5 * time.Minute // 每个窗口时间长度
AlertDedupeWindow = 15 * time.Minute // 告警去重时间窗口
AlertNotifyInterval = 24 * time.Hour // 告警通知间隔
MinRequestsForAlert int64 = 10 // 触发告警的最小请求数
ErrorRateThreshold = 0.5 // 错误率告警阈值 (50%)
ErrorRateThreshold = 0.8 // 错误率告警阈值

// 延迟告警阈值
SmallFileSize int64 = 1 * MB // 小文件阈值
MediumFileSize int64 = 10 * MB // 中等文件阈值
LargeFileSize int64 = 100 * MB // 大文件阈值

SmallFileLatency = 3 * time.Second // 小文件最大延迟
MediumFileLatency = 8 * time.Second // 中等文件最大延迟
LargeFileLatency = 30 * time.Second // 大文件最大延迟
SmallFileLatency = 5 * time.Second // 小文件最大延迟
MediumFileLatency = 10 * time.Second // 中等文件最大延迟
LargeFileLatency = 50 * time.Second // 大文件最大延迟
HugeFileLatency = 300 * time.Second // 超大文件最大延迟 (5分钟)

// 单位常量
Expand All @@ -60,6 +61,9 @@ func UpdateFromConfig(cfg *config.Config) {
if cfg.Metrics.Alert.ErrorRate > 0 {
ErrorRateThreshold = cfg.Metrics.Alert.ErrorRate
}
if cfg.Metrics.Alert.AlertInterval > 0 {
AlertNotifyInterval = cfg.Metrics.Alert.AlertInterval
}

// 延迟告警配置
if cfg.Metrics.Latency.SmallFileSize > 0 {
Expand Down
7 changes: 6 additions & 1 deletion internal/handler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ func (h *ProxyHandler) MetricsHandler(w http.ResponseWriter, r *http.Request) {
return
}

var avgLatency int64
if latency, ok := stats["avg_latency"]; ok && latency != nil {
avgLatency = latency.(int64)
}

metrics := Metrics{
Uptime: uptime.String(),
ActiveRequests: stats["active_requests"].(int64),
Expand All @@ -55,7 +60,7 @@ func (h *ProxyHandler) MetricsHandler(w http.ResponseWriter, r *http.Request) {
ErrorRate: float64(stats["total_errors"].(int64)) / float64(stats["total_requests"].(int64)),
NumGoroutine: stats["num_goroutine"].(int),
MemoryUsage: stats["memory_usage"].(string),
AverageResponseTime: metrics.FormatDuration(time.Duration(stats["avg_latency"].(int64))),
AverageResponseTime: metrics.FormatDuration(time.Duration(avgLatency)),
TotalBytes: stats["total_bytes"].(int64),
BytesPerSecond: float64(stats["total_bytes"].(int64)) / metrics.Max(uptime.Seconds(), 1),
RequestsPerSecond: float64(stats["total_requests"].(int64)) / metrics.Max(uptime.Seconds(), 1),
Expand Down
14 changes: 12 additions & 2 deletions internal/monitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ type Monitor struct {
alerts chan Alert
handlers []AlertHandler
dedup sync.Map
errorWindow [12]ErrorStats // 5分钟一个窗口,保存最近1小时
lastNotify sync.Map
errorWindow [12]ErrorStats
currentWindow atomic.Int32
transferWindow [12]TransferStats // 5分钟一个窗口,保存最近1小时
transferWindow [12]TransferStats
currentTWindow atomic.Int32
}

Expand Down Expand Up @@ -90,6 +91,15 @@ func (m *Monitor) processAlerts() {
continue
}

// 检查是否在通知间隔内
notifyKey := fmt.Sprintf("notify:%s", alert.Level)
if lastTime, ok := m.lastNotify.Load(notifyKey); ok {
if time.Since(lastTime.(time.Time)) < constants.AlertNotifyInterval {
continue
}
}
m.lastNotify.Store(notifyKey, time.Now())

for _, handler := range m.handlers {
handler.HandleAlert(alert)
}
Expand Down

0 comments on commit 3962799

Please sign in to comment.