Skip to content

Commit

Permalink
executor/inspect: refactor current-load diagnosis rule to node-check (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
sre-bot authored Jun 4, 2020
1 parent a7c0ab6 commit 4566942
Show file tree
Hide file tree
Showing 3 changed files with 190 additions and 118 deletions.
200 changes: 108 additions & 92 deletions executor/inspection_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ type (
// version in the cluster
versionInspection struct{ inspectionName }

// currentLoadInspection is used to check the current load of memory/disk/cpu
// nodeLoadInspection is used to check the node load of memory/disk/cpu
// have reached a high-level threshold
currentLoadInspection struct{ inspectionName }
nodeLoadInspection struct{ inspectionName }

// criticalErrorInspection is used to check are there some critical errors
// occurred in the past
Expand All @@ -97,7 +97,7 @@ type (
var inspectionRules = []inspectionRule{
&configInspection{inspectionName: "config"},
&versionInspection{inspectionName: "version"},
&currentLoadInspection{inspectionName: "current-load"},
&nodeLoadInspection{inspectionName: "node-load"},
&criticalErrorInspection{inspectionName: "critical-error"},
&thresholdCheckInspection{inspectionName: "threshold-check"},
}
Expand Down Expand Up @@ -316,101 +316,114 @@ func (versionInspection) inspect(_ context.Context, sctx sessionctx.Context, fil
return results
}

func (c currentLoadInspection) inspect(_ context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var commonResult = func(item, expected string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: row.GetString(0),
instance: row.GetString(1),
item: item,
actual: row.GetString(2),
expected: expected,
severity: "warning",
}
func (c nodeLoadInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var rules = []ruleChecker{
inspectCPULoad{item: "load1", tbl: "node_load1"},
inspectCPULoad{item: "load5", tbl: "node_load5"},
inspectCPULoad{item: "load15", tbl: "node_load15"},
inspectVirtualMemUsage{},
inspectSwapMemoryUsed{},
inspectDiskUsage{},
}
var diskResult = func(item, expected string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: row.GetString(0),
instance: row.GetString(1),
item: item,
actual: row.GetString(3),
expected: expected,
severity: "warning",
detail: fmt.Sprintf("current disk-usage is too high, execute the sql to see more detail: select * from information_schema.cluster_hardware where type='%s' and instance='%s' and device_type='disk' and device_name='%s'",
row.GetString(0), row.GetString(1), row.GetString(2)),
}
return checkRules(ctx, sctx, filter, rules)
}

type inspectVirtualMemUsage struct{}

func (inspectVirtualMemUsage) genSQL(timeRange plannercore.QueryTimeRange) string {
sql := fmt.Sprintf("select instance, max(value) as max_usage from metrics_schema.node_memory_usage %s group by instance having max_usage >= 70", timeRange.Condition())
return sql
}

func (i inspectVirtualMemUsage) genResult(sql string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: "node",
instance: row.GetString(0),
item: i.getItem(),
actual: fmt.Sprintf("%.1f%%", row.GetFloat64(1)),
expected: "< 70%",
severity: "warning",
detail: "the memory-usage is too high",
}
var rules = []struct {
item string
sql string
expected string
result func(string, string, chunk.Row) inspectionResult
}{
{
"virtual-memory-usage",
"select type, instance, value from information_schema.cluster_load where device_type='memory' and device_name='virtual' and name='used-percent' and value > 0.7",
"< 0.7",
commonResult,
},
{
"swap-memory-usage",
"select type, instance, value from information_schema.cluster_load where device_type='memory' and device_name='swap' and name='used-percent' and value > 0",
"0",
commonResult,
},
{
"disk-usage",
"select type, instance, device_name, value from information_schema.cluster_hardware where device_type='disk' and name='used-percent' and value > 70",
"< 70",
diskResult,
},
}

func (inspectVirtualMemUsage) getItem() string {
return "virtual-memory-usage"
}

type inspectSwapMemoryUsed struct{}

func (inspectSwapMemoryUsed) genSQL(timeRange plannercore.QueryTimeRange) string {
sql := fmt.Sprintf("select instance, max(value) as max_used from metrics_schema.node_memory_swap_used %s group by instance having max_used > 0", timeRange.Condition())
fmt.Println(sql)
return sql
}

func (i inspectSwapMemoryUsed) genResult(sql string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: "node",
instance: row.GetString(0),
item: i.getItem(),
actual: fmt.Sprintf("%.1f", row.GetFloat64(1)),
expected: "0",
severity: "warning",
}
}

var results []inspectionResult
for _, rule := range rules {
if filter.enable(rule.item) {
rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQL(rule.sql)
if err != nil {
sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("check load %s failed: %v", rule.item, err))
continue
}
for _, row := range rows {
results = append(results, rule.result(rule.item, rule.expected, row))
}
}
func (inspectSwapMemoryUsed) getItem() string {
return "swap-memory-used"
}

type inspectDiskUsage struct{}

func (inspectDiskUsage) genSQL(timeRange plannercore.QueryTimeRange) string {
sql := fmt.Sprintf("select instance, device, max(value) as max_usage from metrics_schema.node_disk_usage %v and device like '/%%' group by instance, device having max_usage >= 70", timeRange.Condition())
return sql
}

func (i inspectDiskUsage) genResult(sql string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: "node",
instance: row.GetString(0),
item: i.getItem(),
actual: fmt.Sprintf("%.1f%%", row.GetFloat64(2)),
expected: "< 70%",
severity: "warning",
detail: "the disk-usage of " + row.GetString(1) + " is too high",
}
results = append(results, c.inspectCPULoad(sctx, filter)...)
return results
}

func (currentLoadInspection) inspectCPULoad(sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var results []inspectionResult
for _, item := range []string{"load1", "load5", "load15"} {
if !filter.enable(item) {
continue
}
sql := fmt.Sprintf(`select t1.*, 0.7 * t2.cpu_core from
(select type, instance, value from information_schema.cluster_load where device_type='cpu' and device_name='cpu' and name='%s') as t1 join
(select type,instance, max(value) as cpu_core from information_schema.CLUSTER_HARDWARE where DEVICE_TYPE='cpu' and name='cpu-logical-cores' group by type,instance) as t2
where t2.instance = t1.instance and t1.type=t2.type and t1.value > 0.7 * t2.cpu_core;`, item)
rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQL(sql)
if err != nil {
sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("check load %s failed: %v", item, err))
continue
}
for _, row := range rows {
result := inspectionResult{
tp: row.GetString(0),
instance: row.GetString(1),
item: "cpu-" + item,
actual: row.GetString(2),
expected: fmt.Sprintf("< %.1f", row.GetFloat64(3)),
severity: "warning",
detail: "cpu-" + item + " should less than (cpu_logical_cores * 0.7)",
}
results = append(results, result)
}
func (inspectDiskUsage) getItem() string {
return "disk-usage"
}

type inspectCPULoad struct {
item string
tbl string
}

func (i inspectCPULoad) genSQL(timeRange plannercore.QueryTimeRange) string {
sql := fmt.Sprintf(`select t1.instance, t1.max_load , 0.7*t2.cpu_count from
(select instance,max(value) as max_load from metrics_schema.%[1]s %[2]s group by instance) as t1 join
(select instance,max(value) as cpu_count from metrics_schema.node_virtual_cpus %[2]s group by instance) as t2
on t1.instance=t2.instance where t1.max_load>(0.7*t2.cpu_count);`, i.tbl, timeRange.Condition())
return sql
}

func (i inspectCPULoad) genResult(sql string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: "node",
instance: row.GetString(0),
item: "cpu-" + i.item,
actual: fmt.Sprintf("%.1f", row.GetFloat64(1)),
expected: fmt.Sprintf("< %.1f", row.GetFloat64(2)),
severity: "warning",
detail: i.getItem() + " should less than (cpu_logical_cores * 0.7)",
}
return results
}

func (i inspectCPULoad) getItem() string {
return "cpu-" + i.item
}

func (c criticalErrorInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
Expand Down Expand Up @@ -855,7 +868,7 @@ func (thresholdCheckInspection) inspectThreshold2(ctx context.Context, sctx sess
return results
}

type thresholdCheckRule interface {
type ruleChecker interface {
genSQL(timeRange plannercore.QueryTimeRange) string
genResult(sql string, row chunk.Row) inspectionResult
getItem() string
Expand Down Expand Up @@ -967,7 +980,7 @@ func (c checkStoreRegionTooMuch) getItem() string {
}

func (thresholdCheckInspection) inspectThreshold3(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var rules = []thresholdCheckRule{
var rules = []ruleChecker{
compareStoreStatus{
item: "leader-score-balance",
tp: "leader_score",
Expand All @@ -986,7 +999,10 @@ func (thresholdCheckInspection) inspectThreshold3(ctx context.Context, sctx sess
checkRegionHealth{},
checkStoreRegionTooMuch{},
}
return checkRules(ctx, sctx, filter, rules)
}

func checkRules(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter, rules []ruleChecker) []inspectionResult {
var results []inspectionResult
for _, rule := range rules {
if !filter.enable(rule.getItem()) {
Expand Down
96 changes: 74 additions & 22 deletions executor/inspection_result_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,6 @@ func (s *inspectionResultSuite) TestInspectionResult(c *C) {
types.MakeDatums("pd", "192.168.1.33:1234", "192.168.1.33:1234", "4.0", "m234e"),
},
}
// mock load
mockData[infoschema.TableClusterLoad] = variable.TableSnapshot{
Rows: [][]types.Datum{
types.MakeDatums("tidb", "192.168.1.11:1234", "memory", "virtual", "used-percent", "0.8"),
types.MakeDatums("tidb", "192.168.1.12:1234", "memory", "virtual", "used-percent", "0.6"),
types.MakeDatums("tidb", "192.168.1.13:1234", "memory", "swap", "used-percent", "0"),
types.MakeDatums("tikv", "192.168.1.21:1234", "memory", "swap", "used-percent", "0.6"),
types.MakeDatums("pd", "192.168.1.31:1234", "cpu", "cpu", "load1", "1.0"),
types.MakeDatums("pd", "192.168.1.32:1234", "cpu", "cpu", "load5", "2.0"),
types.MakeDatums("pd", "192.168.1.33:1234", "cpu", "cpu", "load15", "8.0"),
},
}
mockData[infoschema.TableClusterHardware] = variable.TableSnapshot{
Rows: [][]types.Datum{
types.MakeDatums("tikv", "192.168.1.22:1234", "disk", "sda", "used-percent", "80"),
Expand Down Expand Up @@ -158,16 +146,6 @@ func (s *inspectionResultSuite) TestInspectionResult(c *C) {
"version git_hash tidb inconsistent consistent critical the cluster has 3 different tidb versions, execute the sql to see more detail: select * from information_schema.cluster_info where type='tidb'",
},
},
{
sql: "select rule, item, type, instance, value, reference, severity, details from information_schema.inspection_result where rule='current-load'",
rows: []string{
"current-load cpu-load1 pd 192.168.1.31:1234 1.0 < 0.7 warning cpu-load1 should less than (cpu_logical_cores * 0.7)",
"current-load cpu-load15 pd 192.168.1.33:1234 8.0 < 7.0 warning cpu-load15 should less than (cpu_logical_cores * 0.7)",
"current-load disk-usage tikv 192.168.1.22:1234 80 < 70 warning current disk-usage is too high, execute the sql to see more detail: select * from information_schema.cluster_hardware where type='tikv' and instance='192.168.1.22:1234' and device_type='disk' and device_name='sda'",
"current-load swap-memory-usage tikv 192.168.1.21:1234 0.6 0 warning ",
"current-load virtual-memory-usage tidb 192.168.1.11:1234 0.8 < 0.7 warning ",
},
},
}

for _, cs := range cases {
Expand Down Expand Up @@ -583,3 +561,77 @@ func (s *inspectionResultSuite) TestCriticalErrorInspection(c *C) {
"tikv_engine_write_stall tikv-0 1.00(kv) the total number of errors about 'tikv_engine_write_stall' is too many",
))
}

func (s *inspectionResultSuite) TestNodeLoadInspection(c *C) {
tk := testkit.NewTestKitWithInit(c, s.store)
datetime := func(s string) types.Time {
t, err := types.ParseTime(tk.Se.GetSessionVars().StmtCtx, s, mysql.TypeDatetime, types.MaxFsp)
c.Assert(err, IsNil)
return t
}

// construct some mock abnormal data
mockData := map[string][][]types.Datum{
// columns: time, instance, value
"node_load1": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 28.1),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 13.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 10.0),
},
// columns: time, instance, value
"node_load5": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 27.9),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 14.1),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 0.0),
},
// columns: time, instance, value
"node_load15": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 30.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 14.1),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 20.0),
},
// columns: time, instance, value
"node_virtual_cpus": {
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 40.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 20.0),
},
// columns: time, instance, value
"node_memory_usage": {
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 80.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 60.0),
types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", 60.0),
},
// columns: time, instance, value
"node_memory_swap_used": {
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 0.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-1", 0.0),
},
// columns: time, instance, device, value
"node_disk_usage": {
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", "/dev/nvme0", 80.0),
types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", "/dev/nvme0", 50.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", "tmpfs", 80.0),
types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", "tmpfs", 50.0),
},
}

ctx := s.setupForThresholdCheck(c, mockData)
defer s.tearDownForThresholdCheck(c)

rs, err := tk.Se.Execute(ctx, `select /*+ time_range('2020-02-14 04:20:00','2020-02-14 05:23:00') */
item, type, instance, value, reference, details from information_schema.inspection_result
where rule='node-load' order by item, value`)
c.Assert(err, IsNil)
result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed"))
c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings()))
result.Check(testkit.Rows(
"cpu-load1 node node-0 28.1 < 28.0 cpu-load1 should less than (cpu_logical_cores * 0.7)",
"cpu-load15 node node-1 14.1 < 14.0 cpu-load15 should less than (cpu_logical_cores * 0.7)",
"cpu-load15 node node-0 30.0 < 28.0 cpu-load15 should less than (cpu_logical_cores * 0.7)",
"cpu-load5 node node-1 14.1 < 14.0 cpu-load5 should less than (cpu_logical_cores * 0.7)",
"disk-usage node node-0 80.0% < 70% the disk-usage of /dev/nvme0 is too high",
"swap-memory-used node node-1 1.0 0 ",
"virtual-memory-usage node node-0 80.0% < 70% the memory-usage is too high",
))
}
12 changes: 8 additions & 4 deletions infoschema/metric_table_def.go
Original file line number Diff line number Diff line change
Expand Up @@ -2030,10 +2030,14 @@ var MetricTableMap = map[string]MetricTableDef{
PromQL: `node_memory_MemAvailable_bytes{$LABEL_CONDITIONS}`,
Labels: []string{"instance"},
},
"node_total_memory_swap": {
PromQL: `node_memory_SwapTotal_bytes{$LABEL_CONDITIONS}`,
"node_memory_usage": {
PromQL: `100* (1-(node_memory_MemAvailable_bytes{$LABEL_CONDITIONS}/node_memory_MemTotal_bytes{$LABEL_CONDITIONS}))`,
Labels: []string{"instance"},
},
"node_memory_swap_used": {
PromQL: `node_memory_SwapTotal_bytes{$LABEL_CONDITIONS} - node_memory_SwapFree_bytes{$LABEL_CONDITIONS}`,
Labels: []string{"instance"},
Comment: "node total memory swap",
Comment: "bytes used of node swap memory",
},
"node_uptime": {
PromQL: `node_time_seconds{$LABEL_CONDITIONS} - node_boot_time_seconds{$LABEL_CONDITIONS}`,
Expand Down Expand Up @@ -2146,7 +2150,7 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "device"},
Comment: "Units is byte",
},
"node_filesystem_space_used": {
"node_disk_usage": {
PromQL: `((node_filesystem_size_bytes{$LABEL_CONDITIONS} - node_filesystem_avail_bytes{$LABEL_CONDITIONS}) / node_filesystem_size_bytes{$LABEL_CONDITIONS}) * 100`,
Labels: []string{"instance", "device"},
Comment: "Filesystem used space. If is > 80% then is Critical.",
Expand Down

0 comments on commit 4566942

Please sign in to comment.