Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

executor/inspect: refactor current-load diagnosis rule to node-check (#15860) #17660

Merged
merged 1 commit into from
Jun 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 108 additions & 92 deletions executor/inspection_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ type (
// version in the cluster
versionInspection struct{ inspectionName }

// currentLoadInspection is used to check the current load of memory/disk/cpu
// nodeLoadInspection is used to check the node load of memory/disk/cpu
// have reached a high-level threshold
currentLoadInspection struct{ inspectionName }
nodeLoadInspection struct{ inspectionName }

// criticalErrorInspection is used to check are there some critical errors
// occurred in the past
Expand All @@ -97,7 +97,7 @@ type (
var inspectionRules = []inspectionRule{
&configInspection{inspectionName: "config"},
&versionInspection{inspectionName: "version"},
&currentLoadInspection{inspectionName: "current-load"},
&nodeLoadInspection{inspectionName: "node-load"},
&criticalErrorInspection{inspectionName: "critical-error"},
&thresholdCheckInspection{inspectionName: "threshold-check"},
}
Expand Down Expand Up @@ -316,101 +316,114 @@ func (versionInspection) inspect(_ context.Context, sctx sessionctx.Context, fil
return results
}

func (c currentLoadInspection) inspect(_ context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var commonResult = func(item, expected string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: row.GetString(0),
instance: row.GetString(1),
item: item,
actual: row.GetString(2),
expected: expected,
severity: "warning",
}
func (c nodeLoadInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var rules = []ruleChecker{
inspectCPULoad{item: "load1", tbl: "node_load1"},
inspectCPULoad{item: "load5", tbl: "node_load5"},
inspectCPULoad{item: "load15", tbl: "node_load15"},
inspectVirtualMemUsage{},
inspectSwapMemoryUsed{},
inspectDiskUsage{},
}
var diskResult = func(item, expected string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: row.GetString(0),
instance: row.GetString(1),
item: item,
actual: row.GetString(3),
expected: expected,
severity: "warning",
detail: fmt.Sprintf("current disk-usage is too high, execute the sql to see more detail: select * from information_schema.cluster_hardware where type='%s' and instance='%s' and device_type='disk' and device_name='%s'",
row.GetString(0), row.GetString(1), row.GetString(2)),
}
return checkRules(ctx, sctx, filter, rules)
}

type inspectVirtualMemUsage struct{}

func (inspectVirtualMemUsage) genSQL(timeRange plannercore.QueryTimeRange) string {
sql := fmt.Sprintf("select instance, max(value) as max_usage from metrics_schema.node_memory_usage %s group by instance having max_usage >= 70", timeRange.Condition())
return sql
}

func (i inspectVirtualMemUsage) genResult(sql string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: "node",
instance: row.GetString(0),
item: i.getItem(),
actual: fmt.Sprintf("%.1f%%", row.GetFloat64(1)),
expected: "< 70%",
severity: "warning",
detail: "the memory-usage is too high",
}
var rules = []struct {
item string
sql string
expected string
result func(string, string, chunk.Row) inspectionResult
}{
{
"virtual-memory-usage",
"select type, instance, value from information_schema.cluster_load where device_type='memory' and device_name='virtual' and name='used-percent' and value > 0.7",
"< 0.7",
commonResult,
},
{
"swap-memory-usage",
"select type, instance, value from information_schema.cluster_load where device_type='memory' and device_name='swap' and name='used-percent' and value > 0",
"0",
commonResult,
},
{
"disk-usage",
"select type, instance, device_name, value from information_schema.cluster_hardware where device_type='disk' and name='used-percent' and value > 70",
"< 70",
diskResult,
},
}

func (inspectVirtualMemUsage) getItem() string {
return "virtual-memory-usage"
}

type inspectSwapMemoryUsed struct{}

func (inspectSwapMemoryUsed) genSQL(timeRange plannercore.QueryTimeRange) string {
sql := fmt.Sprintf("select instance, max(value) as max_used from metrics_schema.node_memory_swap_used %s group by instance having max_used > 0", timeRange.Condition())
fmt.Println(sql)
return sql
}

func (i inspectSwapMemoryUsed) genResult(sql string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: "node",
instance: row.GetString(0),
item: i.getItem(),
actual: fmt.Sprintf("%.1f", row.GetFloat64(1)),
expected: "0",
severity: "warning",
}
}

var results []inspectionResult
for _, rule := range rules {
if filter.enable(rule.item) {
rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQL(rule.sql)
if err != nil {
sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("check load %s failed: %v", rule.item, err))
continue
}
for _, row := range rows {
results = append(results, rule.result(rule.item, rule.expected, row))
}
}
func (inspectSwapMemoryUsed) getItem() string {
return "swap-memory-used"
}

type inspectDiskUsage struct{}

func (inspectDiskUsage) genSQL(timeRange plannercore.QueryTimeRange) string {
sql := fmt.Sprintf("select instance, device, max(value) as max_usage from metrics_schema.node_disk_usage %v and device like '/%%' group by instance, device having max_usage >= 70", timeRange.Condition())
return sql
}

func (i inspectDiskUsage) genResult(sql string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: "node",
instance: row.GetString(0),
item: i.getItem(),
actual: fmt.Sprintf("%.1f%%", row.GetFloat64(2)),
expected: "< 70%",
severity: "warning",
detail: "the disk-usage of " + row.GetString(1) + " is too high",
}
results = append(results, c.inspectCPULoad(sctx, filter)...)
return results
}

func (currentLoadInspection) inspectCPULoad(sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var results []inspectionResult
for _, item := range []string{"load1", "load5", "load15"} {
if !filter.enable(item) {
continue
}
sql := fmt.Sprintf(`select t1.*, 0.7 * t2.cpu_core from
(select type, instance, value from information_schema.cluster_load where device_type='cpu' and device_name='cpu' and name='%s') as t1 join
(select type,instance, max(value) as cpu_core from information_schema.CLUSTER_HARDWARE where DEVICE_TYPE='cpu' and name='cpu-logical-cores' group by type,instance) as t2
where t2.instance = t1.instance and t1.type=t2.type and t1.value > 0.7 * t2.cpu_core;`, item)
rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQL(sql)
if err != nil {
sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("check load %s failed: %v", item, err))
continue
}
for _, row := range rows {
result := inspectionResult{
tp: row.GetString(0),
instance: row.GetString(1),
item: "cpu-" + item,
actual: row.GetString(2),
expected: fmt.Sprintf("< %.1f", row.GetFloat64(3)),
severity: "warning",
detail: "cpu-" + item + " should less than (cpu_logical_cores * 0.7)",
}
results = append(results, result)
}
func (inspectDiskUsage) getItem() string {
return "disk-usage"
}

type inspectCPULoad struct {
item string
tbl string
}

func (i inspectCPULoad) genSQL(timeRange plannercore.QueryTimeRange) string {
sql := fmt.Sprintf(`select t1.instance, t1.max_load , 0.7*t2.cpu_count from
(select instance,max(value) as max_load from metrics_schema.%[1]s %[2]s group by instance) as t1 join
(select instance,max(value) as cpu_count from metrics_schema.node_virtual_cpus %[2]s group by instance) as t2
on t1.instance=t2.instance where t1.max_load>(0.7*t2.cpu_count);`, i.tbl, timeRange.Condition())
return sql
}

func (i inspectCPULoad) genResult(sql string, row chunk.Row) inspectionResult {
return inspectionResult{
tp: "node",
instance: row.GetString(0),
item: "cpu-" + i.item,
actual: fmt.Sprintf("%.1f", row.GetFloat64(1)),
expected: fmt.Sprintf("< %.1f", row.GetFloat64(2)),
severity: "warning",
detail: i.getItem() + " should less than (cpu_logical_cores * 0.7)",
}
return results
}

func (i inspectCPULoad) getItem() string {
return "cpu-" + i.item
}

func (c criticalErrorInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
Expand Down Expand Up @@ -855,7 +868,7 @@ func (thresholdCheckInspection) inspectThreshold2(ctx context.Context, sctx sess
return results
}

type thresholdCheckRule interface {
type ruleChecker interface {
genSQL(timeRange plannercore.QueryTimeRange) string
genResult(sql string, row chunk.Row) inspectionResult
getItem() string
Expand Down Expand Up @@ -967,7 +980,7 @@ func (c checkStoreRegionTooMuch) getItem() string {
}

func (thresholdCheckInspection) inspectThreshold3(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var rules = []thresholdCheckRule{
var rules = []ruleChecker{
compareStoreStatus{
item: "leader-score-balance",
tp: "leader_score",
Expand All @@ -986,7 +999,10 @@ func (thresholdCheckInspection) inspectThreshold3(ctx context.Context, sctx sess
checkRegionHealth{},
checkStoreRegionTooMuch{},
}
return checkRules(ctx, sctx, filter, rules)
}

func checkRules(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter, rules []ruleChecker) []inspectionResult {
var results []inspectionResult
for _, rule := range rules {
if !filter.enable(rule.getItem()) {
Expand Down
96 changes: 74 additions & 22 deletions executor/inspection_result_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,6 @@ func (s *inspectionResultSuite) TestInspectionResult(c *C) {
types.MakeDatums("pd", "192.168.1.33:1234", "192.168.1.33:1234", "4.0", "m234e"),
},
}
// mock load
mockData[infoschema.TableClusterLoad] = variable.TableSnapshot{
Rows: [][]types.Datum{
types.MakeDatums("tidb", "192.168.1.11:1234", "memory", "virtual", "used-percent", "0.8"),
types.MakeDatums("tidb", "192.168.1.12:1234", "memory", "virtual", "used-percent", "0.6"),
types.MakeDatums("tidb", "192.168.1.13:1234", "memory", "swap", "used-percent", "0"),
types.MakeDatums("tikv", "192.168.1.21:1234", "memory", "swap", "used-percent", "0.6"),
types.MakeDatums("pd", "192.168.1.31:1234", "cpu", "cpu", "load1", "1.0"),
types.MakeDatums("pd", "192.168.1.32:1234", "cpu", "cpu", "load5", "2.0"),
types.MakeDatums("pd", "192.168.1.33:1234", "cpu", "cpu", "load15", "8.0"),
},
}
mockData[infoschema.TableClusterHardware] = variable.TableSnapshot{
Rows: [][]types.Datum{
types.MakeDatums("tikv", "192.168.1.22:1234", "disk", "sda", "used-percent", "80"),
Expand Down Expand Up @@ -158,16 +146,6 @@ func (s *inspectionResultSuite) TestInspectionResult(c *C) {
"version git_hash tidb inconsistent consistent critical the cluster has 3 different tidb versions, execute the sql to see more detail: select * from information_schema.cluster_info where type='tidb'",
},
},
{
sql: "select rule, item, type, instance, value, reference, severity, details from information_schema.inspection_result where rule='current-load'",
rows: []string{
"current-load cpu-load1 pd 192.168.1.31:1234 1.0 < 0.7 warning cpu-load1 should less than (cpu_logical_cores * 0.7)",
"current-load cpu-load15 pd 192.168.1.33:1234 8.0 < 7.0 warning cpu-load15 should less than (cpu_logical_cores * 0.7)",
"current-load disk-usage tikv 192.168.1.22:1234 80 < 70 warning current disk-usage is too high, execute the sql to see more detail: select * from information_schema.cluster_hardware where type='tikv' and instance='192.168.1.22:1234' and device_type='disk' and device_name='sda'",
"current-load swap-memory-usage tikv 192.168.1.21:1234 0.6 0 warning ",
"current-load virtual-memory-usage tidb 192.168.1.11:1234 0.8 < 0.7 warning ",
},
},
}

for _, cs := range cases {
Expand Down Expand Up @@ -583,3 +561,77 @@ func (s *inspectionResultSuite) TestCriticalErrorInspection(c *C) {
"tikv_engine_write_stall tikv-0 1.00(kv) the total number of errors about 'tikv_engine_write_stall' is too many",
))
}

func (s *inspectionResultSuite) TestNodeLoadInspection(c *C) {
tk := testkit.NewTestKitWithInit(c, s.store)
datetime := func(s string) types.Time {
t, err := types.ParseTime(tk.Se.GetSessionVars().StmtCtx, s, mysql.TypeDatetime, types.MaxFsp)
c.Assert(err, IsNil)
return t
}

// construct some mock abnormal data
mockData := map[string][][]types.Datum{
// columns: time, instance, value
"node_load1": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 28.1),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 13.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 10.0),
},
// columns: time, instance, value
"node_load5": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 27.9),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 14.1),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 0.0),
},
// columns: time, instance, value
"node_load15": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 30.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 14.1),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 20.0),
},
// columns: time, instance, value
"node_virtual_cpus": {
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 40.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 20.0),
},
// columns: time, instance, value
"node_memory_usage": {
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 80.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 60.0),
types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", 60.0),
},
// columns: time, instance, value
"node_memory_swap_used": {
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 0.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-1", 0.0),
},
// columns: time, instance, device, value
"node_disk_usage": {
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", "/dev/nvme0", 80.0),
types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", "/dev/nvme0", 50.0),
types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", "tmpfs", 80.0),
types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", "tmpfs", 50.0),
},
}

ctx := s.setupForThresholdCheck(c, mockData)
defer s.tearDownForThresholdCheck(c)

rs, err := tk.Se.Execute(ctx, `select /*+ time_range('2020-02-14 04:20:00','2020-02-14 05:23:00') */
item, type, instance, value, reference, details from information_schema.inspection_result
where rule='node-load' order by item, value`)
c.Assert(err, IsNil)
result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed"))
c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings()))
result.Check(testkit.Rows(
"cpu-load1 node node-0 28.1 < 28.0 cpu-load1 should less than (cpu_logical_cores * 0.7)",
"cpu-load15 node node-1 14.1 < 14.0 cpu-load15 should less than (cpu_logical_cores * 0.7)",
"cpu-load15 node node-0 30.0 < 28.0 cpu-load15 should less than (cpu_logical_cores * 0.7)",
"cpu-load5 node node-1 14.1 < 14.0 cpu-load5 should less than (cpu_logical_cores * 0.7)",
"disk-usage node node-0 80.0% < 70% the disk-usage of /dev/nvme0 is too high",
"swap-memory-used node node-1 1.0 0 ",
"virtual-memory-usage node node-0 80.0% < 70% the memory-usage is too high",
))
}
12 changes: 8 additions & 4 deletions infoschema/metric_table_def.go
Original file line number Diff line number Diff line change
Expand Up @@ -2030,10 +2030,14 @@ var MetricTableMap = map[string]MetricTableDef{
PromQL: `node_memory_MemAvailable_bytes{$LABEL_CONDITIONS}`,
Labels: []string{"instance"},
},
"node_total_memory_swap": {
PromQL: `node_memory_SwapTotal_bytes{$LABEL_CONDITIONS}`,
"node_memory_usage": {
PromQL: `100* (1-(node_memory_MemAvailable_bytes{$LABEL_CONDITIONS}/node_memory_MemTotal_bytes{$LABEL_CONDITIONS}))`,
Labels: []string{"instance"},
},
"node_memory_swap_used": {
PromQL: `node_memory_SwapTotal_bytes{$LABEL_CONDITIONS} - node_memory_SwapFree_bytes{$LABEL_CONDITIONS}`,
Labels: []string{"instance"},
Comment: "node total memory swap",
Comment: "bytes used of node swap memory",
},
"node_uptime": {
PromQL: `node_time_seconds{$LABEL_CONDITIONS} - node_boot_time_seconds{$LABEL_CONDITIONS}`,
Expand Down Expand Up @@ -2146,7 +2150,7 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "device"},
Comment: "Units is byte",
},
"node_filesystem_space_used": {
"node_disk_usage": {
PromQL: `((node_filesystem_size_bytes{$LABEL_CONDITIONS} - node_filesystem_avail_bytes{$LABEL_CONDITIONS}) / node_filesystem_size_bytes{$LABEL_CONDITIONS}) * 100`,
Labels: []string{"instance", "device"},
Comment: "Filesystem used space. If is > 80% then is Critical.",
Expand Down