executor/inspect: refactor current-load diagnosis rule to node-check (#…

…15860) (#17660)
pingcap · Jun 4, 2020 · 4566942 · 4566942
1 parent a7c0ab6
commit 4566942
Show file tree

Hide file tree

Showing 3 changed files with 190 additions and 118 deletions.
diff --git a/executor/inspection_result.go b/executor/inspection_result.go
@@ -82,9 +82,9 @@ type (
 	// version in the cluster
 	versionInspection struct{ inspectionName }
 
-	// currentLoadInspection is used to check the current load of memory/disk/cpu
+	// nodeLoadInspection is used to check the node load of memory/disk/cpu
 	// have reached a high-level threshold
-	currentLoadInspection struct{ inspectionName }
+	nodeLoadInspection struct{ inspectionName }
 
 	// criticalErrorInspection is used to check are there some critical errors
 	// occurred in the past
@@ -97,7 +97,7 @@ type (
 var inspectionRules = []inspectionRule{
 	&configInspection{inspectionName: "config"},
 	&versionInspection{inspectionName: "version"},
-	&currentLoadInspection{inspectionName: "current-load"},
+	&nodeLoadInspection{inspectionName: "node-load"},
 	&criticalErrorInspection{inspectionName: "critical-error"},
 	&thresholdCheckInspection{inspectionName: "threshold-check"},
 }
@@ -316,101 +316,114 @@ func (versionInspection) inspect(_ context.Context, sctx sessionctx.Context, fil
 	return results
 }
 
-func (c currentLoadInspection) inspect(_ context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
-	var commonResult = func(item, expected string, row chunk.Row) inspectionResult {
-		return inspectionResult{
-			tp:       row.GetString(0),
-			instance: row.GetString(1),
-			item:     item,
-			actual:   row.GetString(2),
-			expected: expected,
-			severity: "warning",
-		}
+func (c nodeLoadInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
+	var rules = []ruleChecker{
+		inspectCPULoad{item: "load1", tbl: "node_load1"},
+		inspectCPULoad{item: "load5", tbl: "node_load5"},
+		inspectCPULoad{item: "load15", tbl: "node_load15"},
+		inspectVirtualMemUsage{},
+		inspectSwapMemoryUsed{},
+		inspectDiskUsage{},
 	}
-	var diskResult = func(item, expected string, row chunk.Row) inspectionResult {
-		return inspectionResult{
-			tp:       row.GetString(0),
-			instance: row.GetString(1),
-			item:     item,
-			actual:   row.GetString(3),
-			expected: expected,
-			severity: "warning",
-			detail: fmt.Sprintf("current disk-usage is too high, execute the sql to see more detail: select * from information_schema.cluster_hardware where type='%s' and instance='%s' and device_type='disk' and device_name='%s'",
-				row.GetString(0), row.GetString(1), row.GetString(2)),
-		}
+	return checkRules(ctx, sctx, filter, rules)
+}
+
+type inspectVirtualMemUsage struct{}
+
+func (inspectVirtualMemUsage) genSQL(timeRange plannercore.QueryTimeRange) string {
+	sql := fmt.Sprintf("select instance, max(value) as max_usage from metrics_schema.node_memory_usage %s group by instance having max_usage >= 70", timeRange.Condition())
+	return sql
+}
+
+func (i inspectVirtualMemUsage) genResult(sql string, row chunk.Row) inspectionResult {
+	return inspectionResult{
+		tp:       "node",
+		instance: row.GetString(0),
+		item:     i.getItem(),
+		actual:   fmt.Sprintf("%.1f%%", row.GetFloat64(1)),
+		expected: "< 70%",
+		severity: "warning",
+		detail:   "the memory-usage is too high",
 	}
-	var rules = []struct {
-		item     string
-		sql      string
-		expected string
-		result   func(string, string, chunk.Row) inspectionResult
-	}{
-		{
-			"virtual-memory-usage",
-			"select type, instance, value from information_schema.cluster_load where device_type='memory' and device_name='virtual' and name='used-percent' and value > 0.7",
-			"< 0.7",
-			commonResult,
-		},
-		{
-			"swap-memory-usage",
-			"select type, instance, value from information_schema.cluster_load where device_type='memory' and device_name='swap' and name='used-percent' and value > 0",
-			"0",
-			commonResult,
-		},
-		{
-			"disk-usage",
-			"select type, instance, device_name, value from information_schema.cluster_hardware where device_type='disk' and name='used-percent' and value > 70",
-			"< 70",
-			diskResult,
-		},
+}
+
+func (inspectVirtualMemUsage) getItem() string {
+	return "virtual-memory-usage"
+}
+
+type inspectSwapMemoryUsed struct{}
+
+func (inspectSwapMemoryUsed) genSQL(timeRange plannercore.QueryTimeRange) string {
+	sql := fmt.Sprintf("select instance, max(value) as max_used from metrics_schema.node_memory_swap_used %s group by instance having max_used > 0", timeRange.Condition())
+	fmt.Println(sql)
+	return sql
+}
+
+func (i inspectSwapMemoryUsed) genResult(sql string, row chunk.Row) inspectionResult {
+	return inspectionResult{
+		tp:       "node",
+		instance: row.GetString(0),
+		item:     i.getItem(),
+		actual:   fmt.Sprintf("%.1f", row.GetFloat64(1)),
+		expected: "0",
+		severity: "warning",
 	}
+}
 
-	var results []inspectionResult
-	for _, rule := range rules {
-		if filter.enable(rule.item) {
-			rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQL(rule.sql)
-			if err != nil {
-				sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("check load %s failed: %v", rule.item, err))
-				continue
-			}
-			for _, row := range rows {
-				results = append(results, rule.result(rule.item, rule.expected, row))
-			}
-		}
+func (inspectSwapMemoryUsed) getItem() string {
+	return "swap-memory-used"
+}
+
+type inspectDiskUsage struct{}
+
+func (inspectDiskUsage) genSQL(timeRange plannercore.QueryTimeRange) string {
+	sql := fmt.Sprintf("select instance, device, max(value) as max_usage from metrics_schema.node_disk_usage %v and device like '/%%' group by instance, device having max_usage >= 70", timeRange.Condition())
+	return sql
+}
+
+func (i inspectDiskUsage) genResult(sql string, row chunk.Row) inspectionResult {
+	return inspectionResult{
+		tp:       "node",
+		instance: row.GetString(0),
+		item:     i.getItem(),
+		actual:   fmt.Sprintf("%.1f%%", row.GetFloat64(2)),
+		expected: "< 70%",
+		severity: "warning",
+		detail:   "the disk-usage of " + row.GetString(1) + " is too high",
 	}
-	results = append(results, c.inspectCPULoad(sctx, filter)...)
-	return results
 }
 
-func (currentLoadInspection) inspectCPULoad(sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
-	var results []inspectionResult
-	for _, item := range []string{"load1", "load5", "load15"} {
-		if !filter.enable(item) {
-			continue
-		}
-		sql := fmt.Sprintf(`select t1.*, 0.7 * t2.cpu_core from
-				(select type, instance, value from information_schema.cluster_load where device_type='cpu' and device_name='cpu' and name='%s') as t1 join
-				(select type,instance, max(value) as cpu_core from information_schema.CLUSTER_HARDWARE where DEVICE_TYPE='cpu' and name='cpu-logical-cores' group by type,instance) as t2
-				where t2.instance = t1.instance and t1.type=t2.type and t1.value > 0.7 * t2.cpu_core;`, item)
-		rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQL(sql)
-		if err != nil {
-			sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("check load %s failed: %v", item, err))
-			continue
-		}
-		for _, row := range rows {
-			result := inspectionResult{
-				tp:       row.GetString(0),
-				instance: row.GetString(1),
-				item:     "cpu-" + item,
-				actual:   row.GetString(2),
-				expected: fmt.Sprintf("< %.1f", row.GetFloat64(3)),
-				severity: "warning",
-				detail:   "cpu-" + item + " should less than (cpu_logical_cores * 0.7)",
-			}
-			results = append(results, result)
-		}
+func (inspectDiskUsage) getItem() string {
+	return "disk-usage"
+}
+
+type inspectCPULoad struct {
+	item string
+	tbl  string
+}
+
+func (i inspectCPULoad) genSQL(timeRange plannercore.QueryTimeRange) string {
+	sql := fmt.Sprintf(`select t1.instance, t1.max_load , 0.7*t2.cpu_count from
+			(select instance,max(value) as max_load  from metrics_schema.%[1]s %[2]s group by instance) as t1 join
+			(select instance,max(value) as cpu_count from metrics_schema.node_virtual_cpus %[2]s group by instance) as t2
+			on t1.instance=t2.instance where t1.max_load>(0.7*t2.cpu_count);`, i.tbl, timeRange.Condition())
+	return sql
+}
+
+func (i inspectCPULoad) genResult(sql string, row chunk.Row) inspectionResult {
+	return inspectionResult{
+		tp:       "node",
+		instance: row.GetString(0),
+		item:     "cpu-" + i.item,
+		actual:   fmt.Sprintf("%.1f", row.GetFloat64(1)),
+		expected: fmt.Sprintf("< %.1f", row.GetFloat64(2)),
+		severity: "warning",
+		detail:   i.getItem() + " should less than (cpu_logical_cores * 0.7)",
 	}
-	return results
+}
+
+func (i inspectCPULoad) getItem() string {
+	return "cpu-" + i.item
 }
 
 func (c criticalErrorInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
@@ -855,7 +868,7 @@ func (thresholdCheckInspection) inspectThreshold2(ctx context.Context, sctx sess
 	return results
 }
 
-type thresholdCheckRule interface {
+type ruleChecker interface {
 	genSQL(timeRange plannercore.QueryTimeRange) string
 	genResult(sql string, row chunk.Row) inspectionResult
 	getItem() string
@@ -967,7 +980,7 @@ func (c checkStoreRegionTooMuch) getItem() string {
 }
 
 func (thresholdCheckInspection) inspectThreshold3(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
-	var rules = []thresholdCheckRule{
+	var rules = []ruleChecker{
 		compareStoreStatus{
 			item:      "leader-score-balance",
 			tp:        "leader_score",
@@ -986,7 +999,10 @@ func (thresholdCheckInspection) inspectThreshold3(ctx context.Context, sctx sess
 		checkRegionHealth{},
 		checkStoreRegionTooMuch{},
 	}
+	return checkRules(ctx, sctx, filter, rules)
+}
 
+func checkRules(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter, rules []ruleChecker) []inspectionResult {
 	var results []inspectionResult
 	for _, rule := range rules {
 		if !filter.enable(rule.getItem()) {

diff --git a/executor/inspection_result_test.go b/executor/inspection_result_test.go
@@ -86,18 +86,6 @@ func (s *inspectionResultSuite) TestInspectionResult(c *C) {
 			types.MakeDatums("pd", "192.168.1.33:1234", "192.168.1.33:1234", "4.0", "m234e"),
 		},
 	}
-	// mock load
-	mockData[infoschema.TableClusterLoad] = variable.TableSnapshot{
-		Rows: [][]types.Datum{
-			types.MakeDatums("tidb", "192.168.1.11:1234", "memory", "virtual", "used-percent", "0.8"),
-			types.MakeDatums("tidb", "192.168.1.12:1234", "memory", "virtual", "used-percent", "0.6"),
-			types.MakeDatums("tidb", "192.168.1.13:1234", "memory", "swap", "used-percent", "0"),
-			types.MakeDatums("tikv", "192.168.1.21:1234", "memory", "swap", "used-percent", "0.6"),
-			types.MakeDatums("pd", "192.168.1.31:1234", "cpu", "cpu", "load1", "1.0"),
-			types.MakeDatums("pd", "192.168.1.32:1234", "cpu", "cpu", "load5", "2.0"),
-			types.MakeDatums("pd", "192.168.1.33:1234", "cpu", "cpu", "load15", "8.0"),
-		},
-	}
 	mockData[infoschema.TableClusterHardware] = variable.TableSnapshot{
 		Rows: [][]types.Datum{
 			types.MakeDatums("tikv", "192.168.1.22:1234", "disk", "sda", "used-percent", "80"),
@@ -158,16 +146,6 @@ func (s *inspectionResultSuite) TestInspectionResult(c *C) {
 				"version git_hash tidb inconsistent consistent critical the cluster has 3 different tidb versions, execute the sql to see more detail: select * from information_schema.cluster_info where type='tidb'",
 			},
 		},
-		{
-			sql: "select rule, item, type, instance, value, reference, severity, details from information_schema.inspection_result where rule='current-load'",
-			rows: []string{
-				"current-load cpu-load1 pd 192.168.1.31:1234 1.0 < 0.7 warning cpu-load1 should less than (cpu_logical_cores * 0.7)",
-				"current-load cpu-load15 pd 192.168.1.33:1234 8.0 < 7.0 warning cpu-load15 should less than (cpu_logical_cores * 0.7)",
-				"current-load disk-usage tikv 192.168.1.22:1234 80 < 70 warning current disk-usage is too high, execute the sql to see more detail: select * from information_schema.cluster_hardware where type='tikv' and instance='192.168.1.22:1234' and device_type='disk' and device_name='sda'",
-				"current-load swap-memory-usage tikv 192.168.1.21:1234 0.6 0 warning ",
-				"current-load virtual-memory-usage tidb 192.168.1.11:1234 0.8 < 0.7 warning ",
-			},
-		},
 	}
 
 	for _, cs := range cases {
@@ -583,3 +561,77 @@ func (s *inspectionResultSuite) TestCriticalErrorInspection(c *C) {
 		"tikv_engine_write_stall tikv-0 1.00(kv) the total number of errors about 'tikv_engine_write_stall' is too many",
 	))
 }
+
+func (s *inspectionResultSuite) TestNodeLoadInspection(c *C) {
+	tk := testkit.NewTestKitWithInit(c, s.store)
+	datetime := func(s string) types.Time {
+		t, err := types.ParseTime(tk.Se.GetSessionVars().StmtCtx, s, mysql.TypeDatetime, types.MaxFsp)
+		c.Assert(err, IsNil)
+		return t
+	}
+
+	// construct some mock abnormal data
+	mockData := map[string][][]types.Datum{
+		// columns: time, instance, value
+		"node_load1": {
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 28.1),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 13.0),
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 10.0),
+		},
+		// columns: time, instance, value
+		"node_load5": {
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 27.9),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 14.1),
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 0.0),
+		},
+		// columns: time, instance, value
+		"node_load15": {
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-0", 30.0),
+			types.MakeDatums(datetime("2020-02-14 05:20:00"), "node-1", 14.1),
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 20.0),
+		},
+		// columns: time, instance, value
+		"node_virtual_cpus": {
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 40.0),
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 20.0),
+		},
+		// columns: time, instance, value
+		"node_memory_usage": {
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 80.0),
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 60.0),
+			types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", 60.0),
+		},
+		// columns: time, instance, value
+		"node_memory_swap_used": {
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", 0.0),
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-1", 1.0),
+			types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-1", 0.0),
+		},
+		// columns: time, instance, device, value
+		"node_disk_usage": {
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", "/dev/nvme0", 80.0),
+			types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", "/dev/nvme0", 50.0),
+			types.MakeDatums(datetime("2020-02-14 05:21:00"), "node-0", "tmpfs", 80.0),
+			types.MakeDatums(datetime("2020-02-14 05:22:00"), "node-0", "tmpfs", 50.0),
+		},
+	}
+
+	ctx := s.setupForThresholdCheck(c, mockData)
+	defer s.tearDownForThresholdCheck(c)
+
+	rs, err := tk.Se.Execute(ctx, `select /*+ time_range('2020-02-14 04:20:00','2020-02-14 05:23:00') */
+		item, type, instance, value, reference, details from information_schema.inspection_result
+		where rule='node-load' order by item, value`)
+	c.Assert(err, IsNil)
+	result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed"))
+	c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings()))
+	result.Check(testkit.Rows(
+		"cpu-load1 node node-0 28.1 < 28.0 cpu-load1 should less than (cpu_logical_cores * 0.7)",
+		"cpu-load15 node node-1 14.1 < 14.0 cpu-load15 should less than (cpu_logical_cores * 0.7)",
+		"cpu-load15 node node-0 30.0 < 28.0 cpu-load15 should less than (cpu_logical_cores * 0.7)",
+		"cpu-load5 node node-1 14.1 < 14.0 cpu-load5 should less than (cpu_logical_cores * 0.7)",
+		"disk-usage node node-0 80.0% < 70% the disk-usage of /dev/nvme0 is too high",
+		"swap-memory-used node node-1 1.0 0 ",
+		"virtual-memory-usage node node-0 80.0% < 70% the memory-usage is too high",
+	))
+}
diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go
@@ -2030,10 +2030,14 @@ var MetricTableMap = map[string]MetricTableDef{
 		PromQL: `node_memory_MemAvailable_bytes{$LABEL_CONDITIONS}`,
 		Labels: []string{"instance"},
 	},
-	"node_total_memory_swap": {
-		PromQL:  `node_memory_SwapTotal_bytes{$LABEL_CONDITIONS}`,
+	"node_memory_usage": {
+		PromQL: `100* (1-(node_memory_MemAvailable_bytes{$LABEL_CONDITIONS}/node_memory_MemTotal_bytes{$LABEL_CONDITIONS}))`,
+		Labels: []string{"instance"},
+	},
+	"node_memory_swap_used": {
+		PromQL:  `node_memory_SwapTotal_bytes{$LABEL_CONDITIONS} - node_memory_SwapFree_bytes{$LABEL_CONDITIONS}`,
 		Labels:  []string{"instance"},
-		Comment: "node total memory swap",
+		Comment: "bytes used of node swap memory",
 	},
 	"node_uptime": {
 		PromQL:  `node_time_seconds{$LABEL_CONDITIONS} - node_boot_time_seconds{$LABEL_CONDITIONS}`,
@@ -2146,7 +2150,7 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance", "device"},
 		Comment: "Units is byte",
 	},
-	"node_filesystem_space_used": {
+	"node_disk_usage": {
 		PromQL:  `((node_filesystem_size_bytes{$LABEL_CONDITIONS} - node_filesystem_avail_bytes{$LABEL_CONDITIONS}) / node_filesystem_size_bytes{$LABEL_CONDITIONS}) * 100`,
 		Labels:  []string{"instance", "device"},
 		Comment: "Filesystem used space. If is > 80% then is Critical.",