Skip to content

Commit

Permalink
feat(sqlserver): sqlserver巡检任务 #8033
Browse files Browse the repository at this point in the history
  • Loading branch information
yksitu authored and iSecloud committed Dec 6, 2024
1 parent 3e99a54 commit ddef38d
Show file tree
Hide file tree
Showing 10 changed files with 986 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
"no_data_config": {
"level": 2,
"continuous": 10,
"is_enabled": false,
"agg_dimension": []
"is_enabled": true,
"agg_dimension": [
"bk_target_service_instance_id"
]
},
"target": [],
"expression": "a",
Expand Down Expand Up @@ -88,14 +90,81 @@
"connector": "and"
}
],
"actions": [],
"actions": [
{
"id": 168888,
"config_id": 137317,
"user_groups": [],
"user_type": "main",
"signal": [
"abnormal"
],
"options": {
"end_time": "23:59:59",
"start_time": "00:00:00",
"converge_config": {
"count": 1,
"condition": [
{
"value": [
"self"
],
"dimension": "action_info"
}
],
"timedelta": 60,
"is_enabled": false,
"converge_func": "skip_when_success",
"need_biz_converge": true
}
},
"relate_type": "ACTION",
"config": {
"id": 137317,
"name": "dbm_autofix_http_callback",
"desc": "",
"bk_biz_id": "5005578",
"plugin_id": "2",
"execute_config": {
"template_detail": {
"need_poll": false,
"notify_interval": 60,
"interval_notify_mode": "standard",
"method": "POST",
"url": "",
"headers": [],
"authorize": {
"auth_type": "bearer_token",
"auth_config": {
"token": ""
}
},
"body": {
"data_type": "raw",
"params": [],
"content": "{\"callback_message\": {{alarm.callback_message}},\"appointees\": \"{{alarm.appointees}}\"}",
"content_type": "json"
},
"query_params": [],
"failed_retry": {
"is_enabled": true,
"timeout": 10,
"max_retry_times": 2,
"retry_interval": 2
}
},
"timeout": 600
}
}
}
],
"notice": {
"config_id": 47942,
"config_id": 118363,
"user_groups": [],
"user_type": "main",
"signal": [
"no_data",
"abnormal"
"abnormal",
"no_data"
],
"options": {
"end_time": "23:59:59",
Expand Down Expand Up @@ -218,17 +287,17 @@
"template": [
{
"signal": "abnormal",
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n{{content.related_info}}",
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n通知人:{{alarm.receivers}}\n{{content.related_info}}",
"title_tmpl": "{{business.bk_biz_name}} - {{alarm.name}}{{alarm.display_type}}"
},
{
"signal": "recovered",
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n{{content.related_info}}",
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n通知人:{{alarm.receivers}}\n{{content.related_info}}",
"title_tmpl": "{{business.bk_biz_name}} - {{alarm.name}}{{alarm.display_type}}"
},
{
"signal": "closed",
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n{{content.related_info}}",
"message_tmpl": "{{content.level}}\n{{content.begin_time}}\n{{content.time}}\n{{content.duration}}\n{{content.target_type}}\n{{content.data_source}}\n{{content.content}}\n{{content.current_value}}\n{{content.biz}}\n{{content.target}}\n{{content.dimension}}\n{{content.detail}}\n{{content.assign_detail}}\n通知人:{{alarm.receivers}}\n{{content.related_info}}",
"title_tmpl": "{{business.bk_biz_name}} - {{alarm.name}}{{alarm.display_type}}"
}
]
Expand All @@ -250,8 +319,8 @@
},
"is_enabled": true,
"monitor_indicator": "MAX(mssql_serveice_available)",
"version": 2,
"version": 3,
"alert_source": "time_series",
"custom_conditions": [],
"export_at": "2024-04-02T16:01:58+08:00"
"export_at": "2024-11-29T16:28:30+08:00"
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""

from backend.db_meta.enums import ClusterPhase, ClusterType, InstanceInnerRole, InstanceRole, InstanceStatus
from backend.db_meta.models import Cluster, StorageInstance
from backend.db_meta.models.storage_set_dtl import SqlserverClusterSyncMode
from backend.db_report.models.sqlserver_check_report import (
SqlserverCheckAppSettingReport,
SqlserverCheckJobSyncReport,
SqlserverCheckLinkServerReport,
SqlserverCheckSysJobStatuReport,
SqlserverCheckUserSyncReport,
)
from backend.flow.utils.sqlserver.sqlserver_bk_config import (
get_module_infos,
get_sqlserver_alarm_config,
get_sqlserver_backup_config,
)
from backend.flow.utils.sqlserver.sqlserver_db_function import (
check_ha_config,
check_sys_job_status,
fix_app_setting_data,
get_app_setting_data,
insert_sqlserver_config,
)


class CheckAppSettingData(object):
"""
已dbm元数据为准
检查实例的app_setting表的信息是否符合预期,如果存在信息不一致,则需要已某种方式输出告知相关DBA
"""

def __init__(self):
# 获取所有的online状态的cluster
self.clusters = Cluster.objects.prefetch_related(
"storageinstance_set",
"storageinstance_set__machine",
).filter(phase=ClusterPhase.ONLINE, cluster_type__in=[ClusterType.SqlserverHA, ClusterType.SqlserverSingle])

def check_task(self):
"""
定义巡检逻辑
"""
for cluster in self.clusters:
print(cluster.name)
self.check_app_setting_data(cluster)
self.check_job_is_disabled(cluster)
if cluster.cluster_type == ClusterType.SqlserverHA:
master = cluster.storageinstance_set.get(instance_inner_role=InstanceInnerRole.MASTER)
for s in cluster.storageinstance_set.filter(
status=InstanceStatus.RUNNING, instance_inner_role=InstanceInnerRole.SLAVE
):
self.check_user(master_instance=master, slave_instance=s, cluster=cluster)
self.check_job(master_instance=master, slave_instance=s, cluster=cluster)
self.check_link_server(master_instance=master, slave_instance=s, cluster=cluster)

@staticmethod
def fix_app_setting_data(cluster: Cluster, instance: StorageInstance, sync_mode: str, master: StorageInstance):
"""
存在不一致元数据,进行修复
"""
is_fix = 0
status, msg = fix_app_setting_data(cluster=cluster, instance=instance, sync_mode=sync_mode, master=master)
if status:
is_fix = 1
SqlserverCheckAppSettingReport.objects.create(
cluster=cluster.name,
cluster_type=cluster.cluster_type,
instance_host=instance.machine.ip,
instance_port=instance.port,
is_inconsistent=1,
is_fix=is_fix,
status=status,
msg=msg,
)
return True

@staticmethod
def add_app_setting_data(cluster: Cluster, instance: StorageInstance):
"""
插入app_setting数据
"""
is_fix = 0
fix_status = False
msg = "fix failed"
# 获取集群字符集配置
charset = get_module_infos(
bk_biz_id=cluster.bk_biz_id,
db_module_id=cluster.db_module_id,
cluster_type=ClusterType(cluster.cluster_type),
)["charset"]

# 获取集群的备份配置
backup_config = get_sqlserver_backup_config(
bk_biz_id=cluster.bk_biz_id,
db_module_id=cluster.db_module_id,
cluster_domain=cluster.immute_domain,
)

# 获取集群的个性化配置
alarm_config = get_sqlserver_alarm_config(
bk_biz_id=cluster.bk_biz_id,
db_module_id=cluster.db_module_id,
cluster_domain=cluster.immute_domain,
)

# 配置数据
try:
fix_status = insert_sqlserver_config(
cluster=cluster,
storages=[instance],
charset=charset,
backup_config=backup_config,
alarm_config=alarm_config,
)
except Exception:
is_fix = 0

if fix_status:
is_fix = 1
msg = "fix successfully"

SqlserverCheckAppSettingReport.objects.create(
cluster=cluster.name,
cluster_type=cluster.cluster_type,
instance_host=instance.machine.ip,
instance_port=instance.port,
is_inconsistent=1,
is_fix=is_fix,
status=fix_status,
msg=msg,
)
return True

def check_app_setting_data(self, cluster: Cluster):
master = cluster.storageinstance_set.get(instance_role__in=[InstanceRole.ORPHAN, InstanceRole.BACKEND_MASTER])
if cluster.cluster_type == ClusterType.SqlserverHA:
sync_mode = SqlserverClusterSyncMode.objects.get(cluster_id=cluster.id).sync_mode
else:
sync_mode = ""

# 按照集群维度查询所有的实例,状态running中的
for instance in cluster.storageinstance_set.filter(status=InstanceStatus.RUNNING):
data, err = get_app_setting_data(instance=instance, bk_cloud_id=cluster.bk_cloud_id)
if data is None:
# 如果返回是空则,则大概率是访问异常,录入异常信息,跳过这次的校验
SqlserverCheckAppSettingReport.objects.create(
cluster=cluster.name,
cluster_type=cluster.cluster_type,
instance_host=instance.machine.ip,
instance_port=instance.port,
is_inconsistent=1,
is_fix=0,
status=False,
msg=err,
)
continue

if len(data) == 0:
# 则说明没有配置app_setting,需要重新执行
self.add_app_setting_data(cluster=cluster, instance=instance)

elif (
int(data["APP"]) != cluster.bk_biz_id
or int(data["BK_BIZ_ID"]) != cluster.bk_biz_id
or int(data["BK_CLOUD_ID"]) != cluster.bk_cloud_id
or int(data["CLUSTER_ID"]) != cluster.id
or data["CLUSTER_DOMAIN"] != cluster.immute_domain
or int(data["PORT"]) != instance.port
or data["ROLE"] != instance.instance_inner_role
or data["SYNCHRONOUS_MODE"] != sync_mode
or data["MASTER_IP"] != master.machine.ip
or int(data["MASTER_PORT"]) != master.port
):
# 尝试修复数据
self.fix_app_setting_data(cluster=cluster, instance=instance, sync_mode=sync_mode, master=master)

@staticmethod
def check_user(master_instance: StorageInstance, slave_instance: StorageInstance, cluster: Cluster):
"""
检查主从的用户是否一致
"""
status, msg = check_ha_config(
master_instance=master_instance,
slave_instance=slave_instance,
bk_cloud_id=cluster.bk_cloud_id,
check_tag="user",
)
if not status:
SqlserverCheckUserSyncReport.objects.create(
cluster=cluster.name,
cluster_type=cluster.cluster_type,
instance_host=slave_instance.machine.ip,
instance_port=slave_instance.port,
is_user_inconsistent=1,
status=status,
msg=msg,
)

@staticmethod
def check_job(master_instance: StorageInstance, slave_instance: StorageInstance, cluster: Cluster):
"""
检测主从的业务作业是否一致
"""
status, msg = check_ha_config(
master_instance=master_instance,
slave_instance=slave_instance,
bk_cloud_id=cluster.bk_cloud_id,
check_tag="job",
)
if not status:
SqlserverCheckJobSyncReport.objects.create(
cluster=cluster.name,
cluster_type=cluster.cluster_type,
instance_host=slave_instance.machine.ip,
instance_port=slave_instance.port,
is_job_inconsistent=1,
status=status,
msg=msg,
)

@staticmethod
def check_link_server(master_instance: StorageInstance, slave_instance: StorageInstance, cluster: Cluster):
"""
检测主从的link_server是否一致
"""
status, msg = check_ha_config(
master_instance=master_instance,
slave_instance=slave_instance,
bk_cloud_id=cluster.bk_cloud_id,
check_tag="job",
)
if not status:
SqlserverCheckLinkServerReport.objects.create(
cluster=cluster.name,
cluster_type=cluster.cluster_type,
instance_host=slave_instance.machine.ip,
instance_port=slave_instance.port,
is_link_server_inconsistent=1,
status=status,
msg=msg,
)

@staticmethod
def check_job_is_disabled(cluster: Cluster):
# 按照集群维度查询所有的实例,状态running中的
for instance in cluster.storageinstance_set.filter(status=InstanceStatus.RUNNING):
status, msg = check_sys_job_status(cluster=cluster, instance=instance)
if not status:
# 只有异常才记录
SqlserverCheckSysJobStatuReport.objects.create(
cluster=cluster.name,
cluster_type=cluster.cluster_type,
instance_host=instance.machine.ip,
instance_port=instance.port,
is_job_disable=1,
status=status,
msg=msg,
)
Loading

0 comments on commit ddef38d

Please sign in to comment.