Skip to content

Commit

Permalink
fix(sqlserver): 增加SQLserver备份文件巡检和云数据巡检任务 #8825
Browse files Browse the repository at this point in the history
  • Loading branch information
yksitu authored and iSecloud committed Jan 6, 2025
1 parent 4fccdaf commit 002da96
Show file tree
Hide file tree
Showing 13 changed files with 493 additions and 6 deletions.
6 changes: 6 additions & 0 deletions dbm-ui/backend/components/mysql_backup/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,13 @@ def __init__(self):
default_timeout=300,
max_retry_times=1,
)
self.query_for_task_ids = self.generate_data_api(
method="POST",
url="backupapi/queryTasks",
description=_("根据备份任务id列表,拉取备份文件信息"),
)


MysqlBackupApi = _BackupApi()
RedisBackupApi = _BackupApi()
SQLServerBackupApi = _BackupApi()
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
from .db_meta_check import db_meta_check_task
from .db_meta_check import db_meta_check_task, sqlserver_topo_daily_check
from .sync_cluster_stat import sync_cluster_stat_from_monitor
from .update_app_cache import update_app_cache
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
from .task import db_meta_check_task
from .task import db_meta_check_task, sqlserver_topo_daily_check
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
from .check import sqlserver_dbmeta_check
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
from typing import List

from backend.db_meta.enums import ClusterType
from backend.db_meta.models import Cluster
from backend.db_periodic_task.local_tasks.db_meta.db_meta_check.mysql_cluster_topo.check_response import CheckResponse
from backend.db_periodic_task.local_tasks.db_meta.db_meta_check.mysql_cluster_topo.tendbha.replicate import (
_cluster_master_as_ejector,
_cluster_slave_as_receiver,
)
from backend.db_periodic_task.local_tasks.db_meta.db_meta_check.mysql_cluster_topo.tendbha.status import (
_cluster_master_entry_count,
_cluster_master_status,
_cluster_one_master,
_cluster_one_standby_slave,
_cluster_standby_slave_status,
_cluster_status,
)
from backend.db_periodic_task.local_tasks.db_meta.db_meta_check.mysql_cluster_topo.tendbha.unique_cluster import (
_cluster_instance_unique_cluster,
)


def sqlserver_dbmeta_check(cluster_id: int) -> List[CheckResponse]:
"""
SQLServer Ha集群检测项:
1:集群状态正常
2:实例有且只有属于一个集群
3:主入口数 >= 1
4:唯一 master
5:master 状态正常
6:唯一 standby slave
7:standby slave 状态正常
8:master 只能作为 ejector
9:slave 只能作为 receiver
"""
# 或者集群所有元信息
clusters = Cluster.objects.filter(id=cluster_id).prefetch_related(
"clusterentry_set__storageinstance_set",
"storageinstance_set__as_receiver__ejector__cluster",
"storageinstance_set__as_ejector__receiver__cluster",
"storageinstance_set__cluster",
)

res = []
for cluster_obj in clusters:
# 检查集群状态
res.extend(_cluster_status(cluster_obj))
# 实例有且只有属于一个集群
res.extend(_cluster_instance_unique_cluster(cluster_obj))
# 主入口数 >= 1
res.extend(_cluster_master_entry_count(cluster_obj))

# 如果是ha架构,则需要检测下面子项
if cluster_obj.cluster_type == ClusterType.SqlserverHA:
# 唯一 master
res.extend(_cluster_one_master(cluster_obj))
# master 状态
res.extend(_cluster_master_status(cluster_obj))
# 唯一 standby slave
res.extend(_cluster_one_standby_slave(cluster_obj))
# standby slave 状态正常
res.extend(_cluster_standby_slave_status(cluster_obj))
# master 只能作为 ejector
res.extend(_cluster_master_as_ejector(cluster_obj))
# slave 只能作为 receiver
res.extend(_cluster_slave_as_receiver(cluster_obj))

return res
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@

from celery.schedules import crontab

from backend.db_meta.enums import ClusterType
from backend.db_meta.enums import ClusterPhase, ClusterType
from backend.db_meta.models import Cluster
from backend.db_periodic_task.local_tasks.register import register_periodic_task
from backend.db_report.models import MetaCheckReport

from .check_redis_instance import check_redis_instance
from .mysql_cluster_topo.tendbha import health_check
from .sqlserver_cluster_topo.check import sqlserver_dbmeta_check

logger = logging.getLogger("celery")

Expand All @@ -37,3 +38,14 @@ def tendbha_topo_daily_check():
r: MetaCheckReport
for r in health_check(c.id):
r.save()


@register_periodic_task(run_every=crontab(hour=5, minute=30))
def sqlserver_topo_daily_check():
# 只检查online状态的集群
for c in Cluster.objects.filter(
phase=ClusterPhase.ONLINE, cluster_type__in=[ClusterType.SqlserverHA, ClusterType.SqlserverSingle]
):
r: MetaCheckReport
for r in sqlserver_dbmeta_check(c.id):
r.save()
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
from .task import check_instance_app_setting
from .task import check_backup_info, check_instance_app_setting
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""

import logging
from collections import defaultdict
from datetime import datetime, time, timedelta
from typing import Dict, List, Set

from django.utils import timezone

from backend.components.bklog.handler import BKLogHandler
from backend.components.mysql_backup.client import SQLServerBackupApi
from backend.db_meta.enums import ClusterPhase, ClusterType
from backend.db_meta.models import Cluster
from backend.db_report.models.sqlserver_check_report import SqlserverFullBackupInfoReport, SqlserverLogBackupInfoReport

logger = logging.getLogger("root")


class CheckBackupInfo(object):
"""
已dbm元数据为准
检查实例的app_setting表的信息是否符合预期,如果存在信息不一致,则需要已某种方式输出告知相关DBA
"""

def __init__(self):
# 获取所有的online状态的cluster
self.clusters = Cluster.objects.prefetch_related(
"storageinstance_set",
"storageinstance_set__machine",
).filter(phase=ClusterPhase.ONLINE, cluster_type__in=[ClusterType.SqlserverHA, ClusterType.SqlserverSingle])
# 拼装查询的时间区间, 查找当前00点到前一天的00点
today = datetime.now(timezone.utc).date()
midnight_utc = datetime.combine(today, time(), tzinfo=timezone.utc)
self.start_time = midnight_utc - timedelta(days=1)
self.end_time = midnight_utc

def __query_log_bk_log(self, cluster: Cluster, collector: str):
return BKLogHandler.query_logs(
collector=collector,
start_time=self.start_time,
end_time=self.end_time,
query_string=f"cluster_id: {cluster.id}",
size=10000,
sorting_rule="asc",
)

def check_task(self):
for cluster in self.clusters:
self.check_full_backup_info_cluster(cluster)
self.check_log_backup_info_cluster(cluster)

def check_full_backup_info_cluster(self, cluster: Cluster):
"""
检查集群的全量备份文件周边信息是否存在
1:对应的备份文件是否在备份记录
2:对应的备份文件是否上传到备份系统
"""
# 获取待巡检的全量备份信息
backup_infos = self.__query_log_bk_log(cluster=cluster, collector="mssql_dbbackup_result")
# 判断每一次的备份任务是否缺失记录
check_result, is_normal = self.check_backup_info_in_bk_log(backup_infos)
# 写入到巡检表
SqlserverFullBackupInfoReport.objects.create(
bk_cloud_id=cluster.bk_cloud_id,
bk_biz_id=cluster.bk_biz_id,
cluster=cluster.name,
cluster_type=cluster.cluster_type,
status=is_normal,
msg=check_result,
)
return

def check_log_backup_info_cluster(self, cluster: Cluster):
"""
检查集群的增量备份文件周边信息是否存在
1:对应的备份文件是否在备份记录
2:对应的备份文件是否上传到备份系统
"""
# 获取待巡检的全量备份信息
backup_infos = self.__query_log_bk_log(cluster=cluster, collector="mssql_binlog_result")
# 判断每一次的备份任务是否缺失记录
check_result, is_normal = self.check_backup_info_in_bk_log(backup_infos)
# 写入到巡检表
SqlserverLogBackupInfoReport.objects.create(
bk_cloud_id=cluster.bk_cloud_id,
bk_biz_id=cluster.bk_biz_id,
cluster=cluster.name,
cluster_type=cluster.cluster_type,
status=is_normal,
msg=check_result,
)
return

def check_backup_info_in_bk_log(self, backup_infos: list):
"""
判断从bk_log拉取出来的备份信息,根据backup_id聚合,判断合法性
"""
check_result = ""
is_normal = True
if not backup_infos:
# 如果查询到的备份文件为空, 怎么提前返回结果
return f"backup-info is null , check [{self.start_time}-{self.end_time}]", False

# 根据backup id聚合备份记录
backup_id__logs: Dict[str, List] = defaultdict(list)
for log in backup_infos:
backup_id__logs[log["backup_id"]].append(log)

# 对每一份备份记录去重,相同的backup id不能出现重复的dbname
backup_id__valid_logs: Dict[str, List] = defaultdict(list)
for backup_id, logs in backup_id__logs.items():
dbname_set: Set[str] = set()
for log in logs:
if log["dbname"] not in dbname_set:
backup_id__valid_logs[backup_id].append(log)
dbname_set.add(log["dbname"])

# 遍历没有backup_id的备份任务
for backup_id, logs in backup_id__valid_logs.items():
# 按照备份任务,查询在备份系统上报情况
task_ids = [i["task_id"] for i in logs]
result = self.check_backup_file_in_backup_system(task_ids=task_ids)
if result:
check_result += f"[{backup_id}] {result}\n"
is_normal = False

# 判断每个备份任务的备份文件行数,跟bk_log上传的日志是否一致
if len(logs) != logs[0]["file_cnt"]:
check_result += f"Backup tasks[{backup_id}] are missing backup records, check\n"
is_normal = False

if not check_result:
# 代表正常返回结果
return f"backup info check ok [{self.start_time}-{self.end_time}]", is_normal

return check_result, is_normal

@staticmethod
def check_backup_file_in_backup_system(task_ids: list):
"""
根据传入的task_id列表,查询备份文件是否成功上传到备份系统
"""
max_length = 100
check_result = []
if len(task_ids) > 100:
# 如果大于最大长度,进行切分
split_lists = [task_ids[i : i + max_length] for i in range(0, len(task_ids), max_length)]
else:
# 如果不大于最大长度,直接返回原列表
split_lists = [task_ids]
for task_list in split_lists:
# 分批请求
check_result.extend(SQLServerBackupApi.query_for_task_ids({"task_ids": task_list}))

# 判断长度
if len(task_ids) != len(check_result):
# 如果传入的任务列表长度和返回的结果长度不一致,则必定是有缺漏,返回异常
return "some backup files are not in the backup system, check"

# 判断每个备份文件上传状态码,如果状态码不等于4(已上传完成),表示返回异常
not_success_task_id_list = []
for info in check_result:
if info["status"] != 4:
not_success_task_id_list.append(info["task_id"])
if not_success_task_id_list:
return f"some backup files failed to upload, check:{not_success_task_id_list}"

return ""
12 changes: 11 additions & 1 deletion dbm-ui/backend/db_periodic_task/local_tasks/sqlserver/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from celery.schedules import crontab

from backend.db_periodic_task.local_tasks.register import register_periodic_task
from backend.db_periodic_task.local_tasks.sqlserver.backup_file_check import CheckBackupInfo
from backend.db_periodic_task.local_tasks.sqlserver.check_app_setting_data import CheckAppSettingData

logger = logging.getLogger("celery")
Expand All @@ -22,6 +23,15 @@
def check_instance_app_setting():
"""
检查实例的元数据表(app_setting)是否正常
每条凌晨7点执行
每条凌晨6点30分执行
"""
CheckAppSettingData().check_task()


@register_periodic_task(run_every=crontab(minute=00, hour=8))
def check_backup_info():
"""
检查集群的备份信息的巡检报告
每条凌晨8点执行
"""
CheckBackupInfo().check_task()
Loading

0 comments on commit 002da96

Please sign in to comment.