Skip to content

Commit

Permalink
feat(backend): 独立托管业务自动更新监控策略目标范围 #7128
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangzhw8 committed Oct 8, 2024
1 parent a4acc85 commit e022766
Show file tree
Hide file tree
Showing 7 changed files with 247 additions and 218 deletions.
File renamed without changes.
119 changes: 117 additions & 2 deletions dbm-ui/backend/db_monitor/models/alarm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import datetime
import json
import logging
import os
from collections import defaultdict
from typing import Any, Dict, List

Expand All @@ -35,14 +36,25 @@
DEFAULT_ALERT_NOTICE,
PLAT_PRIORITY,
TARGET_LEVEL_TO_PRIORITY,
TPLS_ALARM_DIR,
AlertSourceEnum,
DutyRuleCategory,
PolicyStatus,
TargetLevel,
TargetPriority,
)
from backend.db_monitor.exceptions import (
BkMonitorDeleteAlarmException,
BkMonitorSaveAlarmException,
BuiltInNotAllowDeleteException,
)
from backend.db_monitor.exceptions import BkMonitorDeleteAlarmException, BuiltInNotAllowDeleteException
from backend.db_monitor.tasks import update_app_policy
from backend.db_monitor.utils import bkm_delete_alarm_strategy, bkm_save_alarm_strategy, render_promql_sql
from backend.db_monitor.utils import (
bkm_delete_alarm_strategy,
bkm_save_alarm_strategy,
get_dbm_autofix_action_id,
render_promql_sql,
)
from backend.exceptions import ApiError

__all__ = ["NoticeGroup", "AlertRule", "RuleTemplate", "DispatchGroup", "MonitorPolicy", "DutyRule"]
Expand Down Expand Up @@ -996,6 +1008,109 @@ def get_dbha_policies(cls):
cls.objects.filter(details__labels__contains=["/DBM_DBHA/"]).values_list("monitor_policy_id", flat=True)
)

@classmethod
def sync_plat_monitor_policy(cls, action_id=None, db_type=None, force=False):
if action_id is None:
action_id = get_dbm_autofix_action_id()
skip_dir = "v1"
now = datetime.datetime.now(timezone.utc)
logger.warning("[sync_plat_monitor_policy] sync bkm alarm policy start: %s", now)

# 逐个json导入,本地+远程
updated_policies = 0
for root, dirs, files in os.walk(TPLS_ALARM_DIR):
if skip_dir in dirs:
dirs.remove(skip_dir)

for alarm_tpl in files:

with open(os.path.join(root, alarm_tpl), "r", encoding="utf-8") as f:
logger.info("[sync_plat_monitor_policy] start sync bkm alarm tpl: %s " % alarm_tpl)
try:
template_dict = json.loads(f.read())
# 监控API不支持传入额外的字段
template_dict.pop("export_at", "")
policy_name = template_dict["name"]
except json.decoder.JSONDecodeError:
logger.error("[sync_plat_monitor_policy] load template failed: %s", alarm_tpl)
continue

# 如指定db_type,只同步指定db_type的策略(跳过非指定db_type的策略)
if db_type is not None and template_dict["db_type"] != db_type:
continue

deleted = template_dict.pop("deleted", False)

if not template_dict.get("details"):
logger.error(("[sync_plat_monitor_policy] template %s has no details" % alarm_tpl))
continue

# patch template
labels = list(set(template_dict["details"]["labels"]))
template_dict["details"]["labels"] = labels
template_dict["details"]["name"] = policy_name
template_dict["details"]["priority"] = TargetPriority.PLATFORM.value
# 平台策略仅开启基于分派通知
template_dict["details"]["notice"]["options"]["assign_mode"] = ["by_rule"]
for label in labels:
if label.startswith("NEED_AUTOFIX") and action_id is not None:
template_dict["details"]["actions"] = [
{
"config_id": action_id,
"signal": ["abnormal"],
"user_groups": [],
"options": {
"converge_config": {
"is_enabled": False,
"converge_func": "skip_when_success",
"timedelta": 60,
"count": 1,
}
},
}
]

policy = MonitorPolicy(**template_dict)

policy_name = policy.name
logger.info("[sync_plat_monitor_policy] start sync bkm alarm policy: %s " % policy_name)
try:
synced_policy = MonitorPolicy.objects.get(bk_biz_id=policy.bk_biz_id, name=policy_name)

if deleted:
logger.info("[sync_plat_monitor_policy] delete old alarm: %s " % policy_name)
synced_policy.delete()
continue

if synced_policy.version >= policy.version and not force:
logger.info("[sync_plat_monitor_policy] skip same version alarm: %s " % policy_name)
continue

for keeped_field in MonitorPolicy.KEEPED_FIELDS:
setattr(policy, keeped_field, getattr(synced_policy, keeped_field))

policy.details["id"] = synced_policy.monitor_policy_id
logger.info("[sync_plat_monitor_policy] update bkm alarm policy: %s " % policy_name)
except MonitorPolicy.DoesNotExist:
logger.info("[sync_plat_monitor_policy] create bkm alarm policy: %s " % policy_name)

try:
# fetch targets/test_rules/notify_rules/notify_groups from parent details
for attr, value in policy.parse_details().items():
setattr(policy, attr, value)

policy.save()
updated_policies += 1
logger.error("[sync_plat_monitor_policy] save bkm alarm policy success: %s", policy_name)
except BkMonitorSaveAlarmException as e:
logger.error("[sync_plat_monitor_policy] save bkm alarm policy failed: %s, %s ", policy_name, e)

logger.warning(
"[sync_plat_monitor_policy] finish sync bkm alarm policy end: %s, update_cnt: %s",
datetime.datetime.now(timezone.utc) - now,
updated_policies,
)

@staticmethod
def bkm_search_event(
bk_biz_ids: list,
Expand Down
112 changes: 110 additions & 2 deletions dbm-ui/backend/db_monitor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
import copy
import json
import logging
import os
import re

from django.utils.translation import gettext as _

from backend import env
from backend.components import BKMonitorV3Api
from backend.db_monitor.constants import AUTOFIX_ACTION_NAME
from backend.components import BKLogApi, BKMonitorV3Api
from backend.db_monitor.constants import AUTOFIX_ACTION_NAME, AUTOFIX_ACTION_TEMPLATE
from backend.db_monitor.exceptions import BkMonitorDeleteAlarmException, BkMonitorSaveAlarmException
from backend.db_monitor.format import JsonConfigFormat
from backend.exceptions import ApiError

logger = logging.getLogger("root")

Expand Down Expand Up @@ -109,3 +116,104 @@ def get_dbm_autofix_action_id() -> int:
if action["name"] == AUTOFIX_ACTION_NAME:
action_id = action["id"]
return action_id


def create_bkmonitor_action() -> int:
"""
创建监控处理套餐
"""
action_id = get_dbm_autofix_action_id()
action_config = copy.deepcopy(AUTOFIX_ACTION_TEMPLATE)

if action_id is None:
BKMonitorV3Api.save_action_config(action_config)
else:
action_config["id"] = action_id
BKMonitorV3Api.edit_action_config(action_config)
return action_id


def create_bklog_collector(startswith: str = ""):
bklog_json_files_path = "backend/dbm_init/json_files/bklog"
for filename in os.listdir(bklog_json_files_path):
if not filename.endswith(".json"):
continue
if not filename.startswith(startswith):
continue

# 读取日志采集项json文件,并渲染配置
with open(os.path.join(bklog_json_files_path, filename), "r", encoding="utf-8") as file:
try:
bklog_params = json.load(file)
except json.decoder.JSONDecodeError as err:
logger.error(f"[create_bklog_collector] Failed to load json: {filename}, {err}")
raise err
log_name = filename.split(".")[0]
# 优先获取指定了 log_name 的 formatter
if hasattr(JsonConfigFormat, f"format_{log_name}"):
bklog_params = JsonConfigFormat.format(bklog_params, f"format_{log_name}")
# 根据不同 db 类型,指定对应的 formatter,主要是区分采集目标
elif "mysql" in filename:
bklog_params = JsonConfigFormat.format(bklog_params, JsonConfigFormat.format_mysql.__name__)
elif "redis" in filename:
bklog_params = JsonConfigFormat.format(bklog_params, JsonConfigFormat.format_redis.__name__)
elif "mssql" in filename:
bklog_params = JsonConfigFormat.format(bklog_params, JsonConfigFormat.format_mssql.__name__)
else:
logger.warning(_("格式化函数{log_name}不存在(如果无需格式化json可忽略)").format(log_name=log_name))

# 针对特殊需求修改请求参数
if hasattr(JsonConfigFormat, f"custom_modify_{log_name}"):
bklog_params = JsonConfigFormat.custom_modify(bklog_params, f"custom_modify_{log_name}")
# 如果存在对应的环境变量设置了日志自定义的保留天数,则进行更新
retention = getattr(env, f"BKLOG_{log_name.upper()}_RETENTION", "") or env.BKLOG_DEFAULT_RETENTION
bklog_params["retention"] = retention
# 自定义了 ES 存储集群,则指定 storage_cluster_id
if env.BKLOG_STORAGE_CLUSTER_ID:
bklog_params["storage_cluster_id"] = env.BKLOG_STORAGE_CLUSTER_ID
# 如果集群支持冷热数据,则补充 allocation_min_days,为 retention 的一半即可
if env.BKLOG_CLUSTER_SUPPORT_HOT_COLD:
bklog_params["allocation_min_days"] = retention // 2

# 获取当前采集项的列表
data = BKLogApi.list_collectors(
{"bk_biz_id": env.DBA_APP_BK_BIZ_ID, "pagesize": 500, "page": 1}, use_admin=True
)
collectors_name__info_map = {collector["collector_config_name_en"]: collector for collector in data["list"]}

# 判断采集项是否重复创建
collector_name = bklog_params["collector_config_name_en"]
data = BKLogApi.pre_check(
{
"bk_biz_id": env.DBA_APP_BK_BIZ_ID,
"collector_config_name_en": collector_name,
},
use_admin=True,
)
if not data["allowed"]:
# 采集项已创建,对采集项进行更新
try:
collector_config_id = collectors_name__info_map[collector_name]["collector_config_id"]
except KeyError:
logger.error(_("采集项{collector_name}被创建后删除,暂无法自动重建,请联系管理员处理。").format(collector_name=collector_name))
continue
bklog_params.update({"collector_config_id": collector_config_id})
logger.info(_("采集项{collector_name}已创建, 对采集项进行更新...").format(collector_name=collector_name))
try:
BKLogApi.fast_update(params=bklog_params, use_admin=True)
except ApiError as err:
logger.error(
_("采集项{collector_name}更新失败,请联系管理员。错误信息:{err}").format(collector_name=collector_name, err=err)
)

continue

# 创建采集项
try:
data = BKLogApi.fast_create(params=bklog_params, use_admin=True)
logger.info(_("采集项创建成功,相关信息: {data}").format(data=data))
except ApiError as err:
# 当前采集项创建失败默认不影响下一个采集项的创建
logger.error(_("采集项创建失败,请联系管理员。错误信息:{err}").format(err=err))

return True
Loading

0 comments on commit e022766

Please sign in to comment.