From e6356b0b55a0caaf3dccde00ea6adea624206887 Mon Sep 17 00:00:00 2001 From: neko12583 <1258375097@qq.com> Date: Mon, 16 Oct 2023 17:58:23 +0800 Subject: [PATCH] =?UTF-8?q?feature:=20=E5=90=8E=E5=8F=B0=E7=BB=84=E4=BB=B6?= =?UTF-8?q?=E6=9C=8D=E5=8A=A1=E6=94=AF=E6=8C=81=E9=85=8D=E7=BD=AE=E8=B6=85?= =?UTF-8?q?=E6=97=B6=E6=97=B6=E9=97=B4(closed=20#1696)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../collections/agent_new/configure_policy.py | 4 ++-- .../collections/agent_new/get_agent_status.py | 4 ++-- .../components/collections/agent_new/install.py | 7 ++++--- .../agent_new/push_host_identifier.py | 4 ++-- apps/backend/components/collections/base.py | 17 ++++++++++++++++- apps/backend/components/collections/job.py | 4 ++-- apps/backend/components/collections/plugin.py | 3 +-- .../agent_new/test_get_agent_status.py | 8 ++++---- apps/node_man/models.py | 2 ++ 9 files changed, 35 insertions(+), 18 deletions(-) diff --git a/apps/backend/components/collections/agent_new/configure_policy.py b/apps/backend/components/collections/agent_new/configure_policy.py index 3ff55f475..49a14bae8 100644 --- a/apps/backend/components/collections/agent_new/configure_policy.py +++ b/apps/backend/components/collections/agent_new/configure_policy.py @@ -11,7 +11,7 @@ from django.utils.translation import ugettext_lazy as _ -from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT +from apps.backend.api.constants import POLLING_INTERVAL from apps.backend.components.collections.agent_new.base import AgentBaseService from apps.node_man import models from apps.node_man.handlers.security_group import get_security_group_factory @@ -56,7 +56,7 @@ def _schedule(self, data, parent_data, callback_data=None): self.finish_schedule() return True - elif polling_time + POLLING_INTERVAL > POLLING_TIMEOUT / 2: + elif polling_time + POLLING_INTERVAL > self.get_component_polling_timeout / 2: self.move_insts_to_failed(subscription_instance_ids, _("配置到Gse和Nginx的策略失败请联系节点管理维护人员")) self.finish_schedule() return False diff --git a/apps/backend/components/collections/agent_new/get_agent_status.py b/apps/backend/components/collections/agent_new/get_agent_status.py index b4722c55f..7bdf2578a 100644 --- a/apps/backend/components/collections/agent_new/get_agent_status.py +++ b/apps/backend/components/collections/agent_new/get_agent_status.py @@ -13,7 +13,7 @@ from django.utils.translation import ugettext_lazy as _ -from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT +from apps.backend.api.constants import POLLING_INTERVAL from apps.node_man import constants, models from pipeline.core.flow import Service, StaticIntervalGenerator @@ -141,7 +141,7 @@ def _schedule(self, data, parent_data, callback_data=None): return polling_time = data.get_one_of_outputs("polling_time") - if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT: + if polling_time + POLLING_INTERVAL > self.get_component_polling_timeout: sub_inst_ids = [host_id__sub_inst_id_map[host_id] for host_id in host_ids_need_to_query] self.move_insts_to_failed(sub_inst_ids=sub_inst_ids, log_content=_("查询 GSE 超时")) self.finish_schedule() diff --git a/apps/backend/components/collections/agent_new/install.py b/apps/backend/components/collections/agent_new/install.py index 9229d7246..059ac59ec 100644 --- a/apps/backend/components/collections/agent_new/install.py +++ b/apps/backend/components/collections/agent_new/install.py @@ -25,7 +25,7 @@ from apps.backend.agent import solution_maker from apps.backend.agent.tools import InstallationTools -from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT +from apps.backend.api.constants import POLLING_INTERVAL from apps.backend.constants import ( REDIS_AGENT_CONF_KEY_TPL, REDIS_INSTALL_CALLBACK_KEY_TPL, @@ -263,10 +263,11 @@ def _execute(self, data, parent_data, common_data: base.AgentCommonData): # 缓存 Agent 配置 pipeline.mset(cache_key__config_content_map) # 设置过期时间 + polling_timeout = self.get_component_polling_timeout for cache_key in cache_key__config_content_map: # 根据调度超时时间预估一个过期时间 # 由于此时还未执行「命令下发」动作,随机增量过期时长,避免缓存雪崩 - pipeline.expire(cache_key, POLLING_TIMEOUT + random.randint(POLLING_TIMEOUT, 2 * POLLING_TIMEOUT)) + pipeline.expire(cache_key, polling_timeout + random.randint(polling_timeout, 2 * polling_timeout)) pipeline.execute() remote_conn_helpers_gby_result_type = self.bulk_check_ssh(remote_conn_helpers=lan_windows_sub_inst) @@ -691,7 +692,7 @@ def _schedule(self, data, parent_data, callback_data=None): return True polling_time = data.get_one_of_outputs("polling_time") - if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT: + if polling_time + POLLING_INTERVAL > self.get_component_polling_timeout: self.move_insts_to_failed(left_scheduling_sub_inst_ids, _("安装超时")) self.finish_schedule() data.outputs.polling_time = polling_time + POLLING_INTERVAL diff --git a/apps/backend/components/collections/agent_new/push_host_identifier.py b/apps/backend/components/collections/agent_new/push_host_identifier.py index eec1925d7..58d55b3fe 100644 --- a/apps/backend/components/collections/agent_new/push_host_identifier.py +++ b/apps/backend/components/collections/agent_new/push_host_identifier.py @@ -12,7 +12,7 @@ from django.utils.translation import ugettext_lazy as _ -from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT +from apps.backend.api.constants import POLLING_INTERVAL from apps.core.concurrent import controller from apps.utils import concurrent from common.api import CCApi @@ -104,7 +104,7 @@ def _schedule(self, data, parent_data, callback_data=None): self.finish_schedule() return - if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT: + if polling_time + POLLING_INTERVAL > self.get_component_polling_timeout: self.move_insts_to_failed(sub_inst_ids=pending_push_instance_ids, log_content=_("推送主机身份超时")) self.finish_schedule() return diff --git a/apps/backend/components/collections/base.py b/apps/backend/components/collections/base.py index ed8d10c39..0ecb2c47c 100644 --- a/apps/backend/components/collections/base.py +++ b/apps/backend/components/collections/base.py @@ -11,6 +11,7 @@ import os import traceback import typing + from dataclasses import dataclass from typing import ( Any, @@ -32,6 +33,7 @@ from django.utils.translation import ugettext as _ from apps.adapters.api.gse import GseApiBaseHelper, get_gse_api_helper +from apps.backend.api.constants import POLLING_TIMEOUT from apps.backend.subscription import errors from apps.core.files.storage import get_storage from apps.node_man import constants, models @@ -203,6 +205,19 @@ def setup_pagent_file_content(self) -> bytes: return fh.read().encode() +class PollingTimeoutMixin: + @property + def get_component_polling_timeout(self) -> int: + service_code = self.__class__.__name__ + all_component_polling_timeout: dict = models.GlobalSettings.get_config( + key=models.GlobalSettings.KeyEnum.BACKEND_SERVICE_POLLING_TIMEOUT.value, + default={}, + ) + + component_polling_timeout = all_component_polling_timeout.get(service_code, POLLING_TIMEOUT) + return component_polling_timeout + + @dataclass class CommonData: """ @@ -222,7 +237,7 @@ class CommonData: subscription_instance_ids: Set[int] -class BaseService(Service, LogMixin, DBHelperMixin): +class BaseService(Service, LogMixin, DBHelperMixin, PollingTimeoutMixin): # 失败订阅实例ID - 失败原因 映射关系 failed_subscription_instance_id_reason_map: Optional[Dict[int, Any]] = None diff --git a/apps/backend/components/collections/job.py b/apps/backend/components/collections/job.py index ffbd19b52..c303aed94 100644 --- a/apps/backend/components/collections/job.py +++ b/apps/backend/components/collections/job.py @@ -21,7 +21,7 @@ from django.conf import settings from django.utils.translation import ugettext_lazy as _ -from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT +from apps.backend.api.constants import POLLING_INTERVAL from apps.backend.api.job import process_parms from apps.backend.components.collections.base import BaseService, CommonData from apps.core.files.storage import get_storage @@ -340,7 +340,7 @@ def _schedule(self, data, parent_data, callback_data=None): ).exists() if is_finished: self.finish_schedule() - elif polling_time + POLLING_INTERVAL > POLLING_TIMEOUT: + elif polling_time + POLLING_INTERVAL > self.get_component_polling_timeout: # 由于JOB的超时机制可能会失效,因此这里节点管理自己需要有超时机制进行兜底 pending_job_sub_maps = models.JobSubscriptionInstanceMap.objects.filter( node_id=self.id, status=constants.BkJobStatus.PENDING diff --git a/apps/backend/components/collections/plugin.py b/apps/backend/components/collections/plugin.py index e01a7109e..463c4e2be 100644 --- a/apps/backend/components/collections/plugin.py +++ b/apps/backend/components/collections/plugin.py @@ -27,7 +27,6 @@ from apps.backend.api.constants import ( GSE_RUNNING_TASK_CODE, POLLING_INTERVAL, - POLLING_TIMEOUT, SUFFIX_MAP, GseDataErrCode, ) @@ -1245,7 +1244,7 @@ def handle_error_code( if error_code == GseDataErrCode.RUNNING: # 只要有运行中的任务,则认为未完成,标记 is_finished is_finished = False - if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT: + if polling_time + POLLING_INTERVAL > self.get_component_polling_timeout: self.move_insts_to_failed([subscription_instance.id], _("GSE任务轮询超时")) elif success_conditions: # 状态码非 SUCCESS 的,但满足成功的特殊条件,认为是成功的,无需做任何处理 diff --git a/apps/backend/tests/components/collections/agent_new/test_get_agent_status.py b/apps/backend/tests/components/collections/agent_new/test_get_agent_status.py index d28acd750..c21e53947 100644 --- a/apps/backend/tests/components/collections/agent_new/test_get_agent_status.py +++ b/apps/backend/tests/components/collections/agent_new/test_get_agent_status.py @@ -229,10 +229,10 @@ def adjust_test_data_in_db(cls): def setUp(self) -> None: super().setUp() - mock.patch( - "apps.backend.components.collections.agent_new.get_agent_status.POLLING_TIMEOUT", - 2 * POLLING_INTERVAL - 1, - ).start() + # mock.patch( + # "apps.backend.components.collections.agent_new.get_agent_status.POLLING_TIMEOUT", + # 2 * POLLING_INTERVAL - 1, + # ).start() def cases(self): return [ diff --git a/apps/node_man/models.py b/apps/node_man/models.py index 8b1747289..09eecd505 100644 --- a/apps/node_man/models.py +++ b/apps/node_man/models.py @@ -144,6 +144,8 @@ class KeyEnum(Enum): ENABLE_AP_VERSION_MUTEX = "ENABLE_AP_VERSION_MUTEX" # 记录所有业务ID,用于同步新业务到灰度列表对比使用 ALL_BIZ_IDS = "ALL_BIZ_IDS" + # 后台组件服务超时时间 + BACKEND_SERVICE_POLLING_TIMEOUT = "BACKEND_SERVICE_POLLING_TIMEOUT" key = models.CharField(_("键"), max_length=255, db_index=True, primary_key=True) v_json = JSONField(_("值"))