Skip to content

Commit

Permalink
feature: 后台组件服务支持配置超时时间(closed TencentBlueKing#1696)
Browse files Browse the repository at this point in the history
  • Loading branch information
neko12583 committed Oct 20, 2023
1 parent 1c02d93 commit e6356b0
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.components.collections.agent_new.base import AgentBaseService
from apps.node_man import models
from apps.node_man.handlers.security_group import get_security_group_factory
Expand Down Expand Up @@ -56,7 +56,7 @@ def _schedule(self, data, parent_data, callback_data=None):
self.finish_schedule()
return True

elif polling_time + POLLING_INTERVAL > POLLING_TIMEOUT / 2:
elif polling_time + POLLING_INTERVAL > self.get_component_polling_timeout / 2:
self.move_insts_to_failed(subscription_instance_ids, _("配置到Gse和Nginx的策略失败请联系节点管理维护人员"))
self.finish_schedule()
return False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.node_man import constants, models
from pipeline.core.flow import Service, StaticIntervalGenerator

Expand Down Expand Up @@ -141,7 +141,7 @@ def _schedule(self, data, parent_data, callback_data=None):
return

polling_time = data.get_one_of_outputs("polling_time")
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.get_component_polling_timeout:
sub_inst_ids = [host_id__sub_inst_id_map[host_id] for host_id in host_ids_need_to_query]
self.move_insts_to_failed(sub_inst_ids=sub_inst_ids, log_content=_("查询 GSE 超时"))
self.finish_schedule()
Expand Down
7 changes: 4 additions & 3 deletions apps/backend/components/collections/agent_new/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from apps.backend.agent import solution_maker
from apps.backend.agent.tools import InstallationTools
from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.constants import (
REDIS_AGENT_CONF_KEY_TPL,
REDIS_INSTALL_CALLBACK_KEY_TPL,
Expand Down Expand Up @@ -263,10 +263,11 @@ def _execute(self, data, parent_data, common_data: base.AgentCommonData):
# 缓存 Agent 配置
pipeline.mset(cache_key__config_content_map)
# 设置过期时间
polling_timeout = self.get_component_polling_timeout
for cache_key in cache_key__config_content_map:
# 根据调度超时时间预估一个过期时间
# 由于此时还未执行「命令下发」动作,随机增量过期时长,避免缓存雪崩
pipeline.expire(cache_key, POLLING_TIMEOUT + random.randint(POLLING_TIMEOUT, 2 * POLLING_TIMEOUT))
pipeline.expire(cache_key, polling_timeout + random.randint(polling_timeout, 2 * polling_timeout))
pipeline.execute()

remote_conn_helpers_gby_result_type = self.bulk_check_ssh(remote_conn_helpers=lan_windows_sub_inst)
Expand Down Expand Up @@ -691,7 +692,7 @@ def _schedule(self, data, parent_data, callback_data=None):
return True

polling_time = data.get_one_of_outputs("polling_time")
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.get_component_polling_timeout:
self.move_insts_to_failed(left_scheduling_sub_inst_ids, _("安装超时"))
self.finish_schedule()
data.outputs.polling_time = polling_time + POLLING_INTERVAL
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.core.concurrent import controller
from apps.utils import concurrent
from common.api import CCApi
Expand Down Expand Up @@ -104,7 +104,7 @@ def _schedule(self, data, parent_data, callback_data=None):
self.finish_schedule()
return

if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.get_component_polling_timeout:
self.move_insts_to_failed(sub_inst_ids=pending_push_instance_ids, log_content=_("推送主机身份超时"))
self.finish_schedule()
return
Expand Down
17 changes: 16 additions & 1 deletion apps/backend/components/collections/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import os
import traceback
import typing

from dataclasses import dataclass
from typing import (
Any,
Expand All @@ -32,6 +33,7 @@
from django.utils.translation import ugettext as _

from apps.adapters.api.gse import GseApiBaseHelper, get_gse_api_helper
from apps.backend.api.constants import POLLING_TIMEOUT
from apps.backend.subscription import errors
from apps.core.files.storage import get_storage
from apps.node_man import constants, models
Expand Down Expand Up @@ -203,6 +205,19 @@ def setup_pagent_file_content(self) -> bytes:
return fh.read().encode()


class PollingTimeoutMixin:
@property
def get_component_polling_timeout(self) -> int:
service_code = self.__class__.__name__
all_component_polling_timeout: dict = models.GlobalSettings.get_config(
key=models.GlobalSettings.KeyEnum.BACKEND_SERVICE_POLLING_TIMEOUT.value,
default={},
)

component_polling_timeout = all_component_polling_timeout.get(service_code, POLLING_TIMEOUT)
return component_polling_timeout


@dataclass
class CommonData:
"""
Expand All @@ -222,7 +237,7 @@ class CommonData:
subscription_instance_ids: Set[int]


class BaseService(Service, LogMixin, DBHelperMixin):
class BaseService(Service, LogMixin, DBHelperMixin, PollingTimeoutMixin):

# 失败订阅实例ID - 失败原因 映射关系
failed_subscription_instance_id_reason_map: Optional[Dict[int, Any]] = None
Expand Down
4 changes: 2 additions & 2 deletions apps/backend/components/collections/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from django.conf import settings
from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.api.job import process_parms
from apps.backend.components.collections.base import BaseService, CommonData
from apps.core.files.storage import get_storage
Expand Down Expand Up @@ -340,7 +340,7 @@ def _schedule(self, data, parent_data, callback_data=None):
).exists()
if is_finished:
self.finish_schedule()
elif polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
elif polling_time + POLLING_INTERVAL > self.get_component_polling_timeout:
# 由于JOB的超时机制可能会失效,因此这里节点管理自己需要有超时机制进行兜底
pending_job_sub_maps = models.JobSubscriptionInstanceMap.objects.filter(
node_id=self.id, status=constants.BkJobStatus.PENDING
Expand Down
3 changes: 1 addition & 2 deletions apps/backend/components/collections/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from apps.backend.api.constants import (
GSE_RUNNING_TASK_CODE,
POLLING_INTERVAL,
POLLING_TIMEOUT,
SUFFIX_MAP,
GseDataErrCode,
)
Expand Down Expand Up @@ -1245,7 +1244,7 @@ def handle_error_code(
if error_code == GseDataErrCode.RUNNING:
# 只要有运行中的任务,则认为未完成,标记 is_finished
is_finished = False
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.get_component_polling_timeout:
self.move_insts_to_failed([subscription_instance.id], _("GSE任务轮询超时"))
elif success_conditions:
# 状态码非 SUCCESS 的,但满足成功的特殊条件,认为是成功的,无需做任何处理
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,10 @@ def adjust_test_data_in_db(cls):

def setUp(self) -> None:
super().setUp()
mock.patch(
"apps.backend.components.collections.agent_new.get_agent_status.POLLING_TIMEOUT",
2 * POLLING_INTERVAL - 1,
).start()
# mock.patch(
# "apps.backend.components.collections.agent_new.get_agent_status.POLLING_TIMEOUT",
# 2 * POLLING_INTERVAL - 1,
# ).start()

def cases(self):
return [
Expand Down
2 changes: 2 additions & 0 deletions apps/node_man/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ class KeyEnum(Enum):
ENABLE_AP_VERSION_MUTEX = "ENABLE_AP_VERSION_MUTEX"
# 记录所有业务ID,用于同步新业务到灰度列表对比使用
ALL_BIZ_IDS = "ALL_BIZ_IDS"
# 后台组件服务超时时间
BACKEND_SERVICE_POLLING_TIMEOUT = "BACKEND_SERVICE_POLLING_TIMEOUT"

key = models.CharField(_("键"), max_length=255, db_index=True, primary_key=True)
v_json = JSONField(_("值"))
Expand Down

0 comments on commit e6356b0

Please sign in to comment.