Skip to content

Commit

Permalink
feature: 后台组件服务支持配置超时时间(closed TencentBlueKing#1696)
Browse files Browse the repository at this point in the history
  • Loading branch information
neko12583 committed Oct 25, 2023
1 parent 79d60af commit bd0c4a0
Show file tree
Hide file tree
Showing 11 changed files with 55 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.components.collections.agent_new.base import AgentBaseService
from apps.node_man import models
from apps.node_man.handlers.security_group import get_security_group_factory
Expand Down Expand Up @@ -56,7 +56,7 @@ def _schedule(self, data, parent_data, callback_data=None):
self.finish_schedule()
return True

elif polling_time + POLLING_INTERVAL > POLLING_TIMEOUT / 2:
elif polling_time + POLLING_INTERVAL > self.service_polling_timeout / 2:
self.move_insts_to_failed(subscription_instance_ids, _("配置到Gse和Nginx的策略失败请联系节点管理维护人员"))
self.finish_schedule()
return False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.node_man import constants, models
from pipeline.core.flow import Service, StaticIntervalGenerator

Expand Down Expand Up @@ -141,7 +141,7 @@ def _schedule(self, data, parent_data, callback_data=None):
return

polling_time = data.get_one_of_outputs("polling_time")
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.service_polling_timeout:
sub_inst_ids = [host_id__sub_inst_id_map[host_id] for host_id in host_ids_need_to_query]
self.move_insts_to_failed(sub_inst_ids=sub_inst_ids, log_content=_("查询 GSE 超时"))
self.finish_schedule()
Expand Down
7 changes: 4 additions & 3 deletions apps/backend/components/collections/agent_new/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from apps.backend.agent import solution_maker
from apps.backend.agent.tools import InstallationTools
from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.constants import (
REDIS_AGENT_CONF_KEY_TPL,
REDIS_INSTALL_CALLBACK_KEY_TPL,
Expand Down Expand Up @@ -365,10 +365,11 @@ def _execute(self, data, parent_data, common_data: base.AgentCommonData):
# 缓存 Agent 配置
pipeline.mset(cache_key__config_content_map)
# 设置过期时间
polling_timeout = self.service_polling_timeout
for cache_key in cache_key__config_content_map:
# 根据调度超时时间预估一个过期时间
# 由于此时还未执行「命令下发」动作,随机增量过期时长,避免缓存雪崩
pipeline.expire(cache_key, POLLING_TIMEOUT + random.randint(POLLING_TIMEOUT, 2 * POLLING_TIMEOUT))
pipeline.expire(cache_key, polling_timeout + random.randint(polling_timeout, 2 * polling_timeout))
pipeline.execute()

remote_conn_helpers_gby_result_type = self.bulk_check_ssh(remote_conn_helpers=lan_windows_sub_inst)
Expand Down Expand Up @@ -866,7 +867,7 @@ def _schedule(self, data, parent_data, callback_data=None):
return True

polling_time = data.get_one_of_outputs("polling_time")
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.service_polling_timeout:
self.move_insts_to_failed(left_scheduling_sub_inst_ids, _("安装超时"))
self.finish_schedule()
data.outputs.polling_time = polling_time + POLLING_INTERVAL
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.core.concurrent import controller
from apps.utils import concurrent
from common.api import CCApi
Expand Down Expand Up @@ -104,7 +104,7 @@ def _schedule(self, data, parent_data, callback_data=None):
self.finish_schedule()
return

if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.service_polling_timeout:
self.move_insts_to_failed(sub_inst_ids=pending_push_instance_ids, log_content=_("推送主机身份超时"))
self.finish_schedule()
return
Expand Down
17 changes: 16 additions & 1 deletion apps/backend/components/collections/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import os
import traceback
import typing

from dataclasses import dataclass
from typing import (
Any,
Expand All @@ -33,6 +34,7 @@
from django.utils.translation import ugettext as _

from apps.adapters.api.gse import GseApiBaseHelper, get_gse_api_helper
from apps.backend.api.constants import POLLING_TIMEOUT
from apps.backend.subscription import errors
from apps.core.files.storage import get_storage
from apps.exceptions import parse_exception
Expand Down Expand Up @@ -218,6 +220,19 @@ def setup_pagent_file_content(self) -> bytes:
return fh.read().encode()


class PollingTimeoutMixin:
@property
def service_polling_timeout(self) -> int:
service_name = self.__class__.__name__
all_service_polling_timeout: dict = models.GlobalSettings.get_config(
key=models.GlobalSettings.KeyEnum.BACKEND_SERVICE_POLLING_TIMEOUT.value,
default={},
)

service_polling_timeout = all_service_polling_timeout.get(service_name, POLLING_TIMEOUT)
return service_polling_timeout


@dataclass
class CommonData:
"""
Expand All @@ -237,7 +252,7 @@ class CommonData:
subscription_instance_ids: Set[int]


class BaseService(Service, LogMixin, DBHelperMixin):
class BaseService(Service, LogMixin, DBHelperMixin, PollingTimeoutMixin):

# 失败订阅实例ID - 失败原因 映射关系
failed_subscription_instance_id_reason_map: Optional[Dict[int, Any]] = None
Expand Down
4 changes: 2 additions & 2 deletions apps/backend/components/collections/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from django.conf import settings
from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.api.job import process_parms
from apps.backend.components.collections.base import BaseService, CommonData
from apps.core.files.storage import get_storage
Expand Down Expand Up @@ -340,7 +340,7 @@ def _schedule(self, data, parent_data, callback_data=None):
).exists()
if is_finished:
self.finish_schedule()
elif polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
elif polling_time + POLLING_INTERVAL > self.service_polling_timeout:
# 由于JOB的超时机制可能会失效,因此这里节点管理自己需要有超时机制进行兜底
pending_job_sub_maps = models.JobSubscriptionInstanceMap.objects.filter(
node_id=self.id, status=constants.BkJobStatus.PENDING
Expand Down
3 changes: 1 addition & 2 deletions apps/backend/components/collections/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from apps.backend.api.constants import (
GSE_RUNNING_TASK_CODE,
POLLING_INTERVAL,
POLLING_TIMEOUT,
SUFFIX_MAP,
GseDataErrCode,
)
Expand Down Expand Up @@ -1249,7 +1248,7 @@ def handle_error_code(
if error_code == GseDataErrCode.RUNNING:
# 只要有运行中的任务,则认为未完成,标记 is_finished
is_finished = False
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.service_polling_timeout:
self.move_insts_to_failed([subscription_instance.id], _("GSE任务轮询超时"))
elif success_conditions:
# 状态码非 SUCCESS 的,但满足成功的特殊条件,认为是成功的,无需做任何处理
Expand Down
24 changes: 22 additions & 2 deletions apps/backend/tests/components/collections/agent_new/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
from typing import Any, Dict, List, Optional

import mock
from django.test import TestCase

from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.components.collections.base import PollingTimeoutMixin
from apps.mock_data import api_mkd
from apps.mock_data import utils as mock_data_utils
from apps.node_man import constants, models
Expand All @@ -31,6 +33,23 @@
from . import utils


class PollingTimeoutMixinTestCase(TestCase):
def setUp(self):
self.mixin = PollingTimeoutMixin()

def test_default_polling_time(self):
expected_timeout = POLLING_TIMEOUT
timeout = self.mixin.service_polling_timeout
self.assertEqual(timeout, expected_timeout)

def test_component_polling_time(self):
expected_timeout = 20
component_polling_timeout = {"PollingTimeoutMixin": expected_timeout}
with mock.patch('apps.node_man.models.GlobalSettings.get_config', return_value=component_polling_timeout):
timeout = self.mixin.service_polling_timeout
self.assertEqual(timeout, expected_timeout)


class JobBaseTestCase(utils.AgentServiceBaseTestCase, ABC):
"""用于JobV3BaseService子类的测试基类"""

Expand Down Expand Up @@ -241,7 +260,8 @@ def structure_mock_data(cls):
def setUp(self) -> None:
super().setUp()
mock.patch(
"apps.backend.components.collections.job.POLLING_TIMEOUT", (self.POLLING_COUNT - 1) * POLLING_INTERVAL
"apps.backend.components.collections.base.PollingTimeoutMixin.service_polling_timeout",
(self.POLLING_COUNT - 1) * POLLING_INTERVAL
).start()

def fetch_schedule_assertion(self) -> List[ScheduleAssertion]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def adjust_test_data_in_db(cls):
def setUp(self) -> None:
super().setUp()
mock.patch(
"apps.backend.components.collections.agent_new.get_agent_status.POLLING_TIMEOUT",
"apps.backend.components.collections.base.PollingTimeoutMixin.service_polling_timeout",
2 * POLLING_INTERVAL - 1,
).start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def get_default_case_name(cls) -> str:
def setUp(self) -> None:
super().setUp()
mock.patch(
"apps.backend.api.constants.POLLING_TIMEOUT",
"apps.backend.components.collections.base.PollingTimeoutMixin.service_polling_timeout",
2 * POLLING_INTERVAL - 1,
).start()

Expand Down
2 changes: 2 additions & 0 deletions apps/node_man/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ class KeyEnum(Enum):
ENABLE_AP_VERSION_MUTEX = "ENABLE_AP_VERSION_MUTEX"
# 记录所有业务ID,用于同步新业务到灰度列表对比使用
ALL_BIZ_IDS = "ALL_BIZ_IDS"
# 后台组件服务超时时间
BACKEND_SERVICE_POLLING_TIMEOUT = "BACKEND_SERVICE_POLLING_TIMEOUT"

key = models.CharField(_("键"), max_length=255, db_index=True, primary_key=True)
v_json = JSONField(_("值"))
Expand Down

0 comments on commit bd0c4a0

Please sign in to comment.