Skip to content

Commit

Permalink
feature: 后台组件服务支持配置超时时间(closed #1696)
Browse files Browse the repository at this point in the history
  • Loading branch information
neko12583 authored and wyyalt committed Nov 2, 2023
1 parent 61f7b6c commit 3d8433d
Show file tree
Hide file tree
Showing 12 changed files with 59 additions and 19 deletions.
2 changes: 1 addition & 1 deletion apps/backend/agent/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def check_agent_ability(cls):

@classmethod
def push_environ_files(cls):
"""检测 Agent 功能"""
"""推送环境变量文件"""
act = AgentServiceActivity(
component_code=components.PushEnvironFilesComponent.code,
name=components.PushEnvironFilesComponent.name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.components.collections.agent_new.base import AgentBaseService
from apps.node_man import models
from apps.node_man.handlers.security_group import get_security_group_factory
Expand Down Expand Up @@ -56,7 +56,7 @@ def _schedule(self, data, parent_data, callback_data=None):
self.finish_schedule()
return True

elif polling_time + POLLING_INTERVAL > POLLING_TIMEOUT / 2:
elif polling_time + POLLING_INTERVAL > self.service_polling_timeout / 2:
self.move_insts_to_failed(subscription_instance_ids, _("配置到Gse和Nginx的策略失败请联系节点管理维护人员"))
self.finish_schedule()
return False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.node_man import constants, models
from pipeline.core.flow import Service, StaticIntervalGenerator

Expand Down Expand Up @@ -147,7 +147,7 @@ def _schedule(self, data, parent_data, callback_data=None):
return

polling_time = data.get_one_of_outputs("polling_time")
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.service_polling_timeout:
sub_inst_ids = [host_id__sub_inst_id_map[host_id] for host_id in host_ids_need_to_query]
self.move_insts_to_failed(sub_inst_ids=sub_inst_ids, log_content=_("查询 GSE 超时"))
self.finish_schedule()
Expand Down
7 changes: 4 additions & 3 deletions apps/backend/components/collections/agent_new/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from apps.backend.agent import solution_maker
from apps.backend.agent.tools import InstallationTools
from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.constants import (
REDIS_AGENT_CONF_KEY_TPL,
REDIS_INSTALL_CALLBACK_KEY_TPL,
Expand Down Expand Up @@ -365,10 +365,11 @@ def _execute(self, data, parent_data, common_data: base.AgentCommonData):
# 缓存 Agent 配置
pipeline.mset(cache_key__config_content_map)
# 设置过期时间
polling_timeout = self.service_polling_timeout
for cache_key in cache_key__config_content_map:
# 根据调度超时时间预估一个过期时间
# 由于此时还未执行「命令下发」动作,随机增量过期时长,避免缓存雪崩
pipeline.expire(cache_key, POLLING_TIMEOUT + random.randint(POLLING_TIMEOUT, 2 * POLLING_TIMEOUT))
pipeline.expire(cache_key, polling_timeout + random.randint(polling_timeout, 2 * polling_timeout))
pipeline.execute()

remote_conn_helpers_gby_result_type = self.bulk_check_ssh(remote_conn_helpers=lan_windows_sub_inst)
Expand Down Expand Up @@ -866,7 +867,7 @@ def _schedule(self, data, parent_data, callback_data=None):
return True

polling_time = data.get_one_of_outputs("polling_time")
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.service_polling_timeout:
self.move_insts_to_failed(left_scheduling_sub_inst_ids, _("安装超时"))
self.finish_schedule()
data.outputs.polling_time = polling_time + POLLING_INTERVAL
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.core.concurrent import controller
from apps.utils import concurrent
from common.api import CCApi
Expand Down Expand Up @@ -104,7 +104,7 @@ def _schedule(self, data, parent_data, callback_data=None):
self.finish_schedule()
return

if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.service_polling_timeout:
self.move_insts_to_failed(sub_inst_ids=pending_push_instance_ids, log_content=_("推送主机身份超时"))
self.finish_schedule()
return
Expand Down
18 changes: 17 additions & 1 deletion apps/backend/components/collections/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import os
import traceback
import typing

from dataclasses import dataclass
from typing import (
Any,
Expand All @@ -33,6 +34,7 @@
from django.utils.translation import ugettext as _

from apps.adapters.api.gse import GseApiBaseHelper, get_gse_api_helper
from apps.backend.api.constants import POLLING_TIMEOUT
from apps.backend.subscription import errors
from apps.core.files.storage import get_storage
from apps.exceptions import parse_exception
Expand Down Expand Up @@ -218,6 +220,20 @@ def setup_pagent_file_content(self) -> bytes:
return fh.read().encode()


class PollingTimeoutMixin:
@property
def service_polling_timeout(self) -> int:
service_name = self.__class__.__name__
print("service_name: ", service_name)
all_service_polling_timeout: dict = models.GlobalSettings.get_config(
key=models.GlobalSettings.KeyEnum.BACKEND_SERVICE_POLLING_TIMEOUT.value,
default={},
)

service_polling_timeout = all_service_polling_timeout.get(service_name, POLLING_TIMEOUT)
return service_polling_timeout


@dataclass
class CommonData:
"""
Expand All @@ -238,7 +254,7 @@ class CommonData:
subscription_instance_ids: Set[int]


class BaseService(Service, LogMixin, DBHelperMixin):
class BaseService(Service, LogMixin, DBHelperMixin, PollingTimeoutMixin):

# 失败订阅实例ID - 失败原因 映射关系
failed_subscription_instance_id_reason_map: Optional[Dict[int, Any]] = None
Expand Down
4 changes: 2 additions & 2 deletions apps/backend/components/collections/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from django.conf import settings
from django.utils.translation import ugettext_lazy as _

from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.api.job import process_parms
from apps.backend.components.collections.base import BaseService, CommonData
from apps.core.files.storage import get_storage
Expand Down Expand Up @@ -340,7 +340,7 @@ def _schedule(self, data, parent_data, callback_data=None):
).exists()
if is_finished:
self.finish_schedule()
elif polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
elif polling_time + POLLING_INTERVAL > self.service_polling_timeout:
# 由于JOB的超时机制可能会失效,因此这里节点管理自己需要有超时机制进行兜底
pending_job_sub_maps = models.JobSubscriptionInstanceMap.objects.filter(
node_id=self.id, status=constants.BkJobStatus.PENDING
Expand Down
3 changes: 1 addition & 2 deletions apps/backend/components/collections/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from apps.backend.api.constants import (
GSE_RUNNING_TASK_CODE,
POLLING_INTERVAL,
POLLING_TIMEOUT,
SUFFIX_MAP,
GseDataErrCode,
)
Expand Down Expand Up @@ -1250,7 +1249,7 @@ def handle_error_code(
if error_code == GseDataErrCode.RUNNING:
# 只要有运行中的任务,则认为未完成,标记 is_finished
is_finished = False
if polling_time + POLLING_INTERVAL > POLLING_TIMEOUT:
if polling_time + POLLING_INTERVAL > self.service_polling_timeout:
self.move_insts_to_failed([subscription_instance.id], _("GSE任务轮询超时"))
elif success_conditions:
# 状态码非 SUCCESS 的,但满足成功的特殊条件,认为是成功的,无需做任何处理
Expand Down
26 changes: 24 additions & 2 deletions apps/backend/tests/components/collections/agent_new/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
from typing import Any, Dict, List, Optional

import mock
from django.test import TestCase

from apps.backend.api.constants import POLLING_INTERVAL
from apps.backend.api.constants import POLLING_INTERVAL, POLLING_TIMEOUT
from apps.backend.components.collections.base import PollingTimeoutMixin
from apps.mock_data import api_mkd
from apps.mock_data import utils as mock_data_utils
from apps.node_man import constants, models
Expand All @@ -31,6 +33,25 @@
from . import utils


class PollingTimeoutMixinTestCase(TestCase):
def setUp(self):
self.mixin = PollingTimeoutMixin()

def test_default_polling_time(self):
expected_timeout = POLLING_TIMEOUT
timeout = self.mixin.service_polling_timeout
self.assertEqual(timeout, expected_timeout)

def test_component_polling_time(self):
expected_timeout = 20
models.GlobalSettings.set_config(
models.GlobalSettings.KeyEnum.BACKEND_SERVICE_POLLING_TIMEOUT.value,
{"PollingTimeoutMixin": expected_timeout}
)
timeout = self.mixin.service_polling_timeout
self.assertEqual(timeout, expected_timeout)


class JobBaseTestCase(utils.AgentServiceBaseTestCase, ABC):
"""用于JobV3BaseService子类的测试基类"""

Expand Down Expand Up @@ -241,7 +262,8 @@ def structure_mock_data(cls):
def setUp(self) -> None:
super().setUp()
mock.patch(
"apps.backend.components.collections.job.POLLING_TIMEOUT", (self.POLLING_COUNT - 1) * POLLING_INTERVAL
"apps.backend.components.collections.base.PollingTimeoutMixin.service_polling_timeout",
(self.POLLING_COUNT - 1) * POLLING_INTERVAL
).start()

def fetch_schedule_assertion(self) -> List[ScheduleAssertion]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def adjust_test_data_in_db(cls):
def setUp(self) -> None:
super().setUp()
mock.patch(
"apps.backend.components.collections.agent_new.get_agent_status.POLLING_TIMEOUT",
"apps.backend.components.collections.base.PollingTimeoutMixin.service_polling_timeout",
2 * POLLING_INTERVAL - 1,
).start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def get_default_case_name(cls) -> str:
def setUp(self) -> None:
super().setUp()
mock.patch(
"apps.backend.api.constants.POLLING_TIMEOUT",
"apps.backend.components.collections.base.PollingTimeoutMixin.service_polling_timeout",
2 * POLLING_INTERVAL - 1,
).start()

Expand Down
2 changes: 2 additions & 0 deletions apps/node_man/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ class KeyEnum(Enum):
ENABLE_AP_VERSION_MUTEX = "ENABLE_AP_VERSION_MUTEX"
# 记录所有业务ID,用于同步新业务到灰度列表对比使用
ALL_BIZ_IDS = "ALL_BIZ_IDS"
# 后台组件服务超时时间
BACKEND_SERVICE_POLLING_TIMEOUT = "BACKEND_SERVICE_POLLING_TIMEOUT"

key = models.CharField(_("键"), max_length=255, db_index=True, primary_key=True)
v_json = JSONField(_("值"))
Expand Down

0 comments on commit 3d8433d

Please sign in to comment.