From ff8957889a05aed24417710c09b549c15b317986 Mon Sep 17 00:00:00 2001 From: jiayuan929 <252461528@qq.com> Date: Mon, 9 Sep 2024 19:59:16 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E7=9B=B8=E5=85=B3=20Sentry=20=E5=BC=82=E5=B8=B8=20(#1579)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../paasng/paasng/accessories/log/client.py | 12 +- .../paasng/accessories/log/constants.py | 4 + .../paasng/accessories/log/exceptions.py | 16 ++ .../paasng/accessories/log/serializers.py | 14 +- .../paasng/accessories/log/views/logs.py | 28 +++- .../bk_plugins/pluginscenter/exceptions.py | 2 + .../bk_plugins/pluginscenter/log/__init__.py | 6 +- .../bk_plugins/pluginscenter/log/client.py | 11 +- .../pluginscenter/log/exceptions.py | 32 ++++ .../paasng/bk_plugins/pluginscenter/views.py | 138 ++++++++++++------ 10 files changed, 200 insertions(+), 63 deletions(-) create mode 100644 apiserver/paasng/paasng/bk_plugins/pluginscenter/log/exceptions.py diff --git a/apiserver/paasng/paasng/accessories/log/client.py b/apiserver/paasng/paasng/accessories/log/client.py index eb5cbaa4f9..db91355391 100644 --- a/apiserver/paasng/paasng/accessories/log/client.py +++ b/apiserver/paasng/paasng/accessories/log/client.py @@ -14,7 +14,7 @@ # # We undertake not to change the open source license (MIT license) applicable # to the current version of the project delivered to anyone in the future. - +import logging from operator import attrgetter from typing import Dict, List, Optional, Protocol, Tuple, Union @@ -27,7 +27,7 @@ from elasticsearch_dsl.response.aggs import FieldBucketData from paasng.accessories.log.constants import DEFAULT_LOG_BATCH_SIZE -from paasng.accessories.log.exceptions import LogQueryError, NoIndexError +from paasng.accessories.log.exceptions import BkLogApiError, LogQueryError, NoIndexError from paasng.accessories.log.filters import ( FieldFilter, agg_builtin_filters, @@ -40,6 +40,8 @@ from paasng.utils.es_log.misc import filter_indexes_by_time_range from paasng.utils.es_log.search import SmartSearch, SmartTimeRange +logger = logging.getLogger(__name__) + class LogClientProtocol(Protocol): """LogClient protocol, all log search backend should abide this protocol""" @@ -178,7 +180,11 @@ def _call_api(self, method: _APIGWOperationStub, data: Dict, timeout: int): data["bkdata_authentication_method"] = self.config.bkdataAuthenticationMethod if self.config.bkdataDataToken: data["bkdata_data_token"] = self.config.bkdataDataToken - return method(data=data, timeout=timeout) + resp = method(data=data, timeout=timeout) + if not resp.get("result"): + logger.error(f"query bk log error: {resp['message']}") + raise BkLogApiError(resp["message"]) + return resp class ESLogClient: diff --git a/apiserver/paasng/paasng/accessories/log/constants.py b/apiserver/paasng/paasng/accessories/log/constants.py index 0a79400e57..f84d2c92e9 100644 --- a/apiserver/paasng/paasng/accessories/log/constants.py +++ b/apiserver/paasng/paasng/accessories/log/constants.py @@ -22,6 +22,10 @@ # 默认查询日志的分片大小 DEFAULT_LOG_BATCH_SIZE = 200 +# ES 查询的最大窗口,可在 ES 中配置,但不建议调大,容易导致 ES oom +# 日志平台最多也只返回 10,000 条数据,且不可修改 +MAX_RESULT_WINDOW = 10000 + class LogTimeChoices(str, StructuredEnum): """日志搜索-日期范围可选值""" diff --git a/apiserver/paasng/paasng/accessories/log/exceptions.py b/apiserver/paasng/paasng/accessories/log/exceptions.py index 69e9c6a6a1..0146693a4c 100644 --- a/apiserver/paasng/paasng/accessories/log/exceptions.py +++ b/apiserver/paasng/paasng/accessories/log/exceptions.py @@ -38,3 +38,19 @@ def __init__(self, lacking_key: str): class NoIndexError(Exception): """无可用 index""" + + +class BkLogGatewayServiceError(Exception): + """This error indicates that there's something wrong when operating bk_log's + API Gateway resource. It's a wrapper class of API SDK's original exceptions + """ + + def __init__(self, message: str): + super().__init__(message) + self.message = message + + +class BkLogApiError(BkLogGatewayServiceError): + """When calling the bk_log api, bk_log returns an error message, + which needs to be captured and displayed to the user on the page + """ diff --git a/apiserver/paasng/paasng/accessories/log/serializers.py b/apiserver/paasng/paasng/accessories/log/serializers.py index 95a62a2f77..3d38b06a78 100644 --- a/apiserver/paasng/paasng/accessories/log/serializers.py +++ b/apiserver/paasng/paasng/accessories/log/serializers.py @@ -24,7 +24,7 @@ from rest_framework.exceptions import ValidationError from rest_framework.fields import get_attribute -from paasng.accessories.log.constants import LogTimeChoices +from paasng.accessories.log.constants import MAX_RESULT_WINDOW, LogTimeChoices from paasng.infras.bk_log.constatns import BkLogType from paasng.utils.es_log.time_range import SmartTimeRange @@ -115,7 +115,13 @@ class LogFieldFilterSLZ(serializers.Serializer): class LogQueryParamsSLZ(serializers.Serializer): - """查询日志的 query 参数""" + """查询日志的 query 参数,包含: + - 结构化日志 + - 访问日志 + - 标准输出日志 + - 日志事件直方图 + - 日志字段统计 + """ time_range = serializers.ChoiceField(choices=LogTimeChoices.get_choices(), required=True) start_time = serializers.DateTimeField(help_text="format %Y-%m-%d %H:%M:%S", allow_null=True, required=False) @@ -143,6 +149,10 @@ def validate(self, attrs): attrs["limit"] = attrs["page_size"] if "page" in attrs: attrs["offset"] = (attrs["page"] - 1) * attrs["limit"] + + # 限制最大分页条数 + if attrs["offset"] + attrs["limit"] > MAX_RESULT_WINDOW: + raise ValidationError(_(f"最多仅能查看前 {MAX_RESULT_WINDOW} 条日志")) return attrs diff --git a/apiserver/paasng/paasng/accessories/log/views/logs.py b/apiserver/paasng/paasng/accessories/log/views/logs.py index 51f149f7da..88dbbb7765 100644 --- a/apiserver/paasng/paasng/accessories/log/views/logs.py +++ b/apiserver/paasng/paasng/accessories/log/views/logs.py @@ -35,9 +35,9 @@ from paasng.accessories.log import serializers from paasng.accessories.log.client import instantiate_log_client -from paasng.accessories.log.constants import DEFAULT_LOG_BATCH_SIZE, LogType +from paasng.accessories.log.constants import DEFAULT_LOG_BATCH_SIZE, MAX_RESULT_WINDOW, LogType from paasng.accessories.log.dsl import SearchRequestSchema -from paasng.accessories.log.exceptions import NoIndexError +from paasng.accessories.log.exceptions import BkLogApiError, NoIndexError from paasng.accessories.log.filters import EnvFilter, ModuleFilter from paasng.accessories.log.models import ElasticSearchParams, ProcessLogQueryConfig from paasng.accessories.log.responses import IngressLogLine, StandardOutputLogLine, StructureLogLine @@ -218,7 +218,9 @@ def query_logs(self, request, code, module_name, environment): search=search, timeout=settings.DEFAULT_ES_SEARCH_TIMEOUT, ) - except RequestError: + except (RequestError, BkLogApiError) as e: + # 用户输入数据不符合 ES 语法等报错,不需要记录到 Sentry,仅打 error 日志即可 + logger.error("Request error when querying logs: %s", e) # noqa: TRY400 raise error_codes.QUERY_REQUEST_ERROR except Exception: logger.exception("failed to get logs") @@ -229,6 +231,8 @@ def query_logs(self, request, code, module_name, environment): "logs": clean_logs(list(response), log_config.search_params), "total": total, "dsl": json.dumps(search.to_dict()), + # 前端使用该配置控制页面上展示的日志分页的最大页数 + "max_result_window": MAX_RESULT_WINDOW, }, Logs[self.line_model], # type: ignore ) @@ -264,7 +268,9 @@ def query_logs_scroll(self, request, code, module_name, environment): # scan 失败大概率是 scroll_id 失效 logger.exception("scroll_id 失效, 日志查询失败") raise error_codes.QUERY_LOG_FAILED.f(_("日志查询快照失效, 请刷新后重试。")) - except RequestError: + except (RequestError, BkLogApiError) as e: + # # 用户输入数据不符合 ES 语法等报错,不需要记录到 Sentry,仅打 error 日志即可 + logger.error("request error when querying logs: %s", e) # noqa: TRY400 raise error_codes.QUERY_REQUEST_ERROR except Exception: logger.exception("failed to get logs") @@ -298,10 +304,18 @@ def aggregate_date_histogram(self, request, code, module_name, environment): ), time_field=log_config.search_params.timeField, ) + try: + response = log_client.aggregate_date_histogram( + index=log_config.search_params.indexPattern, search=search, timeout=settings.DEFAULT_ES_SEARCH_TIMEOUT + ) + except (RequestError, BkLogApiError) as e: + # # 用户输入数据不符合 ES 语法等报错,不需要记录到 Sentry,仅打 error 日志即可 + logger.error("request error when aggregate time-based histogram: %s", e) # noqa: TRY400 + raise error_codes.QUERY_REQUEST_ERROR + except Exception: + logger.exception("failed to aggregate time-based histogram") + raise error_codes.QUERY_LOG_FAILED.f(_("聚合时间直方图失败,请稍后再试。")) - response = log_client.aggregate_date_histogram( - index=log_config.search_params.indexPattern, search=search, timeout=settings.DEFAULT_ES_SEARCH_TIMEOUT - ) date_histogram = cattr.structure( { **clean_histogram_buckets(response), diff --git a/apiserver/paasng/paasng/bk_plugins/pluginscenter/exceptions.py b/apiserver/paasng/paasng/bk_plugins/pluginscenter/exceptions.py index f8afd4ab3b..ddc29d7f0d 100644 --- a/apiserver/paasng/paasng/bk_plugins/pluginscenter/exceptions.py +++ b/apiserver/paasng/paasng/bk_plugins/pluginscenter/exceptions.py @@ -54,6 +54,8 @@ class ErrorCodes: QUERY_REPO_OVERVIEW_DATA_ERROR = ErrorCode(_("查询代码仓库概览数据异常")) # 日志查询异常 QUERY_ES_ERROR = ErrorCode(_("日志系统异常, 请稍后重试")) + QUERY_LOG_FAILED = ErrorCode(_("查询日志失败")) + QUERY_REQUEST_ERROR = ErrorCode(_("查询日志失败,请检查查询条件")) # 可见范围修改失败 VISIBLE_RANGE_UPDATE_FAIELD = ErrorCode(_("可见范围修改失败")) diff --git a/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/__init__.py b/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/__init__.py index 5f4ddfbb0e..364a87b272 100644 --- a/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/__init__.py +++ b/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/__init__.py @@ -25,6 +25,7 @@ from paasng.accessories.log.client import instantiate_log_client as instantiate_bksaas_log_client from paasng.accessories.log.constants import LogType from paasng.accessories.log.models import ProcessLogQueryConfig +from paasng.accessories.log.shim import setup_env_log_model from paasng.bk_plugins.pluginscenter.definitions import ElasticSearchParams from paasng.bk_plugins.pluginscenter.log.client import ( FieldBucketData, @@ -249,7 +250,10 @@ def _instantiate_log_client( search_params = cast(ElasticSearchParams, pd.log_config.ingress) return log_client, search_params # 由于 bk-saas 接入了日志平台, 每个应用独立的日志查询配置, 因此需要访问 PaaS 的数据库获取配置信息 - env = Application.objects.get(code=instance.id).get_app_envs("prod") + # 插件开发中心只部署主模块的生产环境 + env = Application.objects.get(code=instance.id).envs.get(environment="prod", module__is_default=True) + # 初始化 env log 模型, 保证数据库对象存在且是 settings 中的最新配置 + setup_env_log_model(env) if log_type == LogType.INGRESS: log_config = ProcessLogQueryConfig.objects.select_process_irrelevant(env).ingress search_params = log_config.search_params diff --git a/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/client.py b/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/client.py index 46804936da..3045046484 100644 --- a/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/client.py +++ b/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/client.py @@ -14,7 +14,7 @@ # # We undertake not to change the open source license (MIT license) applicable # to the current version of the project delivered to anyone in the future. - +import logging from functools import reduce from operator import add from typing import Dict, List, Protocol, Tuple @@ -34,11 +34,14 @@ ) from paasng.bk_plugins.pluginscenter.exceptions import error_codes from paasng.bk_plugins.pluginscenter.log.constants import DEFAULT_LOG_BATCH_SIZE +from paasng.bk_plugins.pluginscenter.log.exceptions import BkLogApiError from paasng.bk_plugins.pluginscenter.thirdparty.utils import make_client from paasng.utils.es_log.misc import count_filters_options from paasng.utils.es_log.models import FieldFilter from paasng.utils.es_log.search import SmartSearch +logger = logging.getLogger(__name__) + class LogClientProtocol(Protocol): """LogClient protocol, all log search backend should abide this protocol""" @@ -115,7 +118,11 @@ def _call_api(self, data, timeout: int): data["bkdata_authentication_method"] = self.config.bkdataAuthenticationMethod if self.config.bkdataDataToken: data["bkdata_data_token"] = self.config.bkdataDataToken - return self.client.call(data=data, timeout=timeout) + resp = self.client.call(data=data, timeout=timeout) + if not resp.get("result"): + logger.error(f"query bk log error: {resp['message']}") + raise BkLogApiError(resp["message"]) + return resp class ESLogClient: diff --git a/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/exceptions.py b/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/exceptions.py new file mode 100644 index 0000000000..3cdf0321e8 --- /dev/null +++ b/apiserver/paasng/paasng/bk_plugins/pluginscenter/log/exceptions.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +# TencentBlueKing is pleased to support the open source community by making +# 蓝鲸智云 - PaaS 平台 (BlueKing - PaaS System) available. +# Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +# Licensed under the MIT License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://opensource.org/licenses/MIT +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +# either express or implied. See the License for the specific language governing permissions and +# limitations under the License. +# +# We undertake not to change the open source license (MIT license) applicable +# to the current version of the project delivered to anyone in the future. + + +class BkLogGatewayServiceError(Exception): + """This error indicates that there's something wrong when operating bk_log's + API Gateway resource. It's a wrapper class of API SDK's original exceptions + """ + + def __init__(self, message: str): + super().__init__(message) + self.message = message + + +class BkLogApiError(BkLogGatewayServiceError): + """When calling the bk_log api, bk_log returns an error message, + which needs to be captured and displayed to the user on the page + """ diff --git a/apiserver/paasng/paasng/bk_plugins/pluginscenter/views.py b/apiserver/paasng/paasng/bk_plugins/pluginscenter/views.py index 3681259797..b286d56ca6 100644 --- a/apiserver/paasng/paasng/bk_plugins/pluginscenter/views.py +++ b/apiserver/paasng/paasng/bk_plugins/pluginscenter/views.py @@ -26,6 +26,7 @@ from django.utils.translation import get_language from django.utils.translation import gettext_lazy as _ from drf_yasg.utils import swagger_auto_schema +from elasticsearch.exceptions import RequestError from rest_framework import mixins, status from rest_framework.filters import OrderingFilter, SearchFilter from rest_framework.pagination import LimitOffsetPagination @@ -49,6 +50,7 @@ submit_create_approval_ticket, submit_visible_range_ticket, ) +from paasng.bk_plugins.pluginscenter.log.exceptions import BkLogApiError from paasng.bk_plugins.pluginscenter.models import ( OperationRecord, PluginBasicInfoDefinition, @@ -993,15 +995,23 @@ def query_standard_output_logs(self, request, pd_id, plugin_id): slz.is_valid(raise_exception=True) query_params = slz.validated_data - logs = log_api.query_standard_output_logs( - pd=plugin.pd, - instance=plugin, - operator=request.user.username, - time_range=query_params["smart_time_range"], - query_string=data["query"]["query_string"], - limit=query_params["limit"], - offset=query_params["offset"], - ) + try: + logs = log_api.query_standard_output_logs( + pd=plugin.pd, + instance=plugin, + operator=request.user.username, + time_range=query_params["smart_time_range"], + query_string=data["query"]["query_string"], + limit=query_params["limit"], + offset=query_params["offset"], + ) + except (RequestError, BkLogApiError) as e: + # 用户输入数据不符合 ES 语法等报错,不需要记录到 Sentry,仅打 error 日志即可 + logger.error("request error when querying stdout log: %s", e) # noqa: TRY400 + raise error_codes.QUERY_REQUEST_ERROR + except Exception: + logger.exception("Failed to query stdout log") + raise error_codes.QUERY_LOG_FAILED.f(_("查询标准输出日志失败,请稍后再试。")) return Response(data=serializers.StandardOutputLogsSLZ(logs).data) @swagger_auto_schema( @@ -1021,17 +1031,25 @@ def query_structure_logs(self, request, pd_id, plugin_id): slz.is_valid(raise_exception=True) query_params = slz.validated_data - logs = log_api.query_structure_logs( - pd=plugin.pd, - instance=plugin, - operator=request.user.username, - time_range=query_params["smart_time_range"], - query_string=data["query"]["query_string"], - terms=data["query"]["terms"], - exclude=data["query"]["exclude"], - limit=query_params["limit"], - offset=query_params["offset"], - ) + try: + logs = log_api.query_structure_logs( + pd=plugin.pd, + instance=plugin, + operator=request.user.username, + time_range=query_params["smart_time_range"], + query_string=data["query"]["query_string"], + terms=data["query"]["terms"], + exclude=data["query"]["exclude"], + limit=query_params["limit"], + offset=query_params["offset"], + ) + except (RequestError, BkLogApiError) as e: + # 用户输入数据不符合 ES 语法等报错,不需要记录到 Sentry,仅打 error 日志即可 + logger.error("request error when querying structure log: %s", e) # noqa: TRY400 + raise error_codes.QUERY_REQUEST_ERROR + except Exception: + logger.exception("Failed to query structure log") + raise error_codes.QUERY_LOG_FAILED.f(_("查询结构化日志失败,请稍后再试。")) return Response(data=serializers.StructureLogsSLZ(logs).data) @swagger_auto_schema( @@ -1051,15 +1069,23 @@ def query_ingress_logs(self, request, pd_id, plugin_id): slz.is_valid(raise_exception=True) query_params = slz.validated_data - logs = log_api.query_ingress_logs( - pd=plugin.pd, - instance=plugin, - operator=request.user.username, - time_range=query_params["smart_time_range"], - query_string=data["query"]["query_string"], - limit=query_params["limit"], - offset=query_params["offset"], - ) + try: + logs = log_api.query_ingress_logs( + pd=plugin.pd, + instance=plugin, + operator=request.user.username, + time_range=query_params["smart_time_range"], + query_string=data["query"]["query_string"], + limit=query_params["limit"], + offset=query_params["offset"], + ) + except (RequestError, BkLogApiError) as e: + # 用户输入数据不符合 ES 语法等报错,不需要记录到 Sentry,仅打 error 日志即可 + logger.error("request error when querying ingress log: %s", e) # noqa: TRY400 + raise error_codes.QUERY_REQUEST_ERROR + except Exception: + logger.exception("Failed to query ingress log") + raise error_codes.QUERY_LOG_FAILED.f(_("查询访问日志失败,请稍后再试。")) return Response(data=serializers.IngressLogSLZ(logs).data) @swagger_auto_schema( @@ -1081,14 +1107,22 @@ def aggregate_date_histogram( slz.is_valid(raise_exception=True) query_params = slz.validated_data - date_histogram = log_api.aggregate_date_histogram( - pd=plugin.pd, - instance=plugin, - log_type=log_type, - operator=request.user.username, - time_range=query_params["smart_time_range"], - query_string=data["query"]["query_string"], - ) + try: + date_histogram = log_api.aggregate_date_histogram( + pd=plugin.pd, + instance=plugin, + log_type=log_type, + operator=request.user.username, + time_range=query_params["smart_time_range"], + query_string=data["query"]["query_string"], + ) + except (RequestError, BkLogApiError) as e: + # 用户输入数据不符合 ES 语法等报错,不需要记录到 Sentry,仅打 error 日志即可 + logger.error("failed to aggregate time-based histogram: %s", e) # noqa: TRY400 + raise error_codes.QUERY_REQUEST_ERROR + except Exception: + logger.exception("failed to aggregate time-based histogram") + raise error_codes.QUERY_LOG_FAILED.f(_("聚合时间直方图失败,请稍后再试。")) return Response(data=serializers.DateHistogramSLZ(date_histogram).data) @swagger_auto_schema( @@ -1099,7 +1133,7 @@ def aggregate_date_histogram( def aggregate_fields_filters( self, request, pd_id, plugin_id, log_type: Literal["standard_output", "structure", "ingress"] ): - """查询日志基于时间分布的直方图""" + """统计日志的字段分布""" plugin = self.get_plugin_instance() slz = serializers.PluginLogQueryBodySLZ(data=request.data) @@ -1110,16 +1144,24 @@ def aggregate_fields_filters( slz.is_valid(raise_exception=True) query_params = slz.validated_data - fields_filters = log_api.aggregate_fields_filters( - pd=plugin.pd, - instance=plugin, - log_type=log_type, - operator=request.user.username, - time_range=query_params["smart_time_range"], - query_string=data["query"]["query_string"], - terms=data["query"]["terms"], - exclude=data["query"]["exclude"], - ) + try: + fields_filters = log_api.aggregate_fields_filters( + pd=plugin.pd, + instance=plugin, + log_type=log_type, + operator=request.user.username, + time_range=query_params["smart_time_range"], + query_string=data["query"]["query_string"], + terms=data["query"]["terms"], + exclude=data["query"]["exclude"], + ) + except (RequestError, BkLogApiError) as e: + # 用户输入数据不符合 ES 语法等报错,不需要记录到 Sentry,仅打 error 日志即可 + logger.error("request error when aggregating log fields: %s", e) # noqa: TRY400 + raise error_codes.QUERY_REQUEST_ERROR + except Exception: + logger.exception("Failed to aggregate log fields") + raise error_codes.QUERY_LOG_FAILED.f(_("统计日志的字段失败,请稍后再试。")) return Response(data=serializers.LogFieldFilterSLZ(fields_filters, many=True).data)