Skip to content

Commit

Permalink
feature: 可观测性建设 (close TencentBlueKing#603)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangzhw8 committed Mar 22, 2022
1 parent 2ea03fb commit ab5533c
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 2 deletions.
148 changes: 148 additions & 0 deletions apps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,151 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""

import json
import threading
from typing import Collection

import MySQLdb
from django.conf import settings
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation import dbapi
from opentelemetry.instrumentation.celery import CeleryInstrumentor
from opentelemetry.instrumentation.django import DjangoInstrumentor
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
from opentelemetry.instrumentation.logging import LoggingInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import ReadableSpan, TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.trace.sampling import ALWAYS_OFF, DEFAULT_OFF
from opentelemetry.trace import Span, Status, StatusCode


def requests_callback(span: Span, response):
"""处理蓝鲸格式返回码"""
try:
json_result = response.json()
except Exception: # pylint: disable=broad-except
return
if not isinstance(json_result, dict):
return

# NOTE: esb got a result, but apigateway /iam backend / search-engine got not result
code = json_result.get("code", 0)
span.set_attribute("result_code", code)
span.set_attribute("result_message", json_result.get("message", ""))
span.set_attribute("result_errors", str(json_result.get("errors", "")))
try:
request_id = (
# new esb and apigateway
response.headers.get("x-bkapi-request-id")
# iam backend
or response.headers.get("x-request-id")
# old esb
or json_result.get("request_id", "")
)
if request_id:
span.set_attribute("bk.request_id", request_id)
except Exception: # pylint: disable=broad-except
pass

if code in [0, "0", "00"]:
span.set_status(Status(StatusCode.OK))
else:
span.set_status(Status(StatusCode.ERROR))


def django_response_hook(span, request, response):
if hasattr(response, "data"):
result = response.data
else:
try:
result = json.loads(response.content)
except Exception: # pylint: disable=broad-except
return
if not isinstance(result, dict):
return
span.set_attribute("result_code", result.get("code", 0))
span.set_attribute("result_message", result.get("message", ""))
span.set_attribute("result_errors", result.get("errors", ""))
result = result.get("result", True)
if result:
span.set_status(Status(StatusCode.OK))
return
span.set_status(Status(StatusCode.ERROR))


class LazyBatchSpanProcessor(BatchSpanProcessor):
def __init__(self, *args, **kwargs):
super(LazyBatchSpanProcessor, self).__init__(*args, **kwargs)
# 停止默认线程
self.done = True
with self.condition:
self.condition.notify_all()
self.worker_thread.join()
self.done = False
self.worker_thread = None

def on_end(self, span: ReadableSpan) -> None:
if self.worker_thread is None:
self.worker_thread = threading.Thread(target=self.worker, daemon=True)
self.worker_thread.start()
super(LazyBatchSpanProcessor, self).on_end(span)


class BluekingInstrumentor(BaseInstrumentor):
has_instrument = False

def _uninstrument(self, **kwargs):
pass

def _instrument(self, **kwargs):
"""Instrument the library"""
if self.has_instrument:
return
otlp_exporter = OTLPSpanExporter(endpoint=settings.OTLP_GRPC_HOST)
span_processor = LazyBatchSpanProcessor(otlp_exporter)

# periord task not sampler
sampler = DEFAULT_OFF
if settings.IS_CELERY_BEAT:
sampler = ALWAYS_OFF

tracer_provider = TracerProvider(
resource=Resource.create(
{
"service.name": settings.APP_CODE,
"bk_data_id": settings.OTLP_BK_DATA_ID,
}
),
sampler=sampler,
)

tracer_provider.add_span_processor(span_processor)
trace.set_tracer_provider(tracer_provider)
DjangoInstrumentor().instrument(response_hook=django_response_hook)
RedisInstrumentor().instrument()
# ElasticsearchInstrumentor().instrument()
RequestsInstrumentor().instrument(tracer_provider=tracer_provider, span_callback=requests_callback)
CeleryInstrumentor().instrument(tracer_provider=tracer_provider)
LoggingInstrumentor().instrument()
dbapi.wrap_connect(
__name__,
MySQLdb,
"connect",
"mysql",
{
"database": "db",
"port": "port",
"host": "host",
"user": "user",
},
tracer_provider=tracer_provider,
)
self.has_instrument = True

def instrumentation_dependencies(self) -> Collection[str]:
return []
4 changes: 4 additions & 0 deletions apps/node_man/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from django.conf import settings
from django.db import ProgrammingError, connection

from apps import BluekingInstrumentor
from common.log import logger


Expand All @@ -26,8 +27,11 @@ def ready(self):
"""
初始化部分配置,主要目的是为了SaaS和后台共用部分配置
"""

from apps.node_man.models import GlobalSettings

if settings.ENABLE_OTEL_TRACE:
BluekingInstrumentor().instrument()
if GlobalSettings._meta.db_table not in connection.introspection.table_names():
# 初次部署表不存在时跳过DB写入操作
logger.info(f"{GlobalSettings._meta.db_table} not exists, skip fetch_esb_api_key before migrate.")
Expand Down
1 change: 1 addition & 0 deletions apps/node_man/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@ class JobStatusType(object):
REMOVED = "REMOVED"
FILTERED = "FILTERED"
IGNORED = "IGNORED"
PROCESSING_STATUS = [PENDING, RUNNING]

@classmethod
def get_choices(cls):
Expand Down
3 changes: 2 additions & 1 deletion apps/node_man/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
QueryGlobalSettingsException,
UrlNotReachableError,
)
from apps.prometheus.models import export_job_prometheus_mixin
from apps.utils import files, orm
from common.log import logger
from pipeline.parser import PipelineParser
Expand Down Expand Up @@ -837,7 +838,7 @@ class Meta:
verbose_name_plural = _("安装通道")


class Job(models.Model):
class Job(export_job_prometheus_mixin(), models.Model):
"""任务信息"""

created_by = models.CharField(_("操作人"), max_length=45, default="")
Expand Down
10 changes: 10 additions & 0 deletions apps/prometheus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available.
Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
43 changes: 43 additions & 0 deletions apps/prometheus/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-节点管理(BlueKing-BK-NODEMAN) available.
Copyright (C) 2017-2022 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""

from django_prometheus.conf import NAMESPACE
from prometheus_client import Counter

from apps.node_man import constants

job_model_updates = Counter(
"django_job_model_updates_total",
"Number of update operations by model.",
["job_type", "status"],
namespace=NAMESPACE,
)


def export_job_prometheus_mixin():
"""任务模型埋点"""

class Mixin:
_original_status = None
status = None
job_type = None

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._origin_status = self.status

def _do_update(self, *args, **kwargs):
# 任务状态有变更,且目标状态不是处理中的,则进行埋点
if self._origin_status != self.status and self.status not in constants.JobStatusType.PROCESSING_STATUS:
job_model_updates.labels(self.job_type, self.status).inc()
return super()._do_update(*args, **kwargs)

return Mixin
11 changes: 11 additions & 0 deletions config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@
+ ("django_prometheus.middleware.PrometheusAfterMiddleware",)
)

# open telemetry
ENABLE_OTEL_TRACE = get_type_env("ENABLE_OTEL_TRACE", _type=bool, default=False)
OTLP_GRPC_HOST = get_type_env("OTLP_GRPC_HOST", _type=str, default="")
OTLP_BK_DATA_ID = get_type_env("OTLP_BK_DATA_ID", _type=int, default=0)

# 单元测试豁免登录
if "test" in sys.argv:
index = MIDDLEWARE.index("blueapps.account.middlewares.LoginRequiredMiddleware")
Expand Down Expand Up @@ -223,6 +228,12 @@
CELERY_DEFAULT_EXCHANGE = "default"
CELERY_DEFAULT_ROUTING_KEY = "default"

IS_CELERY = False
IS_CELERY_BEAT = False
if "celery" in sys.argv:
IS_CELERY = True
if "beat" in sys.argv:
IS_CELERY_BEAT = True
# ===============================================================================
# 项目配置
# ===============================================================================
Expand Down
3 changes: 3 additions & 0 deletions dev_log/2.2.8/durant_202203221829.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
feature:
- "可观测性建设 (close #603)"
15 changes: 14 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,17 @@ asgiref==3.4.1

asyncssh==2.8.1

django-prometheus==2.2.0
# prometheus
django-prometheus==2.2.0

# opentelemetry
opentelemetry-api==1.5.0
opentelemetry-sdk==1.5.0
opentelemetry-exporter-otlp==1.5.0
opentelemetry-instrumentation-django==0.24b0
opentelemetry-instrumentation-elasticsearch==0.24b0
opentelemetry-instrumentation-dbapi==0.24b0
opentelemetry-instrumentation-redis==0.24b0
opentelemetry-instrumentation-requests==0.24b0
opentelemetry-instrumentation-celery==0.24b0
opentelemetry-instrumentation-logging==0.24b0

0 comments on commit ab5533c

Please sign in to comment.