diff --git a/ChatQnA/README.md b/ChatQnA/README.md index 40fdac003a..50fd79d324 100644 --- a/ChatQnA/README.md +++ b/ChatQnA/README.md @@ -70,11 +70,11 @@ To set up environment variables for deploying ChatQnA services, follow these ste # on Gaudi cd GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/ source ./set_env.sh - export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails + export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,guardrails,jaeger,prometheus,grafana,gaudi-node-exporter-1 # on Xeon cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/ source ./set_env.sh - export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service + export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,xeon-node-exporter-1 # on Nvidia GPU cd GenAIExamples/ChatQnA/docker_compose/nvidia/gpu source ./set_env.sh diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md index f8475e94d0..6ba093216e 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md @@ -59,8 +59,10 @@ docker compose up -d To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file. CPU example with Open Telemetry feature: +> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands. + ```bash -cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/ +./grafana/dashboards/download_opea_dashboard.sh docker compose -f compose.yaml -f compose.telemetry.yaml up -d ``` diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml index 4da33d6d50..4456fee747 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.telemetry.yaml @@ -4,10 +4,19 @@ services: tei-embedding-service: command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + environment: + - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} tei-reranking-service: command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + environment: + - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} +# vllm-service: +# command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --otlp-traces-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + chatqna-xeon-backend-server: + environment: + - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} jaeger: - image: jaegertracing/all-in-one:latest + image: jaegertracing/all-in-one:1.67.0 container_name: jaeger ports: - "16686:16686" @@ -21,7 +30,51 @@ services: https_proxy: ${https_proxy} COLLECTOR_ZIPKIN_HOST_PORT: 9411 restart: unless-stopped - chatqna-xeon-backend-server: + prometheus: + image: prom/prometheus:v2.52.0 + container_name: prometheus + user: root + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + ports: + - '9090:9090' + ipc: host + restart: unless-stopped + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - ./grafana_data:/var/lib/grafana + - ./grafana/dashboards:/var/lib/grafana/dashboards + - ./grafana/provisioning:/etc/grafana/provisioning + user: root environment: - - ENABLE_OPEA_TELEMETRY=true - - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} + GF_SECURITY_ADMIN_PASSWORD: admin + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + GF_LOG_FILTERS: rendering:debug + depends_on: + - prometheus + ports: + - '3000:3000' + ipc: host + restart: unless-stopped + node-exporter: + image: prom/node-exporter + container_name: node-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + ports: + - 9100:9100 + restart: always + deploy: + mode: global diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml index 2ba1375398..dfd263d305 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_tgi.telemetry.yaml @@ -4,12 +4,21 @@ services: tei-embedding-service: command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + environment: + - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} tei-reranking-service: command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + environment: + - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} tgi-service: command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + environment: + - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} + chatqna-xeon-backend-server: + environment: + - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} jaeger: - image: jaegertracing/all-in-one:latest + image: jaegertracing/all-in-one:1.67.0 container_name: jaeger ports: - "16686:16686" @@ -23,7 +32,51 @@ services: https_proxy: ${https_proxy} COLLECTOR_ZIPKIN_HOST_PORT: 9411 restart: unless-stopped - chatqna-xeon-backend-server: + prometheus: + image: prom/prometheus:v2.52.0 + container_name: prometheus + user: root + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + ports: + - '9090:9090' + ipc: host + restart: unless-stopped + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - ./grafana_data:/var/lib/grafana + - ./grafana/dashboards:/var/lib/grafana/dashboards + - ./grafana/provisioning:/etc/grafana/provisioning + user: root environment: - - ENABLE_OPEA_TELEMETRY=true - - TELEMETRY_ENDPOINT=${TELEMETRY_ENDPOINT} + GF_SECURITY_ADMIN_PASSWORD: admin + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + GF_LOG_FILTERS: rendering:debug + depends_on: + - prometheus + ports: + - '3000:3000' + ipc: host + restart: unless-stopped + node-exporter: + image: prom/node-exporter + container_name: node-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + ports: + - 9100:9100 + restart: always + deploy: + mode: global diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh new file mode 100644 index 0000000000..9b603c0403 --- /dev/null +++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/dashboards/download_opea_dashboard.sh @@ -0,0 +1,6 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml new file mode 100644 index 0000000000..13922a769b --- /dev/null +++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/dashboards/local.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: 1 + +providers: +- name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards + options: + path: /var/lib/grafana/dashboards diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000000..109fc0978f --- /dev/null +++ b/ChatQnA/docker_compose/intel/cpu/xeon/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,54 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# config file version +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: Prometheus + orgId: 1 + +# list of datasources to insert/update depending +# what's available in the database +datasources: + # name of the datasource. Required +- name: Prometheus + # datasource type. Required + type: prometheus + # access mode. direct or proxy. Required + access: proxy + # org id. will default to orgId 1 if not specified + orgId: 1 + # url + url: http://prometheus:9090 + # database password, if used + password: + # database user, if used + user: + # database name, if used + database: + # enable/disable basic auth + basicAuth: false + # basic auth username, if used + basicAuthUser: + # basic auth password, if used + basicAuthPassword: + # enable/disable with credentials headers + withCredentials: + # mark as default datasource. Max one per org + isDefault: true + # fields that will be converted to json and stored in json_data + jsonData: + httpMethod: GET + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: false + # json object of data that will be encrypted. + secureJsonData: + tlsCACert: "..." + tlsClientCert: "..." + tlsClientKey: "..." + version: 1 + # allow users to edit datasources from the UI. + editable: true diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/prometheus.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/prometheus.yaml new file mode 100644 index 0000000000..79746aea05 --- /dev/null +++ b/ChatQnA/docker_compose/intel/cpu/xeon/prometheus.yaml @@ -0,0 +1,43 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL} +global: + scrape_interval: 5s + external_labels: + monitor: "my-monitor" +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["prometheus:9090"] + - job_name: "vllm" + metrics_path: /metrics + static_configs: + - targets: ["vllm-service:80"] + - job_name: "tgi" + metrics_path: /metrics + static_configs: + - targets: ["tgi-service:80"] + - job_name: "tei-embedding" + metrics_path: /metrics + static_configs: + - targets: ["tei-embedding-server:80"] + - job_name: "tei-reranking" + metrics_path: /metrics + static_configs: + - targets: ["tei-reranking-server:80"] + - job_name: "retriever" + metrics_path: /metrics + static_configs: + - targets: ["retriever-redis-server:7000"] + - job_name: "dataprep-redis-service" + metrics_path: /metrics + static_configs: + - targets: ["dataprep-redis-server:5000"] + - job_name: "chatqna-backend-server" + metrics_path: /metrics + static_configs: + - targets: ["chatqna-xeon-backend-server:8888"] + - job_name: "prometheus-node-exporter" + metrics_path: /metrics + static_configs: + - targets: ["node-exporter:9100"] diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh index 1d287c8648..f59f2314a7 100755 --- a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh +++ b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh @@ -18,3 +18,5 @@ export LOGFLAG="" export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+') export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces +# Set no proxy +export no_proxy="$no_proxy,chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm-service,jaeger,prometheus,grafana,node-exporter,$JAEGER_IP" diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md index bd5c634903..119b1ac8c9 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md @@ -66,7 +66,10 @@ docker compose up -d To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file. +> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands. + ```bash +./grafana/dashboards/download_opea_dashboard.sh docker compose -f compose.yaml -f compose.telemetry.yaml up -d ``` diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml index 97c71720cc..62154bcb1d 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.telemetry.yaml @@ -7,7 +7,7 @@ services: tei-reranking-service: command: --model-id ${RERANK_MODEL_ID} --auto-truncate --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT jaeger: - image: jaegertracing/all-in-one:latest + image: jaegertracing/all-in-one:1.67.0 container_name: jaeger ports: - "16686:16686" @@ -21,6 +21,67 @@ services: https_proxy: ${https_proxy} COLLECTOR_ZIPKIN_HOST_PORT: 9411 restart: unless-stopped + prometheus: + image: prom/prometheus:v2.52.0 + container_name: prometheus + user: root + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + ports: + - '9090:9090' + ipc: host + restart: unless-stopped + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - ./grafana_data:/var/lib/grafana + - ./grafana/dashboards:/var/lib/grafana/dashboards + - ./grafana/provisioning:/etc/grafana/provisioning + user: root + environment: + GF_SECURITY_ADMIN_PASSWORD: admin + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + GF_LOG_FILTERS: rendering:debug + depends_on: + - prometheus + ports: + - '3000:3000' + ipc: host + restart: unless-stopped + node-exporter: + image: prom/node-exporter + container_name: node-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + ports: + - 9100:9100 + restart: always + deploy: + mode: global + gaudi-exporter: + image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.19.2-32 + container_name: gaudi-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + - /dev:/dev + ports: + - 41611:41611 + restart: always + deploy: + mode: global chatqna-gaudi-backend-server: environment: - ENABLE_OPEA_TELEMETRY=true diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.telemetry.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.telemetry.yaml index 0da707ebc0..64edd63064 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.telemetry.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.telemetry.yaml @@ -9,7 +9,7 @@ services: tgi-service: command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT jaeger: - image: jaegertracing/all-in-one:latest + image: jaegertracing/all-in-one:1.67.0 container_name: jaeger ports: - "16686:16686" @@ -23,6 +23,67 @@ services: https_proxy: ${https_proxy} COLLECTOR_ZIPKIN_HOST_PORT: 9411 restart: unless-stopped + prometheus: + image: prom/prometheus:v2.52.0 + container_name: prometheus + user: root + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml + - ./prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yaml' + ports: + - '9090:9090' + ipc: host + restart: unless-stopped + grafana: + image: grafana/grafana:11.0.0 + container_name: grafana + volumes: + - ./grafana_data:/var/lib/grafana + - ./grafana/dashboards:/var/lib/grafana/dashboards + - ./grafana/provisioning:/etc/grafana/provisioning + user: root + environment: + GF_SECURITY_ADMIN_PASSWORD: admin + GF_RENDERING_CALLBACK_URL: http://grafana:3000/ + GF_LOG_FILTERS: rendering:debug + depends_on: + - prometheus + ports: + - '3000:3000' + ipc: host + restart: unless-stopped + node-exporter: + image: prom/node-exporter + container_name: node-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - --collector.filesystem.ignored-mount-points + - "^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)" + ports: + - 9100:9100 + restart: always + deploy: + mode: global + gaudi-exporter: + image: vault.habana.ai/gaudi-metric-exporter/metric-exporter:1.19.2-32 + container_name: gaudi-exporter + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + - /dev:/dev + ports: + - 41611:41611 + restart: always + deploy: + mode: global chatqna-gaudi-backend-server: environment: - ENABLE_OPEA_TELEMETRY=true diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh new file mode 100644 index 0000000000..93ba5d7454 --- /dev/null +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/dashboards/download_opea_dashboard.sh @@ -0,0 +1,7 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/vllm_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/tgi_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/node_grafana.json +wget https://raw.githubusercontent.com/opea-project/GenAIEval/refs/heads/main/evals/benchmark/grafana/gaudi_grafana.json diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml new file mode 100644 index 0000000000..13922a769b --- /dev/null +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/dashboards/local.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: 1 + +providers: +- name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards + options: + path: /var/lib/grafana/dashboards diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000000..109fc0978f --- /dev/null +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,54 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# config file version +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: Prometheus + orgId: 1 + +# list of datasources to insert/update depending +# what's available in the database +datasources: + # name of the datasource. Required +- name: Prometheus + # datasource type. Required + type: prometheus + # access mode. direct or proxy. Required + access: proxy + # org id. will default to orgId 1 if not specified + orgId: 1 + # url + url: http://prometheus:9090 + # database password, if used + password: + # database user, if used + user: + # database name, if used + database: + # enable/disable basic auth + basicAuth: false + # basic auth username, if used + basicAuthUser: + # basic auth password, if used + basicAuthPassword: + # enable/disable with credentials headers + withCredentials: + # mark as default datasource. Max one per org + isDefault: true + # fields that will be converted to json and stored in json_data + jsonData: + httpMethod: GET + graphiteVersion: "1.1" + tlsAuth: false + tlsAuthWithCACert: false + # json object of data that will be encrypted. + secureJsonData: + tlsCACert: "..." + tlsClientCert: "..." + tlsClientKey: "..." + version: 1 + # allow users to edit datasources from the UI. + editable: true diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml new file mode 100644 index 0000000000..8816f4ec68 --- /dev/null +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/prometheus.yaml @@ -0,0 +1,47 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# [IP_ADDR]:{PORT_OUTSIDE_CONTAINER} -> {PORT_INSIDE_CONTAINER} / {PROTOCOL} +global: + scrape_interval: 5s + external_labels: + monitor: "my-monitor" +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["prometheus:9090"] + - job_name: "vllm" + metrics_path: /metrics + static_configs: + - targets: ["vllm-gaudi-server:80"] + - job_name: "tgi" + metrics_path: /metrics + static_configs: + - targets: ["tgi-gaudi-server:80"] + - job_name: "tei-embedding" + metrics_path: /metrics + static_configs: + - targets: ["tei-embedding-gaudi-server:80"] + - job_name: "tei-reranking" + metrics_path: /metrics + static_configs: + - targets: ["tei-reranking-gaudi-server:80"] + - job_name: "retriever" + metrics_path: /metrics + static_configs: + - targets: ["retriever:7000"] + - job_name: "dataprep-redis-service" + metrics_path: /metrics + static_configs: + - targets: ["dataprep-redis-service:5000"] + - job_name: "chatqna-backend-server" + metrics_path: /metrics + static_configs: + - targets: ["chatqna-gaudi-backend-server:8888"] + - job_name: "prometheus-node-exporter" + metrics_path: /metrics + static_configs: + - targets: ["node-exporter:9100"] + - job_name: "prometheus-gaudi-exporter" + metrics_path: /metrics + static_configs: + - targets: ["gaudi-exporter:41611"] diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh index 27339c478f..b0cba1834c 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh @@ -19,3 +19,4 @@ export LOGFLAG="" export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+') export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 export TELEMETRY_ENDPOINT=http://$JAEGER_IP:4318/v1/traces +export no_proxy="$no_proxy,chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-gaudi-server,vllm-gaudi-server,guardrails,jaeger,prometheus,grafana,node-exporter,gaudi-exporter,$JAEGER_IP"