Skip to content

Commit

Permalink
[v1.0] Add docker metric support (#113)
Browse files Browse the repository at this point in the history
* stresscli/metrics: Add support to collect service metrics for Docker deployed
workload

Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>

* Utilize symbol @ as a delimiter for metric file name

Utilize symbol '@' as a delimiter between the service name and pod name
within the generaged metric filename to ensure that a given service name
does not erroneously correspond to an excessive number of unrelated
files.

Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
bjzhjing and pre-commit-ci[bot] authored Sep 6, 2024
1 parent 102fcdd commit cff0a36
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 20 deletions.
5 changes: 3 additions & 2 deletions evals/benchmark/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ test_cases:
e2e:
run_test: true
service_name: "chatqna-backend-server-svc" # Replace with your service name
service_list: # Replace with your k8s service names for metrics collection,
# activate if deployment_type is k8s and collect_service_metric is true
service_list: # Replace with your k8s service names if deploy with k8s
# or container names if deploy with Docker for metrics collection,
# activate if collect_service_metric is true
- "chatqna-tei"
- "chatqna-teirerank"

Expand Down
57 changes: 42 additions & 15 deletions evals/benchmark/stresscli/commands/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,29 @@ def locust_runtests(kubeconfig, profile):
click.echo(f"Load test results saved to {base_folder}")


def collect_metrics(collector, namespace, services, output_dir):
collector.start_collecting_data(
namespace=namespace,
services=services,
output_dir=output_dir,
restart_pods_flag=False,
)
def collect_metrics(collector, services, output_dir, namespace=None):
"""Collect metrics from the specified services and output directory.
Args:
collector: The metrics collector object.
services (list): A list of services to collect metrics from.
output_dir (str): The directory where metrics will be saved.
namespace (str, optional): The namespace for collecting metrics. Defaults to None.
"""
if namespace:
# If namespace is provided, call with namespace
collector.start_collecting_data(
namespace=namespace,
services=services,
output_dir=output_dir,
restart_pods_flag=False,
)
else:
# If namespace is not provided, call without namespace
collector.start_collecting_data(
services=services,
output_dir=output_dir,
)


def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, index):
Expand Down Expand Up @@ -154,20 +170,31 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
print(f"Running test: {' '.join(cmd)}")
namespace = runspec["namespace"]

if service_metric and runspec["deployment-type"] == "k8s":
from .metrics import MetricsCollector
from .metrics_util import export_metric

collector = MetricsCollector()
if service_metric:
services = global_settings.get("service-list") or []
collect_metrics(collector, namespace, services, start_output_folder)
if runspec["deployment-type"] == "k8s":
from .metrics import MetricsCollector

collector = MetricsCollector()
collect_metrics(collector, services, start_output_folder, namespace)
elif runspec["deployment-type"] == "docker":
from .metrics_docker import DockerMetricsCollector

collector = DockerMetricsCollector()
collect_metrics(collector, services, start_output_folder)

runspec["starttest_time"] = datetime.now().isoformat()
result = subprocess.run(cmd, capture_output=True, text=True)
runspec["endtest_time"] = datetime.now().isoformat()

if service_metric and runspec["deployment-type"] == "k8s":
collect_metrics(collector, namespace, services, end_output_folder)
if service_metric:
from .metrics_util import export_metric

services = global_settings.get("service-list") or []
if runspec["deployment-type"] == "k8s":
collect_metrics(collector, services, end_output_folder, namespace)
elif runspec["deployment-type"] == "docker":
collect_metrics(collector, services, end_output_folder)
export_metric(start_output_folder, end_output_folder, metrics_output_folder, metrics_output, services)

with open(json_output, "w") as json_file:
Expand Down
2 changes: 1 addition & 1 deletion evals/benchmark/stresscli/commands/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def start_collecting_data(self, namespace, services, output_dir="/data", restart
pod_port = self.get_pod_port(pod_info)
metrics = self.collect_metrics(pod_ip, pod_port, metrics_path)
if metrics:
pod_output_path = os.path.join(output_dir, f"{service_name}_{pod_name}_{timestamp}.txt")
pod_output_path = os.path.join(output_dir, f"{service_name}@{pod_name}_{timestamp}.txt")
logging.debug(f"Writing metrics to {pod_output_path}")
with open(pod_output_path, "w") as f:
f.write(metrics)
Expand Down
111 changes: 111 additions & 0 deletions evals/benchmark/stresscli/commands/metrics_docker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import json
import logging
import os
import time

import requests

import docker

# Setup logs
log_level = os.getenv("LOG_LEVEL", "ERROR").upper()
logging.basicConfig(level=getattr(logging, log_level))


class DockerMetricsCollector:
def __init__(self):
self.docker_client = docker.from_env()

def get_docker_container(self, container_name):
"""Retrieve Docker container information."""
try:
container = self.docker_client.containers.get(container_name)
logging.info(f"Found Docker container {container_name}")
return container
except docker.errors.NotFound:
logging.error(f"Container {container_name} not found.")
return None

def get_exposed_port(self, container):
"""Get the port exposed to the external environment by the Docker container."""
try:
# Retrieve ports in JSON format
ports_json = container.attrs["NetworkSettings"]["Ports"]
logging.debug(f"Container ports: {ports_json}")

# Parse the ports to find the host port
for container_port, host_infos in ports_json.items():
for host_info in host_infos:
host_ip = host_info["HostIp"]
host_port = host_info["HostPort"]

# Use localhost if the port is mapped to 0.0.0.0 or empty
if host_ip in ["0.0.0.0", ""]:
logging.debug(
f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)"
)
return ("localhost", host_port)
else:
logging.debug(
f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})"
)
return (host_ip, host_port)

logging.error("No valid host port found.")
return (None, None)
except KeyError as e:
logging.error(f"Error retrieving ports: {e}")
return (None, None)

def collect_metrics(self, container_name, metrics_path="/metrics"):
"""Collect metrics from the Docker container."""
container = self.get_docker_container(container_name)
if container:
try:
host_ip, port = self.get_exposed_port(container) # Get the exposed port
if not port:
logging.error(f"Cannot determine port for container {container_name}")
return None

# Construct the URL
service_url = f"http://{host_ip}:{port}{metrics_path}"
logging.debug(f"Collecting metrics from {service_url}")
response = requests.get(service_url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Error collecting metrics from {container_name}: {e}")
return None

def start_collecting_data(self, services, output_dir="/data"):
"""Start collecting metrics from services."""
timestamp = int(time.time())
for container_name in services:
metrics = self.collect_metrics(container_name)
if metrics:
output_path = os.path.join(output_dir, f"{container_name}@{timestamp}.txt")
logging.debug(f"Writing Docker metrics to {output_path}")
with open(output_path, "w") as f:
f.write(metrics)
else:
logging.error(f"No metrics collected for container {container_name}")
return {"status": "success"}


if __name__ == "__main__":
docker_collector = DockerMetricsCollector()
result = docker_collector.start_collecting_data(
services=[
"llm-tgi-server",
"retriever-redis-server",
"embedding-tei-server",
"tei-embedding-server",
"tgi-service",
"tei-reranking-server",
],
output_dir="/path/to/data",
)
print(result)
4 changes: 2 additions & 2 deletions evals/benchmark/stresscli/commands/metrics_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ def calculate_diff(start_dir, end_dir, output_dir, services=None):
services = [services]

for service_name in services:
# Create a regex pattern to match files starting with the service_name followed by a non-alphanumeric character
pattern = rf"^{re.escape(service_name)}[^a-zA-Z].*\.txt$"
# Create a regex pattern to match files starting with the service_name followed by symbol @
pattern = rf"^{re.escape(service_name)}@.*\.txt$"

start_service_files = [f for f in start_files if re.match(pattern, f)]
end_service_files = [f for f in end_files if re.match(pattern, f)]
Expand Down
1 change: 1 addition & 0 deletions evals/benchmark/stresscli/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
click
deepdiff
docker
flask
kubernetes
locust
Expand Down

0 comments on commit cff0a36

Please sign in to comment.