diff --git a/.github/workflows/build-push-image.yml b/.github/workflows/build-push-image-from-main.yml similarity index 87% rename from .github/workflows/build-push-image.yml rename to .github/workflows/build-push-image-from-main.yml index 89de0f1..cf771f7 100644 --- a/.github/workflows/build-push-image.yml +++ b/.github/workflows/build-push-image-from-main.yml @@ -1,4 +1,4 @@ -name: Build and Push Container Image +name: Build and Push Latest Container Image on: workflow_dispatch: @@ -25,7 +25,6 @@ jobs: uses: docker/metadata-action@v5 with: images: quay.io/autopilot/autopilot - tags: ${{ steps.meta.outputs.tags }} - name: Log into registry if: github.event_name != 'pull_request' @@ -40,4 +39,4 @@ jobs: with: context: autopilot-daemon push: ${{ github.event_name != 'pull_request' }} - tags: ${{ steps.meta.outputs.tags }} + tags: "latest" diff --git a/.github/workflows/publish-release.yml b/.github/workflows/publish-release.yml index 4c2cdad..1873e3c 100644 --- a/.github/workflows/publish-release.yml +++ b/.github/workflows/publish-release.yml @@ -1,7 +1,7 @@ -name: Publish Release +# This is a basic workflow to help you get started with Actions + +name: Create New Release - Quay and Helm on: - push: - tags: '*' workflow_dispatch: jobs: @@ -19,7 +19,6 @@ jobs: run: | git config user.name "$GITHUB_ACTOR" git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - name: Install Helm uses: azure/setup-helm@v3 with: @@ -33,5 +32,46 @@ jobs: with: pages_branch: gh-pages charts_dir: helm-charts + skip_existing: true packages_with_index: true token: ${{ secrets.GITHUB_TOKEN }} + + docker: + runs-on: ubuntu-latest + steps: + - name: Remove unnecessary files + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Read helm chart version + run: echo "CHART_VERSION=$(grep '^version:' helm-charts/autopilot/Chart.yaml | cut -d ":" -f2 | tr -d ' ')" >> $GITHUB_ENV + + - name: Checkout + uses: actions/checkout@v4 + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: quay.io/autopilot/autopilot + tags: ${{ env.CHART_VERSION }} + + - name: Log into registry + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_PASSWORD }} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: autopilot-daemon + push: true + tags: ${{ steps.meta.outputs.tags }} \ No newline at end of file diff --git a/HEALTH_CHECKS.md b/HEALTH_CHECKS.md new file mode 100644 index 0000000..8ff04b1 --- /dev/null +++ b/HEALTH_CHECKS.md @@ -0,0 +1,65 @@ +# Health Checks + +Here is a breakdown of the existing health checks: + +1. **PCIe Bandwidth Check (pciebw)** + - Description : Host-to-device connection speeds, one measurement per GPU. Codebase in tag [v12.4.1](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest) + - Outputs: Pass/fail results based on PCIe bandwidth thresholds. + - Implementation: Compares bandwidth results to a threshold (e.g., 8 GB/s). If the measured bandwidth falls below the threshold, it triggers a failure. +2. **GPU Memory Check (remapped)** + - Description: Information from nvidia-smi regarding GPU memory remapped rows. + - Outputs: Reports the state of GPU memory (normal/faulty). + - Implementation: Analyzes remapped rows information to assess potential GPU memory issues. +3. **GPU Memory Bandwidth Performance (gpumem)** + - Description: Memory bandwidth measurements using DAXPY and DGEMM. + - Outputs: Performance metrics (eg., TFlops, power). + - Implementation: CUDA code that valuates memory bandwidth and flags deviations from expected performance values. +4. **GPU Diagnostics (dcgm)** + - Description: Runs NVidia DCGM diagnostics using dcgmi diag. + - Outputs: Diagnostic results (pass/fail). + - Implementation: Analyzes GPU health, including memory, power, and thermal performance. +5. **PVC Create/Delete (pvc)** + - Description: Given a storage class, tests if a PVC can be created and deleted. + - Output: pass/fail depending on the success or failure of creation and deletion of a PVC. If either operation fail, the result is a failure. + - Implementation: creation of a PVC through K8s APIs. +6. **Network Reachability Check (ping)** + - Description: Pings between nodes to assess connectivity. + - Outputs: Pass/fail based on ping success. + - Implementation: all-to-all reachability test. +7. **Network Bandwidth Check (iperf)** + - Description: Tests network bandwidth by launching clients and servers on multiple interfaces through iperf3. Results are aggregated per interface results from network tests. Further details can be found in [the dedicated page](autopilot-daemon/network/README.md). + - Outputs: Aggregate bandwidth on each interface, per node (in Gb/s). + - Implementation: Tests network bandwidth by launching clients and servers on multiple interfaces and by running a ring topology on all network interfaces found on the pod that are exposed by network controllers like multi-nic CNI, which exposes fast network interfaces in the pods requesting them. Does not run on `eth0`. + +These checks are configured to run periodically (e.g., hourly), and results are accessible via Prometheus, direct API queries or labels on the worker nodes. + +## Deep Diagnostics and Node Labeling + +Autopilot runs health checks periodically on GPU nodes, and if any of the health checks returns an error, the node is labeled with `autopilot.ibm.com/gpuhealth: ERR`. Otherwise, the label is set as `PASS`. + +Also, more extensive tests, namely DCGM diagnostics level 3, are also executed automatically only on nodes that have free GPUs. This deeper analysis is needed to reveal problems in the GPUs that can be found only after running level 3 DCGM diagnostic. +This type of diagnostics can help deciding if the worker node should be used for running workloads or not. To facilitate this task, Autopilot will label nodes with key `autopilot.ibm.com/dcgm.level.3`. + +If errors are found during the level 3 diagnostics, the label `autopilot.ibm.com/dcgm.level.3` will contain detailed information about the error in the following format: + +`ERR_Year-Month-Date_Hour.Minute.UTC_Diagnostic_Test.gpuID,Diagnostic_Test.gpuID,...` + +- `ERR`: An indicator that an error has occurred +- `Year-Month-Date_Hour.Minute.UTC`: Timestamp of completed diagnostics +- `Diagnostic_Test`: Name of the test that has failed (formatted to replace spaces with underscores) +- `gpuID`: ID of GPU where the failure has occurred + +**Example:** `autopilot.ibm.com/dcgm.level.3=ERR_2024-10-10_19.12.03UTC_page_retirement_row_remap.0` + +If there are no errors, the value is set to `PASS_Year-Month-Date_Hour.Minute.UTC`. + +### Logs and Metrics + +All health checks results are exported through Prometheus, but they can be also found in each pod's logs. + +All metrics are accessible through Prometheus and Grafana dashboards. The gauge exposed is `autopilot_health_checks` and can be customized with the following filters: + +- `check`, select one or more specific health checks +- `node`, filter by node name +- `cpumodel` and `gpumodel`, for heterogeneous clusters +- `deviceid` to select specific GPUs, when available diff --git a/README.md b/README.md index 44c2b9b..a806535 100644 --- a/README.md +++ b/README.md @@ -58,189 +58,12 @@ By default, the periodic checks list contains PCIe, rows remapping, GPUs power, Results from health checks are exported as Prometheus Gauges, so that users and admins can easily check the status of the system on Grafana. -## Deep Diagnostics and Node Labeling +Detailed description of all the health checks, can be found in [HEALTH_CHECKS.md](HEALTH_CHECKS.md). -Autopilot runs health checks periodically and labels the nodes with `autopilot.ibm.com/gpuhealth: ERR` is any of the GPU health checks returns an error. Otherwise, health is set as `PASS`. +## Install -Also, more extensive tests, namely DCGM diagnostics level 3, are also executed automatically only on nodes that have free GPUs. This deeper analysis is needed to reveal problems in the GPUs that can be found only after running level 3 DCGM diagnostic. -This type of diagnostics can help deciding if the worker node should be used for running workloads or not. To facilitate this task, Autopilot will label nodes with key `autopilot.ibm.com/dcgm.level.3`. +To learn how to install Autopilot, please refer to [SETUP.md](SETUP.md) -If errors are found, the label `autopilot.ibm.com/dcgm.level.3` will contain the value `ERR`, a timestamp, the test(s) that failed and the GPU id(s) if available. Otherwise, the value is set to `PASS_timestamp`. +## Usage -### Logs and Metrics - -All health checks results are exported through Prometheus, but they can be also found in each pod's logs. - -All metrics are accessible through Prometheus and Grafana dashboards. The gauge exposed is `autopilot_health_checks` and can be customized with the following filters: - -- `check`, select one or more specific health checks -- `node`, filter by node name -- `cpumodel` and `gpumodel`, for heterogeneous clusters -- `deviceid` to select specific GPUs, when available - -## Install Autopilot - -Autopilot can be installed through Helm and need admin privileges to create objects like services, serviceaccounts, namespaces and relevant RBAC. - -**NOTE**: this install procedure does NOT allow the use of `--create-namespace` or `--namespace=autopilot` in the `helm` command. This is because our helm chart, creates a namespace with a label, namely, we are creating a namespace with the label `openshift.io/cluster-monitoring: "true"`, so that Prometheus can scrape metrics. This applies to OpenShift clusters **only**. It is not possible, in Helm, to create namespaces with labels or annotations through the `--create-namespace` parameter, so we decided to create the namespace ourselves. - -Therefore, we recommend one of the following options, which are mutually exclusive: - -- Option 1: use `--create-namespace` with `--namespace=autopilot` in the helm cli AND disable namespace creation in the helm chart config file `namespace.create=false`. **If on OpenShift**, then manually label the namespace for Prometheus with `label ns autopilot "openshift.io/cluster-monitoring"=`. -- Option 2: use `namespace.create=true` in the helm chart config file BUT NOT use `--create-namespace` in the helm cli. Can still use `--namespace` in the helm cli but it should be set to something else (i.e., `default`). -- Option 3: create the namespace by hand `kubectl create namespace autopilot`, use `--namespace autopilot` in the helm cli and set `namespace.create=false` in the helm config file. **If on OpenShift**, then manually label the namespace for Prometheus with `label ns autopilot "openshift.io/cluster-monitoring"=`. - -In the next release, we will remove the namespace from the Helm templates and will add OpenShift-only configurations separately. - -### Requirements - -- Need to install `helm-git` plugin - -```bash -helm plugin install https://github.com/aslafy-z/helm-git --version 0.15.1 -``` - -### Helm Chart customization - -Helm charts values and how-to for customization can be found [here](https://github.com/IBM/autopilot/tree/main/autopilot-daemon/helm-charts/autopilot). - -### Install - -1) Add autopilot repo - -```bash -helm repo add autopilot git+https://github.com/IBM/autopilot.git@autopilot-daemon/helm-charts/autopilot?ref=gh-pages -``` - -2) Install autopilot (idempotent command). The config file is for customizing the helm values. Namespace is where the helm chart will live, not the namespace where Autopilot runs - -```bash -helm upgrade autopilot autopilot/autopilot --install --namespace= -f your-config.yml -``` - -The controllers should show up in the selected namespace - -```bash -kubectl get po -n autopilot -``` - -```bash -NAME READY STATUS RESTARTS AGE -autopilot-daemon-autopilot-g7j6h 1/1 Running 0 70m -autopilot-daemon-autopilot-g822n 1/1 Running 0 70m -autopilot-daemon-autopilot-x6h8d 1/1 Running 0 70m -autopilot-daemon-autopilot-xhntv 1/1 Running 0 70m -``` - -### Uninstall - -```bash - helm uninstall autopilot # -n -``` - -## Manually Query the Autopilot Service - -Autopilot provides a `/status` handler that can be queried to get the entire system status, meaning that it will run all the tests on all the nodes. Autopilot is reachable by service name `autopilot-healthchecks.autopilot.svc` in-cluster only, meaning it can be reached from a pod running in the cluster, or through port forwarding (see below). - -Health check names are `pciebw`, `dcgm`, `remapped`, `ping`, `iperf`, `pvc`, `gpumem`. - -For example, using port forwarding to localhost or by exposing the service - -```bash -kubectl port-forward service/autopilot-healthchecks 3333:3333 -n autopilot -# or kubectl expose service autopilot-healthchecks -n autopilot -``` - -If using port forward, then launch `curl` on another terminal - -```bash -curl "http://localhost:3333/status?check=pciebw&host=nodename1" -``` - -Alternatively, retrieve the route with `kubectl get routes autopilot-healthchecks -n autopilot` - -```bash -curl "http:///status?check=pciebw&host=nodename1" -``` - -All tests can be tailored by a combination of: - -- `host=`, to run all tests on a specific node or on a comma separated list of nodes. -- `check=`, to run a single test (`pciebw`, `dcgm`, `remapped`, `gpumem`, `ping`, `iperf` or `all`) or a list of comma separated tests. When no parameters are specified, only `pciebw`, `dcgm`, `remapped`, `ping` tests are run. -- `batch=<#hosts>`, how many hosts to check at a single moment. Requests to the batch are run in parallel asynchronously. Batching is done to avoid running too many requests in parallel when the number of worker nodes increases. Defaults to all nodes. - -Some health checks provide further customization. - -### DCGM - -This test runs `dcgmi diag`, and we support only `r` as [parameter](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#command-line-options). - -The default is `1`, but can customize it by `/status?check=dcgm&r=2`. - -### Network Bandwidth Validation with IPERF - -As part of this workload, Autopilot will generate the Ring Workload and then start `iperf3 servers` on each interface on each Autopilot pod based on the configuration options provided by the user. Only after the `iperf3 servers` are started, Autopilot will begin executing the workload by starting `iperf3 clients` based on the configuration options provided by the user. All results are logged back to the user. - -- For each network interface on each node, an `iperf3 server` is started. The number of `iperf3 servers` is dependent on the `number of clients` intended on being run. For example, if the `number of clients` is `8`, then there will be `8` `iperf3 servers` started per interface on a unique `port`. - -- For each timestep, all `pairs` are executed simultaneously. For each pair some `number of clients` are started in parallel and will run for `5 seconds` using `zero-copies` against a respective `iperf3 server` -- Metrics such `minimum`, `maximum`, `mean`, `aggregate` bitrates and transfers are tracked. The results are stored both as `JSON` in the respective `pod` as well as summarized and dumped into the `pod logs`. -- Invocation from the exposed Autopilot API is as follows below: - -```bash -# Invoked via the `status` handle: -curl "http://autopilot-healthchecks-autopilot./status?check=iperf&workload=ring&pclients=&startport=" - -# Invoked via the `iperf` handle directly: -curl "http://autopilot-healthchecks-autopilot./iperf?workload=ring&pclients=&startport=" -``` - -### Concrete Example - -In this example, we target one node and check the pcie bandwidth and use the port-forwarding method. -In this scenario, we have a value lower than `8GB/s`, which results in an alert. This error will be exported to the OpenShift web console and on Slack, if that is enabled by admins. - -```bash -curl "http://127.0.0.1:3333/status?check=pciebw" -``` - -The output of the command above, will be similar to the following (edited to save space): - -```bash -Checking status on all nodes -Autopilot Endpoint: 10.128.6.187 -Node: hostname -url(s): http://10.128.6.187:3333/status?host=hostname&check=pciebw -Response: -Checking system status of host hostname (localhost) - -[[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation. -[[ PCIEBW ]] FAIL -Host hostname -12.3 12.3 12.3 12.3 5.3 12.3 12.3 12.3 - -Node Status: PCIE Failed -------------------------------------- - - -Autopilot Endpoint: 10.131.4.93 -Node: hostname2 -url(s): http://10.131.4.93:3333/status?host=hostname2&check=pciebw -Response: -Checking system status of host hostname2 (localhost) - -[[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation. -[[ PCIEBW ]] SUCCESS -Host hostname2 -12.1 12.0 12.3 12.3 11.9 11.5 12.1 12.1 - -Node Status: Ok -------------------------------------- - -Node Summary: - -{'hostname': ['PCIE Failed'], - 'hostname2': ['Ok']} - -runtime: 31.845192193984985 sec -``` +To learn how to invoke health checks, please refer to [USAGE.md](USAGE.md). diff --git a/SETUP.md b/SETUP.md new file mode 100644 index 0000000..5902972 --- /dev/null +++ b/SETUP.md @@ -0,0 +1,65 @@ + +# Install Autopilot + +Autopilot can be installed through Helm and need enough privileges to create objects like services, serviceaccounts, namespaces and relevant RBAC. + +## Helm Chart customization + +Helm charts values and how-to for customization can be found [here](helm-charts/autopilot/README.md). + +## Install + +1) Add autopilot repo + +```bash +helm repo add autopilot https://ibm.github.io/autopilot/ +``` + +2) Install autopilot (idempotent command). The config file is for customizing the helm values. It is not mandatory. If the default values work for you, omit the `-f`. The `--namespace` parameter says where the helm chart will be deployed + +```bash +helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace -f your-config.yml +``` + +The controllers should show up in the selected namespace + +```bash +kubectl get po -n autopilot +``` + +```bash +NAME READY STATUS RESTARTS AGE +autopilot-daemon-autopilot-g7j6h 1/1 Running 0 70m +autopilot-daemon-autopilot-g822n 1/1 Running 0 70m +autopilot-daemon-autopilot-x6h8d 1/1 Running 0 70m +autopilot-daemon-autopilot-xhntv 1/1 Running 0 70m +``` + +## Uninstall + +```bash + helm uninstall autopilot -n autopilot + kubectl delete namespace autopilot +``` + +## Enabling Prometheus + +### Kubernetes Users + +The ServiceMonitor object is the one that enables Prometheus to scrape the metrics produced by Autopilot. +In order for Prometheus to find the right objects, the `ServiceMonitor` needs to be annotated with the Prometheus' release name. It is usually `prometheus`, and that's the default added in the Autopilot release. +If that is not the case in your cluster, the correct release label can be found by checking in the `ServiceMonitor` of Prometheus itself, or the name of Prometheus helm chart. +Then, Autopilot's `ServiceMonitor` can be labeled with the following command + +```bash +kubectl label servicemonitors.monitoring.coreos.com -n autopilot autopilot-metrics-monitor release= +``` + +### OpenShift Users + +**If on OpenShift**, after completing the installation, manually label the namespace to enable metrics to be scraped by Prometheus with the following command: +The `ServiceMonitor` labeling is not required. + +```bash +kubectl label ns autopilot openshift.io/cluster-monitoring=true +``` diff --git a/USAGE.md b/USAGE.md new file mode 100644 index 0000000..515b355 --- /dev/null +++ b/USAGE.md @@ -0,0 +1,113 @@ +# Manually Query the Autopilot Service + +Autopilot provides a `/status` handler that can be queried to get the entire system status, meaning that it will run all the tests on all the nodes. Autopilot is reachable by service name `autopilot-healthchecks.autopilot.svc` in-cluster only, meaning it can be reached from a pod running in the cluster, or through port forwarding (see below). + +Health check names are `pciebw`, `dcgm`, `remapped`, `ping`, `iperf`, `pvc`, `gpumem`. + +For example, using port forwarding to localhost or by exposing the service + +```bash +kubectl port-forward service/autopilot-healthchecks 3333:3333 -n autopilot +# or oc expose service autopilot-healthchecks -n autopilot in OpenShift +``` + +If using port forward, then launch `curl` on another terminal + +```bash +curl "http://localhost:3333/status?check=pciebw&host=nodename1" +``` + +Alternatively, retrieve the route with `kubectl get routes autopilot-healthchecks -n autopilot` +When using routes, it is recommended to [increase the timeout](https://docs.openshift.com/container-platform/4.10/networking/routes/route-configuration.html#nw-configuring-route-timeouts_route-configuration) with the following command + +```bash +oc annotate route autopilot-healthchecks -n autopilot --overwrite haproxy.router.openshift.io/timeout=30m +``` + +Then: + +```bash +curl "http:///status?check=pciebw&host=nodename1" +``` + +All tests can be tailored by a combination of: + +- `host=`, to run all tests on a specific node or on a comma separated list of nodes. +- `check=`, to run a single test (`pciebw`, `dcgm`, `remapped`, `gpumem`, `ping`, `iperf` or `all`) or a list of comma separated tests. When no parameters are specified, only `pciebw`, `dcgm`, `remapped`, `ping` tests are run. +- `job=`, run tests on nodes running a job labeled with `key=value` in a specific namespace. +- `nodelabel=`, run tests on nodes having the `key=value` label. +- `batch=<#hosts>`, how many hosts to check at a single moment. Requests to the batch are run in parallel asynchronously. Batching is done to avoid running too many requests in parallel when the number of worker nodes increases. Defaults to all nodes. + +Some health checks provide further customization. More details on all the tests can be found [here](https://github.com/IBM/autopilot/autopilot-daemon/HEALTH_CHECKS.md) + +## DCGM + +This test runs `dcgmi diag`, and we support only `r` as [parameter](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#command-line-options). + +The default is `1`, but can customize it by `/status?check=dcgm&r=2`. + +## Network Bandwidth Validation with IPERF + +As part of this workload, Autopilot will generate the Ring Workload and then start `iperf3 servers` on each interface on each Autopilot pod based on the configuration options provided by the user. Only after the `iperf3 servers` are started, Autopilot will begin executing the workload by starting `iperf3 clients` based on the configuration options provided by the user. All results are logged back to the user. + +- For each network interface on each node, an `iperf3 server` is started. The number of `iperf3 servers` is dependent on the `number of clients` intended on being run. For example, if the `number of clients` is `8`, then there will be `8` `iperf3 servers` started per interface on a unique `port`. + +- Invocation from the exposed Autopilot API is as follows below: + +```bash +# Invoked via the `status` handle: +curl "http://autopilot-healthchecks-autopilot./status?check=iperf&workload=ring&pclients=&startport=" + +# Invoked via the `iperf` handle directly: +curl "http://autopilot-healthchecks-autopilot./iperf?workload=ring&pclients=&startport=" +``` + +## Concrete Example + +In this example, we target one node and check the pcie bandwidth and use the port-forwarding method. +In this scenario, we have a value lower than `8GB/s`, which results in an alert. This error will be exported to the OpenShift web console and on Slack, if that is enabled by admins. + +```bash +curl "http://127.0.0.1:3333/status?check=pciebw" +``` + +The output of the command above, will be similar to the following (edited to save space): + +```bash +Checking status on all nodes +Autopilot Endpoint: 10.128.6.187 +Node: hostname +url(s): http://10.128.6.187:3333/status?host=hostname&check=pciebw +Response: +Checking system status of host hostname (localhost) + +[[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation. +[[ PCIEBW ]] FAIL +Host hostname +12.3 12.3 12.3 12.3 5.3 12.3 12.3 12.3 + +Node Status: PCIE Failed +------------------------------------- + + +Autopilot Endpoint: 10.131.4.93 +Node: hostname2 +url(s): http://10.131.4.93:3333/status?host=hostname2&check=pciebw +Response: +Checking system status of host hostname2 (localhost) + +[[ PCIEBW ]] Briefings completed. Continue with PCIe Bandwidth evaluation. +[[ PCIEBW ]] SUCCESS +Host hostname2 +12.1 12.0 12.3 12.3 11.9 11.5 12.1 12.1 + +Node Status: Ok +------------------------------------- + +Node Summary: + +{'hostname': ['PCIE Failed'], + 'hostname2': ['Ok']} + +runtime: 31.845192193984985 sec +``` diff --git a/alertmanager/alertmanager.yaml b/alertmanager/alertmanager.yaml index 02462dd..ae44bb5 100644 --- a/alertmanager/alertmanager.yaml +++ b/alertmanager/alertmanager.yaml @@ -48,7 +48,6 @@ receivers: link_names: true send_resolved: true text: |- - {{ template "slack.default.text" .}} {{ range .Alerts -}} *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} *Description:* {{ .Annotations.description }} @@ -56,6 +55,10 @@ receivers: {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` {{ end }} {{ end }} + title: >- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing + | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ + .CommonLabels.job }} - name: Default - name: 'null' - name: Watchdog diff --git a/alertmanager/alerts/healthchecks-alerts.yaml b/alertmanager/alerts/healthchecks-alerts.yaml index 0839e4d..baf7b84 100644 --- a/alertmanager/alerts/healthchecks-alerts.yaml +++ b/alertmanager/alerts/healthchecks-alerts.yaml @@ -9,10 +9,10 @@ spec: groups: - name: Alerts on GPU related issues rules: - - alert: LowPCIeBandwidth + - alert: AutopilotLowPCIeBandwidth annotations: description: | - GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has a PCIE bandwidth of {{ $value }} {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. + GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has a PCIE bandwidth of {{ $value }}{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. summary: GPU with a PCIe bandwidth of 4 or less expr: | sum (autopilot_health_checks{health="pciebw"}<=4) by (node, deviceid, value) > 0 @@ -20,10 +20,10 @@ spec: labels: severity: warning alert: autopilot - - alert: DCGMLevel1Errors + - alert: AutopilotDCGMLevel1Errors annotations: description: | - GPUs on node {{ $labels.node }} have DCGM Level 1 failures '{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. + GPUs on node {{ $labels.node }} have DCGM Level 1 failures{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. summary: GPUs have DCGM failures expr: | sum (autopilot_health_checks{health="dcgm"}==1) by (node) @@ -31,21 +31,21 @@ spec: labels: severity: warning alert: autopilot - - alert: GPUPowerSlowdownEnabled + - alert: AutopilotGPUPowerSlowdownEnabled annotations: description: | GPU device {{ $labels.deviceid }} on node {{ $labels.node }} has power slowdown enabled - summary: A GPU has power slowdown enabled {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. + summary: A GPU has power slowdown enabled{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. expr: | sum (autopilot_health_checks{health="power-slowdown"}==1) by (node, deviceid) for: 1m labels: severity: warning alert: autopilot - - alert: RemappedRowsActive + - alert: AutopilotRemappedRowsActive annotations: description: | - GPU device {{ $labels.deviceid}} on node {{ $labels.node }} with incorrect remapped rows in memory {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. + GPU device {{ $labels.deviceid}} on node {{ $labels.node }} with incorrect remapped rows in memory{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. summary: A GPU device has incorrect remapped rows expr: | sum (autopilot_health_checks{health="remapped"}==1) by (node, deviceid) @@ -53,49 +53,49 @@ spec: labels: severity: warning alert: autopilot - - alert: DCGMLevel3Errors + - alert: AutopilotDCGMLevel3Errors annotations: description: | - A node reported errors after running DCGM level 3 - check health of nodes {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. + A node reported errors after running DCGM level 3 - check health of nodes{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. summary: Node {{ $labels.node }} has GPU errors expr: | - kube_node_labels{label_autopilot_ibm_com_dcgm_level_3=~".*ERR.*"} and kube_node_labels{label_autopilot_ibm_com_dcgm_level_3!~""} - for: 1m + kube_node_labels{label_autopilot_ibm_com_dcgm_level_3=~".*ERR.*"} and kube_node_labels{label_autopilot_ibm_com_dcgm_level_3!~""} and kube_node_labels{label_autopilot_ibm_com_dcgm_level_3!~".*page_retirement_row_remap.*"} + for: 5m labels: severity: critical alert: autopilot - name: Alerts on network related issues rules: - - alert: PingFailures + - alert: AutopilotPingFailures annotations: description: | - IP {{ $labels.deviceid }} on node {{ $labels.node }} is unreachable {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. + Node {{ $labels.node }} cannot reach IP {{ $labels.deviceid }}{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. summary: Node has unreachable IPs expr: | - sum (autopilot_health_checks{health="ping"}==1) by (deviceid) + sum (autopilot_health_checks{health="ping"} > 0) by (deviceid) for: 10m labels: severity: critical alert: autopilot - name: Alerts on PVC related issues rules: - - alert: PVCAlert + - alert: AutopilotPVCAlert annotations: description: | - PVC creation by Autopilot on node {{ $labels.node }} failed {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. + PVC creation by Autopilot on node {{ $labels.node }} failed{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. summary: PVC cannot be created expr: | sum (autopilot_health_checks{health="pvc"}==1) by (node) - for: 1m + for: 5m labels: severity: critical alert: autopilot - name: Generic alert on periodic check failure rules: - - alert: GPUNodeHealth + - alert: AutopilotGPUNodeHealth annotations: description: | - Node {{ $labels.node }} reported errors after running Autopilot's periodic health checks {{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. + Node {{ $labels.node }} reported errors after running Autopilot's periodic health checks{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url) }}{{ end }}{{ end }}. summary: Node {{ $labels.node }} has errors expr: | kube_node_labels{label_autopilot_ibm_com_gpuhealth=~".*ERR.*"} and kube_node_labels{label_autopilot_ibm_com_gpuhealth!~""} @@ -105,14 +105,14 @@ spec: alert: autopilot - name: Alerts on Autopilot pods not ready rules: - - alert: AutopilotPodsFailing + - alert: AutopilotPodsNotReady annotations: - description: Autopilot pod on node {{ $labels.node }} are failing {{ with $console_url := "console_url" | query }}{{ if ne + description: Autopilot pod on node {{ $labels.node }} is not ready{{ with $console_url := "console_url" | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} on cluster {{ label "url" (first $console_url ) }}{{ end }}{{ end }}. - summary: Autopilot pod on node {{ $labels.node }} is failing - expr: count(kube_pod_info{} and on(pod) (kube_pod_container_status_waiting{namespace=~"autopilot.*"} > 0)) by (namespace) - for: 5m + summary: Autopilot pod on node {{ $labels.node }} is not ready + expr: count by (namespace) (kube_pod_info and on (pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace=~"autopilot.*"} > 0 or kube_pod_container_status_terminated_reason{reason=~"Error", namespace=~"autopilot.*"} > 0)) + for: 15m labels: severity: critical alert: autopilot \ No newline at end of file diff --git a/autopilot-daemon/Dockerfile b/autopilot-daemon/Dockerfile index d2890b8..0a04218 100644 --- a/autopilot-daemon/Dockerfile +++ b/autopilot-daemon/Dockerfile @@ -42,8 +42,6 @@ RUN apt -y update && apt -y upgrade && DEBIAN_FRONTEND="noninteractive" TZ="Am git \ && apt -y clean && apt -y autoremove -# Add capabilities for ping -RUN setcap cap_net_raw,cap_net_admin+p /bin/ping RUN add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && apt -y update && apt install -y datacenter-gpu-manager # add ca-certificates (Alpine commands, previous base image) # RUN apk update && apk --no-cache add ca-certificates @@ -55,6 +53,8 @@ RUN add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/r # cd perftest && \ # ./autogen.sh && ./configure && \ # make && make install +# Add capabilities for ping +RUN setcap cap_net_raw,cap_net_admin+p /bin/ping RUN useradd -ms /bin/bash autopilot && usermod -g root autopilot @@ -99,7 +99,10 @@ COPY gpu-power/power-throttle.sh /home/autopilot/gpu-power/power-throttle.sh # Last touches RUN pip install --upgrade pip && pip install kubernetes netifaces aiohttp[speedups] -RUN apt -y update && apt install -y vim && apt -y clean && apt -y autoremove +RUN apt -y update && apt install -y vim curl && apt -y clean && apt -y autoremove RUN chmod 755 /usr/local/bin/autopilot && chown -hR autopilot /home/autopilot && chmod -R g=u /home/autopilot RUN chmod 777 /tmp + + + CMD ["/usr/local/bin/autopilot"] diff --git a/autopilot-daemon/helm-charts/autopilot/templates/prometheus_clusterrole_patch.yaml b/autopilot-daemon/helm-charts/autopilot/templates/prometheus_clusterrole_patch.yaml deleted file mode 100644 index 6f24c21..0000000 --- a/autopilot-daemon/helm-charts/autopilot/templates/prometheus_clusterrole_patch.yaml +++ /dev/null @@ -1,37 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus-k8s-autopilot -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/metrics - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus-k8s-autopilot -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-k8s-autopilot -subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: openshift-monitoring diff --git a/autopilot-daemon/network/README.md b/autopilot-daemon/network/README.md new file mode 100644 index 0000000..a34ff2c --- /dev/null +++ b/autopilot-daemon/network/README.md @@ -0,0 +1,55 @@ +# Network Validation Tests + +Autopilot provides two network validation tests: + +- Reachability: runs `ping` against all network interfaces available in all the Autopilot pods +- Bandwidth: runs `iperf3` to validate the network bandwidth available. + +## Iperf + +This test, in it's current form, is primarily for running `TCP` `data plane` `port-to-port` network workloads to gather key performance statistics. This performs a `Ring Traversal` (or as we call it, a `ring workload`) through all network interfaces (net1-X interfaces) at varying intensity (number of simultaneous client & servers per interface). In future versions of Autopilot, more workloads and customization to the workloads may be provided. + +### Ring workload +A "Ring Workload", in our case is similar the commonly known "Ring Topology" such that the execution calls flow sequentially in a particular _direction_ that forms a "ring" like pattern. _Most importantly, none of the the compute infrastructure is actually configured in a ring, we merely develop workloads that resemble a ring pattern._ The motivation for these workloads is to achieve full line rate throughput on a port-by-port (in our case network interfaces net1-X) basis for a single logical cluster. + +Assume we have the following set of nodes `[A,B,C]`. We can create a `ring` starting from node `A` that flows to the direction of `C`: + +```console +A -> B +B -> C +C -> A +``` + +In our case, a "Ring Workload" will exhaust all starting pointings. We call these iterations, `timesteps`. In a compute infrastructure with `n` number of nodes, we can say there will be `n-1` total timesteps. Said differently, there's `n-1` possible starting points that form a ring such that no node flows to itself. Each of the pairs of execution in a given timestep will execute in parallel. + +```console +Timestep 1: +------------ +A -> B +B -> C +C -> A + +Timestep 2: +------------ +A -> C +B -> A +C -> B +``` + +As part of this workload, Autopilot will generate the Ring Workload and then start `iperf3 servers` on each interface on each Autopilot pod based on the configuration options provided by the user. Only after the `iperf3 servers` are started, Autopilot will begin executing the workload by starting `iperf3 clients` based on the configuration options provided by the user. All results are logged back to the user. + +For each network interface on each node, an `iperf3 server` is started. The number of `iperf3 servers` is dependent on the `number of clients` intended on being run. For example, if the `number of clients` is `8`, then there will be `8` `iperf3 servers` started per interface on a unique `port`. + +For each timestep, all `pairs` are executed simultaneously. For each pair some `number of clients` are started in parallel and will run for `5 seconds` using `zero-copies` against a respective `iperf3 server` + +Metrics such `minimum`, `maximum`, `mean`, `aggregate` bitrates and transfers are tracked for both the `sender` and the `receiver` for each `client -> server` execution. The results are stored both as `JSON` in the respective `pod` as well as summarized and dumped into the `pod logs`. + +Invocation from the exposed Autopilot API is as follows below: + +```bash + # Invoked via the `status` handle: +curl "http://autopilot-healthchecks-autopilot./status?check=iperf&workload=ring&pclients=&startport=" + + # Invoked via the `iperf` handle directly: +curl "http://autopilot-healthchecks-autopilot./iperf?workload=ring&pclients=&startport=" +``` diff --git a/autopilot-daemon/network/iperf3_start_servers.py b/autopilot-daemon/network/iperf3_start_servers.py index a441e29..6ab611d 100644 --- a/autopilot-daemon/network/iperf3_start_servers.py +++ b/autopilot-daemon/network/iperf3_start_servers.py @@ -26,13 +26,47 @@ def main(): num_server = args["numservers"] port = args["startport"] + interfaces = [] + entrylist = json.loads('{}') - interfaces = [ - iface - for iface in netifaces.interfaces() - if "net" in iface and iface not in ("lo", "eth0") - ] + try: + config.load_incluster_config() + v1 = client.CoreV1Api() + except: + log.error("Failed to load Kubernetes CoreV1API.") + exit(1) + try: + autopilot_pods = v1.list_namespaced_pod( + namespace=AUTOPILOT_NAMESPACE, field_selector="metadata.name="+CURR_POD_NAME + ) + except ApiException as e: + log.error( + "Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e + ) + exit(1) + + pod = autopilot_pods.items[0] + try: + entrylist = json.loads( + pod.metadata.annotations["k8s.v1.cni.cncf.io/network-status"] + ) + except KeyError: + log.info( + f'Key k8s.v1.cni.cncf.io/network-status not found on pod "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}"') + if len(entrylist) > 0: + interfaces = [ + iface + for iface in netifaces.interfaces() + if "net" in iface and iface not in ("lo", "eth0", "tunl0") + ] + else: + interfaces = [ + iface + for iface in netifaces.interfaces() + if iface not in ("lo", "tunl0") + ] + if not interfaces: log.error( f'Secondary nics not found for "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}".' diff --git a/autopilot-daemon/network/metrics-entrypoint.py b/autopilot-daemon/network/metrics-entrypoint.py deleted file mode 100644 index 9aed0e5..0000000 --- a/autopilot-daemon/network/metrics-entrypoint.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import subprocess -import sys - - - -def main(): - print("[[ MULTINIC-CNI-STATUS ]] Evaluating reachability of Multi-NIC CNI.") - nodename = os.getenv("NODE_NAME") - command = ['python3', './network/read_status.py', nodename] - timeout_s = 30 - try: - result = subprocess.run(command, text=True, capture_output=True, timeout=timeout_s) - except subprocess.TimeoutExpired: - print("Multi-NIC CNI health checker is not reachable - network reachability test cannot run") - sys.exit(0) - - if result.stderr: - print(result.stderr) - print("Multi-NIC CNI health checker is not reachable - network reachability test cannot run") - sys.exit(0) - else: - output = result.stdout - print(output) - if "OK" in output: - print("[[ MULTINIC-CNI-STATUS ]] SUCCESS") - else: - print("[[ MULTINIC-CNI-STATUS ]] FAIL") - print("Host ", os.getenv("NODE_NAME")) - if "cannot" in output: - print("Multi-NIC CNI health checker is not reachable - network reachability test cannot run") - sys.exit(0) - - connectable = output.split("Connectable network devices: ")[1] - devices = int(connectable.split("/")[0]) - if devices == 2: - lastline = nodename + " 1 1" - elif devices == 1: - lastline = nodename + " 1 0" - elif devices == 0: - lastline = nodename + " 0 0" - else: - lastline = "Cannot determine connectable devices" - - print("\n" + lastline) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/autopilot-daemon/network/network_workload.py b/autopilot-daemon/network/network_workload.py index cdb8b34..76f236f 100644 --- a/autopilot-daemon/network/network_workload.py +++ b/autopilot-daemon/network/network_workload.py @@ -36,21 +36,35 @@ def get_all_ifaces(self): "Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e ) exit(1) - + entrylist = json.loads('{}') for pod in autopilot_pods.items: - entrylist = json.loads( - pod.metadata.annotations["k8s.v1.cni.cncf.io/network-status"] - ) + try: + entrylist = json.loads( + pod.metadata.annotations["k8s.v1.cni.cncf.io/network-status"] + ) + except KeyError: + log.info( + f'Key k8s.v1.cni.cncf.io/network-status not found on pod "{CURR_POD_NAME}" on "{CURR_WORKER_NODE_NAME}"') if len(entrylist) > 0: for entry in entrylist: try: iface = entry["interface"] except KeyError: - self.log.info("Interface key not found, assigning default.") - iface = "default" + self.log.info("Interface key name not found, assigning 'k8s-pod-network'.") + iface = "k8s-pod-network" if address_map.get(iface) == None: address_map[iface] = [] address_map.get(iface).append((pod.spec.node_name, entry["ips"])) + else: + pod_ips = pod.status.pod_i_ps + if pod_ips != None: + iface = "default" + if address_map.get(iface) == None: + address_map[iface] = [] + ips = [] + for pod_ip in pod_ips: + ips.append(pod_ip.ip) + address_map.get(iface).append((pod.spec.node_name, ips)) if len(address_map) == 0: self.log.error("No interfaces found. FAIL.") diff --git a/autopilot-daemon/network/ping-entrypoint.py b/autopilot-daemon/network/ping-entrypoint.py index e1cccaf..25230f3 100644 --- a/autopilot-daemon/network/ping-entrypoint.py +++ b/autopilot-daemon/network/ping-entrypoint.py @@ -54,20 +54,16 @@ async def main(): print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e) exit() - for pod in autopilot_pods.items: - if not 'k8s.v1.cni.cncf.io/network-status' in pod.metadata.annotations: - print("[PING] Pod", pod.metadata.name, "misses network annotation. Skip node", pod.spec.node_name) - # run through all pods and create a map of all interfaces print("Creating a list of interfaces and IPs") + entrylist = json.loads('{}') for pod in autopilot_pods.items: if pod.spec.node_name != nodename_self and (allnodes or (pod.spec.node_name in nodemap.keys())): try: entrylist = json.loads(pod.metadata.annotations['k8s.v1.cni.cncf.io/network-status']) except KeyError: - print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod.metadata.name, "- Skipping node", pod.spec.node_name) - continue - else: + print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod.metadata.name, "- node", pod.spec.node_name) + if len(entrylist) > 0 : node={} nodes[pod.spec.node_name] = node for entry in entrylist: @@ -81,6 +77,22 @@ async def main(): 'ips': entry['ips'], 'pod': pod.metadata.name } + else: + node={} + nodes[pod.spec.node_name] = node + pod_ips = pod.status.pod_i_ps + if pod_ips != None: + iface = "default" + ifaces = ifaces | {iface} + iplist = [] + for pod_ip in pod_ips: + iplist.append(pod_ip.ip) + node[iface] = { + 'ips': iplist, + 'pod': pod.metadata.name + } + + if len(nodes.keys()) == 0: print("[PING] No nodes found. ABORT") @@ -99,7 +111,8 @@ async def main(): continue for index, ip in enumerate(ips): command = ['ping',ip,'-t','45','-c','10'] - clients.append((subprocess.Popen(command, start_new_session=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE), nodename, ip, "net-"+str(index))) + indexed_iface = iface+("-"+str(index) if len(ips)>1 else "") + clients.append((subprocess.Popen(command, start_new_session=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE), nodename, ip, indexed_iface)) for c in clients: try: c[0].wait(50) @@ -111,7 +124,7 @@ async def main(): stdout, stderr = c[0].communicate() if stderr: print("[PING] output parse exited with error: " + stderr) - print("FAIL") + fail = True else: if "Unreachable" in stdout or "100% packet loss" in stdout: print("Node", c[1], c[2], c[3], "1") @@ -128,24 +141,36 @@ def check_local_ifaces(): pod_list = kubeapi.list_namespaced_pod(namespace=namespace_self, field_selector="metadata.name="+podname) ips = [] iface_count = 0 - pod_self = pod_list.items[0] + pod_self = pod_list.items[0] + entrylist = json.loads('{}') + ip_addresses = [netifaces.ifaddresses(iface)[netifaces.AF_INET][0]['addr'] for iface in netifaces.interfaces() if netifaces.AF_INET in netifaces.ifaddresses(iface)] try: entrylist = json.loads(pod_self.metadata.annotations['k8s.v1.cni.cncf.io/network-status']) except KeyError: - print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod_self.metadata.name, "- Skipping node", pod_self.spec.node_name) - for entry in entrylist: - try: - iface=entry['interface'] - except KeyError: - continue - ips.append(entry['ips']) - iface_count += len(entry['ips']) - ifaces = netifaces.interfaces() - ifaces.remove('lo') - - if iface_count != len(ifaces) : - print("[PING] IFACES count inconsistent. Pod annotation reports", ips, ", not found in the pod among", ifaces, "ABORT") - exit() + print("Key k8s.v1.cni.cncf.io/network-status not found on pod", pod_self.metadata.name, "- node", pod_self.spec.node_name) + if len(entrylist) > 0: + for entry in entrylist: + try: + iface=entry['interface'] + except KeyError: + continue + for ip in entry['ips']: + if ip not in ip_addresses: + print("[PING] IFACES count inconsistent. Pod annotation reports", entry['ips'], ", not found in the pod among", ip_addresses, "ABORT") + exit() + ips.append(entry['ips']) + iface_count += len(entry['ips']) + else: + pod_ips = pod_self.status.pod_i_ps + if pod_ips != None: + for pod_ip in pod_ips: + if pod_ip.ip not in ip_addresses: + print("[PING] IFACES count inconsistent. Pod annotation reports", pod_ip.ip, ", not found in the pod among", ip_addresses, "ABORT") + exit() + ips.append(pod_ip.ip) + iface_count += len(pod_ips) + + def get_job_nodes(nodelist): v1 = client.CoreV1Api() diff --git a/autopilot-daemon/pkg/handlers/healthchecks.go b/autopilot-daemon/pkg/handlers/healthchecks.go index 3d2e24d..5e01edf 100644 --- a/autopilot-daemon/pkg/handlers/healthchecks.go +++ b/autopilot-daemon/pkg/handlers/healthchecks.go @@ -326,12 +326,14 @@ func runPing(nodelist string, jobName string, nodelabel string) (*[]byte, error) for _, line := range lines { if strings.HasPrefix(line, "Node") { entry := strings.Split(line, " ") - if entry[len(entry)-1] == "1" { - utils.HchecksGauge.WithLabelValues("ping", os.Getenv("NODE_NAME"), utils.CPUModel, utils.GPUModel, entry[1]).Set(1) - klog.Info("Observation: ", entry[1], " ", entry[2], " ", entry[3], " Unreachable") - unreach_nodes[entry[1]] = append(unreach_nodes[entry[1]], entry[2]) - } else { - utils.HchecksGauge.WithLabelValues("ping", os.Getenv("NODE_NAME"), utils.CPUModel, utils.GPUModel, entry[1]).Set(0) + if _, exists := unreach_nodes[entry[1]]; !exists { + if entry[len(entry)-1] == "1" { + utils.HchecksGauge.WithLabelValues("ping", os.Getenv("NODE_NAME"), utils.CPUModel, utils.GPUModel, entry[1]).Set(float64(1)) + klog.Info("Observation: ", entry[1], " ", entry[2], " ", entry[3], " Unreachable") + unreach_nodes[entry[1]] = append(unreach_nodes[entry[1]], entry[2]) + } else { + utils.HchecksGauge.WithLabelValues("ping", os.Getenv("NODE_NAME"), utils.CPUModel, utils.GPUModel, entry[1]).Set(float64(0)) + } } } } diff --git a/autopilot-daemon/helm-charts/autopilot/.helmignore b/helm-charts/autopilot/.helmignore similarity index 100% rename from autopilot-daemon/helm-charts/autopilot/.helmignore rename to helm-charts/autopilot/.helmignore diff --git a/autopilot-daemon/helm-charts/autopilot/Chart.yaml b/helm-charts/autopilot/Chart.yaml similarity index 98% rename from autopilot-daemon/helm-charts/autopilot/Chart.yaml rename to helm-charts/autopilot/Chart.yaml index d0be5ff..ceae232 100644 --- a/autopilot-daemon/helm-charts/autopilot/Chart.yaml +++ b/helm-charts/autopilot/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: v1.9.0 +version: v2.0.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/autopilot-daemon/helm-charts/autopilot/README.md b/helm-charts/autopilot/README.md similarity index 57% rename from autopilot-daemon/helm-charts/autopilot/README.md rename to helm-charts/autopilot/README.md index 832f4c1..1fdc7ff 100644 --- a/autopilot-daemon/helm-charts/autopilot/README.md +++ b/helm-charts/autopilot/README.md @@ -1,27 +1,31 @@ # Helm Chart Customization +## Latest tag + +At every PR merge, we automatically build the `latest` tag that can be pulled by using `quay.io/autopilot/autopilot:latest`. + +This tag contains the latest changes and it must be considered as a dev image. For stable releases, always refer to the published ones. + +## Customize Helm chart + Autopilot is set to run on NVidia GPU nodes. It is possible to run it on heterogeneous nodes (i.e., CPU only and GPU only), GPU only nodes or CPU only nodes. + ```yaml onlyOnGPUNodes: true ``` + Running on GPU nodes only, will: -1) add the `nvidia.com/gpu.present: 'true'` label and + +1) add the `nvidia.com/gpu.present: 'true'` label and 2) enable the init container, which checks on the nvidia device plug-in to be setup Alternatively, `onlyOnGPUNodes` can be set to false and Autopilot will run on all worker nodes, regardless of the accelerators. - -By default, it will create a namespace named `autopilot` where to run the components. Users workloads do not run in the autopilot namespace. The creation of the namespace can be disabled by setting `create` to false in the namespace block of the `Values.yaml` file. - -```yaml -namespace: - create: true - name: autopilot -``` +Notice that, in this heterogeneous case, the GPU health checks will error out in the non-GPU nodes. If you do not want to create a new namespace and use an existing one, then set `create: false` and specify the namespace name. -On OpenShift, please ntice that you **must** label the namespace `oc label ns openshift.io/cluster-monitoring=true` to have Prometheus scrape metrics from Autopilot. +On OpenShift, please notice that you **must** label the namespace `oc label ns openshift.io/cluster-monitoring=true` to have Prometheus scrape metrics from Autopilot. -- To pull the image from a private registry, the admin needs to add `imagePullSecret` data in one of the helm charts. It is possible to avoid the creation of the pull secret by setting the value `create` to false in the imagePullSecret block, and by setting the name of the one that will be used (i.e., `autopilot-pull-secret`). +- To pull the image from a private registry, i.e., in case of development, the admin needs to add `imagePullSecret` data in one of the helm charts. It is possible to avoid the creation of the pull secret by setting the value `create` to false in the imagePullSecret block, and by setting the name of the one that will be used (i.e., `autopilot-pull-secret`). ```yaml pullSecrets: @@ -34,7 +38,7 @@ pullSecrets: ```yaml repeat: # periodic health checks timer (default 1h) -intrusive: # deeper diagnostic timer (default 4h, 0 to disable) +invasive: # deeper diagnostic timer (default 4h, 0 to disable) ``` - PCIe bandwidth critical value is defaulted to 4GB/s. It can be customized by changing the following @@ -62,8 +66,12 @@ env: value: "example-storage-class" ``` -All these values can be saved in a `config.yaml` file, which can be passed to the `helm` install command +All these values can be saved in a `config.yaml` file. + +## Install + +If you have your own configuration file, it can be passed to the `helm` install command with the `-f` parameter. If you want to install the default values, just omit the parameter. ```bash -helm upgrade autopilot autopilot/autopilot --install --namespace= -f your-config.yml -``` \ No newline at end of file +helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace <-f your-config.yml> +``` diff --git a/autopilot-daemon/helm-charts/autopilot/templates/NOTES.txt b/helm-charts/autopilot/templates/NOTES.txt similarity index 100% rename from autopilot-daemon/helm-charts/autopilot/templates/NOTES.txt rename to helm-charts/autopilot/templates/NOTES.txt diff --git a/autopilot-daemon/helm-charts/autopilot/templates/_helpers.tpl b/helm-charts/autopilot/templates/_helpers.tpl similarity index 100% rename from autopilot-daemon/helm-charts/autopilot/templates/_helpers.tpl rename to helm-charts/autopilot/templates/_helpers.tpl diff --git a/autopilot-daemon/helm-charts/autopilot/templates/autopilot.yaml b/helm-charts/autopilot/templates/autopilot.yaml similarity index 88% rename from autopilot-daemon/helm-charts/autopilot/templates/autopilot.yaml rename to helm-charts/autopilot/templates/autopilot.yaml index 4eee82e..70eddc8 100644 --- a/autopilot-daemon/helm-charts/autopilot/templates/autopilot.yaml +++ b/helm-charts/autopilot/templates/autopilot.yaml @@ -4,7 +4,6 @@ metadata: labels: app: autopilot name: {{ printf "%s" .Chart.Name }} - namespace: {{ .Values.namespace.name }} spec: selector: matchLabels: @@ -39,32 +38,29 @@ spec: command: - sh - -c - image: {{ printf "%s:%s" .Values.image.repository .Values.image.tag }} + image: {{ .Values.image.repository }}:{{ default .Chart.AppVersion .Values.image.tag }} imagePullPolicy: Always name: device-plugin-validation securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL runAsNonRoot: true + runAsUser: 1000910000 {{- end}} containers: - - image: {{ printf "%s:%s" .Values.image.repository .Values.image.tag }} + - image: {{ .Values.image.repository }}:{{ default .Chart.AppVersion .Values.image.tag }} command: - sh - -c - | - iperf3 -s -p 6310 -D /usr/local/bin/autopilot --port {{ .Values.service.port }} --loglevel={{ .Values.loglevel }} --bw {{ .Values.PCIeBW }} --w {{ .Values.repeat }} --invasive-check-timer {{ .Values.invasive }} imagePullPolicy: {{ .Values.image.pullPolicy }} name: autopilot securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL runAsNonRoot: true + runAsUser: 1000910000 + capabilities: + add: + - NET_RAW + - NET_ADMIN env: {{- range .Values.env }} - name: {{ .name }} diff --git a/autopilot-daemon/helm-charts/autopilot/templates/metrics_service.yaml b/helm-charts/autopilot/templates/metrics_service.yaml similarity index 83% rename from autopilot-daemon/helm-charts/autopilot/templates/metrics_service.yaml rename to helm-charts/autopilot/templates/metrics_service.yaml index 704e536..0bc3af6 100644 --- a/autopilot-daemon/helm-charts/autopilot/templates/metrics_service.yaml +++ b/helm-charts/autopilot/templates/metrics_service.yaml @@ -4,7 +4,6 @@ metadata: labels: app: autopilot name: autopilot-metrics-service - namespace: {{ .Values.namespace.name }} spec: ports: - name: http diff --git a/autopilot-daemon/helm-charts/autopilot/templates/pullsecret.yaml b/helm-charts/autopilot/templates/pullsecret.yaml similarity index 84% rename from autopilot-daemon/helm-charts/autopilot/templates/pullsecret.yaml rename to helm-charts/autopilot/templates/pullsecret.yaml index 22aece9..e5acd23 100644 --- a/autopilot-daemon/helm-charts/autopilot/templates/pullsecret.yaml +++ b/helm-charts/autopilot/templates/pullsecret.yaml @@ -5,6 +5,5 @@ data: kind: Secret metadata: name: {{ .Values.pullSecrets.name }} - namespace: {{ .Values.namespace.name }} type: kubernetes.io/dockerconfigjson {{- end}} \ No newline at end of file diff --git a/autopilot-daemon/helm-charts/autopilot/templates/service.yaml b/helm-charts/autopilot/templates/service.yaml similarity index 86% rename from autopilot-daemon/helm-charts/autopilot/templates/service.yaml rename to helm-charts/autopilot/templates/service.yaml index db6ca8b..1f4bc12 100644 --- a/autopilot-daemon/helm-charts/autopilot/templates/service.yaml +++ b/helm-charts/autopilot/templates/service.yaml @@ -4,7 +4,6 @@ metadata: labels: app: autopilot name: autopilot-healthchecks - namespace: {{ .Values.namespace.name }} annotations: {{- toYaml .Values.serviceAnnotations | nindent 4 }} spec: @@ -21,7 +20,6 @@ metadata: labels: app: autopilot name: autopilot-readinessprobe - namespace: {{ .Values.namespace.name }} spec: ports: - port: 8080 diff --git a/autopilot-daemon/helm-charts/autopilot/templates/serviceaccount.yaml b/helm-charts/autopilot/templates/serviceaccount.yaml similarity index 59% rename from autopilot-daemon/helm-charts/autopilot/templates/serviceaccount.yaml rename to helm-charts/autopilot/templates/serviceaccount.yaml index a3ab3d0..cead03f 100644 --- a/autopilot-daemon/helm-charts/autopilot/templates/serviceaccount.yaml +++ b/helm-charts/autopilot/templates/serviceaccount.yaml @@ -1,26 +1,43 @@ -{{- if .Values.namespace.create -}} -kind: Namespace -apiVersion: v1 +{{ if .Capabilities.APIVersions.Has "security.openshift.io/v1" -}} +kind: SecurityContextConstraints +apiVersion: security.openshift.io/v1 metadata: - labels: - openshift.io/cluster-monitoring: "true" - name: {{ .Values.namespace.name }} -spec: - finalizers: - - kubernetes -{{- end }} + name: scc-autopilot +allowPrivilegedContainer: true +runAsUser: + type: RunAsAny +seLinuxContext: + type: RunAsAny +fsGroup: + type: RunAsAny +supplementalGroups: + type: RunAsAny +users: +- system:serviceaccount:{{ .Release.Namespace }}:autopilot +allowedCapabilities: +- 'NET_RAW' +- 'NET_ADMIN' +volumes: +- configMap +- csi +- downwardAPI +- emptyDir +- ephemeral +- hostPath +- persistentVolumeClaim +- projected +- secret +{{ end -}} --- apiVersion: v1 kind: ServiceAccount metadata: name: autopilot - namespace: {{ .Values.namespace.name }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: autopilot - namespace: {{ .Values.namespace.name }} rules: - apiGroups: [""] resources: ["endpoints"] @@ -45,11 +62,10 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: autopilot - namespace: {{ .Values.namespace.name }} subjects: - kind: ServiceAccount + namespace: {{ .Release.Namespace }} name: autopilot - namespace: {{ .Values.namespace.name }} roleRef: kind: ClusterRole name: autopilot diff --git a/autopilot-daemon/helm-charts/autopilot/templates/monitor.yaml b/helm-charts/autopilot/templates/servicemonitor.yaml similarity index 78% rename from autopilot-daemon/helm-charts/autopilot/templates/monitor.yaml rename to helm-charts/autopilot/templates/servicemonitor.yaml index 21370e2..c04b2c8 100644 --- a/autopilot-daemon/helm-charts/autopilot/templates/monitor.yaml +++ b/helm-charts/autopilot/templates/servicemonitor.yaml @@ -1,4 +1,5 @@ # Prometheus Monitor Service (Metrics) +{{ if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" -}} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: @@ -6,8 +7,8 @@ metadata: app: autopilot app.kubernetes.io/name: servicemonitor app.kubernetes.io/component: metrics + release: prometheus name: autopilot-metrics-monitor - namespace: {{ .Values.namespace.name }} spec: endpoints: - path: /metrics @@ -16,3 +17,4 @@ spec: selector: matchLabels: app: autopilot +{{ end -}} diff --git a/autopilot-daemon/helm-charts/autopilot/values.yaml b/helm-charts/autopilot/values.yaml similarity index 94% rename from autopilot-daemon/helm-charts/autopilot/values.yaml rename to helm-charts/autopilot/values.yaml index 3385ef0..96fbed7 100644 --- a/autopilot-daemon/helm-charts/autopilot/values.yaml +++ b/helm-charts/autopilot/values.yaml @@ -1,14 +1,8 @@ # Default values for the Autopilot DaemonSet. # This is a YAML-formatted file. # Declare variables to be passed into your templates. - -namespace: - create: true - name: autopilot - image: - repository: quay.io/autopilot/autopilot - tag: v1.9.0 + repository: quay.io/autopilot/autopilot pullPolicy: Always # Bandwidth threshold below which PCIe links are considered broken (Gb/s) diff --git a/utility-tools/Autopilot-Grafana-Dashboard.json b/utility-tools/Autopilot-Grafana-Dashboard.json deleted file mode 100644 index af928d5..0000000 --- a/utility-tools/Autopilot-Grafana-Dashboard.json +++ /dev/null @@ -1,780 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "id": 1, - "iteration": 1689354875983, - "links": [], - "panels": [ - { - "collapsed": false, - "datasource": "OpenShift_Prometheus", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 22, - "panels": [], - "title": "Single Node Stats", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": null, - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "autopilot_health_checks{node=\"$node\",health=\"$health\",deviceid=\"$deviceid\"}", - "interval": "", - "legendFormat": "{{ health }} for device {{ deviceid }} on {{ node }}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Single Node Metrics", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "collapsed": false, - "datasource": "OpenShift_Prometheus", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 9 - }, - "id": 24, - "panels": [], - "title": "All Nodes", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "OpenShift_Prometheus", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 3, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "autopilot_health_checks{health=\"pciebw\"}", - "hide": false, - "interval": "", - "legendFormat": "GPU {{ deviceid }} - {{ node }}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "autopilot_health_checks{health=\"pciebw\",node=\"$node\"}", - "hide": true, - "instant": true, - "interval": "", - "legendFormat": "GPU {{ deviceid }} - {{ node }}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "PCIe Bandwidths (Gauge)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "datasource": "OpenShift_Prometheus", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": "" - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 18 - }, - "id": 14, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "7.5.17", - "targets": [ - { - "exemplar": true, - "expr": "autopilot_health_checks{health=\"pciebw\"} < 4", - "interval": "", - "legendFormat": "GPU {{deviceid}} - {{node}}", - "refId": "A" - } - ], - "title": "GPUs with low PCIeBW (less than 4 GB/s)", - "type": "stat" - }, - { - "datasource": "OpenShift_Prometheus", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0 - }, - { - "color": "#EAB839", - "value": 1 - }, - { - "color": "green", - "value": 2 - } - ] - }, - "unit": "string" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 27 - }, - "id": 4, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "text": {} - }, - "pluginVersion": "7.5.17", - "targets": [ - { - "exemplar": true, - "expr": "autopilot_health_report_total{health=\"netdevice\"}", - "format": "time_series", - "hide": true, - "interval": "", - "legendFormat": "NIC {{ deviceid }}", - "refId": "A" - }, - { - "exemplar": true, - "expr": "group (avg(autopilot_health_report_total{health=\"netdevice\"})) by (node))", - "hide": true, - "interval": "", - "legendFormat": "{{ node }} ", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(autopilot_health_checks{health=\"net-reach\"})by(node) < 2", - "hide": false, - "interval": "", - "legendFormat": "{{ node }}", - "refId": "C" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Faulty Secondary NIC by Node", - "type": "gauge" - }, - { - "datasource": "OpenShift_Prometheus", - "description": "Checks remapped rows on GPUs. If no remapped rows, then value is 0. 1 otherwise.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 27 - }, - "id": 8, - "options": { - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "text": {} - }, - "pluginVersion": "7.5.17", - "targets": [ - { - "exemplar": true, - "expr": "sum(autopilot_health_checks{health=\"remapped\"})by(node)>0", - "hide": false, - "interval": "", - "legendFormat": " {{ node }}", - "refId": "B" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Faulty Remapped Rows by Node", - "type": "gauge" - }, - { - "collapsed": true, - "datasource": "OpenShift_Prometheus", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 35 - }, - "id": 26, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "OpenShift_Prometheus", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 11, - "x": 0, - "y": 27 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum_over_time(scrape_series_added{job=\"autopilot-metrics-service\"}[1h]) ", - "interval": "", - "legendFormat": "{{pod}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Approximate number of new series in this scrape", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "OpenShift_Prometheus", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 27 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "topk(10, sum without(instance)(sum_over_time(scrape_series_added[1h])))", - "hide": true, - "interval": "", - "legendFormat": "", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum_over_time(scrape_samples_scraped{job=\"autopilot-metrics-service\"}[1h])", - "hide": false, - "interval": "", - "legendFormat": "{{pod}}", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Number of samples exposed", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "title": "Timeseries Stats", - "type": "row" - } - ], - "refresh": "1m", - "schemaVersion": 27, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": { - "selected": false, - "text": "llm-dev-ocp-dp4kx-worker-3-with-secondary-4ll7b", - "value": "llm-dev-ocp-dp4kx-worker-3-with-secondary-4ll7b" - }, - "datasource": "OpenShift_Prometheus", - "definition": "label_values(nodename)", - "description": null, - "error": null, - "hide": 0, - "includeAll": false, - "label": "Node", - "multi": false, - "name": "node", - "options": [], - "query": { - "query": "label_values(nodename)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "/.*-worker-*/", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "selected": false, - "text": "pciebw", - "value": "pciebw" - }, - "datasource": "OpenShift_Prometheus", - "definition": "label_values(health)", - "description": null, - "error": null, - "hide": 0, - "includeAll": false, - "label": "Health Check", - "multi": false, - "name": "health", - "options": [], - "query": { - "query": "label_values(health)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "selected": false, - "text": "3", - "value": "3" - }, - "datasource": "OpenShift_Prometheus", - "definition": "label_values(deviceid)", - "description": "GPU (0-7), NIC (1-2)", - "error": null, - "hide": 0, - "includeAll": false, - "label": "Device ID", - "multi": false, - "name": "deviceid", - "options": [], - "query": { - "query": "label_values(deviceid)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 3, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Autopilot", - "uid": "Ny3de_UVz", - "version": 8 -} diff --git a/utility-tools/cronjobs/pcie-cronjob.yaml b/utility-tools/cronjobs/pcie-cronjob.yaml deleted file mode 100644 index ea1202f..0000000 --- a/utility-tools/cronjobs/pcie-cronjob.yaml +++ /dev/null @@ -1,42 +0,0 @@ -apiVersion: batch/v1 -kind: CronJob -metadata: - name: pcie-bw-cronjob - labels: - autopilot: "" - gpu-pcie-bw: "" -spec: - schedule: "30 * * * *" - concurrencyPolicy: Replace - startingDeadlineSeconds: 120 - successfulJobsHistoryLimit: 1 - failedJobsHistoryLimit: 1 - jobTemplate: - metadata: - labels: - autopilot: "" - gpu-pcie-bw: "" - spec: - parallelism: 9 - template: - spec: - imagePullSecrets: - - name: all-icr-io - containers: - - name: main - image: busybox - imagePullPolicy: IfNotPresent - command: - - /bin/sh - - -c - - date; echo Another instance of batch job - resources: - limits: - cpu: 1m - memory: 5G - nvidia.com/gpu: 8 - requests: - cpu: 1m - memory: 5G - nvidia.com/gpu: 8 - restartPolicy: Never diff --git a/utility-tools/system-check/README.md b/utility-tools/system-check/README.md deleted file mode 100644 index f264392..0000000 --- a/utility-tools/system-check/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Complete Health Check Assessment - -This Helm chart can be used to launch a Pod that runs the health checks on all the nodes. - -The health checks are: - -- PCIe Bandwidth on all GPUs, host to device -- Check on existing GPUs remapped rows -- Check on correct availability of Secondary Nics - -This workload is a Python program listing the Autopilot endpoints belonging to the Autopilot Kubernetes Service that exposes the health checks API. - -This program needs to deploy a ClusterRole and ClusterRoleBinding required to list the endpoints, therefore this workload can be deployed only if the necessary privileges are granted (i.e., cluster admins). - -## Installation - -The Helm chart can be configured by updating the following values in `values.yaml`: - -- `namespace` where to run the Pod. The namespace needs to have a valid `ImagePullSecret` to get images from `us.icr.io` or `icr.io` -- `imagePullSecret` defaulted to `all-icr-io` -- `autopilotService` is the name of the Service that exposes the health checks endpoints. It is defaulted to `autopilot-healthchecks` -- `autopilotNamespace` is the namespace where the Autopilot daemons are running. It is defaulted to `autopilot` -- `targetNode` to run the test(s) on a specific node only, rather than on the entire system -- `testType` is the type of test that will run i.e. pciebw, nic, remapped, or all -- `batchSize` is the number of nodes running a health check per processor. It is defaulted to `1` node per processor -- `workload` runs desired health checks on nodes where a workload is deployed. Ex: "namespace:job-name=my-job" or "namespace:app=my-app". It is defaulted to None. - -To deploy the Pod: - -```bash -helm install system-check utility-tools/system-check/charts/ -``` - -Logs can be streamed with - -```bash -kubectl logs -f system-check -``` - -All the health checks expose metrics that can be plotted through Grafana dashboards. A `json` file for a set of predefined dashboards in Autopilot, can be found in the `utiliy-tools` directory in this repository. - -## Uninstall - -To uninstall: - -```bash -helm uninstall system-check -``` diff --git a/utility-tools/system-check/charts/Chart.yaml b/utility-tools/system-check/charts/Chart.yaml deleted file mode 100644 index 1bc1446..0000000 --- a/utility-tools/system-check/charts/Chart.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v2 -name: generator -description: A Helm chart to run autopilot tests standalone -type: application -version: 0.1.0 -appVersion: "0.1.0" \ No newline at end of file diff --git a/utility-tools/system-check/charts/templates/system-check.yaml b/utility-tools/system-check/charts/templates/system-check.yaml deleted file mode 100644 index 9fa8215..0000000 --- a/utility-tools/system-check/charts/templates/system-check.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: autopilot-endpoint-reader -rules: -- apiGroups: [""] - resources: ["endpoints"] - verbs: ["get", "list"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: autopilot-endpoint-reader -subjects: -- kind: ServiceAccount - name: autopilot-endpoint-reader # Name is case sensitive - namespace: {{ .Values.namespace }} -roleRef: - kind: ClusterRole - name: autopilot-endpoint-reader - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: autopilot-endpoint-reader - namespace: {{ .Values.namespace }} ---- -apiVersion: v1 -kind: Pod -metadata: - name: system-check -spec: - restartPolicy: Never - serviceAccountName: autopilot-endpoint-reader - imagePullSecrets: - - name: {{ .Values.imagePullSecret }} - # nodeSelector: - # kubernetes.io/hostname: {{ .Values.nodename }} - containers: - - image: {{ .Values.image }} - name: system-check - args: ['--service={{ .Values.autopilotService }}', '--namespace={{ .Values.autopilotNamespace }}', '--nodes={{ .Values.targetNode }}', '--check={{ .Values.testType }}', '--batchSize={{ .Values.batchSize }}', '--wkload={{ .Values.workload }}'] - env: - - name: MY_NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - resources: - limits: - cpu: 4 - memory: 500M - requests: - cpu: 4 - memory: 500M diff --git a/utility-tools/system-check/charts/values.yaml b/utility-tools/system-check/charts/values.yaml deleted file mode 100644 index 6a1a1bd..0000000 --- a/utility-tools/system-check/charts/values.yaml +++ /dev/null @@ -1,9 +0,0 @@ -image: us.icr.io/cil15-shared-registry/autopilot/run-healthchecks:v2 -namespace: # namespace where to deploy the helm chart. -imagePullSecret: "all-icr-io" -autopilotService: "autopilot-healthchecks" # default=autopilot-healthchecks. -autopilotNamespace: "autopilot" # default=autopilot. -targetNode: # node where to run the health check i.e. all, a specific node name, or list of node names. default=all unless workload provided. -testType: # type of test that will run i.e. pciebw, nic, remapped, or all. default=all -batchSize: # number of nodes running a health check per processor. default=1 -workload: # workload namespace and label with key and value. Ex: "namespace:job-name=my-job" or "namespace:app=my-app". Default is set to None.