Skip to content

Commit

Permalink
RHOAIENG-16076: tests(gha): install a cri-o backed kubernetes for run…
Browse files Browse the repository at this point in the history
…ning Makefile tests (opendatahub-io#783)

* RHOAIENG-16076: tests(gha): install a cri-o backed kubernetes for running tests

* Update .github/workflows/build-notebooks-TEMPLATE.yaml

Co-authored-by: Jan Stourac <jstourac@redhat.com>

---------

Co-authored-by: Jan Stourac <jstourac@redhat.com>
(cherry picked from commit 117a479)
  • Loading branch information
jiridanek committed Dec 18, 2024
1 parent b4b6a7b commit 3c2fb93
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 0 deletions.
115 changes: 115 additions & 0 deletions .github/workflows/build-notebooks-TEMPLATE.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,121 @@ jobs:
- name: "Show podman images information"
run: podman images --digests

- name: "Check if we have tests or not"
id: have-tests
run: "ci/cached-builds/has_tests.py --target ${{ inputs.target }}"

# https://cri-o.io/
- name: Install cri-o
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: |
set -Eeuxo pipefail
sudo apt-get update
sudo apt-get install -y software-properties-common curl
curl -fsSL https://pkgs.k8s.io/core:/stable:/$KUBERNETES_VERSION/deb/Release.key | \
sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/$KUBERNETES_VERSION/deb/ /" | \
sudo tee /etc/apt/sources.list.d/kubernetes.list
curl -fsSL https://pkgs.k8s.io/addons:/cri-o:/stable:/$CRIO_VERSION/deb/Release.key | \
sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/$CRIO_VERSION/deb/ /" | \
sudo tee /etc/apt/sources.list.d/cri-o.list
sudo apt-get update
sudo apt-get install -y cri-o kubelet kubeadm kubectl
# make use of /etc/cni/net.d/11-crio-ipv4-bridge.conflist so we don't
# need a pod network and just use the default bridge
sudo rm -rf /etc/cni/net.d/*
# cat /etc/cni/net.d/11-crio-ipv4-bridge.conflist
# https://github.com/containerd/containerd/blob/main/script%2Fsetup%2Finstall-cni
# https://www.cni.dev/plugins/current/main/bridge/
sudo cp ci/cached-builds/11-crio-ipv4-bridge.conflist /etc/cni/net.d/11-crio-ipv4-bridge.conflist
sudo cp ci/cached-builds/crio.conf /etc/crio/crio.conf.d/
sudo systemctl start crio.service
env:
CRIO_VERSION: v1.30
KUBERNETES_VERSION: v1.30

- name: Show crio debug data (on failure)
if: ${{ failure() && steps.have-tests.outputs.tests == 'true' }}
run: |
set -Eeuxo pipefail
sudo systemctl status crio.service || true
sudo journalctl -xeu crio.service
# do this early, it's a good check that cri-o is not completely broken
- name: "Show crio images information"
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: sudo crictl images

- name: Install Kubernetes cluster
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: |
set -Eeuxo pipefail
sudo swapoff -a
sudo modprobe br_netfilter
sudo sysctl -w net.ipv4.ip_forward=1
# Was getting strange DNS resolution errors from pods that don't seem to want to go away sometimes:
# Resolving raw.githubusercontent.com (raw.githubusercontent.com)... failed: Name or service not known.
# wget: unable to resolve host address ‘raw.githubusercontent.com’
# Here's what helped:
# https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/#known-issues
# https://github.com/kubernetes/kubernetes/blob/e4c1f980b76fecece30c2f77885a7117192170a6/CHANGELOG/CHANGELOG-1.30.md?plain=1#L1454
# https://github.com/canonical/microk8s/issues/68#issuecomment-404923563
sudo ufw allow in on cni0
sudo ufw allow out on cni0
sudo ufw default allow routed
sudo iptables -P FORWARD ACCEPT
sudo iptables -t nat -A POSTROUTING -s 10.85.0.0/16 -o eth0 -j MASQUERADE
# https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm
sudo kubeadm init --config=ci/cached-builds/kubeadm.yaml
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
- name: Show kubelet debug data (on failure)
if: ${{ failure() && steps.have-tests.outputs.tests == 'true' }}
run: |
set -Eeuxo pipefail
sudo systemctl status kubelet || true
sudo journalctl -xeu kubelet
# Here is one example how you may list all running Kubernetes containers by using crictl:
sudo crictl --runtime-endpoint unix:///var/run/crio/crio.sock ps -a | grep kube | grep -v pause
# Once you have found the failing container, you can inspect its logs with:
# crictl --runtime-endpoint unix:///var/run/crio/crio.sock logs CONTAINERID
- name: Untaint the master
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: kubectl taint nodes --all node-role.kubernetes.io/control-plane-

- name: Show nodes status and wait for readiness
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: |
kubectl describe nodes
kubectl wait --for=condition=Ready nodes --all --timeout=100s || (kubectl describe nodes && false)
- name: Wait for pods to be running
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: |
set -Eeuxo pipefail
kubectl wait deployments --all --all-namespaces --for=condition=Available --timeout=100s
kubectl wait pods --all --all-namespaces --for=condition=Ready --timeout=100s
- name: Run Trivy vulnerability scanner
if: ${{ steps.resolve-target.outputs.target }}
run: |
Expand Down
22 changes: 22 additions & 0 deletions ci/cached-builds/11-crio-ipv4-bridge.conflist
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"cniVersion": "1.0.0",
"name": "crio",
"plugins": [
{
"type": "bridge",
"bridge": "cni0",
"isGateway": true,
"ipMasq": true,
"hairpinMode": true,
"ipam": {
"type": "host-local",
"routes": [
{ "dst": "0.0.0.0/0" }
],
"ranges": [
[{ "subnet": "10.85.0.0/16" }]
]
}
}
]
}
17 changes: 17 additions & 0 deletions ci/cached-builds/crio.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md

[crio]
storage_driver = "overlay"
# storage_option = [ "overlay.mountopt=nodev,metacopy=on" ]

# reuse podman's container storage because we have huge images that don't fit on disk twice
root = "/home/runner/.local/share/containers/storage"
# has to be the same as root!
runroot = "/home/runner/.local/share/containers/storage"

# https://stackoverflow.com/questions/62408028/kubelet-failed-to-createpodsandbox-for-coredns-failed-to-set-bridge-addr-c
[crio.network]
# the /etc/cni/net.d/11-crio-ipv4-bridge.conflist default IPs confilct with flannel,
# older versions of kubernetes the kubelet was touching the cni, now only the container runtime touches
# c.f. https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/#installation
#network_dir = "/etc/cni/net.d-kube/"
58 changes: 58 additions & 0 deletions ci/cached-builds/has_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3
import argparse
import json
import os
import pathlib
import typing
import unittest

import gha_pr_changed_files

"""Determines whether we have deploy Makefile tests for this target or not
https://github.com/openshift/release/blob/master/ci-operator/config/opendatahub-io/notebooks/opendatahub-io-notebooks-main.yaml#L1485
"""


class Args(argparse.Namespace):
"""Type annotation to have autocompletion for args"""
target: str


def main() -> None:
parser = argparse.ArgumentParser("make_test.py")
parser.add_argument("--target", type=str)
args = typing.cast(Args, parser.parse_args())

has_tests = check_tests(args.target)

if "GITHUB_ACTIONS" in os.environ:
with open(os.environ["GITHUB_OUTPUT"], "at") as f:
print(f"tests={json.dumps(has_tests)}", file=f)

print(f"{has_tests=}")


def check_tests(target: str) -> bool:
if target.startswith("rocm-jupyter-minimal-") or target.startswith("rocm-jupyter-datascience-"):
return False # we don't have specific tests for -minimal-, ... in ci-operator/config
if '-intel-' in target:
return False # RHOAIENG-8388: Intel tensorflow notebook failed to get tested on OCP-CI

has_tests = False
dirs = gha_pr_changed_files.analyze_build_directories(target)
for d in reversed(dirs): # (!)
kustomization = pathlib.Path(gha_pr_changed_files.PROJECT_ROOT) / d / "kustomize/base/kustomization.yaml"
has_tests = has_tests or kustomization.is_file()
break # TODO: check only the last directory (the top level layer) for now
return has_tests


class TestCheckTests(unittest.TestCase):
def test_has_tests(self):
assert check_tests("base-c9s-python-3.11") is False
assert check_tests("jupyter-minimal-ubi9-python-3.9") is True


if __name__ == "__main__":
main()
45 changes: 45 additions & 0 deletions ci/cached-builds/kubeadm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
---
# kubeadm config print init-defaults > kubeadm.yaml
# kubeadm init --cri-socket=/var/run/crio/crio.sock

# https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta3/
# https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta4/
apiVersion: kubeadm.k8s.io/v1beta3
bootstrapTokens:
- groups:
- system:bootstrappers:kubeadm:default-node-token
token: abcdef.0123456789abcdef
ttl: 24h0m0s
usages:
- signing
- authentication
kind: InitConfiguration
localAPIEndpoint:
bindPort: 6443
nodeRegistration:
kubeletExtraArgs:
# Need to have enough disk space for Kubelet, so move root-dir on the LVM volume
# Note: the internets discourage from changing the default because storage plugins may then struggle
# https://cep.dev/posts/adventure-trying-change-kubelet-rootdir/
root-dir: "/home/runner/.local/share/containers/kubelet-root-dir"
criSocket: unix:///var/run/crio/crio.sock
imagePullPolicy: IfNotPresent
taints: null
---
apiServer:
timeoutForControlPlane: 4m0s
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
controllerManager: {}
dns: {}
etcd:
local:
dataDir: /var/lib/etcd
imageRepository: registry.k8s.io
kind: ClusterConfiguration
networking:
dnsDomain: cluster.local
# this matches the default in /etc/cni/net.d/11-crio-ipv4-bridge.conflist
podSubnet: 10.85.0.0/16
scheduler: {}

0 comments on commit 3c2fb93

Please sign in to comment.