Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add workaround and update for intel-bootc #780

Merged
merged 1 commit into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 33 additions & 7 deletions training/intel-bootc/Containerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/nvidia-builder:latest"
ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/intel-builder:latest"
ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9"

FROM ${DRIVER_TOOLKIT_IMAGE} as builder
Expand Down Expand Up @@ -40,6 +40,10 @@ COPY --from=builder /home/builder/usr/src/habanalabs-${DRIVER_VERSION}/drivers/i
COPY --from=builder /home/builder/usr/src/habanalabs-${DRIVER_VERSION}/drivers/net/ethernet/intel/hbl_cn/habanalabs_cn.ko /tmp/extra/habanalabs_cn.ko
COPY --from=builder /home/builder/usr/src/habanalabs-${DRIVER_VERSION}/drivers/net/ethernet/intel/hbl_en/habanalabs_en.ko /tmp/extra/habanalabs_en.ko
COPY --from=builder /home/builder/lib/firmware/habanalabs /tmp/firmware/habanalabs
COPY --from=builder /home/builder/usr/bin/hl-smi /usr/bin/hl-smi


COPY duplicated/common/usr /usr

RUN . /etc/os-release \
&& export OS_VERSION_MAJOR=$(echo ${VERSION} | cut -d'.' -f 1) \
Expand All @@ -49,16 +53,15 @@ RUN . /etc/os-release \
&& mv /tmp/firmware/habanalabs /lib/firmware \
&& depmod -a ${KERNEL_VERSION}.${TARGET_ARCH}

RUN dnf install -y ${EXTRA_RPM_PACKAGES} \
RUN mv /etc/selinux /etc/selinux.tmp \
dnf install -y ${EXTRA_RPM_PACKAGES} \
cloud-init \
skopeo \
rsync \
&& dnf clean all \
&& mv /etc/selinux.tmp /etc/selinux \
&& ln -s ../cloud-init.target /usr/lib/systemd/system/default.target.wants

ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-intel:latest"

ARG SSHPUBKEY

# The --build-arg "SSHPUBKEY=$(cat ~/.ssh/id_rsa.pub)" option inserts your
# public key into the image, allowing root access via ssh.
Expand All @@ -68,10 +71,33 @@ RUN if [ -n "${SSHPUBKEY}" ]; then \
echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \
fi

# Prepull the instructlab image
RUN if [ -f "/run/.input/instructlab-intel/oci-layout" ]; then \
# Setup /usr/lib/containers/storage as an additional store for images.
# Remove once the base images have this set by default.
# Also make sure not to duplicate if a base image already has it specified.
RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \
sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \
/etc/containers/storage.conf

COPY duplicated/ilab-wrapper/ilab /usr/bin/ilab
RUN chmod +x /usr/bin/ilab

ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-intel:latest"
ARG INSTRUCTLAB_IMAGE_PULL_SECRET="instructlab-intel-pull"


# Added for running as an OCI Container to prevent Overlay on Overlay issues.
VOLUME /var/lib/containers

RUN --mount=type=secret,id=${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson \
if [ -f "/run/.input/instructlab-intel/oci-layout" ]; then \
IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/instructlab-intel) && \
podman --root /usr/lib/containers/storage image tag ${IID} ${INSTRUCTLAB_IMAGE}; \
elif [ -f "/run/secrets/${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson" ]; then \
IID=$(sudo podman --root /usr/lib/containers/storage pull --authfile /run/secrets/${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson ${INSTRUCTLAB_IMAGE}); \
else \
IID=$(sudo podman --root /usr/lib/containers/storage pull ${INSTRUCTLAB_IMAGE}); \
fi
RUN podman system reset --force 2>/dev/null

LABEL image_version_id="${IMAGE_VERSION_ID}"

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[Unit]
Description=Check for available RHEL AI upgrade
ConditionPathExists=/run/ostree-booted
After=network-online.target
StartLimitIntervalSec=400
StartLimitBurst=3

[Service]
Type=oneshot
ExecStart=/usr/libexec/upgrade-informer
Restart=on-failure
RestartSec=90
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Unit]
Description=Runs upgrade informer periodically
ConditionPathExists=/run/ostree-booted

[Timer]
OnBootSec=1h
OnUnitInactiveSec=1day
RandomizedDelaySec=2h

[Install]
WantedBy=timers.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

# Run the command and capture its output
output=$(bootc upgrade --check | sed -e 1q)
message_file="/etc/motd.d/upgrade-message"
bootc_auth="/etc/ostree/auth.json"

if [[ $output == Update\ available* ]]; then
if [[ ! -f $message_file ]]; then
echo "New version was found"
bootc_image=$(awk '{print $4}' <<< "$output")
# If auth file exists we should use it
auth_params=""
if [[ -f $bootc_auth ]]; then
auth_params="--authfile $bootc_auth"
fi

# Get image version
# shellcheck disable=SC2086
image_version_id=$(skopeo inspect --format json $auth_params "$bootc_image" | jq -r '.Labels | .["image_version_id"] // empty')

# If upgrade available, write the output to the file
cat > $message_file << EOF
** Attention! **
** A new $image_version_id version is available **
** In order to apply it run: bootc upgrade --apply
** Please note that the system will reboot after the upgrade **
EOF
fi
else
echo "No upgrade was found"
rm $message_file 2> /dev/null
fi

echo "Finished running upgrade informer"
145 changes: 145 additions & 0 deletions training/intel-bootc/duplicated/ilab-wrapper/ilab
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/bin/bash

echo-err() { echo "$@" >&2; }

verify_range() {
subuid_range="$1"
username="$2"
NUMBER_OF_MATCHING_SUBUID_RANGES=$(if [[ -z "$subuid_range" ]]; then echo 0; else wc -l <<<"$subuid_range"; fi)

if [[ "$NUMBER_OF_MATCHING_SUBUID_RANGES" == 0 ]]; then
echo-err "No /etc/subuid range found for user $username ($UID)"
exit 1
elif [[ "$NUMBER_OF_MATCHING_SUBUID_RANGES" != 1 ]]; then
# TODO: Handle multiple subuid ranges. But for now, hard fail
echo-err "Multiple /etc/subuid ranges found for user $username ($UID), this is currently unsupported:"
echo-err "$subuid_range"
exit 1
fi
}

check_insights() {
if [[ -f /etc/insights-client/machine-id ]]; then
return
fi
if [[ -f /etc/ilab/insights-opt-out ]]; then
return
fi
local ID
eval "$(grep ^ID= /etc/os-release)"
if [[ "$ID" != "rhel" ]]; then
return
fi
cat << EOF
This host is not connected to Red Hat Insights.

To connect this host to Red Hat Insights run the following command:
sudo rhc connect --organization <org_id> --activation-key <your_activation_key>

To generate an Activation Key:
https://console.redhat.com/insights/connector/activation-keys (this page will also display your Organization ID).

For more information on Red Hat Insights, please visit:
https://docs.redhat.com/en/documentation/subscription_central/1-latest/html/getting_started_with_activation_keys_on_the_hybrid_cloud_console/assembly-creating-managing-activation-keys
EOF
exit 1
}

check_insights

# Template values replaced by container build
CONTAINER_DEVICE="__REPLACE_CONTAINER_DEVICE__"
IMAGE_NAME="__REPLACE_IMAGE_NAME__"

ENTRYPOINT="ilab"
PARAMS=("$@")

if [[ -n "$ILAB_HOME" ]]; then
HOME="$ILAB_HOME"
fi

for dir in "$HOME/.cache" "$HOME/.config" "$HOME/.local"; do
mkdir -p "$dir"
done

if [[ "$1" = "shell" ]]; then
ENTRYPOINT=bash
PARAMS=()
fi

# If you need to mount additional volumes into the container, you can specify them
# using the ILAB_ADDITIONAL_MOUNTS environment variable.
#
# Example ILAB_ADDITIONAL_MOUNTS usage:
#
# ILAB_ADDITIONAL_MOUNTS="/host/path:/container/path /host/path2:/container/path2"
#
# If your path contains spaces, you can use quotes:
#
# ILAB_ADDITIONAL_MOUNTS="/host/path:/container/path '/host/path with spaces':/container/path"
ADDITIONAL_MOUNTS=()
if [ -n "${ILAB_ADDITIONAL_MOUNTS}" ]; then
# (eval is used here to allow the user to specify mounts that might have spaces in them)
eval "ADDITIONAL_MOUNTS=(${ILAB_ADDITIONAL_MOUNTS})"
fi
ADDITIONAL_MOUNT_OPTIONS=()
for PODMAN_MOUNT in "${ADDITIONAL_MOUNTS[@]}"; do
ADDITIONAL_MOUNT_OPTIONS+=("-v" "$PODMAN_MOUNT")
done

# Add pull-secret to additional mounts
# In case of normal user, /run/user is used (XDG_RUNTIME_DIR), if root, it will be /run/containers
for authfile in \
"${XDG_RUNTIME_DIR}/containers/auth.json" \
/run/user/${UID}/containers/auth.json \
/run/containers/${UID}/auth.json
do
if [[ -f "$authfile" ]]; then
ADDITIONAL_MOUNT_OPTIONS+=("-v" "$authfile:/run/containers/0/auth.json")
break
fi
done

# We run the container as sudo in order to be able to access the root container
# storage, which has the ilab image pre-pulled. But for security reasons we map
# root UID 0 inside the container to the current user's UID (and all the other
# subuids to the user's /etc/subuid range) so that we're effectively running
# the container as the current user.
#
# In the future, we will run podman as the current user, once we figure a
# reasonable way for the current user to access the root's user container
# storage.
if [[ "$UID" == 0 ]]; then
# If we're already running as root, we don't need to map any UIDs
IMPERSONATE_CURRENT_USER_PODMAN_FLAGS=()
else
CURRENT_USER_NAME=$(id --user --name)
CURRENT_USER_SUBUID_RANGE=$(awk \
--field-separator ':' \
--assign current_user="$CURRENT_USER_NAME" \
--assign current_uid="$UID" \
'$1 == current_user || $1 == current_uid {print $2 ":" $3}' \
/etc/subuid)

verify_range "$CURRENT_USER_SUBUID_RANGE" "$CURRENT_USER_NAME"

IMPERSONATE_CURRENT_USER_PODMAN_FLAGS=("--uidmap" "0:$UID" "--uidmap" "1:$CURRENT_USER_SUBUID_RANGE")
fi

PRESERVE_ENV="VLLM_LOGGING_LEVEL,NCCL_DEBUG,HOME,HF_TOKEN"
PODMAN_COMMAND=("sudo" "--preserve-env=$PRESERVE_ENV" "podman" "run" "--rm" "-it"
"${IMPERSONATE_CURRENT_USER_PODMAN_FLAGS[@]}"
"--device" "${CONTAINER_DEVICE}"
"--security-opt" "label=disable" "--net" "host"
"--shm-size" "10G"
"--pids-limit" "-1"
"-v" "$HOME:$HOME"
"${ADDITIONAL_MOUNT_OPTIONS[@]}"
"--env" "VLLM_LOGGING_LEVEL"
"--env" "HOME"
"--env" "NCCL_DEBUG"
"--entrypoint" "$ENTRYPOINT"
"--env" "HF_TOKEN"
"${IMAGE_NAME}")

exec "${PODMAN_COMMAND[@]}" "${PARAMS[@]}"