Intel Gaudi: update to 1.17.1 with Python 3.11 (instructlab#2211)

mergify[bot] · web-flow · commit 34f74a7eba51 · 2024-09-19T17:50:10.000Z
Update to latest Intel Gaudi software 1.17.1. The new release comes with PyTorch 2.3.1a0, Python 3.11, and official RHEL 9.4 support. **Checklist:** - [x] **Commit Message Formatting**: Commit titles and messages follow guidelines in the [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/#summary). - [x] [Changelog](https://github.com/instructlab/instructlab/blob/main/CHANGELOG.md) updated with breaking and/or notable changes for the next minor release. - [x] Documentation has been updated, if necessary. - [ ] Unit tests have been added, if necessary. - [ ] Integration tests have been added, if necessary. Approved-by: cdoern Approved-by: jaideepr97 Approved-by: leseb
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,8 @@
   * `list`
   * `sysinfo`
   * `test`
+* Intel Gaudi software has been updated to 1.17.1 with Python 3.11 and
+  Torch 2.3.1 support.
 
 ## v0.18.1
 
diff --git a/Makefile b/Makefile
@@ -28,7 +28,7 @@ HPU_CONTAINERFILE = $(CURDIR)/containers/hpu/Containerfile
 HPU_CONTEXT_DIR = $(CURDIR)/containers/hpu
 HPU_DEPS = \
 	$(HPU_CONTAINERFILE) \
-	$(CURDIR)/requirements-hpu.txt \
+	$(CURDIR)/requirements/hpu.txt \
 	$(COMMON_DEPS) \
 	$(NULL)
 
diff --git a/containers/hpu/Containerfile b/containers/hpu/Containerfile
@@ -1,40 +1,43 @@
-ARG HABANA_VERSION=1.16.2
-ARG BASEIMAGE=vault.habana.ai/gaudi-docker/${HABANA_VERSION}/rhel9.2/habanalabs/pytorch-installer-2.2.2
+ARG HABANA_VERSION=1.17.1
+ARG BASEIMAGE=vault.habana.ai/gaudi-docker/${HABANA_VERSION}/rhel9.4/habanalabs/pytorch-installer-2.3.1
 
 FROM ${BASEIMAGE} AS runtime
 # base image has PyTorch fork with Habana plugins in self-compiled Python 3.10
-ARG PYTHON=python3.10
+ARG PYTHON=python3.11
 
 ENV PYTHON="${PYTHON}" \
     APP_ROOT="/opt/app-root"
 ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
     PIP_NO_COMPILE=1 \
+    PIP_NO_CACHE_DIR=off \
     PS1="(app-root) \w\$ " \
     VIRTUAL_ENV="${APP_ROOT}" \
     PATH="${APP_ROOT}/bin:${PATH}"
 
+# Gaudi container has Torch and habanalabs plugins in system Python.
+# Use system site packages and replace shebang, so scripts like torchrun
+# pick up the virtual env.
 RUN ${PYTHON} -m venv --upgrade-deps --system-site-packages ${VIRTUAL_ENV} && \
+    sed -i '1s:#!/usr/bin/python.*:#!/usr/bin/env python3:' /usr/local/bin/* && \
     mkdir ${VIRTUAL_ENV}/src && \
     find ${VIRTUAL_ENV} -name __pycache__ | xargs rm -rf
 
 COPY containers/sitecustomize.py ${VIRTUAL_ENV}/lib/${PYTHON}/site-packages/
 COPY containers/bin/debug-* ${VIRTUAL_ENV}/bin/
 
-# -mno-avx: work around a build problem with llama-cpp-python and gcc.
-# flash-attn is compiled from source, bitsandbytes has a manylinux wheel
-COPY requirements.txt requirements-hpu.txt /tmp
-RUN sed 's/\[.*\]//' /tmp/requirements.txt >/tmp/constraints.txt && \
-    export PIP_NO_CACHE_DIR=off; \
-    ${VIRTUAL_ENV}/bin/pip install -U wheel pip && \
-    CMAKE_ARGS="-DLLAMA_NATIVE=off" \
-        FORCE_CMAKE=1 \
-        ${VIRTUAL_ENV}/bin/pip install --no-binary llama_cpp_python -c /tmp/constraints.txt llama_cpp_python && \
-    ${VIRTUAL_ENV}/bin/pip install -r /tmp/requirements.txt -r /tmp/requirements-hpu.txt && \
-    rm /tmp/constraints.txt && \
+COPY . /tmp/instructlab
+RUN CMAKE_ARGS="-DLLAMA_NATIVE=off" \
+        ${VIRTUAL_ENV}/bin/pip install "/tmp/instructlab[hpu]" && \
     find ${VIRTUAL_ENV} -name __pycache__ | xargs rm -rf
 
-COPY . /tmp/instructlab
-RUN ${VIRTUAL_ENV}/bin/pip install "/tmp/instructlab[hpu]" && \
+# install Intel Gaudi fork of DeepSpeed
+RUN ${VIRTUAL_ENV}/bin/pip uninstall -y deepspeed && \
+    ${VIRTUAL_ENV}/bin/pip install --no-build-isolation git+https://github.com/HabanaAI/DeepSpeed.git@1.17.1 && \
+    find ${VIRTUAL_ENV} -name __pycache__ | xargs rm -rf
+
+# install Intel Gaudi fork of vLLM
+RUN VLLM_TARGET_DEVICE=hpu \
+        ${VIRTUAL_ENV}/bin/pip install --no-build-isolation git+https://github.com/HabanaAI/vllm-fork.git@v0.5.3.post1-Gaudi-1.17.0 && \
     pip list && \
     find ${VIRTUAL_ENV} -name __pycache__ | xargs rm -rf
 
@@ -43,14 +46,10 @@ WORKDIR "${HOME}"
 VOLUME ["/opt/app-root/src"]
 CMD ["/bin/bash"]
 
-
-# default values, override with `-e HABANA_VISIBLE_MODULES="0,1"`
-ENV TSAN_OPTIONS='ignore_noninstrumented_modules=1' \
-    HABANA_VISIBLE_MODULES="all" \
-    PT_HPU_LAZY_MODE=0 \
-    PT_HPU_ENABLE_EAGER_CACHE=TRUE \
-    PT_HPU_EAGER_4_STAGE_PIPELINE_ENABLE=TRUE \
-    PT_ENABLE_INT64_SUPPORT=1
+# https://docs.habana.ai/en/latest/PyTorch/Reference/Runtime_Flags.html
+# use eager mode / torch.compile()
+ENV PT_HPU_LAZY_MODE=0 \
+    PT_HPU_ENABLE_EAGER_CACHE=TRUE
 # workaround for race condition in libgomp / oneMKL (HS-1795)
 ENV OMP_NUM_THREADS=8
 
diff --git a/docs/habana-gaudi.md b/docs/habana-gaudi.md
@@ -6,9 +6,9 @@
 
 ## System requirements
 
-- RHEL 9 on `x86_64` (tested with RHEL 9.3 and patched installer)
+- RHEL 9 on `x86_64` (tested with RHEL 9.4)
 - Intel Gaudi 2 device
-- [Habana Labs](https://docs.habana.ai/en/latest/index.html) software stack (tested with 1.16.2)
+- [Habana Labs](https://docs.habana.ai/en/latest/index.html) software stack (tested with 1.17.1)
 - software from Habana Vault for [RHEL](https://vault.habana.ai/ui/native/rhel) and [PyTorch](https://vault.habana.ai/ui/native/gaudi-pt-modules)
 - software [HabanaAI GitHub](https://github.com/HabanaAI/) org like [optimum-habana](https://github.com/HabanaAI/optimum-habana-fork) fork
 
@@ -28,7 +28,7 @@ sudo dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.
 ```ini
 [vault]
 name=Habana Vault
-baseurl=https://vault.habana.ai/artifactory/rhel/9/9.2
+baseurl=https://vault.habana.ai/artifactory/rhel/9/9.4
 enabled=1
 repo_gpgcheck=0
 ```
@@ -86,7 +86,7 @@ hl-smi
 +=============================================================================+
 ````
 
-See [Intel Gaudi SW Stack for RHEL 9.2](https://docs.habana.ai/en/latest/shared/Install_Driver_and_Firmware.html)
+See [Intel Gaudi SW Stack for RHEL 9.4](https://docs.habana.ai/en/latest/shared/Install_Driver_and_Firmware.html)
 for detailed documentation.
 
 ## Other tools
@@ -106,15 +106,7 @@ curl -O https://vault.habana.ai/artifactory/gaudi-installer/1.15.1/habanalabs-in
 chmod +x habanalabs-installer.sh
 ```
 
-> **NOTE**
->
-> Habana Labs Installer 1.15.1 only supports RHEL 9.2 and will fail on 9.3+. You can hack around the limitation by patching the installer:
->
-> ```shell
-> sed -i 's/OS_VERSION=\$VERSION_ID/OS_VERSION=9.2/' habanalabs-installer.sh
-> ```
-
-Install dependencies (use `--verbose` for verbose logging). This will install several RPM packages, download Intel compilers + libraries, download + compile Python 3.10, and more.
+Install dependencies (use `--verbose` for verbose logging). This will install several RPM packages, download Intel compilers + libraries, and more.
 
 ```shell
 export MAKEFLAGS="-j$(nproc)"
diff --git a/requirements.txt b/requirements.txt
@@ -13,11 +13,7 @@ instructlab-sdg>=0.3.0
 instructlab-training>=0.4.1
 llama_cpp_python[server]==0.2.79
 mlx>=0.5.1,<0.6.0; sys_platform == 'darwin' and platform_machine == 'arm64'
-# HabanaLabs / Intel Gaudi env comes with Python 3.10 and slightly older
-# versions of some dependencies. Use '3.10' as an indicator.
-# Habana installer has NumPy 1.23.5
-numpy>=1.23.5,<2.0.0 ; python_version == '3.10'
-numpy>=1.26.4,<2.0.0 ; python_version >= '3.11'
+numpy>=1.26.4,<2.0.0
 openai>=1.13.3
 peft>=0.9.0
 prompt-toolkit>=3.0.38
@@ -31,13 +27,11 @@ sentencepiece>=0.2.0
 # "old" version required for vLLM on CUDA to build
 tokenizers>=0.11.1
 toml>=0.10.2
-# Habana Labs 1.16.2 has PyTorch 2.2.2a0+gitxxx pre-release
-torch>=2.2.2a0,<2.4.0 ; python_version == '3.10'
-torch>=2.3.0,<2.4.0 ; python_version >= '3.11'
+# Habana Labs 1.17.1 has PyTorch 2.3.1a0+gitxxx pre-release
+torch>=2.3.0,<2.4.0
 tqdm>=4.66.2
-# 'optimum' for Intel Gaudi needs transformers <4.41.0,>=4.40.0
-transformers>=4.40.0 ; python_version == '3.10'
-transformers>=4.41.2 ; python_version >= '3.11'
+# 'optimum' for Intel Gaudi needs transformers <4.44.0,>=4.43.0
+transformers>=4.41.2
 trl>=0.9.4
 wandb>=0.16.4
 xdg-base-dirs>=6.0.1
diff --git a/requirements/hpu.txt b/requirements/hpu.txt
@@ -1,15 +1,13 @@
 # Dependencies for Intel Gaudi / Habana Labs HPU devices
 #
 
-optimum>=1.20.1
-optimum-habana>=1.12.0
-# Habana Labs 1.16.2 has NumPy 1.23.5
-numpy>=1.23.5,<2.0.0
+optimum>=1.21.0
+optimum-habana>=1.13.1
 # Habana Labs 1.16.2 has PyTorch 2.2.2a0+gitxxx pre-release
-torch>=2.2.2a0,<2.4.0
+torch>=2.3.1a0,<2.4.0
 # Habana Labs frameworks
-habana-torch-plugin>=1.16.2
-habana_gpu_migration>=1.16.2
+habana-torch-plugin>=1.17.1
+habana_gpu_migration>=1.17.1
 # additional Habana Labs packages (installed, but not used)
 #habana-media-loader
 #habana-pyhlml
diff --git a/src/instructlab/train/linux_train.py b/src/instructlab/train/linux_train.py
@@ -2,9 +2,9 @@
 
 # Standard
 from pathlib import Path
-from typing import Optional
 import logging
 import os
+import typing
 
 # Third Party
 # https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset
@@ -53,9 +53,6 @@
     # Habana implementations of SFT Trainer
     # https://huggingface.co/docs/optimum/habana/index
     from optimum.habana import GaudiConfig, GaudiTrainingArguments
-    from optimum.habana.transformers.generation.configuration_utils import (
-        GaudiGenerationConfig,
-    )
     from optimum.habana.trl import GaudiSFTTrainer
 
     # silence warning: "Call mark_step function will not have any effect"
@@ -162,7 +159,7 @@ def linux_train(
     train_file: Path,
     test_file: Path,
     model_name: str,
-    num_epochs: Optional[int] = None,
+    num_epochs: int | None,
     train_device: str = "cpu",
     four_bit_quant: bool = False,
     output_dir: Path = Path("training_results"),
@@ -346,6 +343,7 @@ def model_generate(user, **kwargs):
     tokenizer.padding_side = "right"
     per_device_train_batch_size = 1
     max_seq_length = 300
+    generate_kwargs: dict[str, typing.Any]
 
     if device.type == "hpu":
         # Intel Gaudi trainer
@@ -384,10 +382,7 @@ def model_generate(user, **kwargs):
             args=training_arguments,
             gaudi_config=gaudi_config,
         )
-        generate_kwargs = {
-            # TODO: check generation config parameters?
-            "generation_config": GaudiGenerationConfig(),
-        }
+        generate_kwargs = {}
     else:
         training_arguments = TrainingArguments(
             output_dir=output_dir,
diff --git a/tests/test_package.py b/tests/test_package.py
@@ -7,6 +7,7 @@
 
 # Third Party
 from packaging.requirements import Requirement
+from packaging.version import Version
 import pytest
 
 PKG_NAME = "instructlab"
@@ -15,9 +16,8 @@
 # special cases
 EXTRA_CHECKS = {
     "hpu": {
-        "numpy": "1.23.5",
-        "torch": "2.2.2a0",
-        "transformers": "4.40.2",
+        "torch": Version("2.3.1a0"),
+        "transformers": Version("4.43.0"),
     }
 }
 
@@ -44,8 +44,8 @@ def test_require_no_url_req():
 @pytest.mark.parametrize("hw_extra", sorted(HW_EXTRAS))
 @pytest.mark.parametrize("py_version", ["3.10", "3.11"])
 def test_package_conflict(py_version: str, hw_extra: str) -> None:
-    if py_version == "3.11" and hw_extra == "hpu":
-        pytest.skip("Intel Gaudi is not supported on 3.11")
+    if py_version != "3.11" and hw_extra == "hpu":
+        pytest.skip("Intel Gaudi only supports 3.11")
 
     base: dict[str, Requirement] = {}
     hw: dict[str, Requirement] = {}
@@ -72,12 +72,18 @@ def test_package_conflict(py_version: str, hw_extra: str) -> None:
             continue
         for specifier in hwreq.specifier:
             # naive check for common version conflicts
+            # allow pre-releases for Gaudi
             if specifier.operator in {"~=", "==", "<=", ">="}:
-                assert basereq.specifier.contains(specifier.version), (basereq, hwreq)
+                version: Version = Version(specifier.version)
+                assert basereq.specifier.contains(
+                    version, prereleases=version.is_prerelease
+                ), (basereq, hwreq)
 
     # verify special cases against base requirements
     if hw_extra in EXTRA_CHECKS:
         for name, basereq in base.items():
             extra_check = EXTRA_CHECKS[hw_extra].get(name)
             if extra_check is not None:
-                assert basereq.specifier.contains(extra_check), (basereq, extra_check)
+                assert basereq.specifier.contains(
+                    extra_check, prereleases=extra_check.is_prerelease
+                ), (basereq, extra_check)