Skip to content

Add Gaudi Backend #3055

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,8 @@ server/fbgemmm

.direnv/
.venv/

# Gaudi auto-generated files
hl-smi_log*.txt
.graph_dumps
out
127 changes: 127 additions & 0 deletions Dockerfile_gaudi
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Those arguments are required to build the image
ARG HABANA_VERSION
ARG PYTORCH_VERSION

# Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef AS planner
COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

ENV PYO3_PYTHON="/root/.local/bin/python" \
PYTHON_SYS_EXECUTABLE="/root/.local/bin/python" \
PYO3_PYTHON_VERSION="3.10"

RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
&& . $HOME/.local/bin/env \
&& uv python install 3.10 --default --preview \
&& test -f /root/.local/bin/python || (echo "Python 3.10 not found at /root/.local/bin/python" && exit 1)

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

ARG GIT_SHA
ARG DOCKER_LABEL

COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo build --profile release-opt

# Text Generation Inference base image
ARG HABANA_VERSION
ARG PYTORCH_VERSION

FROM vault.habana.ai/gaudi-docker/${HABANA_VERSION}/ubuntu22.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:latest AS base

ENV ATTENTION=default
ENV PREFIX_CACHING=0
ENV PREFILL_CHUNKING=0

# Text Generation Inference base env
ENV HF_HOME=/data \
HF_HUB_ENABLE_HF_TRANSFER=1 \
PORT=80

# Assert that Python 3.10 is installed as the launcher is compiled with Python 3.10
RUN python3.10 --version || (echo "Python 3.10 is not installed" && exit 1)

# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb

WORKDIR /usr/src

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
libssl-dev \
ca-certificates \
make \
curl \
git \
&& rm -rf /var/lib/apt/lists/*

# Install server
COPY proto proto
COPY backends/gaudi/server server
COPY backends/gaudi/server/Makefile server/Makefile
ARG HABANA_VERSION
RUN cd server && \
make gen-server && \
pip install --no-deps -r requirements.txt && \
bash ./dill-0.3.8-patch.sh && \
pip install outlines~=0.0.34 && \
pip install "git+https://github.com/HabanaAI/DeepSpeed.git@${HABANA_VERSION}" && \
BUILD_CUDA_EXT=0 pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@097dd04e --no-build-isolation && \
pip install . --no-cache-dir

# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher


# AWS Sagemaker compatible image
FROM base AS sagemaker

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

# Final image
FROM base

ENV HF_HUB_ENABLE_HF_TRANSFER 1
ENV HABANA_VISIBLE_DEVICES all
ENV OMPI_MCA_btl_vader_single_copy_mechanism NONE

COPY backends/gaudi/tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

ENTRYPOINT ["/tgi-entrypoint.sh"]
CMD ["--json-output"]
49 changes: 49 additions & 0 deletions backends/gaudi/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
mkfile_dir := $(dir $(mkfile_path))
root_dir := "${mkfile_dir}/../.."

HABANA_VERSION := 1.19.0
PYTORCH_VERSION := 2.5.1

.PHONY: image run-local-dev-container install-dependencies install-server install-router install-launcher local-dev-install

image:
docker build -t tgi-gaudi -f ${root_dir}/Dockerfile_gaudi ${root_dir} --build-arg HABANA_VERSION=$(HABANA_VERSION) --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION)

run-local-dev-container:
docker run -it \
--runtime=habana \
--ipc=host \
--cap-add=sys_nice \
--net=host \
-e HABANA_VISIBLE_DEVICES=all \
-e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
-e HF_TOKEN=`cat /home/ubuntu/.cache/huggingface/token` \
-e LOG_LEVEL=debug \
-e PORT=8080 \
-v /home/ubuntu/.cache/huggingface:/data \
-v $(PWD):/text-generation-inference \
-w /text-generation-inference \
vault.habana.ai/gaudi-docker/$(HABANA_VERSION)/ubuntu22.04/habanalabs/pytorch-installer-$(PYTORCH_VERSION):latest

install-dependencies:
pip install git+https://github.com/HabanaAI/DeepSpeed.git@$(HABANA_VERSION)
pip install outlines~=0.0.34
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y

install-server:
make -C ${root_dir}/backends/gaudi/server install PROTO_PATH=../../../proto/v3

install-router:
make -C ${root_dir} install-router

install-launcher:
make -C ${root_dir} install-launcher

# use source to load the rust in path
local-dev-install: install-dependencies
bash -c 'source "$$HOME/.cargo/env" && \
make install-server && \
make install-router && \
make install-launcher'
98 changes: 98 additions & 0 deletions backends/gaudi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Text-generation-inference - Gaudi backend

## Description

This is the TGI backend for Intel Gaudi. This backend is composed of the tgi server optimized for Gaudi hardware.

## Build your own image

The simplest way to build TGI with the Gaudi backend is to use the provided `Makefile`:

Option 1: From the project root directory:
```bash
make -C backends/gaudi image
```

Option 2: From the Gaudi backend directory:
```bash
cd backends/gaudi
make image
```

You can now run the server with the following command:

Option 1: Sharded:
```bash
model=meta-llama/Llama-3.1-8B-Instruct
hf_token=$(cat ${HOME}/.cache/huggingface/token)
volume=${HOME}/.cache/huggingface

docker run --runtime=habana --ipc=host --cap-add=sys_nice \
-p 8080:80 -v $volume:/data \
-e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
tgi-gaudi --model-id $model \
--sharded true --num-shard 8 \
--max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 8 --max-batch-prefill-tokens 2048
```

Option 2: Non-sharded:
```bash
model=meta-llama/Llama-3.1-8B-Instruct
hf_token=$(cat ${HOME}/.cache/huggingface/token)
volume=${HOME}/.cache/huggingface

docker run --runtime=habana --ipc=host --cap-add=sys_nice \
-p 8080:80 -v $volume:/data \
-e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
tgi-gaudi --model-id $model \
--max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 4 --max-batch-prefill-tokens 2048
```

## Contributing

### Local Development

This is useful if you want to run the server locally for better debugging.
```bash
make -C backends/gaudi run-local-dev-container
```

Then run the following command inside the container to install tgi for gaudi:
```bash
make -C backends/gaudi local-dev-install
```

Add rust to path:
```bash
. "$HOME/.cargo/env"
```

Option 1: Run the server (sharded model):
```bash
LOG_LEVEL=debug text-generation-launcher \
--model-id meta-llama/Llama-3.1-8B-Instruct \
--sharded true \
--num-shard 8 \
--max-input-tokens 512 \
--max-total-tokens 1024 \
--max-batch-size 8 \
--max-batch-prefill-tokens 2048
```

Option 2: Run the server (non-sharded model):
```bash
LOG_LEVEL=debug text-generation-launcher \
--model-id meta-llama/Llama-3.1-8B-Instruct \
--max-input-tokens 512 \
--max-total-tokens 1024 \
--max-batch-size 4 \
--max-batch-prefill-tokens 2048
```

You can then test the server with the following curl command from another terminal (can be outside the container):
```bash
curl 127.0.0.1:8080/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
-H 'Content-Type: application/json'
```
Loading
Loading