Skip to content

Commit

Permalink
Merge pull request vllm-project#3 from ilya-lavrenov/docker-file
Browse files Browse the repository at this point in the history
Added dockerfile with vLLM + openvino
  • Loading branch information
ilya-lavrenov authored Mar 18, 2024
2 parents e913d6b + 5b0db2b commit 354ca31
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 9 deletions.
61 changes: 61 additions & 0 deletions Dockerfile.openvino
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

#################### BASE BUILD IMAGE ####################
FROM ubuntu:22.04 AS dev

RUN apt-get update -y && \
apt-get install -y python3-pip git
WORKDIR /workspace

# build and install OpenVINO
RUN git clone --recurse-submodules -b pytorch_module_extension https://github.com/slyalin/openvino.git
RUN /workspace/openvino/install_build_dependencies.sh
RUN cmake -DCPACK_GENERATOR=DEB -DENABLE_PYTHON=ON -DENABLE_PYTHON_PACKAGING=ON -DENABLE_CPPLINT=OFF \
-DENABLE_INTEL_GPU=OFF -DENABLE_TEMPLATE=OFF -DENABLE_AUTO=OFF -DENABLE_HETERO=OFF -DENABLE_AUTO_BATCH=OFF \
-DENABLE_OV_TF_FRONTEND=OFF -DENABLE_OV_ONNX_FRONTEND=OFF -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF \
-S /workspace/openvino -B /workspace/openvino_build
RUN python3 -m pip install -r /workspace/openvino/src/bindings/python/wheel/requirements-dev.txt
RUN cmake --build /workspace/openvino_build --parallel 8
RUN cmake -P /workspace/openvino_build/cmake_install.cmake

# build and install OpenVINO Contrib with PagedAttention
RUN git clone --branch paged-attention https://github.com/ilya-lavrenov/openvino_contrib.git
RUN cmake -DCUSTOM_OPERATIONS=paged_attention -DCMAKE_INSTALL_PREFIX=/usr \
-S /workspace/openvino_contrib/modules/custom_operations/ -B /workspace/paged_attention_build/
RUN cmake --build /workspace/paged_attention_build/ --parallel 8
RUN cmake -P /workspace/openvino_build/cmake_install.cmake

# Install OpenVINO tokenizers
RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://storage.openvinotoolkit.org/simple/wheels/nightly" python3 -m pip install openvino-tokenizers
#################### BASE BUILD IMAGE ####################


#################### EXTENSION BUILD IMAGE ####################
FROM dev AS build

COPY requirements-build.txt /workspace/vllm/
COPY requirements-openvino.txt /workspace/vllm/

# install build dependencies
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
# install runtime dependencies
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-openvino.txt

COPY vllm/ /workspace/vllm/vllm
COPY setup.py /workspace/vllm/

RUN cmake -P /workspace/paged_attention_build/cmake_install.cmake
RUN python3 -m pip install --no-build-isolation /workspace/vllm/
#################### EXTENSION Build IMAGE ####################


#################### OPENAI API SERVER ####################
# openai api server alternative
FROM build AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install accelerate

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
2 changes: 2 additions & 0 deletions requirements-openvino.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ prometheus_client >= 0.18.0
torch >= 2.1.2
transformers >= 4.38.0 # Required for Gemma.
openvino==2024.1.0
optimum-intel[nncf,openvino]
outlines >= 0.0.27
6 changes: 2 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def _is_openvino() -> bool:
import openvino
except ImportError:
openvino_available = False
openvino_available = os.getenv("VLLM_OPENVINO", "0") == "1"
return openvino_available

# Compiler flags.
Expand Down Expand Up @@ -124,9 +123,8 @@ def get_neuronxcc_version():


def get_openvino_version():
# import openvino
# return openvino.__version__[:8]
return "2024.1.0"
import openvino
return openvino.__version__[:8]

def get_nvcc_cuda_version(cuda_dir: str) -> Version:
"""Get the CUDA version from nvcc.
Expand Down
8 changes: 7 additions & 1 deletion vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,12 @@ def from_engine_args(cls,
# Create the engine configs.
engine_configs = engine_args.create_engine_configs()
parallel_config = engine_configs[2]
if parallel_config.worker_use_ray or engine_args.engine_use_ray:
device_config = engine_configs[4]

if device_config.is_openvino:
from vllm.executor.openvino_executor import OpenVINOExecutorAsync
executor_class = OpenVINOExecutorAsync
elif parallel_config.worker_use_ray or engine_args.engine_use_ray:
initialize_ray_cluster(parallel_config)
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
executor_class = RayGPUExecutorAsync
Expand All @@ -334,6 +339,7 @@ def from_engine_args(cls,
"Ray is required if parallel_config.world_size > 1.")
from vllm.executor.gpu_executor import GPUExecutorAsync
executor_class = GPUExecutorAsync

# Create the async LLM engine.
engine = cls(parallel_config.worker_use_ray,
engine_args.engine_use_ray,
Expand Down
13 changes: 9 additions & 4 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,20 @@ def is_neuron() -> bool:
return transformers_neuronx is not None

def is_openvino() -> bool:
is_openvino_available = True
try:
import openvino
except ImportError:
openvino = None
return openvino is not None
is_openvino_available = False
return is_openvino_available

def is_openvino_optimum_intel() -> bool:
openvino_optimum_intel = True if os.getenv('VLLM_OPENVINO_OPTIMUM', "0") == "1" else False
return is_openvino() and openvino_optimum_intel
is_optimum_intel_available = is_openvino()
try:
import optimum.intel
except:
is_optimum_intel_available = False
return is_optimum_intel_available

def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes."""
Expand Down

0 comments on commit 354ca31

Please sign in to comment.