From b4609123bad02c74ca39568b98542d11700f9922 Mon Sep 17 00:00:00 2001 From: Wovchena Date: Wed, 5 Jun 2024 14:59:34 +0400 Subject: [PATCH] Split text samples to sepparate folders --- .github/workflows/causal_lm_cpp.yml | 64 +++++++------- CMakeLists.txt | 16 +++- .../causal_lm => samples}/cpp/README.md | 0 .../cpp/beam_search_causal_lm/CMakeLists.txt | 14 +++ samples/cpp/beam_search_causal_lm/README.md | 50 +++++++++++ .../beam_search_causal_lm.cpp | 0 samples/cpp/chat_sample/CMakeLists.txt | 14 +++ samples/cpp/chat_sample/README.md | 50 +++++++++++ .../cpp/chat_sample}/chat_sample.cpp | 0 samples/cpp/greedy_causal_lm/CMakeLists.txt | 14 +++ samples/cpp/greedy_causal_lm/README.md | 50 +++++++++++ .../greedy_causal_lm}/greedy_causal_lm.cpp | 0 .../cpp/multinomial_causal_lm/CMakeLists.txt | 14 +++ samples/cpp/multinomial_causal_lm/README.md | 50 +++++++++++ .../multinomial_causal_lm.cpp | 0 .../prompt_lookup_decoding_lm/CMakeLists.txt | 18 ++++ .../cpp/prompt_lookup_decoding_lm/README.md | 52 +++++++++++ .../prompt_lookup_decoding_lm.cpp | 1 + .../cpp/requirements.txt | 0 .../speculative_decoding_lm/CMakeLists.txt | 18 ++++ samples/cpp/speculative_decoding_lm/README.md | 57 ++++++++++++ .../speculative_decoding_lm.cpp | 0 src/README.md | 81 ++++++++++++++++++ .../causal_lm/cpp => src}/beam_idx-drop.gif | Bin .../causal_lm/cpp => src}/beam_idx-fork.gif | Bin src/cpp/src/llm_pipeline.cpp | 4 +- .../causal_lm/cpp => src}/stateful.jpg | Bin .../causal_lm/cpp => src}/stateless.jpg | Bin text_generation/causal_lm/cpp/CMakeLists.txt | 63 -------------- 29 files changed, 532 insertions(+), 98 deletions(-) rename {text_generation/causal_lm => samples}/cpp/README.md (100%) create mode 100644 samples/cpp/beam_search_causal_lm/CMakeLists.txt create mode 100644 samples/cpp/beam_search_causal_lm/README.md rename {text_generation/causal_lm/cpp => samples/cpp/beam_search_causal_lm}/beam_search_causal_lm.cpp (100%) create mode 100644 samples/cpp/chat_sample/CMakeLists.txt create mode 100644 samples/cpp/chat_sample/README.md rename {text_generation/causal_lm/cpp => samples/cpp/chat_sample}/chat_sample.cpp (100%) create mode 100644 samples/cpp/greedy_causal_lm/CMakeLists.txt create mode 100644 samples/cpp/greedy_causal_lm/README.md rename {text_generation/causal_lm/cpp => samples/cpp/greedy_causal_lm}/greedy_causal_lm.cpp (100%) create mode 100644 samples/cpp/multinomial_causal_lm/CMakeLists.txt create mode 100644 samples/cpp/multinomial_causal_lm/README.md rename {text_generation/causal_lm/cpp => samples/cpp/multinomial_causal_lm}/multinomial_causal_lm.cpp (100%) create mode 100644 samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt create mode 100644 samples/cpp/prompt_lookup_decoding_lm/README.md rename {text_generation/causal_lm/cpp => samples/cpp/prompt_lookup_decoding_lm}/prompt_lookup_decoding_lm.cpp (99%) rename {text_generation/causal_lm => samples}/cpp/requirements.txt (100%) create mode 100644 samples/cpp/speculative_decoding_lm/CMakeLists.txt create mode 100644 samples/cpp/speculative_decoding_lm/README.md rename {text_generation/causal_lm/cpp => samples/cpp/speculative_decoding_lm}/speculative_decoding_lm.cpp (100%) rename {text_generation/causal_lm/cpp => src}/beam_idx-drop.gif (100%) rename {text_generation/causal_lm/cpp => src}/beam_idx-fork.gif (100%) rename {text_generation/causal_lm/cpp => src}/stateful.jpg (100%) rename {text_generation/causal_lm/cpp => src}/stateless.jpg (100%) delete mode 100644 text_generation/causal_lm/cpp/CMakeLists.txt diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index d78a574e7..4d8804a1f 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -3,7 +3,8 @@ on: pull_request: paths: - .github/workflows/causal_lm_cpp.yml - - text_generation/causal_lm/cpp/* + - src/* + - samples/* - thirdparty/openvino_tokenizers - "!**.md" concurrency: @@ -28,7 +29,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -36,7 +37,7 @@ jobs: - name: greedy_causal_lm run: | source ./ov/setupvars.sh - ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./open_llama_3b_v2/ "return 0" + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" cpp-beam_search_causal_lm-ubuntu: runs-on: ubuntu-20.04 @@ -55,7 +56,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -64,7 +65,7 @@ jobs: run: | source ./ov/setupvars.sh - timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -80,7 +81,7 @@ jobs: " echo "Why is the Sun yellow?" passed - timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt + timeout 25s ../build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -96,7 +97,7 @@ jobs: " echo "69" passed - timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -112,7 +113,7 @@ jobs: " echo "Hi" passed - timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -128,7 +129,7 @@ jobs: " echo "return 0" passed - ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt + timeout 25s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -144,7 +145,7 @@ jobs: " echo "你好! 你好嗎?" passed - timeout 1m ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + timeout 1m ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -183,7 +184,7 @@ jobs: shell: cmd run: | call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat - python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -193,7 +194,7 @@ jobs: run: | call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat set PATH=.\build\openvino_genai\;%PATH% - .\build\text_generation\causal_lm\cpp\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt + .\build/samples/cpp/greedy_causal_lm/Release/greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt echo import transformers > ref.py echo predictions = open('pred.txt', 'r').read() >> ref.py @@ -224,7 +225,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -232,7 +233,7 @@ jobs: - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt + timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: runs-on: ubuntu-20.04-16-cores @@ -251,7 +252,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -259,7 +260,7 @@ jobs: - name: Run run: | source ./ov/setupvars.sh - timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" > ./pred_qwen15.txt + timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" > ./pred_qwen15.txt cpp-beam_search_causal_lm-Phi-2: runs-on: ubuntu-20.04-16-cores @@ -278,7 +279,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -286,7 +287,7 @@ jobs: - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt + timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt cpp-beam_search_causal_lm-notus-7b-v1: runs-on: ubuntu-20.04-16-cores @@ -305,7 +306,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -313,7 +314,7 @@ jobs: - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt + timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt cpp-speculative_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores @@ -332,7 +333,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b @@ -341,8 +342,8 @@ jobs: - name: run and compare run: | source ./ov/setupvars.sh - ./build/text_generation/causal_lm/cpp/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt - ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt + ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt + ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -370,7 +371,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -386,8 +387,8 @@ jobs: Question: Can you please add 2 and 3 A:' > ./prompt.txt - ./build/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt - ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt + ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt + ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -414,7 +415,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -422,8 +423,8 @@ jobs: - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt - timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt - name: Compare run: | python -c " @@ -458,7 +459,7 @@ jobs: - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt + python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -467,7 +468,8 @@ jobs: - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt + + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " diff --git a/CMakeLists.txt b/CMakeLists.txt index f67ba52e4..b8cad76f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,9 +21,21 @@ project(OpenVINOGenAI VERSION 2024.2.0.0) add_subdirectory(./thirdparty/) add_subdirectory(src) -add_subdirectory(text_generation/causal_lm/cpp) +add_subdirectory(samples/cpp/beam_search_causal_lm/) +add_subdirectory(samples/cpp/chat_sample/) +add_subdirectory(samples/cpp/greedy_causal_lm/) +add_subdirectory(samples/cpp/multinomial_causal_lm/) +add_subdirectory(samples/cpp/prompt_lookup_decoding_lm/) +add_subdirectory(samples/cpp/speculative_decoding_lm/) -install(DIRECTORY text_generation/causal_lm/cpp/ DESTINATION samples/cpp/causal_lm COMPONENT cpp_samples_genai) +install(DIRECTORY + ./samples/cpp/beam_search_causal_lm + ./samples/cpp/chat_sample + ./samples/cpp/greedy_causal_lm + ./samples/cpp/multinomial_causal_lm + # Don't install prompt_lookup_decoding_lm and speculative_decoding_lm because they don't use openvino_genai library and arent verifyed yet. + DESTINATION samples/cpp/ COMPONENT cpp_samples_genai) +install(FILES ./samples/cpp/requirements.txt DESTINATION samples/cpp/ COMPONENT cpp_samples_genai) install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) if(MSVC AND NOT DEFINED CPACK_GENERATOR) diff --git a/text_generation/causal_lm/cpp/README.md b/samples/cpp/README.md similarity index 100% rename from text_generation/causal_lm/cpp/README.md rename to samples/cpp/README.md diff --git a/samples/cpp/beam_search_causal_lm/CMakeLists.txt b/samples/cpp/beam_search_causal_lm/CMakeLists.txt new file mode 100644 index 000000000..9728eee3b --- /dev/null +++ b/samples/cpp/beam_search_causal_lm/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) +add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) +target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) +target_compile_features(beam_search_causal_lm PRIVATE cxx_std_17) +install(TARGETS beam_search_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md new file mode 100644 index 000000000..b5ad4581a --- /dev/null +++ b/samples/cpp/beam_search_causal_lm/README.md @@ -0,0 +1,50 @@ +# Text generation C++ sample that supports most popular models like LLaMA 2 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Install OpenVINO + +Install [OpenVINO Archives >= 2024.2](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. + +## Install OpenVINOGenAI + +Follow [../../../src/README.md](../../../src/README.md). + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +#### Linux/macOS + +```sh +source /setupvars.sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +#### Windows + +```bat +\setupvars.bat +python -m pip install --upgrade-strategy eager -r requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +### Usage: +`beam_search_causal_lm ""` + +### Examples: + +#### Linux/MacOS: +`./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` + +#### Windows: +`.\build\samples\cpp\beam_search_causal_lm\Release\beam_search_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See [../../../src/README.md#supported-models](../../src/README.md#supported-models) for the list of supported models. diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp similarity index 100% rename from text_generation/causal_lm/cpp/beam_search_causal_lm.cpp rename to samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp diff --git a/samples/cpp/chat_sample/CMakeLists.txt b/samples/cpp/chat_sample/CMakeLists.txt new file mode 100644 index 000000000..347ff4382 --- /dev/null +++ b/samples/cpp/chat_sample/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) +add_executable(chat_sample chat_sample.cpp) +target_link_libraries(chat_sample PRIVATE openvino::genai) +target_compile_features(chat_sample PRIVATE cxx_std_17) +install(TARGETS chat_sample + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md new file mode 100644 index 000000000..97f8a10cb --- /dev/null +++ b/samples/cpp/chat_sample/README.md @@ -0,0 +1,50 @@ +# C++ chat_sample that supports most popular models like LLaMA 2 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Install OpenVINO + +Install [OpenVINO Archives >= 2024.2](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. + +## Install OpenVINOGenAI + +Follow [../../../src/README.md](../../../src/README.md). + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +#### Linux/macOS + +```sh +source /setupvars.sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +#### Windows + +```bat +\setupvars.bat +python -m pip install --upgrade-strategy eager -r requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +### Usage: +`chat_sample ` + +### Examples: + +#### Linux/MacOS: +`./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/` + +#### Windows: +`.\build\samples\cpp\chat_sample\Release\chat_sample .\TinyLlama-1.1B-Chat-v1.0\` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See [../../../src/README.md#supported-models](../../src/README.md#supported-models) for the list of supported models. diff --git a/text_generation/causal_lm/cpp/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp similarity index 100% rename from text_generation/causal_lm/cpp/chat_sample.cpp rename to samples/cpp/chat_sample/chat_sample.cpp diff --git a/samples/cpp/greedy_causal_lm/CMakeLists.txt b/samples/cpp/greedy_causal_lm/CMakeLists.txt new file mode 100644 index 000000000..c49203608 --- /dev/null +++ b/samples/cpp/greedy_causal_lm/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) +add_executable(greedy_causal_lm greedy_causal_lm.cpp) +target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) +target_compile_features(greedy_causal_lm PRIVATE cxx_std_17) +install(TARGETS greedy_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md new file mode 100644 index 000000000..5e78917a3 --- /dev/null +++ b/samples/cpp/greedy_causal_lm/README.md @@ -0,0 +1,50 @@ +# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 2 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Install OpenVINO + +Install [OpenVINO Archives >= 2024.2](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. + +## Install OpenVINOGenAI + +Follow [../../../src/README.md](../../../src/README.md). + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +#### Linux/macOS + +```sh +source /setupvars.sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +#### Windows + +```bat +\setupvars.bat +python -m pip install --upgrade-strategy eager -r requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +### Usage: +`greedy_causal_lm ""` + +### Examples: + +#### Linux/MacOS: +`./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` + +#### Windows: +`.\build\samples\cpp\greedy_causal_lm\Release\greedy_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See [../../../src/README.md#supported-models](../../src/README.md#supported-models) for the list of supported models. diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp similarity index 100% rename from text_generation/causal_lm/cpp/greedy_causal_lm.cpp rename to samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt new file mode 100644 index 000000000..1d79af25d --- /dev/null +++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) +add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) +target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) +target_compile_features(greedy_causal_lm PRIVATE cxx_std_17) +install(TARGETS multinomial_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md new file mode 100644 index 000000000..f32d2d653 --- /dev/null +++ b/samples/cpp/multinomial_causal_lm/README.md @@ -0,0 +1,50 @@ +# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 2 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Install OpenVINO + +Install [OpenVINO Archives >= 2024.2](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. + +## Install OpenVINOGenAI + +Follow [../../../src/README.md](../../../src/README.md). + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +#### Linux/macOS + +```sh +source /setupvars.sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +#### Windows + +```bat +\setupvars.bat +python -m pip install --upgrade-strategy eager -r requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +### Usage: +`multinomial_causal_lm ""` + +### Examples: + +#### Linux/MacOS: +`./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?"` + +#### Windows: +`.\build\sampels\cpp\multinomial_causal_lm\Release\multinomial_causal_lm .\TinyLlama-1.1B-Chat-v1.0\ "Why is the Sun yellow?"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See [../../../src/README.md#supported-models](../../src/README.md#supported-models) for the list of supported models. diff --git a/text_generation/causal_lm/cpp/multinomial_causal_lm.cpp b/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp similarity index 100% rename from text_generation/causal_lm/cpp/multinomial_causal_lm.cpp rename to samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt new file mode 100644 index 000000000..1fff62a1a --- /dev/null +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +if(TARGET openvino_tokenizers) + set(OPENVINO_TOKENIZERS_PATH $) +else() + message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") +endif() +find_package(OpenVINO REQUIRED COMPONENTS Runtime) +find_package(TBB REQUIRED COMPONENTS tbb) +add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) +target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb) +target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") +target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17) +install(TARGETS prompt_lookup_decoding_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md new file mode 100644 index 000000000..5d7472005 --- /dev/null +++ b/samples/cpp/prompt_lookup_decoding_lm/README.md @@ -0,0 +1,52 @@ +# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 2 + +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Install OpenVINO + +Install [OpenVINO Archives >= 2024.2](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. + +## Install OpenVINOGenAI + +Follow [../../../src/README.md](../../../src/README.md). + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +#### Linux/macOS + +```sh +source /setupvars.sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +#### Windows + +```bat +\setupvars.bat +python -m pip install --upgrade-strategy eager -r requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +### Usage: +`prompt_lookup_decoding_lm ""` + +### Examples: + +#### Linux/MacOS: +`./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` + +#### Windows: +`.\build\samples\cpp\prompt_lookup_decoding_lm\Release\prompt_lookup_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\ "return 0;"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See [../../../src/README.md#supported-models](../../src/README.md#supported-models) for the list of supported models. diff --git a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp similarity index 99% rename from text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp rename to samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 5060b8864..cd6de3775 100644 --- a/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -1,6 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include #include #include diff --git a/text_generation/causal_lm/cpp/requirements.txt b/samples/cpp/requirements.txt similarity index 100% rename from text_generation/causal_lm/cpp/requirements.txt rename to samples/cpp/requirements.txt diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt new file mode 100644 index 000000000..e18ffec97 --- /dev/null +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +if(TARGET openvino_tokenizers) + set(OPENVINO_TOKENIZERS_PATH $) +else() + message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") +endif() +find_package(OpenVINO REQUIRED COMPONENTS Runtime) +find_package(TBB REQUIRED COMPONENTS tbb) +add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) +target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb) +target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") +target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17) +install(TARGETS speculative_decoding_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md new file mode 100644 index 000000000..a89b5a4b2 --- /dev/null +++ b/samples/cpp/speculative_decoding_lm/README.md @@ -0,0 +1,57 @@ +# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 2 + +Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model. + +Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. + +This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In tha caste the are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application don't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Install OpenVINO + +Install [OpenVINO Archives >= 2024.2](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. + +## Install OpenVINOGenAI + +Follow [../../../src/README.md](../../../src/README.md). + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +#### Linux/macOS + +```sh +source /setupvars.sh +python3 -m pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model meta-llama/Llama-2-7b-chat-hf Llama-2-7b-chat-hf +``` + +#### Windows + +```bat +\setupvars.bat +python -m pip install --upgrade-strategy eager -r requirements.txt +optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +### Usage: +`speculative_decoding_lm ""` + +### Examples: + +#### Linux/MacOS: +`./build/speculative_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ ./Llama-2-7b-chat-hf/ "Why is the Sun yellow?"` + +#### Windows: +`.\build\Release\speculative_decoding_lm .\TinyLlama-1.1B-Chat-v1.0\ .\Llama-2-7b-chat-hf\ "Why is the Sun yellow?"` + +To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See [../../../src/README.md#supported-models](../../src/README.md#supported-models) for the list of supported models. diff --git a/text_generation/causal_lm/cpp/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp similarity index 100% rename from text_generation/causal_lm/cpp/speculative_decoding_lm.cpp rename to samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp diff --git a/src/README.md b/src/README.md index 854908684..e80947a3c 100644 --- a/src/README.md +++ b/src/README.md @@ -163,3 +163,84 @@ int main(int argc, char* argv[]) { std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer); } ``` + +## How it works + +### Stateful LLM + +A common LLM inference optimisation is introduction of past KV (key/value)-cache. This cache is represented by the corresponding inputs and outputs in a model implemented originally in DL framework (e.g. PyTorch models from HuggingFace). To optimize it further and simplify usage, the model is transformed to a stateful form. This transformation improves inference performance and decreases amount of allocated runtime memory in long running text generation scenarios. It is achieved by hiding inputs and outputs of the model that represent past KV-cache tensors and handling them inside the model in a more efficient way. Although the cache is still accessible with state API. It is opposed to stateless model approach requiring manipulating these inputs and outputs explicitly. An introduction to stateful models can be found in https://docs.openvino.ai/2023.3/openvino_docs_OV_UG_stateful_models_intro.html. + +Hiding KV-cache introduces a peculiarity for beam search algorithm. Beam search suggests batched inference of multiple beams. The design described here so far would result in generating multiple independent sequences of tokens. Beam search algorithm, on the other hand, requires removing some of the ongoing beams and splitting other beams to multiple branches. Beam removal requires deleting corresponding KV-cache entry and beam splitting requires copying corresponding KV-cache values. + +To provide the possibility to implement beam search without accessing model's internal state, a stateful LLM converted with `optimum-intel` or [llm_bench](../../../llm_bench/python/) introduces an additional 1-dimentional `beam_idx` input. `beam_idx` must contain indexes of elements in a batch which are intended to be selected and will evolve during the next beam search iteration. There's only one beam when the generation starts. That beam corresponds to the initial prompt. `beam_idx` must have values: `[0, 0]` to keep the initial beam and introduce its copy. The dynamic batch size enables to change the number of beams dynamically. `beam_idx` must have `[1]` as the value to remove zeroth sequence and keep the second beam only. + +Assume there are two running beams. To proceed with generating both beams at the next iteration, `beam_idx` values must be `[0, 1]`, pointing to batch elements `0` and `1`. To drop the last beam and split the other beam in two, `beam_idx` must be set to `[0, 0]`. This results in utilizing only the part of KV cache corresponding to the zeroth element in the batch. The process of selecting proper entries in cache is called Cache Reorder. + +![](beam_idx-fork.gif) +![](beam_idx-drop.gif) + +The images below represent stateless and stateful LLM pipelines. The model has 4 inputs: +1. `input_ids` contains the next selected token +2. `attention_mask` is filled with `1` +3. `position_ids` encodes a position of currently generating token in the sequence +4. `beam_idx` selects beams + +The model has 1 output `logits` describing the predicted distribution over the next tokens. And there's KV cache state. + +![](stateless.jpg) +![](stateful.jpg) + +## Supported models + +1. chatglm + 1. https://huggingface.co/THUDM/chatglm2-6b - refer to + [chatglm2-6b - AttributeError: can't set attribute](../../../llm_bench/python/doc/NOTES.md#chatglm2-6b---attributeerror-cant-set-attribute) + in case of `AttributeError` + 2. https://huggingface.co/THUDM/chatglm3-6b +2. LLaMA 2 (requires access request submission on its Hugging Face page to be downloaded) + 1. https://huggingface.co/meta-llama/Llama-2-13b-chat-hf + 2. https://huggingface.co/meta-llama/Llama-2-13b-hf + 3. https://huggingface.co/meta-llama/Llama-2-7b-chat-hf + 4. https://huggingface.co/meta-llama/Llama-2-7b-hf + 5. https://huggingface.co/meta-llama/Llama-2-70b-chat-hf + 6. https://huggingface.co/meta-llama/Llama-2-70b-hf +3. [Llama2-7b-WhoIsHarryPotter](https://huggingface.co/microsoft/Llama2-7b-WhoIsHarryPotter) +4. OpenLLaMA + 1. https://huggingface.co/openlm-research/open_llama_13b + 2. https://huggingface.co/openlm-research/open_llama_3b + 3. https://huggingface.co/openlm-research/open_llama_3b_v2 + 4. https://huggingface.co/openlm-research/open_llama_7b + 5. https://huggingface.co/openlm-research/open_llama_7b_v2 +5. [TinyLlama](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) +6. Qwen + 1. https://huggingface.co/Qwen/Qwen-7B-Chat + 2. https://huggingface.co/Qwen/Qwen-7B-Chat-Int4 - refer to + 3. https://huggingface.co/Qwen/Qwen1.5-7B-Chat + 4. https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4 + [Qwen-7B-Chat-Int4 - Torch not compiled with CUDA enabled](../../../llm_bench/python/doc/NOTES.md#qwen-7b-chat-int4---torch-not-compiled-with-cuda-enabled) + in case of `AssertionError` +7. Dolly + 1. https://huggingface.co/databricks/dolly-v2-3b +8. Phi + 1. https://huggingface.co/microsoft/phi-2 + 2. https://huggingface.co/microsoft/phi-1_5 +9. [notus-7b-v1](https://huggingface.co/argilla/notus-7b-v1) +10. [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta) +11. [redpajama-3b-chat](https://huggingface.co/ikala/redpajama-3b-chat) +12. [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) +13. [Gemma-2B-it](https://huggingface.co/google/gemma-2b-it) + +This pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. The model is required heve after the conversion the following inputs: +1. `input_ids` contains the tokens +2. `attention_mask` is filled with `1` +3. `beam_idx` selects beams +4. `position_ids` (optional) encodes a position of currently generating token in the sequence + +and a single `logits` output. + +Some models may require access request submission on their Hugging Face page to be downloaded. + +If https://huggingface.co/ is down, the conversion step won't be able to download the models. + +> [!NOTE] +> Models should belong to the same family and have same tokenizers. diff --git a/text_generation/causal_lm/cpp/beam_idx-drop.gif b/src/beam_idx-drop.gif similarity index 100% rename from text_generation/causal_lm/cpp/beam_idx-drop.gif rename to src/beam_idx-drop.gif diff --git a/text_generation/causal_lm/cpp/beam_idx-fork.gif b/src/beam_idx-fork.gif similarity index 100% rename from text_generation/causal_lm/cpp/beam_idx-fork.gif rename to src/beam_idx-fork.gif diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 22347582c..e8451dbff 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -27,7 +27,7 @@ ov::genai::GenerationConfig from_config_json_if_exists(const std::filesystem::pa ov::genai::GenerationConfig config; if (std::filesystem::exists(model_path / generation_config_fname)) { - config = ov::genai::GenerationConfig(model_path / generation_config_fname); + config = ov::genai::GenerationConfig((model_path / generation_config_fname).string()); } // if eos_token_ids is undefined try to load it from config.json @@ -404,7 +404,7 @@ ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( const ov::AnyMap& config ): m_model_runner{ov::Core{}.compile_model(path / "openvino_model.xml", device, config).create_infer_request()}, - m_tokenizer(path), + m_tokenizer(path.string()), m_generation_config{from_config_json_if_exists(path)}, m_chat_template{chat_template_from_tokenizer_json_if_exists(path)} { diff --git a/text_generation/causal_lm/cpp/stateful.jpg b/src/stateful.jpg similarity index 100% rename from text_generation/causal_lm/cpp/stateful.jpg rename to src/stateful.jpg diff --git a/text_generation/causal_lm/cpp/stateless.jpg b/src/stateless.jpg similarity index 100% rename from text_generation/causal_lm/cpp/stateless.jpg rename to src/stateless.jpg diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt deleted file mode 100644 index afdd4c48b..000000000 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -cmake_minimum_required(VERSION 3.15) -project(causal_lm) - -if(TARGET openvino_tokenizers) - set(OPENVINO_TOKENIZERS_PATH $) -else() - set(OPENVINO_TOKENIZERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../bin/openvino_tokenizers.dll) # TODO: I'll go away after the generate() gets a way to find openvino_tokenizers -endif() - -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. -) - -add_executable(greedy_causal_lm greedy_causal_lm.cpp) -target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) -set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) - -add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) -target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) -set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) - -add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) -target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") -target_include_directories(speculative_decoding_lm PRIVATE ./) -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime) -set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(speculative_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) -find_package(TBB REQUIRED COMPONENTS tbb) -target_link_libraries(speculative_decoding_lm PRIVATE TBB::tbb) - -add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) -target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") -target_include_directories(prompt_lookup_decoding_lm PRIVATE ./) -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime) -set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) -find_package(TBB REQUIRED COMPONENTS tbb) -target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb) - -add_executable(chat_sample chat_sample.cpp) -target_link_libraries(chat_sample PRIVATE openvino::genai) -target_include_directories(chat_sample PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") -set_target_properties(chat_sample PROPERTIES CXX_STANDARD 17) -set_target_properties(chat_sample PROPERTIES CXX_STANDARD_REQUIRED ON) - -add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) -target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) -target_include_directories(multinomial_causal_lm PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") -set_target_properties(multinomial_causal_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(multinomial_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) - -install(TARGETS greedy_causal_lm beam_search_causal_lm speculative_decoding_lm prompt_lookup_decoding_lm chat_sample multinomial_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL)