diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9ab4587c2..789167949 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,5 +1,9 @@ version: 2 updates: + - package-ecosystem: "pip" + directory: "./" + schedule: + interval: "weekly" - package-ecosystem: "pip" directory: "image_generation/stable_diffusion_1_5/cpp/scripts/" schedule: diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index df03bab7c..b86f49af3 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -23,21 +23,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: greedy_causal_lm run: | source ./ov/setupvars.sh - ./build/greedy_causal_lm ./open_llama_3b_v2/ "return 0" + ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./open_llama_3b_v2/ "return 0" cpp-beam_search_causal_lm-ubuntu: runs-on: ubuntu-20.04 @@ -51,22 +51,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare run: | source ./ov/setupvars.sh - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt + timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -74,7 +74,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -82,7 +82,7 @@ jobs: " echo "Why is the Sun yellow?" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt + timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -90,7 +90,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('69', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -98,7 +98,7 @@ jobs: " echo "69" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt + timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -106,7 +106,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('Hi', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -114,7 +114,7 @@ jobs: " echo "Hi" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt + timeout 25s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -122,7 +122,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('return 0', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -130,7 +130,7 @@ jobs: " echo "return 0" passed - ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt + ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -138,7 +138,7 @@ jobs: tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -146,7 +146,7 @@ jobs: " echo "你好! 你好嗎?" passed - timeout 1m ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + timeout 1m ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -160,7 +160,7 @@ jobs: for prompt in prompts: tokenized = tokenizer(prompt, return_tensors='pt') for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -179,29 +179,30 @@ jobs: - name: Install OpenVINO shell: bash run: | - curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/windows/w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64.zip + curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/windows/w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64.zip unzip ov.zip - name: Download, convert and build shell: cmd run: | - call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat + call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare shell: cmd run: | - call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat - - .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt + call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat + set PATH=.\build\openvino_genai\;%PATH% + .\build\text_generation\causal_lm\cpp\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt + echo import transformers > ref.py echo predictions = open('pred.txt', 'r').read() >> ref.py echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py - echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' >> ref.py + echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py echo idx = predictions.find(ref) >> ref.py echo if -1 == idx: >> ref.py echo raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py @@ -220,21 +221,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt + timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./Qwen-7B-Chat/ 69 > ./pred.txt cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: runs-on: ubuntu-20.04-16-cores @@ -248,21 +249,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Run run: | source ./ov/setupvars.sh - timeout 50s ./build/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" > ./pred_qwen15.txt + timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" > ./pred_qwen15.txt cpp-beam_search_causal_lm-Phi-2: runs-on: ubuntu-20.04-16-cores @@ -276,21 +277,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j 15 - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt + timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./phi-2/ 69 > ./pred.txt cpp-beam_search_causal_lm-notus-7b-v1: runs-on: ubuntu-20.04-16-cores @@ -304,21 +305,21 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: Compare run: | source ./ov/setupvars.sh - timeout 50s ./build/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt + timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./notus-7b-v1/ 69 > ./pred.txt cpp-speculative_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores @@ -332,23 +333,23 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: run and compare run: | source ./ov/setupvars.sh - ./build/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt - ./build/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt + ./build/text_generation/causal_lm/cpp/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt + ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -371,16 +372,16 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - name: run and compare run: | @@ -393,8 +394,8 @@ jobs: Question: Can you please add 2 and 3 A:' > ./prompt.txt - ./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt - ./build/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt + ./build/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt + ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -416,22 +417,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j 15 - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt - timeout 50s ./build/beam_search_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt + timeout 50s ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/text_generation/causal_lm/cpp/beam_search_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_beam.txt - name: Compare run: | python -c " @@ -441,7 +442,7 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5') tokenized = tokenizer('Alan Turing was a', return_tensors='pt') for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False): - ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') @@ -461,22 +462,22 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - run: source ./ov/setupvars.sh && convert_tokenizer ./redpajama-3b-chat/ --output ./redpajama-3b-chat/ --with-detokenizer --trust-remote-code - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/text_generation/causal_lm/cpp/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " @@ -486,7 +487,7 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat') tokenized = tokenizer('Alan Turing was a', return_tensors='pt') for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False): - ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref}" from predictions') diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml new file mode 100644 index 000000000..ea9119dbc --- /dev/null +++ b/.github/workflows/genai_package.yml @@ -0,0 +1,63 @@ +name: genai_package +on: pull_request +jobs: + ubuntu_genai_package: + strategy: + matrix: + build-type: [Release, Debug] + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: mkdir ./ov/ + - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - run: sudo apt-get install libtbb-dev + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov + - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && timeout 50s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" + if: ${{ 'Release' == matrix.build-type }} + + windows_genai_package: + strategy: + matrix: + build-type: [Release, Debug] + runs-on: windows-latest + defaults: + run: + shell: cmd + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/windows/w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64.zip + - run: unzip ov.zip + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64 + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install" + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + if: ${{ 'Release' == matrix.build-type }} + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + if: ${{ 'Release' == matrix.build-type }} + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" + if: ${{ 'Release' == matrix.build-type }} diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml new file mode 100644 index 000000000..ece3b113e --- /dev/null +++ b/.github/workflows/genai_python_lib.yml @@ -0,0 +1,62 @@ +name: genai_python_lib +on: pull_request +jobs: + ubuntu_genai_python_lib: + # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env + runs-on: ubuntu-22.04 + env: + # A tokenizers' dependency fails to compile with Ninja in CenOS7 env + CMAKE_GENERATOR: Unix Makefiles + CMAKE_BUILD_PARALLEL_LEVEL: null + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: mkdir ./ov/ + - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_centos7_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI + - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --verbose --verbose --verbose + - run: source ./ov/setupvars.sh && PYTHONPATH=./build/ python -c "from openvino_genai import LLMPipeline" + - run: source ./ov/setupvars.sh && python -m pip install . --config-settings=build-dir="build" --verbose --verbose --verbose + - run: python -c "from openvino_genai import LLMPipeline" + - name: GenAI Python API tests + run: | + cd ./tests/python_tests/ + python -m pip install -r requirements.txt + models=$(python list_test_models.py) + echo "$models" | while read -r model_name model_path; do + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model "$model_name" "$model_path" + done + GENAI_BUILD_DIR=../../build python -m pytest test_generate_api.py + + windows_genai_python_lib: + runs-on: windows-latest + env: + CMAKE_BUILD_PARALLEL_LEVEL: null + defaults: + run: + shell: cmd + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/windows/w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64.zip + - run: unzip ov.zip + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --verbose --verbose --verbose + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install . --verbose --verbose --verbose # --verbose is additive, and can be used up to 3 times. + - run: python -c "from openvino_genai import LLMPipeline" + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j + - run: set "PYTHONPATH=./build/" && call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index de0615357..17ad925b5 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -40,15 +40,15 @@ jobs: run: | conda activate openvino_lcm_cpp conda update -c conda-forge --all - conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_lcm_cpp - python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + python -m pip install -r requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} @@ -85,15 +85,15 @@ jobs: run: | conda activate openvino_lcm_cpp conda update -c conda-forge --all - conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_lcm_cpp - python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + python -m pip install -r requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index 38a2022e1..db28fad79 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -39,15 +39,15 @@ jobs: - name: Install OpenVINO and other conda dependencies run: | conda activate openvino_sd_cpp - conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_sd_cpp - python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + python -m pip install -r requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} @@ -83,14 +83,14 @@ jobs: - name: Install OpenVINO and other conda dependencies run: | conda activate openvino_sd_cpp - conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_sd_cpp - python -m pip install -r requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + python -m pip install -r requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} diff --git a/.gitignore b/.gitignore index ae479f4fa..5c88a00fd 100644 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,6 @@ CMakeUserPresets.json # Python-specific *.?env* *.pyc -__pycache__ \ No newline at end of file +__pycache__ + +*.so \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..0148ca6dd --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,55 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +cmake_minimum_required(VERSION 3.15) + +# Multi config generators such as Visual Studio ignore CMAKE_BUILD_TYPE. Multi config generators are configured with +# CMAKE_CONFIGURATION_TYPES, but limiting options in it completely removes such build options +get_property(GENERATOR_IS_MULTI_CONFIG_VAR GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) +if(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE) + message(STATUS "CMAKE_BUILD_TYPE is not defined, 'Release' will be used") + # Setting CMAKE_BUILD_TYPE as CACHE must go before project(). Otherwise project() sets its value and set() doesn't take an effect + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...") +endif() + +project(OpenVINOGenAI VERSION 2024.2.0.0) + +add_subdirectory(./thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/") +# Put binaries to a single dir to mimic package structure. +set_target_properties(openvino_tokenizers PROPERTIES + # Generator expressions to disable appending a per-configuration subdirectory (Release, Debug). + # ARCHIVE_OUTPUT is irrelevant. It's here just to keep all the artifacts in one place. + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) +if(TARGET core_tokenizers) + set_target_properties(core_tokenizers PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + ) +else() + # Prebuilt dependencies + if(WIN32) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/core_tokenizers.dll" + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icudt70.dll" + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icuuc70.dll") + elseif(LINUX) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so") + elseif(APPLE) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.dylib") + endif() + add_custom_command(OUTPUT "${extra_libs}" + COMMAND "${CMAKE_COMMAND}" -E copy "${extra_libs}" "${CMAKE_BINARY_DIR}/openvino_genai/" + DEPENDS openvino_tokenizers) +endif() +add_subdirectory(src) +add_subdirectory(text_generation/causal_lm/cpp) + +install(DIRECTORY text_generation/causal_lm/cpp/ DESTINATION samples/cpp/causal_lm COMPONENT cpp_samples_genai) +install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) +install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) +set(CPACK_GENERATOR "ZIP") +include(CPack) diff --git a/image_generation/lcm_dreamshaper_v7/cpp/README.md b/image_generation/lcm_dreamshaper_v7/cpp/README.md index 6077b8a1c..a7b19ae4e 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/README.md +++ b/image_generation/lcm_dreamshaper_v7/cpp/README.md @@ -18,7 +18,7 @@ Prepare a python environment and install dependencies: conda create -n openvino_lcm_cpp python==3.10 conda activate openvino_lcm_cpp conda update -c conda-forge --all -conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake +conda install -c conda-forge openvino=2024.2.0 c-compiler cxx-compiler git make cmake # Ensure that Conda standard libraries are used conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH ``` diff --git a/image_generation/stable_diffusion_1_5/cpp/README.md b/image_generation/stable_diffusion_1_5/cpp/README.md index fb01326ea..81ccd0c29 100644 --- a/image_generation/stable_diffusion_1_5/cpp/README.md +++ b/image_generation/stable_diffusion_1_5/cpp/README.md @@ -18,7 +18,7 @@ Prepare a python environment and install dependencies: ```shell conda create -n openvino_sd_cpp python==3.10 conda activate openvino_sd_cpp -conda install -c conda-forge openvino=2024.1.0 c-compiler cxx-compiler git make cmake +conda install -c conda-forge openvino=2024.2.0 c-compiler cxx-compiler git make cmake # Ensure that Conda standard libraries are used conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH ``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..dbab15506 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[project] +name = "openvino_genai" +version = "2024.2.0.0" +description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai" +requires-python = ">=3.8" +readme = {file = "text_generation/causal_lm/cpp/README.md", content-type="text/markdown"} +license = {text = "OSI Approved :: Apache Software License"} +authors = [ + { name = "OpenVINO Developers", email = "openvino@intel.com" }, +] +classifiers = [ + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "openvino_tokenizers~=2024.2.0.0" +] + +[tool.scikit-build] +cmake.source-dir = "./" +cmake.build-type = "Release" +install.components = ["wheel_genai"] +sdist.cmake = true +wheel.packages = ["src/python/openvino_genai"] +wheel.install-dir = "openvino_genai" +wheel.build-tag = "000" +wheel.license-files = ["LICENSE", "SECURITY.md", "third-party-programs.txt"] + +[[tool.scikit-build.generate]] +path = "openvino_genai/__version__.py" +template = ''' +__version__ = "${version}" +''' + +[build-system] +# TODO: add build.tool-args = ["--parallel"] after scikit-build-core is updated to 0.9.4+. +requires = ["scikit-build-core~=0.8.0", "cmake~=3.23"] # See https://github.com/openvinotoolkit/openvino_tokenizers/pull/123 +build-backend = "scikit_build_core.build" diff --git a/requirements-build.txt b/requirements-build.txt new file mode 100644 index 000000000..d75687fa2 --- /dev/null +++ b/requirements-build.txt @@ -0,0 +1 @@ +build~=1.2.1 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 000000000..d15483687 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,13 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +# Find OpenVINODeveloperPackage first to compile with SDL flags +find_package(OpenVINODeveloperPackage QUIET + PATHS "${OpenVINO_DIR}") +if(NOT OpenVINODeveloperPackage_FOUND) + find_package(OpenVINO REQUIRED COMPONENTS Runtime) +endif() + +add_subdirectory(cpp) +add_subdirectory(python) diff --git a/src/README.md b/src/README.md new file mode 100644 index 000000000..06a649a75 --- /dev/null +++ b/src/README.md @@ -0,0 +1,163 @@ +# OpenVINO Generate API + +## Usage + +First of all you need to convert your model with optimum-cli +``` sh +optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" +pip install openvino-genai +``` + +`LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration. + +### Python + +A minimalist example: +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +print(pipe.generate("The Sun is yellow bacause")) +``` + +Calling generate with custom generation config parameters, e.g. config for grouped beam search +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") + +result = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5, diversity_penalty=1.5) +print(result) +``` + +output: +``` +'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in' +``` + +A simples chat in python: +```python +import openvino_genai as ov_genai +pipe = ov_ov_genai.LLMPipeline(model_path) + +config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.5} +pipe.set_generation_cofnig(config) + +pipe.start_chat() +while True: +    print('question:') +    prompt = input() + if prompt == 'Stop!': +        break +    print(pipe(prompt)) +pipe.finish_chat() +``` + +Test to compare with Huggingface outputs + +### C++ + +Minimalistc example +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow bacause"); +} +``` + +Using Group Beam Search Decoding +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 256; + config.num_groups = 3; + config.group_size = 5; + config.diversity_penalty = 1.0f; + + std::cout << pipe.generate("The Sun is yellow bacause", config); +} +``` + +A simple chat in C++ using grouped beam search decoding +``` cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string prompt; + + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 256; + config.num_groups = 3; + config.group_size = 5; + config.diversity_penalty = 1.0f; + + pipe.start_chat(); + for (;;;) { + std::cout << "question:\n"; + std::getline(std::cin, prompt); + if (prompt == "Stop!") + break; + + std::cout << "answer:\n"; + auto answer = pipe(prompt, config); + std::cout << answer << std::endl; + } + pipe.finish_chat(); +} +``` + +Streaming example with lambda function +``` cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + auto streamer = [](std::string word) { std::cout << word << std::flush; }; + std::cout << pipe.generate("The Sun is yellow bacause", streamer); +} +``` + +Streaming with a custom class +``` cpp +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include + +class CustomStreamer: public ov::genai::StreamerBase { +public: + void put(int64_t token) { + /* custom decoding/tokens processing code + tokens_cache.push_back(token); + std::string text = m_tokenizer.decode(tokens_cache); + ... + */ + }; + + void end() { + /* custom finalization */ + }; +}; + +int main(int argc, char* argv[]) { + CustomStreamer custom_streamer; + + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer); +} +``` diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt new file mode 100644 index 000000000..ec909de27 --- /dev/null +++ b/src/cpp/CMakeLists.txt @@ -0,0 +1,103 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +# Dependencies + +include(FetchContent) + +FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) +FetchContent_MakeAvailable(nlohmann_json) + +function(ov_genai_build_jinja2cpp) + FetchContent_Declare(jinja2cpp + URL https://github.com/jinja2cpp/Jinja2Cpp/archive/9ae7e1fc45d707e1686dd425a154d30963801944.tar.gz + URL_HASH SHA256=aa41ae425225623ba91be5de3ef1e0d942e682d519311e6235b04b4e7d880e01) + + FetchContent_GetProperties(jinja2cpp) + if(NOT jinja2cpp_POPULATED) + FetchContent_Populate(jinja2cpp) + + set(BUILD_SHARED_LIBS OFF) + set(JINJA2CPP_INSTALL OFF CACHE BOOL "") + set(JINJA2CPP_CXX_STANDARD 17 CACHE STRING "") + set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "") + set(JINJA2CPP_USE_REGEX "std" CACHE STRING "") + set(JINJA2CPP_WITH_JSON_BINDINGS "none" CACHE STRING "") + set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "") + set(JINJA2CPP_PIC ON CACHE BOOL "") + + add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL) + endif() +endfunction() + +ov_genai_build_jinja2cpp() + +# Library + +file(GLOB SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") + +set(TARGET_NAME genai) +add_library(${TARGET_NAME} SHARED ${SOURCE_FILES}) +add_library(openvino::genai ALIAS ${TARGET_NAME}) + +target_include_directories(${TARGET_NAME} + PUBLIC "$" "$") + +target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp) + +target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17) + +# Extract two last digits from CMAKE_PROJECT_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols. +string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${CMAKE_PROJECT_VERSION_MAJOR}) +set_target_properties(${TARGET_NAME} PROPERTIES + OUTPUT_NAME openvino_genai + VERSION ${CMAKE_PROJECT_VERSION} + SOVERSION ${MAJOR_SUFFIX}${CMAKE_PROJECT_VERSION_MINOR}${CMAKE_PROJECT_VERSION_PATCH} + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) + +find_package(Python3 REQUIRED COMPONENTS Interpreter Development) +install(TARGETS ${TARGET_NAME} + LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR} + RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + +# - Windows: `\runtime\bin\intel64\Release\` +# - MacOS_x86: `/runtime/lib/intel64/Release` +# - MacOS_arm64: `/runtime/lib/arm64/Release/` +# - Linux_x86: `/runtime/lib/intel64/` +# - Linux_arm64: `/runtime/lib/aarch64/` +string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH_DIR) +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(ARCH_DIR intel64) +elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") + if(APPLE) + set(ARCH_DIR "arm64") + else() + set(ARCH_DIR "aarch64") + endif() +elseif(ARCH_DIR STREQUAL "x86_64" OR ARCH_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64 + OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") + set(ARCH_DIR intel64) +endif() +if(MSVC OR APPLE) + set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE}) +endif() +install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets + LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai + NAMELINK_COMPONENT core_genai_dev + ARCHIVE DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev + RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai + INCLUDES DESTINATION runtime/include) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION runtime/include COMPONENT core_genai_dev) +install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake) +include(CMakePackageConfigHelpers) +configure_package_config_file(OpenVINOGenAIConfig.cmake.in "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" INSTALL_DESTINATION runtime/cmake) +install(FILES "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev) +include(CMakePackageConfigHelpers) +write_basic_package_version_file("${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion) +export(EXPORT OpenVINOGenAITargets FILE "${CMAKE_BINARY_DIR}/OpenVINOGenAITargets.cmake" NAMESPACE openvino::) diff --git a/src/cpp/OpenVINOGenAIConfig.cmake.in b/src/cpp/OpenVINOGenAIConfig.cmake.in new file mode 100644 index 000000000..18c0bb4e4 --- /dev/null +++ b/src/cpp/OpenVINOGenAIConfig.cmake.in @@ -0,0 +1,10 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +find_dependency(OpenVINO COMPONENTS Runtime) + +if(NOT TARGET genai) + include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake") +endif() + +check_required_components(openvino_genai) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp new file mode 100644 index 000000000..0da478a39 --- /dev/null +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -0,0 +1,107 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +#include "openvino/runtime/compiled_model.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +/** + * @brief controls the stopping condition for grouped beam search. The following values are possible: + * "early" stops as soon as there are `num_beams` complete candidates. + "heuristic" stops when is it unlikely to find better candidates. + "never" stops when there cannot be better candidates. + */ +enum class StopCriteria { early, heuristic, never }; + +/** + * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + * and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + * be used while greedy and beam search parameters will not affect decoding at all. + * + * Generic parameters: + * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + * `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + * @param ignore_eos if set to true, then generation will not stop even if token is met. + * @param pad_token_id token_id of (padding) + * @param bos_token_id token_id of (beggining of sentence) + * @param eos_token_id token_id of (end of sentence) + * @param bos_token token string representation + * @param eos_token token string representation + * + * Beam search specific parameters: + * @param num_beams number of beams for beam search. 1 disables beam search. + * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a + * particular time. See https://arxiv.org/pdf/1909.05858. + * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + * the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + * likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + * `length_penalty` < 0.0 encourages shorter sequences. + * @param num_return_sequences the number of sequences to return for grouped beam search decoding. + * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once. + * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: + * "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an + * heuristic is applied and the generation stops when is it very unlikely to find better candidates; + * "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + * + * Random sampling parameters: + * @param temperature the value used to modulate token probabilities for random sampling. + * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. + * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. + */ +class OPENVINO_GENAI_EXPORTS GenerationConfig { +public: + GenerationConfig() = default; + explicit GenerationConfig(std::string json_path); + + // Generic + size_t max_new_tokens = SIZE_MAX; + size_t max_length = SIZE_MAX; + bool ignore_eos = false; + + // Beam search specific + size_t num_beam_groups = 1; + size_t num_beams = 1; + float diversity_penalty = 1.0f; + float length_penalty = 1.0f; + size_t num_return_sequences = 1; + size_t no_repeat_ngram_size = std::numeric_limits::max(); + StopCriteria stop_criteria = StopCriteria::heuristic; + + // Multinomial + float temperature = 1.0f; + float top_p = 1.0f; + size_t top_k = 50; + bool do_sample = false; + float repetition_penalty = 1.0f; + + // special tokens + int64_t pad_token_id = 0; + int64_t bos_token_id = 1; + int64_t eos_token_id = 2; + + // used for chat scenario + std::string bos_token = ""; + std::string eos_token = ""; + + size_t get_max_new_tokens(size_t prompt_length = 0) const; + bool is_greedy_decoding() const; + bool is_beam_search() const; + bool is_multinomial() const; + static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {}); +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp new file mode 100644 index 000000000..d16ec0dc8 --- /dev/null +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -0,0 +1,217 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "openvino/core/any.hpp" +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/streamer_base.hpp" + +namespace ov { +namespace genai { + +using StreamerVariant = std::variant, std::shared_ptr>; +using OptionalGenerationConfig = std::optional; +using OptionalStreamerVariant = std::optional; + +/** +* @brief Structure to store resulting batched tokens and scores for each batch sequence +* +* @param tokens sequence of resulting tokens +* @param scores scores for each sequence +*/ +class EncodedResults { +public: + std::vector> tokens; + std::vector scores; +}; + +/** +* @brief Structure to store resulting batched text outputs and scores for each batch +* +* @param texts vector of resulting sequences +* @param scores scores for each sequence +*/ +class DecodedResults { +public: + std::vector texts; + std::vector scores; + + // @brief Convert DecodedResults to a vector of strings. + // @return A std::vector containing the texts from the DecodedResults object. + operator std::vector() const { + return texts; + } + + // @brief Overloads operator<< to enhance output the contents of DecodedResults. + // @return A reference to the output stream with the concatenated texts. + friend std::ostream& operator<<(std::ostream& os, const DecodedResults& dr) { + for (size_t i = 0; i < dr.texts.size(); ++i) { + os << dr.texts[i]; + if (i != dr.texts.size() - 1) { + os << std::endl; + } + } + return os; + } +}; + +/** +* @brief This class is used for generation with LLMs. + */ +class OPENVINO_GENAI_EXPORTS LLMPipeline { +public: + /** + * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir. + * + * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json + * @param device optional device + * @param plugin_config optional plugin_config + * @param ov_tokenizers_path optional path to an extension to add. Empty adds openvino_tokenizers from openvini_genai library folder. + */ + LLMPipeline(const std::string& path, const std::string& device="CPU", + const ov::AnyMap& plugin_config={}, + const std::string& ov_tokenizers_path=""); + + /** + * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs. + * + * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json + * @param tokenizer manually initialized ov::Tokenizer + * @param device optional device + * @param plugin_config optional plugin_config + */ + LLMPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device="CPU", + const ov::AnyMap& plugin_config = {} + ); + + ~LLMPipeline(); + + /** + * @brief High level generate for the input with a single prompt which encodes inputs and returns decoded output + * + * @param text input prompt + * @param generation_config optional GenerationConfig + * @param streamer optional streamer + * @return std::string decoded resulting text + */ + std::string generate(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt); + + template + util::EnableIfAllStringAny generate( + std::string text, + Properties&&... properties) { + return generate(text, AnyMap{std::forward(properties)...}); + } + std::string generate(std::string text, const ov::AnyMap& config); + + template + util::EnableIfAllStringAny generate( + ov::Tensor input_ids, + Properties&&... properties) { + return generate(input_ids, AnyMap{std::forward(properties)...}); + } + EncodedResults generate(ov::Tensor input_ids, const ov::AnyMap& config); + + /** + * @brief High level generate for batched prompts which encodes inputs and returns decoded outputs. + * Streamer cannot be used for multibatch inputs. + * + * @param text input prompt + * @param generation_config optional GenerationConfig + * @return DecodedResults a structure with resulting texts & scores + */ + DecodedResults generate(const std::vector& texts, OptionalGenerationConfig generation_config); + + /** + * @brief Low level generate to be called with already encoded input_ids tokens. + * Streamer cannot be used for multibatch inputs. + * + * @param input_ids encoded input prompt tokens + * @param attention_mask optional attention_mask + * @param generation_config optional GenerationConfig + * @param streamer optional streamer + * @return EncodedResults a structure with resulting tokens and scores + * @throws Exception if the stremaer is set for inputs_ids with multiple batches + */ + EncodedResults generate(ov::Tensor input_ids, + std::optional attention_mask, + OptionalGenerationConfig generation_config=std::nullopt, + OptionalStreamerVariant streamer=std::nullopt); + + template + util::EnableIfAllStringAny operator()( + InputsType text, + Properties&&... properties) { + return generate(text, AnyMap{std::forward(properties)...}); + } + + DecodedResults operator()(const std::vector& text, OptionalGenerationConfig generation_config=std::nullopt) { + return generate(text, generation_config); + } + + std::string operator()( + std::string text, + OptionalGenerationConfig generation_config=std::nullopt, + OptionalStreamerVariant streamer=std::nullopt + ) { + return generate(text, generation_config, streamer); + } + + ov::genai::Tokenizer get_tokenizer(); + GenerationConfig get_generation_config() const; + void set_generation_config(const GenerationConfig& generation_config); + + void start_chat(); + void finish_chat(); + void reset_state(); + std::string apply_chat_template(std::string prompt, std::string role = "user") const; +private: + class LLMPipelineImpl; + std::unique_ptr m_pimpl; +}; + +/* + * utils that allow to use generate and operator() in the following way: + * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...) + * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...) +*/ +static constexpr ov::Property max_new_tokens{"max_new_tokens"}; +static constexpr ov::Property max_length{"max_length"}; +static constexpr ov::Property ignore_eos{"ignore_eos"}; + +static constexpr ov::Property num_beam_groups{"num_beam_groups"}; +static constexpr ov::Property num_beams{"num_beams"}; +static constexpr ov::Property diversity_penalty{"diversity_penalty"}; +static constexpr ov::Property length_penalty{"length_penalty"}; +static constexpr ov::Property num_return_sequences{"num_return_sequences"}; +static constexpr ov::Property no_repeat_ngram_size{"no_repeat_ngram_size"}; +static constexpr ov::Property stop_criteria{"stop_criteria"}; + +static constexpr ov::Property temperature{"temperature"}; +static constexpr ov::Property top_p{"top_p"}; +static constexpr ov::Property top_k{"top_k"}; +static constexpr ov::Property do_sample{"do_sample"}; +static constexpr ov::Property repetition_penalty{"repetition_penalty"}; + + +static constexpr ov::Property pad_token_id{"pad_token_id"}; +static constexpr ov::Property bos_token_id{"bos_token_id"}; +static constexpr ov::Property eos_token_id{"eos_token_id"}; + +static constexpr ov::Property bos_token{"bos_token"}; +static constexpr ov::Property eos_token{"eos_token"}; + +// only lambda streamer can be set via ov::streamer(),... syntaxic sugar, +// because std::variant> can not be stored in AnyMap +static constexpr ov::Property> streamer{"streamer"}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp new file mode 100644 index 000000000..7731b51c1 --- /dev/null +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +/** + * @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods + * + * @param m_tokenizer tokenizer +*/ +class StreamerBase { +public: + Tokenizer m_tokenizer; + explicit StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {} + StreamerBase() = default; + + /// @brief put is called every time new token is decoded + virtual void put(int64_t token) = 0; + + /// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one + virtual void end() = 0; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp new file mode 100644 index 000000000..e0214fcfb --- /dev/null +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include "openvino/genai/visibility.hpp" + +namespace ov { +namespace genai { + +/** +* @brief class is used to encode prompts and decode resulting tokens +*/ +class OPENVINO_GENAI_EXPORTS Tokenizer { +public: + /** + * @brief ov::Tokenizer constructor. + * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path + * @param device device. Currently only 'CPU' is supported + */ + Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU", const std::string& ov_tokenizers_path=""); + + /** + * @brief encode a single prompt + * @return pair of [input_ids, attention_mask] + */ + std::pair encode(const std::string prompt); + + /** + * @brief encode batch of prompts. Left padding will be applied by default + * @param prompts vector storing batch of prompts + * @return pair of [input_ids, attention_mask] + */ + std::pair encode(std::vector& prompts); + std::pair encode(std::vector&& prompts); + std::pair encode(std::initializer_list& prompts); + + /** + * @brief decode sequence of tokens + * @param tokens vector storing tokens + * @return sequence string + */ + std::string decode(std::vector tokens); + + /** + * @brief decode tokens. + * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] + * @return vector of std::string, with size = batch_size + */ + std::vector decode(ov::Tensor tokens); + + /** + * @brief batched decoding of tokens. + * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size + * @return vector of std::string, with size equal to batch_size + */ + std::vector decode(std::vector> tokens); + + // information about , tokens should be public, + // they are used at least in StreamerBase descendants + int64_t get_bos_token_id() const; + int64_t get_eos_token_id() const; + int64_t get_pad_token_id() const; + + // Also need write access to set these tokens when they are not successfully read from xml rt_info. + // In the latter case values can be read from config.json in LLMPipeline + void set_bos_token_id(int64_t); + void set_eos_token_id(int64_t); + void set_pad_token_id(int64_t); + + Tokenizer() = default; + ~Tokenizer(); +private: + class TokenizerImpl; + std::shared_ptr m_pimpl; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/visibility.hpp b/src/cpp/include/openvino/genai/visibility.hpp new file mode 100644 index 000000000..6a8cf756e --- /dev/null +++ b/src/cpp/include/openvino/genai/visibility.hpp @@ -0,0 +1,10 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/core/visibility.hpp" + +#ifdef genai_EXPORTS +# define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_EXPORTS +#else +# define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_IMPORTS +#endif // genai_EXPORTS diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp new file mode 100644 index 000000000..e2e95262d --- /dev/null +++ b/src/cpp/src/generation_config.cpp @@ -0,0 +1,108 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include +#include +#include "openvino/genai/generation_config.hpp" +#include "utils.hpp" + + +namespace ov { +namespace genai { + +GenerationConfig::GenerationConfig(std::string json_path) { + using ov::genai::utils::read_json_param; + + std::ifstream f(json_path); + OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); + + nlohmann::json data = nlohmann::json::parse(f); + + read_json_param(data, "max_new_tokens", max_new_tokens); + read_json_param(data, "max_length", max_length); + // note that ignore_eos is not present in HF GenerationConfig + read_json_param(data, "num_beam_groups", num_beam_groups); + read_json_param(data, "num_beams", num_beams); + read_json_param(data, "diversity_penalty", diversity_penalty); + read_json_param(data, "length_penalty", length_penalty); + read_json_param(data, "num_return_sequences", num_return_sequences); + read_json_param(data, "no_repeat_ngram_size", no_repeat_ngram_size); + read_json_param(data, "temperature", temperature); + read_json_param(data, "top_p", top_p); + read_json_param(data, "top_k", top_k); + read_json_param(data, "do_sample", do_sample); + read_json_param(data, "repetition_penalty", repetition_penalty); + read_json_param(data, "pad_token_id", pad_token_id); + read_json_param(data, "bos_token_id", bos_token_id); + read_json_param(data, "eos_token_id", eos_token_id); + read_json_param(data, "bos_token", bos_token); + read_json_param(data, "eos_token", eos_token); + + if (data.contains("early_stopping")) { + auto field_type = data["early_stopping"].type(); + if (field_type == nlohmann::json::value_t::string && data["early_stopping"] == "never") { + stop_criteria = StopCriteria::never; + } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == true) { + stop_criteria = StopCriteria::early; + } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == false) { + stop_criteria = StopCriteria::heuristic; + } + } + + +} + +GenerationConfig GenerationConfig::anymap_to_generation_config(const ov::AnyMap& config_map) { + using ov::genai::utils::read_anymap_param; + + GenerationConfig config; + read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens); + read_anymap_param(config_map, "max_length", config.max_length); + read_anymap_param(config_map, "ignore_eos", config.ignore_eos); + read_anymap_param(config_map, "num_beam_groups", config.num_beam_groups); + read_anymap_param(config_map, "num_beams", config.num_beams); + read_anymap_param(config_map, "diversity_penalty", config.diversity_penalty); + read_anymap_param(config_map, "length_penalty", config.length_penalty); + read_anymap_param(config_map, "num_return_sequences", config.num_return_sequences); + read_anymap_param(config_map, "no_repeat_ngram_size", config.no_repeat_ngram_size); + read_anymap_param(config_map, "stop_criteria", config.stop_criteria); + read_anymap_param(config_map, "temperature", config.temperature); + read_anymap_param(config_map, "top_p", config.top_p); + read_anymap_param(config_map, "top_k", config.top_k); + read_anymap_param(config_map, "do_sample", config.do_sample); + read_anymap_param(config_map, "repetition_penalty", config.repetition_penalty); + read_anymap_param(config_map, "pad_token_id", config.pad_token_id); + read_anymap_param(config_map, "bos_token_id", config.bos_token_id); + read_anymap_param(config_map, "eos_token_id", config.eos_token_id); + read_anymap_param(config_map, "bos_token", config.bos_token); + read_anymap_param(config_map, "eos_token", config.eos_token); + + return config; +} + +size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { + // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length + if (max_new_tokens != SIZE_MAX) { + return max_new_tokens; + } else { + return max_length - prompt_length; + } +} + +bool GenerationConfig::is_greedy_decoding() const { + return !do_sample && !is_beam_search(); +} + +bool GenerationConfig::is_beam_search() const { + return num_beams > 1; +} + +bool GenerationConfig::is_multinomial() const { + return do_sample; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp new file mode 100644 index 000000000..e69de29bb diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp new file mode 100644 index 000000000..51e8023b4 --- /dev/null +++ b/src/cpp/src/greedy_decoding.cpp @@ -0,0 +1,130 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" + +namespace ov { +namespace genai { + +EncodedResults greedy_decoding( + ov::InferRequest& m_model_runner, + ov::Tensor input_ids, + ov::Tensor attention_mask, + const ov::genai::GenerationConfig generation_config, + const std::shared_ptr streamer, + const bool is_chat_conversation +) { + + ov::Shape prompts_shape = input_ids.get_shape(); + size_t batch_size = prompts_shape[0]; + size_t prompt_len = prompts_shape[1]; + + auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2]; + + // todo: make this work even if position_ids are not specified + auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; + utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len); + + EncodedResults results; + results.scores.resize(batch_size); + results.tokens.resize(batch_size); + std::fill(results.scores.begin(), results.scores.end(), 0); + + if (is_chat_conversation && kv_cache_len > 0) { + auto attentions_mask_history = m_model_runner.get_tensor("attention_mask"); + + size_t new_prompt_len = attention_mask.get_shape()[1]; + size_t context_len = attentions_mask_history.get_shape()[1]; + ov::Tensor new_attention_mask = ov::Tensor{ov::element::i64, {1, context_len + new_prompt_len}}; + + for (size_t i = 0; i < context_len; ++i) { + auto r = attentions_mask_history.data()[i]; + new_attention_mask.data()[i] = attentions_mask_history.data()[i]; + } + for (size_t i = context_len; i < context_len + new_prompt_len; ++i) { + auto r = attention_mask.data()[i]; + new_attention_mask.data()[i] = attention_mask.data()[i - context_len]; + } + m_model_runner.set_tensor("attention_mask", new_attention_mask); + } else { + m_model_runner.set_tensor("attention_mask", attention_mask); + } + + auto atten_shape = attention_mask.get_shape(); + auto pos_shape = position_ids.get_shape(); + auto input_ids_shape = input_ids.get_shape(); + + m_model_runner.set_tensor("input_ids", input_ids); + m_model_runner.set_tensor("position_ids", position_ids); + + m_model_runner.get_tensor("beam_idx").set_shape({batch_size}); + auto beam_data = m_model_runner.get_tensor("beam_idx").data(); + std::iota(beam_data, beam_data + batch_size, 0); + + size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); + + m_model_runner.infer(); + auto logits = m_model_runner.get_tensor("logits"); + ov::Shape logits_shape = logits.get_shape(); + size_t seq_len = logits_shape[1], vocab_size = logits_shape[2]; + m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1}); + + std::vector token_iter_results(batch_size); // results of a single infer request + std::vector eos_met(batch_size, 0); // use int because can not use std::all_of with vector + for (size_t batch = 0; batch < batch_size; ++batch) { + auto res = utils::softmax(logits, batch); + auto out_token = res.first; + results.tokens[batch].emplace_back(res.first); + results.scores[batch] += res.second; + + token_iter_results[batch] = out_token; + eos_met[batch] = (out_token == generation_config.eos_token_id); + m_model_runner.get_tensor("input_ids").data()[batch] = out_token; + } + if (streamer) + streamer->put(token_iter_results[0]); + + bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; }); + if (!generation_config.ignore_eos && all_are_eos) + return results; + + for (size_t i = 0; i < max_tokens - 1; ++i) { + utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); + m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + + // todo: consider replacing with start_async and run callback right after that + m_model_runner.infer(); + auto logits = m_model_runner.get_tensor("logits"); + ov::Shape logits_shape = logits.get_shape(); + size_t seq_len = logits_shape[1], vocab_size = logits_shape[2]; + + std::vector token_iter_results(batch_size); // results of a single infer request + std::vector eos_met(batch_size, 0); // use int because can not use std::all_of with vector + for (size_t batch = 0; batch < batch_size; ++batch) { + + auto res = ov::genai::utils::softmax(logits, batch); + auto out_token = res.first; + results.tokens[batch].emplace_back(res.first); + results.scores[batch] += res.second; + + token_iter_results[batch] = out_token; + eos_met[batch] = (out_token == generation_config.eos_token_id); + + m_model_runner.get_tensor("input_ids").data()[batch] = out_token; + } + if (streamer) + streamer->put(token_iter_results[0]); + + // stop generation when EOS is met in all batches + bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; }); + if (!generation_config.ignore_eos && all_are_eos) + break; + } + if (streamer) + streamer->end(); + return results; +} + +} //namespace genai +} //namespace ov \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/src/cpp/src/group_beam_searcher.cpp similarity index 69% rename from text_generation/causal_lm/cpp/group_beam_searcher.hpp rename to src/cpp/src/group_beam_searcher.cpp index 6c97c869a..96138cec6 100644 --- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -2,6 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" + +namespace { // Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack std::vector kmp_search(const std::vector& haystack, const std::vector& needle) { @@ -80,16 +84,14 @@ bool greater(const Beam& left, const Beam& right) { return left.score > right.score; } -enum class StopCriteria { early, heuristic, never }; - struct Parameters { std::vector> prompts; - int64_t eos_token; + int64_t eos_token_id; size_t n_groups = 3; size_t group_size = 5; float diversity_penalty = 1.0; size_t max_new_tokens = 20; - StopCriteria stop_criteria = StopCriteria::heuristic; + ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::heuristic; float length_penalty = 1.0; size_t no_repeat_ngram_size = std::numeric_limits::max(); @@ -107,7 +109,7 @@ struct Group { beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty); // HF implementation counts eos_token for length penalty calculation - if (beam.tokens.back() == parameters.eos_token) { + if (beam.tokens.back() == parameters.eos_token_id) { beam.tokens.pop_back(); } @@ -126,15 +128,15 @@ struct Group { float best_sum_logprobs = ongoing.front().score; float worst_score = min_heap.front().score; switch (parameters.stop_criteria) { - case StopCriteria::early: + case ov::genai::StopCriteria::early: done = true; return; - case StopCriteria::heuristic: { + case ov::genai::StopCriteria::heuristic: { float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); done = worst_score >= highest_attainable_score; return; } - case StopCriteria::never: { + case ov::genai::StopCriteria::never: { size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); done = worst_score >= highest_attainable_score; @@ -267,7 +269,7 @@ struct GroupBeamSearcher { std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); group->ongoing.clear(); for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { - if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) { + if (parameters.eos_token_id == candidates.at(cand_idx).tokens.back()) { // If beam_token does not belong to top num_beams tokens, it should not be added if (cand_idx >= parameters.group_size) { continue; @@ -313,3 +315,126 @@ std::vector>> finalize(GroupBeamSearcher&& group_b return finalized; } + +void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { + request.set_tensor("input_ids", input_ids); + request.set_tensor("attention_mask", attention_mask); + + ov::Shape input_shape = input_ids.get_shape(); + + ov::Tensor position_ids = request.get_tensor("position_ids"); + position_ids.set_shape(input_shape); + ov::genai::utils::initialize_position_ids(position_ids, attention_mask); + + ov::Tensor beam_idx = request.get_tensor("beam_idx"); + beam_idx.set_shape({input_shape.at(0)}); + std::fill_n(beam_idx.data(), input_shape.at(0), 0); +} + + +void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector next_beams) { + ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; + ov::Shape original_shape = original_mask.get_shape(); + attention_mask.copy_to(original_mask); + + ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; + attention_mask.set_shape(new_shape); + + for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { + const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); + const size_t result_prompt_offset = beam_id * new_shape.at(1); + + int64_t* dest = attention_mask.data() + result_prompt_offset; + const int64_t* src = original_mask.data() + original_prompt_offset; + + std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); + attention_mask.data()[result_prompt_offset + new_shape.at(1) - 1] = 1; + } +} + +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { + const size_t batch_size = attention_mask.get_shape().at(0); + const size_t sequence_length = attention_mask.get_shape().at(1); + position_ids.set_shape({batch_size, 1}); + + for (size_t batch = 0; batch < batch_size; batch++) { + int64_t* mask_start = attention_mask.data() + batch * sequence_length; + position_ids.data()[batch] = std::accumulate(mask_start, mask_start + sequence_length - 1, 0); + } +} + +} // namespace + + +namespace ov { +namespace genai { + +EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig config) { + OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups"); + + // Initialize beam search + const int64_t* prompt_data = input_ids.data(); + std::vector> prompts; + prompts.reserve(input_ids.get_shape().at(0)); + for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) { + size_t sequence_length = input_ids.get_shape().at(1); + size_t batch_offset = batch * sequence_length; + const int64_t* prompt_start = prompt_data + batch_offset; + prompts.push_back(std::vector{prompt_start, prompt_start + sequence_length}); + } + + initialize_inputs(input_ids, attention_mask, lm); + + Parameters parameters{std::move(prompts)}; + parameters.max_new_tokens = config.max_new_tokens; + parameters.eos_token_id = config.eos_token_id; + parameters.n_groups = config.num_beam_groups; + parameters.group_size = config.num_beams / config.num_beam_groups; + parameters.diversity_penalty = config.diversity_penalty; + parameters.length_penalty = config.length_penalty; + parameters.stop_criteria = config.stop_criteria; + parameters.no_repeat_ngram_size = config.no_repeat_ngram_size; + GroupBeamSearcher group_beam_searcher{parameters}; + + std::vector next_tokens; + std::vector next_beams; + + for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) { + lm.infer(); + + std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); + if (next_tokens.empty()) { + break; + } + size_t batch_size = next_tokens.size(); + // Set pointers + lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); + lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); + // Set auxiliary inputs + update_attention_mask_with_beams(lm.get_tensor("attention_mask"), next_beams); + update_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask")); + } + + std::vector beams; + for (const std::vector>& prompt_group : finalize(std::move(group_beam_searcher))) { + for (const std::vector group : prompt_group) { + for (const Beam& beam : group) { + beams.emplace_back(beam); + } + } + } + + // return sorted scores + auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); }; + std::sort(beams.begin(), beams.end(), compare_scores); + + ov::genai::EncodedResults results; + for (auto beam = beams.begin(); beam != beams.begin() + config.num_return_sequences; ++beam) { + results.scores.emplace_back(beam->score); + results.tokens.emplace_back(beam->tokens); + } + return results; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp new file mode 100644 index 000000000..3f4b9f3f8 --- /dev/null +++ b/src/cpp/src/llm_pipeline.cpp @@ -0,0 +1,417 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +#include +#include +#include + +#include +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" +#include "text_callback_streamer.hpp" + +#ifdef _WIN32 +# include +# define MAX_ABS_PATH _MAX_PATH +# define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH) +#else +# include +# include +# define MAX_ABS_PATH PATH_MAX +# define get_absolute_path(result, path) realpath(path.c_str(), result) +namespace { +std::string get_absolute_file_path(const std::string& path) { + std::string absolutePath; + absolutePath.resize(MAX_ABS_PATH); + std::ignore = get_absolute_path(&absolutePath[0], path); + if (!absolutePath.empty()) { + // on Linux if file does not exist or no access, function will return NULL, but + // `absolutePath` will contain resolved path + absolutePath.resize(absolutePath.find('\0')); + return std::string(absolutePath); + } + std::stringstream ss; + ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno); + throw std::runtime_error(ss.str()); +} +} +#endif + +namespace { + +ov::genai::GenerationConfig from_config_json_if_exists(const std::string& path) { + constexpr char generation_config_fname[] = "generation_config.json"; + constexpr char config_fname[] = "config.json"; + if (std::filesystem::exists(path + "/" + generation_config_fname)) { + return ov::genai::GenerationConfig(path + "/" + generation_config_fname); + } else if (std::filesystem::exists(path + "/" + config_fname)) { + // some models (e.g. google/gemma-*) do not have generation_config.json, but have config.json + // and special tokens are stored there. + std::ifstream file(path + "/" + config_fname); + if (!file.is_open()) + return ov::genai::GenerationConfig{}; + + nlohmann::json data = nlohmann::json::parse(file); + using ov::genai::utils::read_json_param; + ov::genai::GenerationConfig config; + + read_json_param(data, "pad_token_id", config.pad_token_id); + read_json_param(data, "bos_token_id", config.bos_token_id); + read_json_param(data, "eos_token_id", config.eos_token_id); + return config; + + } + return ov::genai::GenerationConfig{}; +} + +std::string from_tokenizer_json_if_exists(const std::string& path) { + std::string res = ""; + + if (!std::filesystem::exists(path)) + return res; + + std::ifstream file(path + "/tokenizer_config.json"); + if (!file.is_open()) + return res; + + ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); + return res; +} + +std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { +#ifdef _WIN32 + constexpr char tokenizers[] = "openvino_tokenizers.dll"; +#elif __linux__ + constexpr char tokenizers[] = "libopenvino_tokenizers.so"; +#elif __APPLE__ + constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; +#endif + return path.parent_path() / tokenizers; +} + +std::string get_ov_genai_library_path() { +#ifdef _WIN32 + CHAR genai_library_path[MAX_PATH]; + HMODULE hm = NULL; + if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + reinterpret_cast(get_ov_genai_library_path), + &hm)) { + std::stringstream ss; + ss << "GetModuleHandle returned " << GetLastError(); + throw std::runtime_error(ss.str()); + } + GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path)); + return std::string(genai_library_path); +#elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__) + Dl_info info; + dladdr(reinterpret_cast(get_ov_genai_library_path), &info); + return get_absolute_file_path(info.dli_fname).c_str(); +#else +# error "Unsupported OS" +#endif // _WIN32 +} + +} + +namespace ov { +namespace genai { + +ov::genai::EncodedResults greedy_decoding( + ov::InferRequest& model_runner, + ov::Tensor prompts, + ov::Tensor attentin_mask, + const GenerationConfig sampling_params, + const std::shared_ptr streamer, + const bool is_chat_conversation = false +); + +ov::genai::EncodedResults multinominal_decoding( + ov::InferRequest& model_runner, + ov::Tensor prompts, + ov::Tensor attentin_mask, + GenerationConfig sampling_params, + std::shared_ptr streamer +); + +EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig config); + + +class LLMPipeline::LLMPipelineImpl { +public: + ov::InferRequest m_model_runner; + Tokenizer m_tokenizer; + GenerationConfig m_generation_config; + std::string m_chat_template = ""; + bool is_chat_conversation = false; + + LLMPipelineImpl( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config + ); + + LLMPipelineImpl( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path="" + ); + + GenerationConfig generation_config() const; + + std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer); + DecodedResults generate(std::vector texts, OptionalGenerationConfig generation_config); + EncodedResults generate(ov::Tensor input_ids, std::optional attention_mask, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer); + + std::string apply_chat_template(std::string prompt, std::string role = "user") const; +}; + +} // namespace genai +} // namespace ov + +using namespace std; + + +ov::genai::LLMPipeline::LLMPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config +) { + m_pimpl = make_unique(model_path, tokenizer, device, plugin_config); +} + +ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config +): m_tokenizer(tokenizer) { + ov::Core core; + + std::string full_path = model_path; + if (!ov::genai::utils::is_xml(full_path)) + full_path += "/openvino_model.xml"; + try { + m_model_runner = core.compile_model(full_path, device, plugin_config).create_infer_request(); + } catch (...) { + OPENVINO_THROW("Cannot compile_model from path " + full_path); + } +} + +ov::genai::LLMPipeline::LLMPipeline( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path +) { + m_pimpl = make_unique(path, device, config, ov_tokenizers_path); +} + +ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path +): + m_model_runner{ov::Core{}.compile_model(path + "/openvino_model.xml", device, config).create_infer_request()}, + m_tokenizer{ + ov_tokenizers_path.empty() + ? Tokenizer(path, device, with_openvino_tokenizers(get_ov_genai_library_path()).string()) + : Tokenizer(path, device, ov_tokenizers_path) + }, + m_generation_config{from_config_json_if_exists(path)}, + m_chat_template{from_tokenizer_json_if_exists(path)} + {} + +ov::genai::GenerationConfig ov::genai::LLMPipeline::LLMPipelineImpl::generation_config() const { + return m_generation_config; +} + +ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { + return m_pimpl->generation_config(); +} + +std::string ov::genai::LLMPipeline::LLMPipelineImpl::generate( + std::string text, + OptionalGenerationConfig generation_config, + OptionalStreamerVariant streamer +) { + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + + if (is_chat_conversation) { + text = apply_chat_template(text); + } + auto kv_cache_len = m_model_runner.query_state()[0].get_state().get_shape()[2]; + + // previous prompt generation in chat dialog stops with the end of sentence token, + // need to append this token to the current prompt + if (is_chat_conversation && kv_cache_len > 0) { + text = config.eos_token + text; + } + + auto [input_ids, attention_mask] = m_tokenizer.encode(text); + + // todo: W/A If sentence begins with a specfial tokens (, , etc.) openvino_tokenizer inserts 2 special extra tokens and "▁", + // but HF does not do that. Moreover openvino_tokenizer always inserts but in chat scenario HF does not do that because skip_special_tokens=True. + // Need to remove both of that tokens manually to get exact token by token alignment with HF + auto size = input_ids.get_shape(); + int64_t* inputs_data = input_ids.data(); + std::vector tmp_ids(inputs_data, inputs_data + input_ids.get_size()); // todo: works only for batch 1 + // tmp_ids.erase(tmp_ids.begin()); + + auto attention_mask_data = attention_mask.data(); + std::vector tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size()); + // tmp_attn_mask.erase(tmp_attn_mask.begin()); + + std::vector prefixes_to_exclude = {config.eos_token, config.bos_token}; + auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; }; + if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) { + tmp_ids.erase(tmp_ids.begin()); + tmp_attn_mask.erase(tmp_attn_mask.begin()); + } + + input_ids = ov::Tensor(input_ids.get_element_type(), {1, tmp_ids.size()}); + for (size_t i = 0; i < tmp_ids.size(); i++) + input_ids.data()[i] = tmp_ids.data()[i]; + attention_mask = ov::Tensor(attention_mask.get_element_type(), {1, tmp_attn_mask.size()}); + for (size_t i = 0; i < tmp_attn_mask.size(); i++) + attention_mask.data()[i] = tmp_attn_mask.data()[i]; + + auto generate_results = generate(input_ids, attention_mask, config, streamer); + return m_tokenizer.decode(generate_results.tokens)[0]; +} + +ov::genai::DecodedResults ov::genai::LLMPipeline::generate(const std::vector& texts, OptionalGenerationConfig generation_config) { + return m_pimpl->generate(texts, generation_config); +} + +ov::genai::DecodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { + auto [input_ids, attention_mask] = m_tokenizer.encode(texts); + + auto generate_results = generate(input_ids, attention_mask, generation_config, {}); + + return {m_tokenizer.decode(generate_results.tokens), generate_results.scores}; +} + +ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, + std::optional attention_mask, + OptionalGenerationConfig generation_config, + OptionalStreamerVariant streamer) { + return m_pimpl->generate(input_ids, attention_mask, generation_config, streamer); +} + +ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate( + ov::Tensor input_ids, + std::optional attention_mask, OptionalGenerationConfig generation_config, + OptionalStreamerVariant streamer +) { + ov::genai::EncodedResults result; + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + + std::shared_ptr streamer_ptr; + if (!streamer.has_value()){ + streamer_ptr = nullptr; + } else if (auto streamer_obj = std::get_if>(&*streamer)) { + streamer_ptr = *streamer_obj; + } else if (auto callback = std::get_if>(&*streamer)) { + streamer_ptr = std::make_shared(m_tokenizer, *callback); + } + auto batch_size = input_ids.get_shape().at(0); + if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) { + OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy or multinomial decoding"); + } + + auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids); + + if (config.is_greedy_decoding()) { + result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation); + } else if (config.is_beam_search()) { + result = beam_search(m_model_runner, input_ids, attention_mask_data, config); + } else if (config.is_multinomial()) { + result = multinominal_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr); + } else { + OPENVINO_THROW("No decoding algorithm found for provided configuration parameters."); + } + + if (!is_chat_conversation) + m_model_runner.reset_state(); + + return result; +} + +std::string ov::genai::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { + return m_pimpl->generate(text, generation_config, streamer); +} + +std::string ov::genai::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) { + OptionalStreamerVariant streamer; + auto config = GenerationConfig::anymap_to_generation_config(config_map); + if (config_map.count("streamer")) { + streamer = config_map.at("streamer").as>(); + } + + return m_pimpl->generate(text, config, streamer); +} + +ov::genai::EncodedResults ov::genai::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) { + OptionalStreamerVariant streamer; + auto config = GenerationConfig::anymap_to_generation_config(config_map); + if (config_map.count("streamer")) { + streamer = config_map.at("streamer").as>(); + } + + std::optional attention_mask; + return m_pimpl->generate(input_ids, attention_mask, config, streamer); +} + +ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() { + return m_pimpl->m_tokenizer; +} + +std::string ov::genai::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const { + return m_pimpl->apply_chat_template(prompt, role); +} + +std::string ov::genai::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const { + jinja2::TemplateEnv env; + env.GetSettings().lstripBlocks = true; + env.GetSettings().trimBlocks = true; + jinja2::Template tpl(&env); + tpl.Load(m_chat_template); + + jinja2::ValuesMap message {{"role", role}, {"content", prompt}}; + jinja2::ValuesMap params = { + {"messages", jinja2::ValuesList({message})}, + {"bos_token", m_generation_config.bos_token}, + {"eos_token", m_generation_config.eos_token}, + {"add_generation_prompt", true}, + }; + + return tpl.RenderAsString(params).value(); +} + +void ov::genai::LLMPipeline::start_chat() { + m_pimpl->is_chat_conversation = true; +} + +void ov::genai::LLMPipeline::finish_chat() { + m_pimpl->is_chat_conversation = false; + reset_state(); +} + +void ov::genai::LLMPipeline::reset_state() { + m_pimpl->m_model_runner.reset_state(); +} + +void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& generation_config) { + m_pimpl->m_generation_config = generation_config; +} + +ov::genai::LLMPipeline::~LLMPipeline() = default; diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp new file mode 100644 index 000000000..3dcdcdeb7 --- /dev/null +++ b/src/cpp/src/multinomial_decoding.cpp @@ -0,0 +1,262 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include "generation_config_helper.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" + + +namespace { + +struct TokenIdScore { + int64_t id; + float score; + + bool operator<(const TokenIdScore& other) const { + return score < other.score; + } + + bool operator>(const TokenIdScore& other) const { + return score > other.score; + } +}; + +void apply_softmax_inplace(std::vector& tokens) { + float max_score = std::max_element(tokens.begin(), tokens.end())->score; + float sum = 0.f; + + for (auto& token : tokens) { + float s = std::exp(token.score - max_score); + token.score = s; + sum += s; + } + + float inv_sum = 1.f / sum; + + for (auto& token : tokens) { + token.score *= inv_sum; + } +} + +TokenIdScore* sample_top_p(TokenIdScore* first, TokenIdScore* last, float top_p) { + // sort score + std::sort(first, last, std::greater()); + + int tokens_size = last - first; + std::vector token_scores(tokens_size); + for (size_t i = 0; i < tokens_size; i++) { + token_scores[i] = first[i]; + } + + // calculate softmax + apply_softmax_inplace(token_scores); + + float prefix_sum = 0.0f; + + // top_p + for (size_t i = 0; i < tokens_size; i++) { + prefix_sum += token_scores[i].score; + if (prefix_sum >= top_p) { + return first + (i + 1); + } + } + + return last; +} + +void apply_repetition_penalty(float* first, float* last, const std::vector& input_ids, float penalty) { + const float inv_penalty = 1.f / penalty; + const int vocab_size = last - first; + std::vector occurrence(vocab_size, false); + for (const int64_t id : input_ids) { + if (!occurrence[id]) { + first[id] *= (first[id] > 0) ? inv_penalty : penalty; + } + occurrence[id] = true; + } +} + +void apply_inv_temperature(float* first, float* last, float inv_temperature) { + for (float* it = first; it != last; it++) { + *it *= inv_temperature; + } +} + +struct RandomSampling { + const size_t top_k; + const float top_p; + const float inv_temperature; + const float repetition_penalty; + + std::mt19937 gen{std::random_device{}()}; + + RandomSampling(ov::genai::GenerationConfig generation_config) + : top_k{generation_config.top_k}, + top_p{generation_config.top_p}, + inv_temperature{1.f / generation_config.temperature}, + repetition_penalty{generation_config.repetition_penalty} { + // parameters validation + OPENVINO_ASSERT(generation_config.top_k > 0, + "top_k must be a strictly positive, but got ", + generation_config.top_p); + OPENVINO_ASSERT(generation_config.top_p > 0 || generation_config.top_p < 1.0f, + "top_p must be a positive float > 0 and < 1, but got ", + generation_config.top_p); + OPENVINO_ASSERT(generation_config.temperature > 0, + "Temperature must be a strictly positive float, but got ", + generation_config.temperature); + OPENVINO_ASSERT(generation_config.repetition_penalty > 0, + "Repetition penalty must be a strictly positive float, but got ", + generation_config.repetition_penalty); + } + + TokenIdScore get_out_token(float* logits, size_t vocab_size, const std::vector& tokens) { + // logits pre-process + if (repetition_penalty != 1.0f) { + apply_repetition_penalty(logits, logits + vocab_size, tokens, repetition_penalty); + } + + if (inv_temperature != 1.0f) { + apply_inv_temperature(logits, logits + vocab_size, inv_temperature); + } + + std::vector token_scores(vocab_size); + for (size_t i = 0; i < vocab_size; i++) { + token_scores[i] = TokenIdScore{int64_t(i), logits[i]}; + } + + // top_k sampling + if (0 < top_k && top_k < token_scores.size()) { + std::nth_element(token_scores.data(), + token_scores.data() + top_k, + token_scores.data() + token_scores.size(), + std::greater()); + token_scores.resize(top_k); + } + + // top_p sampling + if (0.f < top_p && top_p < 1.0f) { + auto pos = sample_top_p(token_scores.data(), token_scores.data() + token_scores.size(), top_p); + token_scores.resize(pos - token_scores.data()); + } + + // sample next token + apply_softmax_inplace(token_scores); + for (size_t i = 0; i < token_scores.size(); i++) { + logits[i] = token_scores[i].score; + } + + std::discrete_distribution<> dist(logits, logits + token_scores.size()); + return token_scores[dist(gen)]; + } +}; +} // namespace + +namespace ov { +namespace genai { + +ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner, + ov::Tensor input_ids, + ov::Tensor attention_mask, + ov::genai::GenerationConfig config, + std::shared_ptr streamer) { + ov::Shape prompts_shape = input_ids.get_shape(); + size_t batch_size = prompts_shape[0]; + + OPENVINO_ASSERT(batch_size == 1, "Only batch size = 1 supported for multinomial decoding"); + + size_t prompt_len = prompts_shape[1]; + + ov::genai::EncodedResults results; + results.scores.resize(batch_size, 0); + results.tokens.resize(batch_size); + + // Initialize inputs + m_model_runner.set_tensor("input_ids", input_ids); + m_model_runner.set_tensor("attention_mask", attention_mask); + + ov::Tensor position_ids = m_model_runner.get_tensor("position_ids"); + position_ids.set_shape(input_ids.get_shape()); + std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); + + // Input values are persistent between inference calls. + // That allows to set values, which aren't going to change, only once + m_model_runner.get_tensor("beam_idx").set_shape({batch_size}); + m_model_runner.get_tensor("beam_idx").data()[0] = 0; + + m_model_runner.infer(); + + auto logits_tensor = m_model_runner.get_tensor("logits"); + + int64_t sequence_offset = logits_tensor.get_shape().at(1) - 1; + size_t vocab_size = logits_tensor.get_shape().back(); + + float* logits = logits_tensor.data() + sequence_offset * vocab_size; + + const int64_t* input_ids_data = input_ids.data(); + + std::vector tokens{input_ids_data, input_ids_data + input_ids.get_size()}; + + RandomSampling sampling{config}; + + TokenIdScore out_token = sampling.get_out_token(logits, vocab_size, tokens); + + tokens.push_back(out_token.id); + results.tokens[0].push_back(out_token.id); + results.scores[0] += out_token.score; + + if (streamer) { + streamer->put(out_token.id); + } + + if (!config.ignore_eos && out_token.id == config.eos_token_id) { + return results; + } + + m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1}); + m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1}); + + size_t max_new_tokens = config.get_max_new_tokens(prompt_len); + + for (size_t i = 0; i < max_new_tokens - 1; i++) { + ov::genai::utils::update_position_ids(m_model_runner.get_tensor("position_ids"), + m_model_runner.get_tensor("attention_mask")); + m_model_runner.set_tensor("attention_mask", + ov::genai::utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + + m_model_runner.get_tensor("input_ids").data()[0] = out_token.id; + + m_model_runner.infer(); + + logits = m_model_runner.get_tensor("logits").data(); + out_token = sampling.get_out_token(logits, vocab_size, tokens); + + tokens.push_back(out_token.id); + results.tokens[0].push_back(out_token.id); + results.scores[0] += out_token.score; + + if (streamer) { + streamer->put(out_token.id); + } + + if (!config.ignore_eos && out_token.id == config.eos_token_id) { + break; + } + } + + if (streamer) { + streamer->end(); + } + + return results; +} +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp new file mode 100644 index 000000000..bb2bec09d --- /dev/null +++ b/src/cpp/src/text_callback_streamer.cpp @@ -0,0 +1,75 @@ +#include "text_callback_streamer.hpp" + +namespace ov { +namespace genai { + +TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, bool print_eos_token) { + m_tokenizer = tokenizer; + m_print_eos_token = print_eos_token; + on_decoded_text_callback = callback; + m_enabled = true; +} + +TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, bool print_eos_token) { + m_tokenizer = tokenizer; + m_print_eos_token = print_eos_token; +} + +void TextCallbackStreamer::put(int64_t token) { + std::stringstream res; + // do nothing if token is met and if print_eos_token=false + if (!m_print_eos_token && token == m_tokenizer.get_eos_token_id()) + return; + + m_tokens_cache.push_back(token); + std::string text = m_tokenizer.decode(m_tokens_cache); + if (!text.empty() && '\n' == text.back()) { + // Flush the cache after the new line symbol + res << std::string_view{text.data() + print_len, text.size() - print_len}; + m_tokens_cache.clear(); + print_len = 0; + on_finalized_text(res.str()); + return; + } + if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { + // Don't print incomplete text + on_finalized_text(res.str()); + return; + } + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); + on_finalized_text(res.str()); + return; +} + +void TextCallbackStreamer::end() { + std::stringstream res; + std::string text = m_tokenizer.decode(m_tokens_cache); + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + m_tokens_cache.clear(); + print_len = 0; + on_finalized_text(res.str()); +} + +void TextCallbackStreamer::set_tokenizer(Tokenizer tokenizer) { + this->m_tokenizer = tokenizer; +} + +void TextCallbackStreamer::set_callback(std::function callback) { + on_decoded_text_callback = callback; + m_enabled = true; +} + +void TextCallbackStreamer::set_callback() { + on_decoded_text_callback = [](std::string words){}; + m_enabled = false; +} + +void TextCallbackStreamer::on_finalized_text(const std::string& subword) { + if (m_enabled) { + on_decoded_text_callback(subword); + } +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp new file mode 100644 index 000000000..3834dd01b --- /dev/null +++ b/src/cpp/src/text_callback_streamer.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +class TextCallbackStreamer: public StreamerBase { +public: + void put(int64_t token) override; + void end() override; + + TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, bool print_eos_token = false); + TextCallbackStreamer(const Tokenizer& tokenizer, bool print_eos_token = false); + TextCallbackStreamer() = default; + ~TextCallbackStreamer() = default; + + void set_tokenizer(Tokenizer tokenizer); + void set_callback(std::function callback); + void set_callback(); + + std::function on_decoded_text_callback = [](std::string words){}; + bool m_enabled = false; + int64_t m_eos_token; +private: + bool m_print_eos_token = false; + Tokenizer m_tokenizer; + std::vector m_tokens_cache; + size_t print_len = 0; + void on_finalized_text(const std::string& subword); +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp new file mode 100644 index 000000000..11ca3d353 --- /dev/null +++ b/src/cpp/src/tokenizer.cpp @@ -0,0 +1,201 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "openvino/genai/tokenizer.hpp" +#include "utils.hpp" + +namespace { + +// todo: remove when openvino-tokenizers will support left padding +std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask, int64_t pad_token) { + const size_t batch_size = input_ids.get_shape()[0]; + const size_t sequence_length = input_ids.get_shape()[1]; + int64_t* inputs_data = input_ids.data(); + int64_t* attention_mask_data = attention_mask.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + const size_t batch_offset = batch * sequence_length; + + // last token in the sequence is not a PAD_TOKEN, skipping + if (inputs_data[batch_offset + sequence_length - 1] != pad_token) + continue; + + size_t pad_tokens_number = 0; + for (int i = sequence_length - 1; i >= 0; i--) { + const size_t token_offset = batch_offset + i; + + if (inputs_data[token_offset] == pad_token) + continue; + + if (pad_tokens_number == 0) + pad_tokens_number = sequence_length - i - 1; + + std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]); + std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]); + } + } + + return {input_ids, attention_mask}; +} + +} + +namespace ov { +namespace genai { + +class Tokenizer::TokenizerImpl { +public: + ov::InferRequest m_tokenize_request; + ov::InferRequest m_detokenizer_request; + int64_t m_pad_token_id = 0; + int64_t m_bos_token_id = 1; + int64_t m_eos_token_id = 2; + + TokenizerImpl() = default; + TokenizerImpl(std::string tokenizers_path, const std::string device, const std::string& ov_tokenizers_path) { + ov::Core core; + + if (ov::genai::utils::is_xml(tokenizers_path)) + OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file"); + + core.add_extension(ov_tokenizers_path); + std::shared_ptr tokenizer_model, detokenizer_model; + try { + tokenizer_model = core.read_model(tokenizers_path + "/openvino_tokenizer.xml"); + detokenizer_model = core.read_model(tokenizers_path + "/openvino_detokenizer.xml"); + } catch (...) { + OPENVINO_THROW("Cannot compile tokenizer and/or detokenizer. Please check that " + "openvino_tokenizer.xml and openvino_detokenizer.xml exist in \"" + tokenizers_path + "\""); + } + m_tokenize_request = core.compile_model(tokenizer_model, device).create_infer_request(); + m_detokenizer_request = core.compile_model(detokenizer_model, device).create_infer_request(); + + auto rt_info = tokenizer_model->get_rt_info(); + if (rt_info.count("eos_token_id") > 0) + m_eos_token_id = rt_info["eos_token_id"].as(); + if (rt_info.count("bos_token_id") > 0) + m_bos_token_id = rt_info["bos_token_id"].as(); + if (rt_info.count("pad_token_id") > 0) + m_pad_token_id = rt_info["pad_token_id"].as(); + } + + std::pair encode(std::string prompt) { + size_t batch_size = 1; + m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); + m_tokenize_request.infer(); + return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; + } + + std::pair encode(std::vector& prompts) { + m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); + auto size_ = m_tokenize_request.get_input_tensor().get_shape(); + m_tokenize_request.infer(); + pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id); + + // todo: fix mask filled with '2' instead of '0' + // https://github.com/openvinotoolkit/openvino_tokenizers/pull/90 should've fixed this + ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask"); + int64_t* attention_mask_data = attention_mask.data(); + std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0); + + return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; + } + + std::string decode(std::vector tokens) { + size_t batch_size = 1; + m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()}); + m_detokenizer_request.infer(); + return m_detokenizer_request.get_output_tensor().data()[0]; + } + + std::vector decode(ov::Tensor tokens) { + m_detokenizer_request.set_input_tensor(tokens); + auto shape = tokens.get_shape(); + auto data = tokens.data(); + m_detokenizer_request.infer(); + auto res = m_detokenizer_request.get_output_tensor(); + + std::vector strings; + for (int i = 0; i < res.get_shape()[0]; ++i) { + strings.emplace_back(res.data()[i]); + } + return strings; + } + + std::vector decode(std::vector> lines) { + // todo: implement calling detokenizer in a single batch + std::vector results; + for (auto& line: lines){ + ov::Tensor tokens = ov::Tensor{ov::element::i64, {1, line.size()}, line.data()}; + m_detokenizer_request.set_input_tensor(tokens); + m_detokenizer_request.infer(); + auto res = m_detokenizer_request.get_output_tensor(); + auto res_str = res.data()[0]; + results.emplace_back(res_str); + } + + return results; + } +}; + +Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device, const std::string& ov_tokenizers_path) { + m_pimpl = std::make_shared(tokenizers_path, device, ov_tokenizers_path); +} + +std::pair Tokenizer::encode(const std::string prompt) { + return m_pimpl->encode(std::move(prompt)); +} + +std::pair Tokenizer::encode(std::vector& prompts) { + return m_pimpl->encode(prompts); +} + +std::pair Tokenizer::encode(std::vector&& prompts) { + return m_pimpl->encode(prompts); +} + +std::pair Tokenizer::encode(std::initializer_list& text) { + return encode(std::vector(text.begin(), text.end())); +} + +std::string Tokenizer::decode(std::vector tokens) { + return m_pimpl->decode(tokens); +} + +std::vector Tokenizer::decode(ov::Tensor tokens) { + return m_pimpl->decode(tokens); +} + +std::vector Tokenizer::decode(std::vector> lines) { + return m_pimpl->decode(lines); +} + +int64_t Tokenizer::get_bos_token_id() const { + return m_pimpl->m_bos_token_id; +} + +int64_t Tokenizer::get_eos_token_id() const { + return m_pimpl->m_eos_token_id; +} + +int64_t Tokenizer::get_pad_token_id() const { + return m_pimpl->m_pad_token_id; +} + +void Tokenizer::set_pad_token_id(int64_t pad_token_id) { + m_pimpl->m_pad_token_id = pad_token_id; +} + +void Tokenizer::set_bos_token_id(int64_t bos_token_id) { + m_pimpl->m_bos_token_id = bos_token_id; +} + +void Tokenizer::set_eos_token_id(int64_t eos_token_id) { + m_pimpl->m_eos_token_id = eos_token_id; +} + +Tokenizer::~Tokenizer() = default; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp new file mode 100644 index 000000000..8111dc5c9 --- /dev/null +++ b/src/cpp/src/utils.cpp @@ -0,0 +1,151 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "utils.hpp" + +namespace ov { +namespace genai { +namespace utils { + +Tensor init_attention_mask(Tensor& position_ids) { + auto shape = position_ids.get_shape(); + auto attention_mask = ov::Tensor{position_ids.get_element_type(), shape}; + std::fill_n(attention_mask.data(), shape[0] * shape[1], 1); + return attention_mask; +} + +void print_tensor(const ov::Tensor& tensor) { + std::vector res; + + auto t_shape = tensor.get_shape(); + std::cout << "["; + for (size_t i = 0; i < t_shape[0]; ++i) { + std::cout << "|"; + for (size_t j = 0; j < t_shape[1]; ++j) { + if (tensor.get_element_type() == ov::element::i64) { + res.emplace_back(tensor.data()[t_shape[1] * i + j]); + std::cout << tensor.data()[t_shape[1] * i + j] << " "; + } + } + std::cout << "|"; + } + std::cout << "]" << std::endl; +} + +bool is_xml(const std::string& path) { return path.compare(path.length() - 4, 4, ".xml") == 0;} + +std::pair softmax(const ov::Tensor& logits, const size_t batch_idx) { + if (logits.get_shape()[0] <= batch_idx) { + OPENVINO_THROW("logits batch size doesn't match the number of beams"); + } + + size_t vocab_size = logits.get_shape().back(); + size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size; + size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; + const float* logits_data = logits.data() + batch_offset + sequence_offset; + + int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data; + float max_logit = logits_data[out_token]; + + float log_sum = std::log( + std::accumulate(logits_data, logits_data + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_logit); + })); + return {out_token, log_sum}; +} + +void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos) { + const size_t batch_size = attention_mask.get_shape()[0]; + const size_t seq_length = attention_mask.get_shape()[1]; + + const int64_t* attention_mask_data = attention_mask.data(); + int64_t* position_ids_data = position_ids.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + size_t sum = start_pos; + for (size_t i = 0; i < seq_length; i++) { + const size_t element_offset = batch * seq_length + i; + position_ids_data[element_offset] = sum; + if (attention_mask_data[element_offset] == 1) { + sum += 1; + } + } + } +} + +void initialize_beam_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { + request.set_tensor("input_ids", input_ids); + request.set_tensor("attention_mask", attention_mask); + + ov::Shape input_shape = input_ids.get_shape(); + + ov::Tensor position_ids = request.get_tensor("position_ids"); + position_ids.set_shape(input_shape); + initialize_position_ids(position_ids, attention_mask); + + ov::Tensor beam_idx = request.get_tensor("beam_idx"); + beam_idx.set_shape({input_shape.at(0)}); + std::fill_n(beam_idx.data(), input_shape.at(0), 0); +} + + +void set_attention_mask(ov::Tensor&& attention_mask, std::vector next_beams) { + ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; + ov::Shape original_shape = original_mask.get_shape(); + attention_mask.copy_to(original_mask); + + ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; + attention_mask.set_shape(new_shape); + + for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { + const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); + const size_t result_prompt_offset = beam_id * new_shape.at(1); + + int64_t* dest = attention_mask.data() + result_prompt_offset; + const int64_t* src = original_mask.data() + original_prompt_offset; + + std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); + attention_mask.data()[result_prompt_offset + new_shape.at(1) - 1] = 1; + } +} + +/** + * Set position ids tensor data for next token inference based on provided attention mask + * Supports multi batch + * Supports sparse attention_mask + */ +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { + const size_t batch_size = attention_mask.get_shape().at(0); + const size_t atten_length = attention_mask.get_shape().at(1); + position_ids.set_shape({batch_size, 1}); + + for (size_t batch = 0; batch < batch_size; batch++) { + int64_t* start = attention_mask.data() + batch * atten_length; + // todo: be careful with start + atten_length, probably need to replace with start + atten_length -1 + position_ids.data()[batch] = std::accumulate(start, start + atten_length, 0); + } +} + +/** + * Get attention mask tensor for next token inference + * Supports multi batch + * Supports sparse attention_mask + */ +ov::Tensor extend_attention(ov::Tensor attention_mask) { + auto shape = attention_mask.get_shape(); + auto batch_size = shape[0]; + auto seq_len = shape[1]; + + ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}}; + auto old_data = attention_mask.data(); + auto new_data = new_atten_mask.data(); + for (size_t batch = 0; batch < batch_size; ++batch) { + std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t)); + new_data[batch * (seq_len + 1) + seq_len] = 1; + } + return new_atten_mask; +} + +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp new file mode 100644 index 000000000..4559a8962 --- /dev/null +++ b/src/cpp/src/utils.hpp @@ -0,0 +1,65 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +namespace ov { +namespace genai { +namespace utils { + +Tensor init_attention_mask(Tensor& position_ids); + +void print_tensor(const ov::Tensor& tensor); + +std::pair softmax(const ov::Tensor& logits, const size_t batch_idx); + +void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0); + +ov::Tensor extend_attention(ov::Tensor attention_mask); + +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask); + +bool is_xml(const std::string& path); + +template +struct json_type_traits {}; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::number_integer; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::number_integer; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::number_unsigned; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::number_float; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::string; }; + +template <> +struct json_type_traits { static constexpr auto json_value_t = nlohmann::json::value_t::boolean; }; + +template +void read_json_param(const nlohmann::json& data, const std::string& name, T& param) { + if (data.contains(name) && data[name].type() == json_type_traits::json_value_t) { + param = data[name]; + } +} + +template +void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param) { + if (config_map.count(name)) { + param = config_map.at(name).as(); + } +} + +} // namespace utils +} // namespace genai +} // namespace ov + diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt new file mode 100644 index 000000000..d64b6c61f --- /dev/null +++ b/src/python/CMakeLists.txt @@ -0,0 +1,52 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +include(FetchContent) +FetchContent_Declare( + pybind11 + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.12.0.tar.gz + URL_HASH SHA256=bf8f242abd1abcd375d516a7067490fb71abd79519a282d22b6e4d19282185a7 +) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +FetchContent_GetProperties(pybind11) +if(NOT pybind11_POPULATED) + FetchContent_Populate(pybind11) + add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) +endif() + +pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp) +target_link_libraries(py_generate_pipeline PRIVATE openvino::genai) +set_target_properties(py_generate_pipeline PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) +file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" DESTINATION "${CMAKE_BINARY_DIR}/openvino_genai/") +write_file("${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" "__version__ = \"${CMAKE_PROJECT_VERSION}\"") + +# setting RPATH / LC_RPATH depending on platform +if(LINUX) + # to find libopenvino_genai.so in the same folder + set(rpaths "$ORIGIN") +elseif(APPLE) + # to find libopenvino_genai.dylib in the same folder + set(rpaths "@loader_path") + if(DEFINED SKBUILD) + # in case we build pip package, we need to refer to libopenvino.dylib from 'openvino' package + list(APPEND rpaths "@loader_path/../openvino/libs") + endif() +endif() + +if(rpaths) + set_target_properties(py_generate_pipeline PROPERTIES INSTALL_RPATH "${rpaths}") +endif() + +find_package(Python3 REQUIRED COMPONENTS Interpreter Development) +install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__init__.py" "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) +install(TARGETS genai py_generate_pipeline LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + +# wheel_genai component is used for wheel generation in pyproject.toml. +# Exclude wheel_genai from normal packaging because there's pygenai_X_Y component for that. +install(TARGETS genai py_generate_pipeline + LIBRARY DESTINATION . COMPONENT wheel_genai + RUNTIME DESTINATION . COMPONENT wheel_genai + EXCLUDE_FROM_ALL) diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py new file mode 100644 index 000000000..e069157fa --- /dev/null +++ b/src/python/openvino_genai/__init__.py @@ -0,0 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import openvino # add_dll_directory for openvino lib +import os +from .__version__ import __version__ + + +if hasattr(os, "add_dll_directory"): + os.add_dll_directory(os.path.dirname(__file__)) + +from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults + +__all__ = ['LLMPipeline', 'Tokenizer', 'GenerationConfig', 'DecodedResults', 'EncodedResults'] diff --git a/src/python/openvino_genai/__version__.py b/src/python/openvino_genai/__version__.py new file mode 100644 index 000000000..79da913d6 --- /dev/null +++ b/src/python/openvino_genai/__version__.py @@ -0,0 +1,2 @@ +# Will be overwritten by pyproject.toml or cmake. +__version__ = "0.0.0.0" diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp new file mode 100644 index 000000000..d1f8c5b3c --- /dev/null +++ b/src/python/py_generate_pipeline.cpp @@ -0,0 +1,225 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include "openvino/genai/llm_pipeline.hpp" + +#ifdef _WIN32 +# include +# define MAX_ABS_PATH _MAX_PATH +# define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH) +#else +# include +# include +# define MAX_ABS_PATH PATH_MAX +# define get_absolute_path(result, path) realpath(path.c_str(), result) +namespace { +std::string get_absolute_file_path(const std::string& path) { + std::string absolutePath; + absolutePath.resize(MAX_ABS_PATH); + std::ignore = get_absolute_path(&absolutePath[0], path); + if (!absolutePath.empty()) { + // on Linux if file does not exist or no access, function will return NULL, but + // `absolutePath` will contain resolved path + absolutePath.resize(absolutePath.find('\0')); + return std::string(absolutePath); + } + std::stringstream ss; + ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno); + throw std::runtime_error(ss.str()); +} +} +#endif + +namespace py = pybind11; +using ov::genai::LLMPipeline; +using ov::genai::Tokenizer; +using ov::genai::GenerationConfig; +using ov::genai::EncodedResults; +using ov::genai::DecodedResults; +using ov::genai::StopCriteria; +using ov::genai::StreamerBase; + +namespace { +void str_to_stop_criteria(GenerationConfig& config, const std::string& stop_criteria_str){ + if (stop_criteria_str == "early") config.stop_criteria = StopCriteria::early; + else if (stop_criteria_str == "never") config.stop_criteria = StopCriteria::never; + else if (stop_criteria_str == "heuristic") config.stop_criteria = StopCriteria::heuristic; + else OPENVINO_THROW(stop_criteria_str + " is incorrect value of stop_criteria. " + "Allowed values are: \"early\", \"never\", \"heuristic\". "); +} + +std::string stop_criteria_to_str(const GenerationConfig& config) { + switch (config.stop_criteria) { + case StopCriteria::early: return "early"; + case StopCriteria::heuristic: return "heuristic"; + case StopCriteria::never: return "never"; + default: throw std::runtime_error("Incorrect stop_criteria"); + } +} + +void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwargs) { + if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast(); + if (kwargs.contains("max_length")) config.max_length = kwargs["max_length"].cast(); + if (kwargs.contains("ignore_eos")) config.ignore_eos = kwargs["ignore_eos"].cast(); + if (kwargs.contains("num_beam_groups")) config.num_beam_groups = kwargs["num_beam_groups"].cast(); + if (kwargs.contains("num_beams")) config.num_beams = kwargs["num_beams"].cast(); + if (kwargs.contains("diversity_penalty")) config.diversity_penalty = kwargs["diversity_penalty"].cast(); + if (kwargs.contains("length_penalty")) config.length_penalty = kwargs["length_penalty"].cast(); + if (kwargs.contains("num_return_sequences")) config.num_return_sequences = kwargs["num_return_sequences"].cast(); + if (kwargs.contains("no_repeat_ngram_size")) config.no_repeat_ngram_size = kwargs["no_repeat_ngram_size"].cast(); + if (kwargs.contains("stop_criteria")) str_to_stop_criteria(config, kwargs["stop_criteria"].cast()); + if (kwargs.contains("temperature")) config.temperature = kwargs["temperature"].cast(); + if (kwargs.contains("top_p")) config.top_p = kwargs["top_p"].cast(); + if (kwargs.contains("top_k")) config.top_k = kwargs["top_k"].cast(); + if (kwargs.contains("do_sample")) config.do_sample = kwargs["do_sample"].cast(); + if (kwargs.contains("repetition_penalty")) config.repetition_penalty = kwargs["repetition_penalty"].cast(); + if (kwargs.contains("pad_token_id")) config.pad_token_id = kwargs["pad_token_id"].cast(); + if (kwargs.contains("bos_token_id")) config.bos_token_id = kwargs["bos_token_id"].cast(); + if (kwargs.contains("eos_token_id")) config.eos_token_id = kwargs["eos_token_id"].cast(); + if (kwargs.contains("eos_token")) config.eos_token = kwargs["eos_token"].cast(); + if (kwargs.contains("bos_token")) config.bos_token = kwargs["bos_token"].cast(); +} + +// operator() and generate methods are identical, operator() is just an alias for generate +std::string call_with_kwargs(LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { + // Create a new GenerationConfig instance and initialize from kwargs + GenerationConfig config = pipeline.get_generation_config(); + update_config_from_kwargs(config, kwargs); + return pipeline(text, config); +} + +std::string call_with_config(LLMPipeline& pipe, const std::string& text, const GenerationConfig& config) { + std::shared_ptr streamer; + return pipe(text, config); +} + +std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { +#ifdef _WIN32 + constexpr char tokenizers[] = "openvino_tokenizers.dll"; +#elif __linux__ + constexpr char tokenizers[] = "libopenvino_tokenizers.so"; +#elif __APPLE__ + constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; +#endif + return path.parent_path() / tokenizers; +} + +std::string get_ov_genai_bindings_path() { +#ifdef _WIN32 + CHAR genai_library_path[MAX_PATH]; + HMODULE hm = NULL; + if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + reinterpret_cast(get_ov_genai_bindings_path), + &hm)) { + std::stringstream ss; + ss << "GetModuleHandle returned " << GetLastError(); + throw std::runtime_error(ss.str()); + } + GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path)); + return std::string(genai_library_path); +#elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__) + Dl_info info; + dladdr(reinterpret_cast(get_ov_genai_bindings_path), &info); + return get_absolute_file_path(info.dli_fname).c_str(); +#else +# error "Unsupported OS" +#endif // _WIN32 +} + +std::string ov_tokenizers_module_path() { + // Try a path relative to build artifacts folder first. + std::filesystem::path from_library = with_openvino_tokenizers(get_ov_genai_bindings_path()); + if (std::filesystem::exists(from_library)) { + return from_library.string(); + } + return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path")); +} +} + +PYBIND11_MODULE(py_generate_pipeline, m) { + m.doc() = "Pybind11 binding for LLM Pipeline"; + + py::class_(m, "LLMPipeline") + .def(py::init(), + py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", + py::arg("plugin_config") = ov::AnyMap{}) + .def(py::init(), + py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) + .def("__call__", py::overload_cast(&call_with_kwargs)) + .def("__call__", py::overload_cast(&call_with_config)) + .def("generate", py::overload_cast(&call_with_kwargs)) + .def("generate", py::overload_cast(&call_with_config)) + + // todo: if input_ids is a ov::Tensor/numpy tensor + // todo: implement calling generate/operator() with StreamerBase or lambda streamer + // signature to be implemented: + // EncodedResults generate(ov::Tensor input_ids, + // std::optional attention_mask, + // OptionalGenerationConfig generation_config=nullopt, + // OptionalStreamerVariant streamer=nullopt); + + + .def("get_tokenizer", &LLMPipeline::get_tokenizer) + .def("start_chat", &LLMPipeline::start_chat) + .def("finish_chat", &LLMPipeline::finish_chat) + .def("reset_state", &LLMPipeline::reset_state) + .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &LLMPipeline::set_generation_config) + .def("apply_chat_template", &LLMPipeline::apply_chat_template); + + // Binding for Tokenizer + py::class_(m, "Tokenizer") + .def(py::init<>()) + .def(py::init(), + py::arg("tokenizers_path"), + py::arg("device") = "CPU", + py::arg("ov_tokenizers_path") = py::str(ov_tokenizers_module_path())) + + // todo: implement encode/decode when for numpy inputs and outputs + .def("encode", py::overload_cast(&Tokenizer::encode), "Encode a single prompt") + // TODO: common.h(1106...) template argument deduction/substitution failed: + // .def("encode", py::overload_cast&>(&Tokenizer::encode), "Encode multiple prompts") + .def("decode", py::overload_cast>(&Tokenizer::decode), "Decode a list of tokens") + .def("decode", py::overload_cast(&Tokenizer::decode), "Decode a tensor of tokens") + .def("decode", py::overload_cast>>(&Tokenizer::decode), "Decode multiple lines of tokens"); + + // Binding for GenerationConfig + py::class_(m, "GenerationConfig") + .def(py::init<>()) + .def(py::init()) + .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) + .def_readwrite("max_length", &GenerationConfig::max_length) + .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) + .def_readwrite("num_beam_groups", &GenerationConfig::num_beam_groups) + .def_readwrite("num_beams", &GenerationConfig::num_beams) + .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) + .def_readwrite("length_penalty", &GenerationConfig::length_penalty) + .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences) + .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size) + .def_property("stop_criteria", &stop_criteria_to_str, &str_to_stop_criteria) + .def_readwrite("temperature", &GenerationConfig::temperature) + .def_readwrite("top_p", &GenerationConfig::top_p) + .def_readwrite("top_k", &GenerationConfig::top_k) + .def_readwrite("do_sample", &GenerationConfig::do_sample) + .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) + .def_readwrite("pad_token_id", &GenerationConfig::pad_token_id) + .def_readwrite("bos_token_id", &GenerationConfig::bos_token_id) + .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id) + .def_readwrite("eos_token", &GenerationConfig::eos_token) + .def_readwrite("bos_token", &GenerationConfig::bos_token); + + py::class_(m, "DecodedResults") + .def(py::init<>()) + .def_readwrite("texts", &DecodedResults::texts) + .def_readwrite("scores", &DecodedResults::scores); + + py::class_(m, "EncodedResults") + .def(py::init<>()) + .def_readwrite("tokens", &EncodedResults::tokens) + .def_readwrite("scores", &EncodedResults::scores); + +} diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py new file mode 100644 index 000000000..a24a4fd13 --- /dev/null +++ b/tests/python_tests/list_test_models.py @@ -0,0 +1,24 @@ +def models_list(): + model_ids = [ + ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0"), + # ("google/gemma-2b-it", "gemma-2b-it"), + # ("google/gemma-7b-it", "gemma-7b-it"), + # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf"), + # ("meta-llama/Llama-2-13b-chat-hf", "Llama-2-13b-chat-hf"), + # ("openlm-research/open_llama_3b", "open_llama_3b"), + # ("openlm-research/open_llama_7b", "open_llama_7b"), + # ("databricks/dolly-v2-3b", "dolly-v2-3b"), + # ("databricks/dolly-v2-12b", "dolly-v2-12b"), + # ("mistralai/Mistral-7B-v0.1", "Mistral-7B-v0.1"), + # ("ikala/redpajama-3b-chat", "redpajama-3b-chat"), + # ("microsoft/phi-1_5", "phi-1_5/"), + # ("Qwen/Qwen1.5-7B-Chat", "Qwen1.5-7B-Chat"), + ] + import os + prefix = os.getenv('GENAI_MODELS_PATH_PREFIX', '') + return [(model_id, os.path.join(prefix, model_path)) for model_id, model_path in model_ids] + + +if __name__ == "__main__": + for model_id, model_path in models_list(): + print(model_id, model_path) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt new file mode 100644 index 000000000..e536fd531 --- /dev/null +++ b/tests/python_tests/requirements.txt @@ -0,0 +1,4 @@ +pytest +transformers +torch +optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 \ No newline at end of file diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py new file mode 100644 index 000000000..5ac977f1b --- /dev/null +++ b/tests/python_tests/test_generate_api.py @@ -0,0 +1,116 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from list_test_models import models_list + + +@pytest.fixture(scope="module", params=models_list()) +def model_fixture(request): + model_id, path = request.param + from transformers import AutoTokenizer, AutoModelForCausalLM + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id) + return model_id, path, tokenizer, model + +def run_hf_ov_genai_comparison(model_fixture, generation_config, prompt): + import openvino_genai as ov_genai + model_id, path, tokenizer, model = model_fixture + + generation_config_hf = generation_config.copy() + # in OpenVINO GenAI this parameter is called stop_criteria, + # while in HF it's called early_stopping. + # HF values True, False and "never" correspond to OV GenAI values "early", "heuristic" and "never" + if generation_config_hf.get('stop_criteria'): + generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')] + + encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) + hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf) + hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) + + device = 'CPU' + # pipe = ov_genai.LLMPipeline(path, device) + + pipe = ov_genai.LLMPipeline(path, device) + + ov_output = pipe.generate(prompt, **generation_config) + + if hf_output != ov_output: + print(f'hf_output: {hf_output}') + print(f'ov_output: {ov_output}') + + assert hf_output == ov_output + + +def stop_criteria_map(): + return {"never": "never", "early": True, "heuristic": False} + +test_cases = [ + (dict(max_new_tokens=20, do_sample=False), 'table is made of'), # generation_config, prompt + (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'), + # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), + # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), + # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), +] +@pytest.mark.parametrize("generation_config,prompt", test_cases) +def test_greedy_decoding(model_fixture, generation_config, prompt): + run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + + +prompts = ['The Sun is yellow because', 'Alan Turing was a', 'table is made of'] +@pytest.mark.parametrize("num_beam_groups", [2, 3, 8]) +@pytest.mark.parametrize("group_size", [5, 3, 10]) +@pytest.mark.parametrize("max_new_tokens", [20, 15]) +@pytest.mark.parametrize("diversity_penalty", [1.0, 1.5]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.skip # temporarily +def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, + max_new_tokens, diversity_penalty, prompt): + generation_config = dict( + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=diversity_penalty, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, + ) + run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + + +@pytest.mark.parametrize("stop_criteria", ["never", "early", "heuristic"]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("max_new_tokens", [20, 40, 300]) +@pytest.mark.skip # temporarily +def test_stop_criteria(model_fixture, stop_criteria, prompt, max_new_tokens): + # todo: for long sentences early stop_criteria fails + if (stop_criteria == 'early' and max_new_tokens >= 300): + pytest.skip() + generation_config = dict( + num_beam_groups=2, + num_beams=2 * 3, + diversity_penalty=1.0, + num_return_sequences=2 * 3, + max_new_tokens=max_new_tokens, + stop_criteria=stop_criteria, + ) + run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + + +# test long sequences +@pytest.mark.parametrize("num_beam_groups", [2]) +@pytest.mark.parametrize("group_size", [5]) +@pytest.mark.parametrize("max_new_tokens", [800, 2000]) +@pytest.mark.parametrize("diversity_penalty", [1.0]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.skip # will be enabled in nightly since are computationally expensive +def test_beam_search_long_sentences(model_fixture, num_beam_groups, group_size, + max_new_tokens, diversity_penalty, prompt): + generation_config = dict( + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=1.0, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, + ) + run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt index 6da39c6ab..afdd4c48b 100644 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ b/text_generation/causal_lm/cpp/CMakeLists.txt @@ -4,25 +4,29 @@ cmake_minimum_required(VERSION 3.15) project(causal_lm) -add_subdirectory(../../../thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/") +if(TARGET openvino_tokenizers) + set(OPENVINO_TOKENIZERS_PATH $) +else() + set(OPENVINO_TOKENIZERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../bin/openvino_tokenizers.dll) # TODO: I'll go away after the generate() gets a way to find openvino_tokenizers +endif() + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) add_executable(greedy_causal_lm greedy_causal_lm.cpp) -target_compile_definitions(greedy_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(greedy_causal_lm PRIVATE openvino::runtime) +target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17) set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) -target_compile_definitions(beam_search_causal_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") -target_include_directories(beam_search_causal_lm PRIVATE ./) -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime) +target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17) set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) -target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") +target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") target_include_directories(speculative_decoding_lm PRIVATE ./) find_package(OpenVINO REQUIRED COMPONENTS Runtime) target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime) @@ -32,7 +36,7 @@ find_package(TBB REQUIRED COMPONENTS tbb) target_link_libraries(speculative_decoding_lm PRIVATE TBB::tbb) add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) -target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH=\"$\") +target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") target_include_directories(prompt_lookup_decoding_lm PRIVATE ./) find_package(OpenVINO REQUIRED COMPONENTS Runtime) target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime) @@ -40,3 +44,20 @@ set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD 17) set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED ON) find_package(TBB REQUIRED COMPONENTS tbb) target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb) + +add_executable(chat_sample chat_sample.cpp) +target_link_libraries(chat_sample PRIVATE openvino::genai) +target_include_directories(chat_sample PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") +set_target_properties(chat_sample PROPERTIES CXX_STANDARD 17) +set_target_properties(chat_sample PROPERTIES CXX_STANDARD_REQUIRED ON) + +add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) +target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) +target_include_directories(multinomial_causal_lm PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") +set_target_properties(multinomial_causal_lm PROPERTIES CXX_STANDARD 17) +set_target_properties(multinomial_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) + +install(TARGETS greedy_causal_lm beam_search_causal_lm speculative_decoding_lm prompt_lookup_decoding_lm chat_sample multinomial_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp index 110ac4717..474537de1 100644 --- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp @@ -1,232 +1,36 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include -#include +#include namespace { - -enum SPECIAL_TOKEN { PAD_TOKEN = 2 }; - -std::string detokenize(ov::InferRequest& detokenizer, const std::vector& tokens) { - constexpr size_t BATCH_SIZE = 1; - ov::Tensor inp = detokenizer.get_input_tensor(); - inp.set_shape({BATCH_SIZE, tokens.size()}); - for (size_t idx = 0; idx < tokens.size(); ++idx) { - inp.data()[idx] = tokens.at(idx); - } - detokenizer.infer(); - return detokenizer.get_output_tensor().data()[0]; -} - -std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask) { - const size_t batch_size = input_ids.get_shape().at(0); - const size_t sequence_length = input_ids.get_shape().at(1); - int64_t* inputs_data = input_ids.data(); - int64_t* attention_mask_data = attention_mask.data(); - - for (size_t batch = 0; batch < batch_size; batch++) { - const size_t batch_offset = batch * sequence_length; - - // last token in the sequence is not a PAD_TOKEN, skipping - if (inputs_data[batch_offset + sequence_length - 1] != SPECIAL_TOKEN::PAD_TOKEN) { - continue; - } - - size_t pad_tokens_number = 0; - for (int i = sequence_length - 1; i >= 0; i--) { - const size_t token_offset = batch_offset + i; - - if (inputs_data[token_offset] == SPECIAL_TOKEN::PAD_TOKEN) { - continue; - } - - if (pad_tokens_number == 0) { - pad_tokens_number = sequence_length - i - 1; - } - - std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]); - std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]); - } - } - - return {input_ids, attention_mask}; + enum SPECIAL_TOKEN { PAD_TOKEN = 2 }; } -std::pair tokenize(ov::InferRequest& tokenizer, std::vector prompts) { - tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); - - tokenizer.infer(); - - pad_left(tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")); - - // fix mask filled with '2' instead of '0' - ov::Tensor attention_mask = tokenizer.get_tensor("attention_mask"); - int64_t* attention_mask_data = attention_mask.data(); - std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0); - - return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; -} - -void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) { - const size_t batch_size = attention_mask.get_shape().at(0); - const size_t sequence_length = attention_mask.get_shape().at(1); - - const int64_t* attention_mask_data = attention_mask.data(); - int64_t* position_ids_data = position_ids.data(); - - for (size_t batch = 0; batch < batch_size; batch++) { - const size_t batch_offset = batch * sequence_length; - size_t sum = 0; - - for (size_t i = 0; i < sequence_length; i++) { - const size_t element_offset = batch_offset + i; - position_ids_data[element_offset] = sum; - if (attention_mask_data[element_offset] == 1) { - sum += 1; - } - } - } -} - -void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { - request.set_tensor("input_ids", input_ids); - request.set_tensor("attention_mask", attention_mask); - - ov::Shape input_shape = input_ids.get_shape(); - - ov::Tensor position_ids = request.get_tensor("position_ids"); - position_ids.set_shape(input_shape); - initialize_position_ids(position_ids, attention_mask); - - ov::Tensor beam_idx = request.get_tensor("beam_idx"); - beam_idx.set_shape({input_shape.at(0)}); - std::fill_n(beam_idx.data(), input_shape.at(0), 0); -} - -void set_attention_mask(ov::Tensor&& attention_mask, std::vector next_beams) { - ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; - ov::Shape original_shape = original_mask.get_shape(); - attention_mask.copy_to(original_mask); - - ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; - attention_mask.set_shape(new_shape); - - for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { - const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); - const size_t result_prompt_offset = beam_id * new_shape.at(1); - - int64_t* dest = attention_mask.data() + result_prompt_offset; - const int64_t* src = original_mask.data() + original_prompt_offset; - - std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); - attention_mask.data()[result_prompt_offset + new_shape.at(1) - 1] = 1; - } -} - -void set_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { - const size_t batch_size = attention_mask.get_shape().at(0); - const size_t sequence_length = attention_mask.get_shape().at(1); - position_ids.set_shape({batch_size, 1}); - - for (size_t batch = 0; batch < batch_size; batch++) { - int64_t* mask_start = attention_mask.data() + batch * sequence_length; - position_ids.data()[batch] = std::accumulate(mask_start, mask_start + sequence_length - 1, 0); - } -} - -std::vector prompts_arguments_to_vector(int argc, char* argv[]) { - std::vector prompts; - prompts.reserve(argc - 2); - for (size_t i = 2; i < argc; i++) { - prompts.push_back(std::string{argv[i]}); - } - return prompts; -} - -} // namespace - int main(int argc, char* argv[]) try { if (argc < 3) { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " '' ['' ...]"); } - - // Compile models - ov::Core core; - core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - // Read the tokenizer model information from the file to later get the runtime information - auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); - // tokenizer and detokenizer work on CPU only - ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); - ov::InferRequest detokenizer = - core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - // The model can be compiled for GPU as well - ov::InferRequest lm = - core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); - - auto [input_ids, attention_mask] = tokenize(tokenizer, prompts_arguments_to_vector(argc, argv)); - - // Initialize beam search - const int64_t* prompt_data = input_ids.data(); - std::vector> prompts; - prompts.reserve(input_ids.get_shape().at(0)); - for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) { - size_t sequence_length = input_ids.get_shape().at(1); - size_t batch_offset = batch * sequence_length; - const int64_t* prompt_start = prompt_data + batch_offset; - prompts.push_back(std::vector{prompt_start, prompt_start + sequence_length}); - } - - // Get the runtime info from the tokenizer model that we read earlier - auto rt_info = tokenizer_model->get_rt_info(); // Get the runtime info for the model - int64_t SPECIAL_EOS_TOKEN; - - if (rt_info.count("eos_token_id") > 0) { // check if the runtime information has a valid EOS token ID - SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as(); - - } else { - throw std::runtime_error("EOS token ID not found in model's runtime information."); - } - - Parameters parameters{std::move(prompts), SPECIAL_EOS_TOKEN}; - GroupBeamSearcher group_beam_searcher{parameters}; - - initialize_inputs(input_ids, attention_mask, lm); - - std::vector next_tokens; - std::vector next_beams; - - for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) { - lm.infer(); - - std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); - if (next_tokens.empty()) { - break; - } - size_t batch_size = next_tokens.size(); - // Set pointers - lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); - lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); - // Set auxiliary inputs - set_attention_mask(lm.get_tensor("attention_mask"), next_beams); - set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask")); - } - - for (const std::vector>& prompt_group : finalize(std::move(group_beam_searcher))) { - std::cout << "Prompt:\n"; - for (const std::vector group : prompt_group) { - std::cout << "Group:\n"; - for (const Beam& beam : group) { - std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n'; - } - } - } - // Model is stateful which means that context (kv-cache) which belongs to a particular - // text sequence is accumulated inside the model during the generation loop above. - // This context should be reset before processing the next text sequence. - // While it is not required to reset context in this sample as only one batch of sequences is processed, - // it is called for education purposes: - lm.reset_state(); + auto prompts = std::vector(argv + 2, argv + argc); + + std::string model_path = argv[1]; + std::string device = "CPU"; // GPU can be used as well + + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 20; + config.num_beam_groups = 3; + config.num_beams = 15; + config.num_return_sequences = config.num_beams * prompts.size(); + + // workaround until pad_token_id is not written into IR + pipe.get_tokenizer().set_pad_token_id(PAD_TOKEN); + + auto beams = pipe.generate(prompts, config); + for (int i = 0; i < beams.scores.size(); i++) + std::cout << beams.scores[i] << ": " << beams.texts[i] << '\n'; + + return 0; } catch (const std::exception& error) { std::cerr << error.what() << '\n'; return EXIT_FAILURE; diff --git a/text_generation/causal_lm/cpp/chat_sample.cpp b/text_generation/causal_lm/cpp/chat_sample.cpp new file mode 100644 index 000000000..3e215e520 --- /dev/null +++ b/text_generation/causal_lm/cpp/chat_sample.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "openvino/genai/llm_pipeline.hpp" + +using namespace std; + +std::vector questions = { + "1+1=", + "what was the previous answer?", + "Why is the sky blue?", + "4+10=", + "What is Intel OpenVINO?", + "Can you briefly summarize what I asked you about during this session?", +}; + +int main(int argc, char* argv[]) try { + std::string prompt; + std::string accumulated_str = ""; + + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 10000; + auto streamer = [](std::string word) { std::cout << word << std::flush; }; + + pipe.start_chat(); + for (size_t i = 0; i < questions.size(); i++) { + // std::getline(std::cin, prompt); + prompt = questions[i]; + + std::cout << "question:\n"; + cout << prompt << endl; + + // auto answer_str = pipe(prompt, config, streamer); + auto answer_str = pipe.generate(prompt, ov::genai::max_new_tokens(10000), ov::genai::streamer(streamer)); + accumulated_str += answer_str; + + cout << "\n----------\n"; + } + pipe.finish_chat(); +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp index d75d32d0e..0fea9b36d 100644 --- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp @@ -1,129 +1,27 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include - -namespace { -std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { - constexpr size_t BATCH_SIZE = 1; - tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); - tokenizer.infer(); - return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; -} - -std::string detokenize(ov::InferRequest& detokenizer, std::vector& tokens) { - constexpr size_t BATCH_SIZE = 1; - detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()}); - detokenizer.infer(); - return detokenizer.get_output_tensor().data()[0]; -} - -// The following reasons require TextStreamer to keep a cache of previous tokens: -// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a", -// but detokenize(tokenize("prefix a")) == "prefix a" -// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�" -struct TextStreamer { - ov::InferRequest detokenizer; - std::vector token_cache; - size_t print_len = 0; - - void put(int64_t token) { - token_cache.push_back(token); - std::string text = detokenize(detokenizer, token_cache); - if (!text.empty() && '\n' == text.back()) { - // Flush the cache after the new line symbol - std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; - token_cache.clear(); - print_len = 0; - return; - } - if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { - // Don't print incomplete text - return; - } - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); - } - - void end() { - std::string text = detokenize(detokenizer, token_cache); - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; - token_cache.clear(); - print_len = 0; - } -}; -} +#include "openvino/genai/llm_pipeline.hpp" int main(int argc, char* argv[]) try { - if (argc != 3) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); - } - // Compile models - ov::Core core; - core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - //Read the tokenizer model information from the file to later get the runtime information - auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); - // tokenizer and detokenizer work on CPU only - ov::InferRequest tokenizer = core.compile_model( - tokenizer_model, "CPU").create_infer_request(); - auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); - ov::InferRequest detokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - // The model can be compiled for GPU as well - ov::InferRequest lm = core.compile_model( - std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); - auto seq_len = input_ids.get_size(); - - // Initialize inputs - lm.set_tensor("input_ids", input_ids); - lm.set_tensor("attention_mask", attention_mask); - ov::Tensor position_ids = lm.get_tensor("position_ids"); - position_ids.set_shape(input_ids.get_shape()); - std::iota(position_ids.data(), position_ids.data() + seq_len, 0); - constexpr size_t BATCH_SIZE = 1; - // Input values are persistent between inference calls. - // That allows to set values, which aren't going to change, only once - lm.get_tensor("beam_idx").set_shape({BATCH_SIZE}); - lm.get_tensor("beam_idx").data()[0] = 0; - lm.infer(); - size_t vocab_size = lm.get_tensor("logits").get_shape().back(); - float* logits = lm.get_tensor("logits").data() + (seq_len - 1) * vocab_size; - int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; - - lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1}); - position_ids.set_shape({BATCH_SIZE, 1}); - TextStreamer text_streamer{std::move(detokenizer)}; - - // Get the runtime info from the tokenizer model that we read earlier - auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model - int64_t SPECIAL_EOS_TOKEN; + if (3 > argc || argc > 4) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\" "); - if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID - SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as(); - } else { - throw std::runtime_error("EOS token ID not found in model's runtime information."); - } - - int max_sequence_length = 100; - while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) { - ++seq_len; - lm.get_tensor("input_ids").data()[0] = out_token; - lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, seq_len}); - std::fill_n(lm.get_tensor("attention_mask").data(), seq_len, 1); - position_ids.data()[0] = int64_t(seq_len - 1); - lm.start_async(); - text_streamer.put(out_token); - lm.wait(); - logits = lm.get_tensor("logits").data(); - out_token = std::max_element(logits, logits + vocab_size) - logits; - } - text_streamer.end(); - // Model is stateful which means that context (kv-cache) which belongs to a particular - // text sequence is accumulated inside the model during the generation loop above. - // This context should be reset before processing the next text sequence. - // While it is not required to reset context in this sample as only one sequence is processed, - // it is called for education purposes: - lm.reset_state(); + std::string model_path = argv[1]; + std::string prompt = argv[2]; + + // GPU can be used as well + std::string device = "CPU"; + if (argc > 3) device = argv[3]; + + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 100; + config.do_sample = false; + auto streamer = [](std::string subword){std::cout << subword << std::flush;}; + + // since streamer is set results will be printed each time a new token is generated + pipe.generate(prompt, config, streamer); } catch (const std::exception& error) { std::cerr << error.what() << '\n'; return EXIT_FAILURE; diff --git a/text_generation/causal_lm/cpp/multinomial_causal_lm.cpp b/text_generation/causal_lm/cpp/multinomial_causal_lm.cpp new file mode 100644 index 000000000..ffbfc6b2c --- /dev/null +++ b/text_generation/causal_lm/cpp/multinomial_causal_lm.cpp @@ -0,0 +1,37 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) try { + if (3 > argc || argc > 4) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\" "); + + std::string model_path = argv[1]; + std::string prompt = argv[2]; + + // GPU can be used as well + std::string device = "CPU"; + if (argc > 3) { + device = argv[3]; + } + + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); + config.max_new_tokens = 100; + config.do_sample = true; + config.top_p = 0.9; + config.top_k = 30; + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + }; + + // since streamer is set results will be printed each time a new token is generated + pipe.generate(prompt, config, streamer); +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/third-party-programs.txt b/third-party-programs.txt new file mode 100644 index 000000000..e418d7b5e --- /dev/null +++ b/third-party-programs.txt @@ -0,0 +1,417 @@ +OpenVINO GenAI Third Party Programs File + +This file contains the list of third party software ("third party programs") +contained in the Intel software and their required notices and/or license +terms. This third party software, even if included with the distribution of +the Intel software, may be governed by separate license terms, including +without limitation, third party license terms, other Intel software license +terms, and open source software license terms. These separate license terms +govern your use of the third party programs as set forth in the +"third-party-programs.txt" or other similarly-named text file. + +Third party programs and their corresponding required notices and/or license +terms are listed below. + +------------------------------------------------------------- + +Jinja2Cpp + +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. + +------------------------------------------------------------- + +JSON for Modern C++ (https://github.com/nlohmann/json) + +MIT License + +Copyright (c) 2013-2022 Niels Lohmann + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index c75450346..b1cced808 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit c754503462f569b648b598d57ff91ea57bb8deb1 +Subproject commit b1cced808312a8017a405811ede887364cdebd6e