From 2d1fa3b33fc3308f4cce9917829ad24346cc0901 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 15:51:54 +0400
Subject: [PATCH 1/9] Add Llama3 (#620)

Co-authored-by: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
---
 samples/cpp/beam_search_causal_lm/README.md     |  2 +-
 samples/cpp/chat_sample/README.md               |  2 +-
 samples/cpp/greedy_causal_lm/README.md          |  2 +-
 samples/cpp/multinomial_causal_lm/README.md     |  2 +-
 samples/cpp/prompt_lookup_decoding_lm/README.md |  2 +-
 samples/cpp/speculative_decoding_lm/README.md   |  2 +-
 samples/python/beam_search_causal_lm/README.md  |  2 +-
 samples/python/chat_sample/README.md            |  2 +-
 samples/python/greedy_causal_lm/README.md       |  2 +-
 samples/python/multinomial_causal_lm/README.md  |  2 +-
 src/docs/SUPPORTED_MODELS.md                    | 14 +++++++++++++-
 11 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md
index a10428891..82232c42f 100644
--- a/samples/cpp/beam_search_causal_lm/README.md
+++ b/samples/cpp/beam_search_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ sample that supports most popular models like LLaMA 2
+# Text generation C++ sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md
index 4baa8385e..8a24b2000 100644
--- a/samples/cpp/chat_sample/README.md
+++ b/samples/cpp/chat_sample/README.md
@@ -1,4 +1,4 @@
-# C++ chat_sample that supports most popular models like LLaMA 2
+# C++ chat_sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md
index 3c0758ee6..c0a7d5f3c 100644
--- a/samples/cpp/greedy_causal_lm/README.md
+++ b/samples/cpp/greedy_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 2
+# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md
index 731d03e3c..447857991 100644
--- a/samples/cpp/multinomial_causal_lm/README.md
+++ b/samples/cpp/multinomial_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 2
+# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md
index 980c0cd19..89a5e2c58 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/README.md
+++ b/samples/cpp/prompt_lookup_decoding_lm/README.md
@@ -1,4 +1,4 @@
-# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 2
+# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 3
 
 [Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality.
 
diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md
index 7abcb6782..c86bd8b61 100644
--- a/samples/cpp/speculative_decoding_lm/README.md
+++ b/samples/cpp/speculative_decoding_lm/README.md
@@ -1,4 +1,4 @@
-# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 2
+# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 3
 
 Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model.
 
diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md
index ff5286d01..5e80aa69d 100644
--- a/samples/python/beam_search_causal_lm/README.md
+++ b/samples/python/beam_search_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python sample that supports most popular models like LLaMA 2
+# Text generation Python sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md
index 34d71fab8..983789d0e 100644
--- a/samples/python/chat_sample/README.md
+++ b/samples/python/chat_sample/README.md
@@ -1,4 +1,4 @@
-# Python chat_sample that supports most popular models like LLaMA 2
+# Python chat_sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md
index 7c87b04aa..97b044eb5 100644
--- a/samples/python/greedy_causal_lm/README.md
+++ b/samples/python/greedy_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 2
+# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md
index d76b93366..d39142f3d 100644
--- a/samples/python/multinomial_causal_lm/README.md
+++ b/samples/python/multinomial_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 2
+# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index 0e6099db0..3eb2af17b 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -45,7 +45,19 @@
       </td>
     </tr>
     <tr>
-      <td rowspan="3" vertical-align="top"><code>LlamaForCausalLM</code></td>
+      <td rowspan="4" vertical-align="top"><code>LlamaForCausalLM</code></td>
+      <td>Llama 3</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><code>meta-llama/Meta-Llama-3-8B</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><code>meta-llama/Meta-Llama-3-8B-Instruct</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B"><code>meta-llama/Meta-Llama-3-70B</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"><code>meta-llama/Meta-Llama-3-70B-Instruct</code></a></li>
+        </ul>
+      </td>
+    </tr>
+    <tr>
+      <!-- <td><code>LlamaForCausalLM</code></td> -->
       <td>Llama 2</td>
       <td>
         <ul>

From 489a87d7c46960a0cb9920ac93333394c91d5306 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 16:54:42 +0400
Subject: [PATCH 2/9] nightly->rc1 (#621)

---
 .github/workflows/causal_lm_cpp.yml           | 66 +++++++++----------
 .github/workflows/genai_package.yml           | 18 ++---
 .github/workflows/genai_python_lib.yml        | 12 ++--
 .github/workflows/lcm_dreamshaper_cpp.yml     |  8 +--
 .../workflows/stable_diffusion_1_5_cpp.yml    |  4 +-
 src/README.md                                 |  2 +-
 src/docs/BUILD.md                             |  8 +--
 7 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 18cc89a8f..85bef624c 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -13,9 +13,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
@@ -34,8 +34,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -77,8 +77,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -208,8 +208,8 @@ jobs:
       - name: Download, convert and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -253,8 +253,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -280,8 +280,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -308,8 +308,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -336,8 +336,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -364,8 +364,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
@@ -401,8 +401,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -445,8 +445,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -493,8 +493,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -543,8 +543,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -603,8 +603,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -645,8 +645,8 @@ jobs:
       - name: Install dependencies and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -684,8 +684,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index 06e589dfb..2535e423d 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   ubuntu_genai_package:
     strategy:
@@ -28,8 +28,8 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -57,8 +57,8 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -99,8 +99,8 @@ jobs:
         shell: bash
       - run: call ov\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: call ov\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: call ov\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: call ov\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: call ov\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install"
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 34d5fbf92..e0c43bddd 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_centos7_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   ubuntu_genai_python_lib:
     # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
@@ -29,7 +29,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -m pytest ./tests/python_tests/
@@ -52,7 +52,7 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -c "from openvino_genai import LLMPipeline"
@@ -79,7 +79,7 @@ jobs:
         shell: bash
       - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j
-      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
       - run: call ./ov/setupvars.bat && python -m pip install . --verbose
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 2d450ad9c..82a74f8cd 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -50,8 +50,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
@@ -95,8 +95,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index cda567c23..5197b27da 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -49,8 +49,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_sd_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/src/README.md b/src/README.md
index c67a60eae..445b88aa5 100644
--- a/src/README.md
+++ b/src/README.md
@@ -23,7 +23,7 @@ To build OpenVINO™ GenAI library from source, refer to the [Build Instructions
     > git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     > cd openvino.genai
     > # Install python dependencies
-    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
     > python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
     > ```
 
diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md
index 710428139..1aee73bfb 100644
--- a/src/docs/BUILD.md
+++ b/src/docs/BUILD.md
@@ -18,7 +18,7 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240626_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
     sudo ./ov/install_dependencies/install_openvino_dependencies.sh
     ```
 3. Build the project:
@@ -48,9 +48,9 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/w_openvino_toolkit_windows_2024.3.0.dev20240626_x86_64.zip
+    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
     unzip ov.zip
-    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240626_x86_64
+    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64
     ```
 3. Build the project:
     ```sh
@@ -85,7 +85,7 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240626_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
     ```
 3. Build the project:
     ```sh

From 67f04675a03b2774babc1e0358407ba67f49cb45 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 19:17:11 +0400
Subject: [PATCH 3/9] Add OpenVINOGenAITargets to core_genai_dev COMPONENT
 (#642)

OpenVINOGenAITargets.cmake was excluded from packaging because
CPACK_COMPONENTS_ALL is custom now and doesn't install Unspecified
component
---
 CMakeLists.txt         | 2 +-
 src/cpp/CMakeLists.txt | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f7390f98..7059324d8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,7 @@ install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT lice
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
 # Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614
-set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_licenses)
+set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_docs)
 if(ENABLE_PYTHON)
     list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 endif() 
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 454c53b94..c140bf9ac 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -103,7 +103,8 @@ install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
         DESTINATION runtime/include COMPONENT core_genai_dev)
 install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake
-        NAMESPACE openvino:: DESTINATION runtime/cmake)
+        NAMESPACE openvino:: DESTINATION runtime/cmake
+        COMPONENT core_genai_dev)
 
 include(CMakePackageConfigHelpers)
 configure_package_config_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/OpenVINOGenAIConfig.cmake.in"

From 19691609512f7c7d344cdf19cd8d36db30b6c574 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 22 Jul 2024 12:46:25 +0400
Subject: [PATCH 4/9] Apply todo, initialize detokenizer's cache (#647)

---
 src/cpp/src/tokenizer.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 9b4a206a1..ac6b925dc 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -98,8 +98,11 @@ class Tokenizer::TokenizerImpl {
                                                    device).create_infer_request();
 
         // Get special token ids by inference if they are not defined.
-        // todo: do not call until CVS-143410 is resolved
-        // infer_special_tokens_if_necessary();
+        infer_special_tokens_if_necessary();
+        // Initialize tokenizer's cache to save time later.
+        // infer_special_tokens_if_necessary() already could do that
+        // but it didn't run decode() for sure.
+        decode(encode("").input_ids);
     }
 
     // load special tokens ids from config.json

From 0e0f6a9c6cde08835dd579b20e76149f9fc17545 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Mon, 22 Jul 2024 11:04:19 +0100
Subject: [PATCH 5/9] Cherry-pick static LLM pipeline changes (#654)

Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
---
 samples/cpp/chat_sample/chat_sample.cpp |  2 +-
 src/cpp/src/llm_pipeline_static.cpp     | 88 +++++++++++++++++++------
 src/cpp/src/llm_pipeline_static.hpp     | 12 ++--
 3 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
index d9d9c2b2d..ae4dad88a 100644
--- a/samples/cpp/chat_sample/chat_sample.cpp
+++ b/samples/cpp/chat_sample/chat_sample.cpp
@@ -10,7 +10,7 @@ int main(int argc, char* argv[]) try {
     std::string prompt;
     std::string model_path = argv[1];
 
-    std::string device = "CPU";  // GPU can be used as well
+    std::string device = "CPU";  // GPU, NPU can be used as well
     ov::genai::LLMPipeline pipe(model_path, "CPU");
     
     ov::genai::GenerationConfig config;
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 3a9ea4d1d..3f50d30ec 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -8,6 +8,8 @@
 #include "text_callback_streamer.hpp"
 #include "utils.hpp"
 
+#include <openvino/pass/stateful_to_stateless.hpp>
+
 namespace {
 
 std::shared_ptr<ov::Model> add_slices_to_kvcache_inputs(const std::shared_ptr<ov::Model>& model) {
@@ -75,25 +77,42 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
     model->reshape(new_shapes);
 }
 
-void fill_tensor(ov::Tensor tensor, int64_t fill_val) {
+void fill_tensor(ov::Tensor tensor, int64_t fill_val, size_t offset = 0u) {
     int64_t* tensor_data = tensor.data<int64_t>();
-    std::fill(tensor_data, tensor_data + tensor.get_size(), fill_val);
+    std::fill(tensor_data + offset, tensor_data + tensor.get_size(), fill_val);
 }
 
-void copy_with_left_offset(const ov::Tensor& orig, ov::Tensor& padded) {
-    const auto orig_size = orig.get_size();
-    const auto padded_size = padded.get_size();
-    const auto kLeftOffset = padded_size - orig_size;
+void copy_with_offset(const ov::Tensor& orig, const int32_t offset, ov::Tensor& padded) {
     int64_t* orig_data = orig.data<int64_t>();
     int64_t* padded_data = padded.data<int64_t>();
-    std::copy(orig_data, orig_data + orig_size, padded_data + kLeftOffset);
+    std::copy(orig_data, orig_data + orig.get_size(), padded_data + offset);
 }
 
-ov::AnyMap extract_config_or_empty(const ov::AnyMap& config, const std::string& config_name) {
+ov::AnyMap extract_config_or_default(const ov::AnyMap& config, const std::string& config_name) {
     ov::AnyMap stage_cfg;
     if (auto it = config.find(config_name); it != config.end()) {
         const auto& map = it->second.as<std::map<std::string, std::string>>();
         stage_cfg = { map.begin(), map.end() };
+    } else if (config_name == "PREFILL_CONFIG") {
+        std::map<std::string, std::string> prefill_config = {
+			{ "NPU_USE_NPUW", "YES" },
+			{ "NPUW_FOLD", "YES" },
+			{ "NPUW_DCOFF_TYPE", "f16" },
+			{ "NPUW_DCOFF_SCALE",  "YES" },
+			{ "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" }
+        };
+        stage_cfg.insert(prefill_config.begin(), prefill_config.end());
+    } else if (config_name == "GENERATE_CONFIG") {
+        std::map<std::string, std::string> generate_config = {
+            { "NPU_USE_NPUW", "YES" },
+            { "NPUW_FOLD", "YES" },
+            { "NPUW_DCOFF_TYPE", "f16" },
+            { "NPUW_DCOFF_SCALE", "YES" },
+            { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" },
+            { "NPUW_PARALLEL_COMPILE", "YES" },
+            { "NPUW_FUNCALL_ASYNC", "YES" }
+        };
+        stage_cfg.insert(generate_config.begin(), generate_config.end());
     }
     return stage_cfg;
 }
@@ -126,7 +145,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     ov::Core core;
     // (1) Read the template model - this will be kvcache model
     auto kvcache_model = core.read_model(path / "openvino_model.xml");
-    // (2) TODO: Expose KV-cache input and output layers from kvcache model
+    // (2) Expose KV-cache input and output layers from kvcache model
+    ov::pass::StatefulToStateless().run_on_model(kvcache_model);
     // (3) Clone the model - this will be prefill
     auto prefill_model = kvcache_model->clone();
     prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
@@ -140,10 +160,10 @@ StaticLLMPipeline::StaticLLMPipeline(
     kvcache_model = add_slices_to_kvcache_inputs(kvcache_model);
     // (6) Compile both model
     m_prefill_request = core.compile_model(
-        prefill_model, device, extract_config_or_empty(config, "PREFILL_CONFIG")
+        prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG")
     ).create_infer_request();
     m_kvcache_request = core.compile_model(
-        kvcache_model, device, extract_config_or_empty(config, "GENERATE_CONFIG")
+        kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG")
     ).create_infer_request();
     // (7) Initialize tensors
     prepare_for_new_conversation();
@@ -156,6 +176,18 @@ StaticLLMPipeline::StaticLLMPipeline(
 ) : StaticLLMPipeline(path, path.string(), device, config) {
 }
 
+void StaticLLMPipeline::start_chat(const std::string& system_message) {
+    if (!system_message.empty()) {
+        m_history.push_back({{"role", "system"}, {"content", system_message}});
+    }
+    m_is_chat_conversation = true;
+};
+
+void StaticLLMPipeline::finish_chat() {
+    m_is_chat_conversation = false;
+    m_history.clear();
+};
+
 void StaticLLMPipeline::prepare_for_new_conversation() {
     fill_tensor(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id());
     fill_tensor(m_prefill_request.get_tensor("position_ids"), 0u);
@@ -175,9 +207,23 @@ DecodedResults StaticLLMPipeline::generate(
     }
 
     OPENVINO_ASSERT(std::holds_alternative<std::string>(inputs));
-    auto tokenized_input = m_tokenizer.encode(std::get<std::string>(inputs));
+    auto& prompt = std::get<std::string>(inputs);
+
+    if (m_is_chat_conversation) {
+        m_history.push_back({{"role", "user"}, {"content", prompt}});
+        constexpr bool add_generation_prompt = true;
+        prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+    }
+
+    auto tokenized_input = m_tokenizer.encode(prompt);
     auto encoded_results = generate(tokenized_input, config, streamer);
-    return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+    DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+
+    if (m_is_chat_conversation) {
+        auto answer = decoded_results.texts[0];
+        m_history.push_back({{"role", "assistant"}, {"content", answer}});
+    }
+    return decoded_results;
 }
 
 EncodedResults StaticLLMPipeline::generate(
@@ -222,22 +268,25 @@ EncodedResults StaticLLMPipeline::generate(
     ov::genai::EncodedResults results;
     // NB: Only batch=1 is supported now
     results.scores.resize(1u);
+    results.scores[0] = 0u;
     results.tokens.resize(1u);
 
-    // NB: Check if input prompt less than maximum size
+    // NB: Check if there is enough space in KV-cache to process input prompt
     auto prompt_len = input_ids.get_size();
     if (prompt_len > m_kvcache_desc.total_size) {
         OPENVINO_THROW("Currently static pipeline only process up to " + std::to_string(m_kvcache_desc.total_size) + " tokens");
     }
 
-    // NB: Reset tensors on every generate call - chat conversation isn't supported yet!
+    // NB: From the "generate" perspective, every call is treated as start of new conversation,
+    // but if continuation is needed, prompt contains information about the entire conversation.
     prepare_for_new_conversation();
 
     auto padded_input_ids = m_prefill_request.get_tensor("input_ids");
-    copy_with_left_offset(input_ids, padded_input_ids);
+    const size_t offset = padded_input_ids.get_size() - input_ids.get_size();
+    copy_with_offset(input_ids, offset, padded_input_ids);
 
     auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask");
-    copy_with_left_offset(attention_mask, padded_attention_mask);
+    fill_tensor(padded_attention_mask, 1u, offset);
 
     auto padded_position_ids = m_prefill_request.get_tensor("position_ids");
     auto* padded_pos_data = padded_position_ids.data<int64_t>();
@@ -248,13 +297,13 @@ EncodedResults StaticLLMPipeline::generate(
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += prompt_len;
     int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0);
+    results.tokens[0].push_back(last_token);
     if (streamer_ptr && streamer_ptr->put(last_token)) {
         return results;
     }
 
     padded_attention_mask.copy_to(m_kvcache_request.get_tensor("attention_mask"));
 
-
     // Inputs: input_ids, attention_mask, position_ids, ...
     // Outputs: logits, ...
     const auto kStartInputKVCacheLayers = 3u;
@@ -286,13 +335,12 @@ EncodedResults StaticLLMPipeline::generate(
 
         last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
         results.tokens[0].push_back(last_token);
-        results.scores[0] = 0u;
 
         if (streamer_ptr && streamer_ptr->put(last_token)) {
             break;
         }
 
-        if (last_token == m_generation_config.eos_token_id) {
+        if (last_token == config.eos_token_id && !config.ignore_eos) {
             break;
         }
 
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 8c2f19ffa..85488e188 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -35,13 +35,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         StreamerVariant streamer
     ) override;
 
-    void start_chat(const std::string& system_message) override {
-        OPENVINO_THROW("Currently chat conversation mode isn't supported");
-    };
-    void finish_chat() override {
-        OPENVINO_THROW("Currently chat conversation mode isn't supported");
-    };
-
+    void start_chat(const std::string& system_message) override;
+    void finish_chat() override;
 private:
     void prepare_for_new_conversation();
 
@@ -54,6 +49,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;
+
+    bool m_is_chat_conversation = false;
+    ChatHistory m_history;
 };
 
 }  // namespace genai

From 7bf42f1f12f55f1ae30610267897c22a98545f31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= <milosz.zeglarski@intel.com>
Date: Mon, 22 Jul 2024 17:03:49 +0200
Subject: [PATCH 6/9] Cherry-pick custom max_element loop (#662)

Cherry picked from master
---
 src/cpp/src/sampler.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index dc631c68a..6390fc872 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -219,8 +219,13 @@ class Sampler {
     }
 
     Token _greedy_sample(const std::vector<Token>& logit_vector) const {
-        auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; });
-        return *out_token;
+        Token max_token{-std::numeric_limits<float>::infinity() , 0};
+        for (const auto& logit : logit_vector) {
+            if (logit.m_log_prob > max_token.m_log_prob) {
+                max_token = logit;
+            }
+        }
+        return max_token;
     }
 
     std::vector<Token> _multinomial_sample(const std::vector<Token>& logit_vector, size_t num_tokens_per_sequence) {

From bad01b94e2c21abce6d211c8c85db00f9af7f6c0 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Mon, 22 Jul 2024 19:35:25 +0200
Subject: [PATCH 7/9] Add note for pybind ov::Tensor issue (#659)

---
 src/docs/BUILD.md | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md
index 1aee73bfb..3b89995dc 100644
--- a/src/docs/BUILD.md
+++ b/src/docs/BUILD.md
@@ -1,5 +1,8 @@
 # How to Build OpenVINO™ GenAI
 
+> **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`.
+The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html).
+
 ## Build for Linux Systems
 
 ### Software Requirements 
@@ -10,20 +13,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build).  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-    sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-    ```
 3. Build the project:
     ```sh
-    source ./ov/setupvars.sh
+    source <INSTALL_DIR>/setupvars.sh
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
@@ -40,21 +39,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
-    unzip ov.zip
-    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64
-    ```
 3. Build the project:
     ```sh
-    call ov\setupvars.bat
+    call <INSTALL_DIR>\setupvars.bat
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
@@ -77,19 +71,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-    ```
 3. Build the project:
     ```sh
-    source ./ov/setupvars.sh
+    source <INSTALL_DIR>/setupvars.sh
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov

From cb0da0ad7a2e35f686d7f529489d83ce01783989 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@intel.com>
Date: Tue, 23 Jul 2024 01:57:33 +0800
Subject: [PATCH 8/9] [OV 24.3]Fix multinomial sample CMakeList (#658)

@Wovchena, retarget to OV 24.3 release branch
---
 samples/cpp/multinomial_causal_lm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
index efcac50f0..98bc76ee3 100644
--- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt
+++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
@@ -11,7 +11,7 @@ set_target_properties(multinomial_causal_lm PROPERTIES
     COMPILE_PDB_NAME multinomial_causal_lm
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-target_compile_features(greedy_causal_lm PRIVATE cxx_std_11)
+target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11)
 install(TARGETS multinomial_causal_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin

From bc9224884963ff89c99b7c73b30404fd6e3b0f40 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Tue, 23 Jul 2024 08:31:59 +0200
Subject: [PATCH 9/9] add Readme for tests (#664)

- Added Readme for python tests
- Added `--model_ids` option to run selectively only on specific models

---------

Co-authored-by: Zlobin Vladimir <vladimir.zlobin@intel.com>
---
 tests/python_tests/README.md                 | 47 ++++++++++++++++++++
 tests/python_tests/conftest.py               |  7 ++-
 tests/python_tests/ov_genai_test_utils.py    |  5 ++-
 tests/python_tests/test_chat_generate_api.py |  4 ++
 tests/python_tests/test_generate_api.py      | 32 +++++++++++++
 5 files changed, 93 insertions(+), 2 deletions(-)
 create mode 100644 tests/python_tests/README.md

diff --git a/tests/python_tests/README.md b/tests/python_tests/README.md
new file mode 100644
index 000000000..e5381708d
--- /dev/null
+++ b/tests/python_tests/README.md
@@ -0,0 +1,47 @@
+# OpenVINO™ GenAI Tests
+
+This tests aim to validate support for vanilla and continuous batching GenAI APIs.
+
+## Setup environemnt
+
+In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md).
+
+Then install requirements for tests:
+```sh
+pip install -r tests/python_tests/requirements.txt
+```
+
+## Run Tests
+
+```sh
+python -m pytest tests/python_tests/ -m precommit
+```
+
+During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g.
+```sh
+GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit
+```
+
+If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g.
+```sh
+PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit
+```
+
+## Customise tests run
+
+Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests:
+```sh
+python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat"
+```
+
+If you wish to run all tests except beam search do the following:
+```sh
+python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search"
+```
+
+Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g:
+```sh
+python -m pytest tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct"
+```
+
+List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list
diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
index 66212468a..f98f47ecf 100644
--- a/tests/python_tests/conftest.py
+++ b/tests/python_tests/conftest.py
@@ -14,6 +14,11 @@ def pytest_make_parametrize_id(config, val, argname):
         return f'{argname}={val}'
     return None
 
-def pytest_configure(config):
+def pytest_addoption(parser):
+    parser.addoption("--model_ids", help="Select models to run")
+
+def pytest_configure(config: pytest.Config):
     marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly'
     pytest.run_marker = marker
+    pytest.selected_model_ids = config.getoption('--model_ids', default=None)
+
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 4ba71a1d4..bc95418af 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -49,7 +49,10 @@ def get_models_list():
         model_ids = precommit_models
     else:
         model_ids = nightly_models
-
+    
+    if pytest.selected_model_ids:
+        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+    # pytest.set_trace()
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
 
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 94de8f6cc..5a73d481d 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -33,6 +33,7 @@
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     device = 'CPU'
     chat_history_hf = []
@@ -69,6 +70,7 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
     # compares with HF when history in ov_genai is save as a text
     device = 'CPU'
@@ -104,6 +106,7 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict)
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
     # Check that when history is stored in KV cache results are the same as when history stored in a text.
     device ='CPU'
@@ -144,6 +147,7 @@ def test_chat_compare_statefull_vs_text_history(model_descr, generation_config:
     {'role': 'user', 'content': 'What was my first question?'},
 ]
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize('chat_config', get_chat_templates())
 def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
     tokenizer_config = chat_config[1]
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 40eba9227..e2395cf8d 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -151,6 +151,7 @@ def hf_ov_genai_tensors_comparison(
 @pytest.mark.parametrize("generation_config,prompt", test_cases)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_decoding(model_descr, generation_config, prompt):
     run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
 
@@ -168,6 +169,7 @@ def test_decoding(model_descr, generation_config, prompt):
     condition=sys.platform == "linux"
 )
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_ov_tensors(model_descr, inputs):
     hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)
 
@@ -182,6 +184,7 @@ def test_ov_tensors(model_descr, inputs):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=TypeError, 
     reason="pybind was unable to find ov::Tensor from openvino yet",
@@ -217,6 +220,7 @@ def test_genai_tokenizer_encode(model_descr, prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.parametrize("encoded_prompt", encoded_prompts)
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=TypeError, 
     reason="pybind was unable to find ov::Tensor from openvino yet",
@@ -252,6 +256,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
 @pytest.mark.parametrize("prompts", batched_prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_multibatch(model_descr, generation_config, prompts):
     run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
 
@@ -264,6 +269,7 @@ def test_multibatch(model_descr, generation_config, prompts):
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
                               max_new_tokens, diversity_penalty, prompt):
     generation_config = dict(
@@ -281,6 +287,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
 @pytest.mark.parametrize("max_new_tokens", [10, 80])
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
     # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk>
     # while genai ends sentence with <eos>
@@ -323,6 +330,7 @@ def user_defined_callback(subword):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     generation_config = pipe.get_generation_config()
@@ -332,6 +340,7 @@ def test_callback_one_string(callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
@@ -340,12 +349,14 @@ def test_callback_batch_fail(callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_kwargs_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     pipe.generate('table is made of', max_new_tokens=10, streamer=callback)
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("model_descr", get_models_list())
 def test_callback_decoding_metallama(model_descr, callback):
     # On metallam this prompt generates output which can shorten after adding new tokens.
@@ -359,6 +370,7 @@ def test_callback_decoding_metallama(model_descr, callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_kwargs_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
@@ -380,6 +392,7 @@ def end(self):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_one_string():
     pipe = read_model(get_models_list()[0])[4]
     generation_config = pipe.get_generation_config()
@@ -389,6 +402,7 @@ def test_streamer_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -397,6 +411,7 @@ def test_streamer_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_kwargs_one_string():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -404,6 +419,7 @@ def test_streamer_kwargs_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_kwargs_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -412,6 +428,7 @@ def test_streamer_kwargs_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
@@ -421,6 +438,7 @@ def test_operator_with_callback_one_string(callback):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
@@ -429,6 +447,7 @@ def test_operator_with_callback_batch_fail(callback):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_operator_with_streamer_kwargs_one_string():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -436,6 +455,7 @@ def test_operator_with_streamer_kwargs_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_operator_with_streamer_kwargs_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -444,6 +464,7 @@ def test_operator_with_streamer_kwargs_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_ids_1(model_tmp_path):
     # test when there is an available config.json
     config_json = { 
@@ -458,6 +479,7 @@ def test_load_special_tokens_ids_1(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_str_2(model_tmp_path):
     # test with special_tokens_map
     special_tokens_map_json = { 
@@ -472,6 +494,7 @@ def test_load_special_tokens_str_2(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_3_(model_tmp_path):
     # special_tokens_map is not available 
     # but tokenize_config.json exists
@@ -498,6 +521,7 @@ def test_load_special_tokens_3_(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_3(model_tmp_path):
     # both config.json is availabel and tokenizer_config.json available
     # check that it does not read int values from tokenizer_config.json if they are in config.json
@@ -532,6 +556,7 @@ def test_load_special_tokens_3(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=AssertionError, 
     reason="CVS-143410 ov tokenizer should be aligned with hf",
@@ -575,6 +600,7 @@ def test_load_special_tokens_4(model_tmp_path):
 ]
 @pytest.mark.parametrize("generation_config", invalid_configs)
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_invalid_configs(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
     config_json = {}
@@ -584,6 +610,7 @@ def test_invalid_configs(model_tmp_path, generation_config):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_valid_configs(model_tmp_path):
     model_id, temp_path = model_tmp_path
     pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
@@ -602,6 +629,7 @@ def test_valid_configs(model_tmp_path):
     dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
 ]
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("generation_config", invalid_py_configs)
 def test_python_generation_config_validation(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
@@ -615,6 +643,7 @@ def test_python_generation_config_validation(model_tmp_path, generation_config):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_1():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
@@ -626,6 +655,7 @@ def test_unicode_pybind_decoding_1():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_2():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
@@ -636,6 +666,7 @@ def test_unicode_pybind_decoding_2():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_3():
     # On this model this prompt generates unfinished utf-8 string
     # and streams it. Test that pybind will not fail while we pass string to python.
@@ -648,6 +679,7 @@ def test_unicode_pybind_decoding_3():
 
 @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
 def test_left_pad():
     # test left pad tokenizer post processing implementation