Refactor openvino-tokenizers notebook (#2367)

Ticket: CVS-151219
openvinotoolkit · Sep 10, 2024 · e57fb8a · e57fb8a
1 parent c657c43
commit e57fb8a
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 62 deletions.
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -182,6 +182,7 @@ DepthAnything
 detections
 detokenization
 detokenizer
+detokenizers
 Dettmers
 dev
 detectron

diff --git a/notebooks/openvino-tokenizers/openvino-tokenizers.ipynb b/notebooks/openvino-tokenizers/openvino-tokenizers.ipynb
@@ -18,6 +18,7 @@
  " - [Convert Tokenizer from HuggingFace Hub with CLI Tool](#Convert-Tokenizer-from_HuggingFace-Hub-with-CLI-Tool)\n",
  " - [Convert Tokenizer from HuggingFace Hub with Python API](#Convert-Tokenizer-from-HuggingFace-Hub-with-Python-API)\n",
  "- [Text Generation Pipeline with OpenVINO Tokenizers](#Text-Generation-Pipeline-with-OpenVINO-Tokenizers)\n",
+ "- [Text Generation Pipeline with OpenVINO GenAI and OpenVINO Tokenizers](#text-generation-pipeline-with-openvino-genai-and-openvino-tokenizers)\n",
  "- [Merge Tokenizer into a Model](#Merge-Tokenizer-into-a-Model)\n",
  "- [Conclusion](#Conclusion)\n",
  "- [Links](#Links)\n",
@@ -65,13 +66,13 @@
  },
  {
  "cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
  "id": "0c698c94-b852-4b06-b699-7d417fb55e10",
  "metadata": {},
  "outputs": [],
  "source": [
  "%pip install -Uq pip\n",
- "%pip install --pre -Uq openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n",
+ "%pip install -q -U \"openvino>=2024.3.0\" openvino-tokenizers[transformers] openvino-genai\n",
  "%pip install \"numpy<2.0.0\" \"torch>=2.1\" --extra-index-url https://download.pytorch.org/whl/cpu"
  ]
  },
@@ -155,7 +156,7 @@
  " ]>,\n",
  " <Model: 'detokenizer'\n",
  " inputs[\n",
- " <ConstOutput: names[Parameter_21] shape[?,?] type: i64>\n",
+ " <ConstOutput: names[Parameter_22] shape[?,?] type: i64>\n",
  " ]\n",
  " outputs[\n",
  " <ConstOutput: names[string_output] shape[?] type: string>\n",
@@ -182,7 +183,9 @@
  "id": "cd211b9f-37a9-4ae7-bc3e-4619643c08b8",
  "metadata": {},
  "source": [
- "That way you get OpenVINO model objects. Use `save_model` function from OpenVINO to reuse converted tokenizers later:"
+ "That way you get OpenVINO model objects. Use `save_model` function from OpenVINO to reuse converted tokenizers later:\n",
+ "\n",
+ "> ⚠️ Import `openvino_tokenizers` will add all tokenizer-related operations to OpenVINO, after which you can work with saved tokenizers and detokenizers."
  ]
  },
  {
@@ -192,11 +195,14 @@
  "metadata": {},
  "outputs": [],
  "source": [
- "from openvino import save_model\n",
+ "import openvino as ov\n",
+ "\n",
+ "# This import is needed to add all tokenizer-related operations to OpenVINO\n",
+ "import openvino_tokenizers # noqa: F401\n",
  "\n",
  "\n",
- "save_model(ov_tokenizer, tokenizer_dir / \"openvino_tokenizer.xml\")\n",
- "save_model(ov_detokenizer, tokenizer_dir / \"openvino_detokenizer.xml\")"
+ "ov.save_model(ov_tokenizer, tokenizer_dir / \"openvino_tokenizer.xml\")\n",
+ "ov.save_model(ov_detokenizer, tokenizer_dir / \"openvino_detokenizer.xml\")"
  ]
  },
  {
@@ -219,15 +225,12 @@
  "text": [
  "Token ids: [[ 1 4321]\n",
  " [ 1 6031]]\n",
- "Detokenized text: ['<s> Test' '<s> strings']\n"
+ "Detokenized text: ['Test' 'strings']\n"
  ]
  }
  ],
  "source": [
- "from openvino import compile_model\n",
- "\n",
- "\n",
- "tokenizer, detokenizer = compile_model(ov_tokenizer), compile_model(ov_detokenizer)\n",
+ "tokenizer, detokenizer = ov.compile_model(ov_tokenizer), ov.compile_model(ov_detokenizer)\n",
  "test_strings = [\"Test\", \"strings\"]\n",
  "\n",
  "token_ids = tokenizer(test_strings)[\"input_ids\"]\n",
@@ -255,23 +258,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Token ids: [[1, 4321], [1, 6031]]\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2024-04-02 18:45:50.238827: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
- "2024-04-02 18:45:50.275055: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
- "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
- "2024-04-02 18:45:50.909410: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
+ "Token ids: [[1, 4321], [1, 6031]]\n",
  "Detokenized text: ['<s> Test', '<s> strings']\n"
  ]
  }
@@ -307,11 +294,11 @@
  "model_dir = Path(Path(model_id).name)\n",
  "\n",
  "if not model_dir.exists():\n",
- " # converting the original model\n",
+ " # Converting the original model\n",
  " # %pip install -U \"git+https://github.com/huggingface/optimum-intel.git\" \"nncf>=2.8.0\" onnx\n",
  " # %optimum-cli export openvino -m $model_id --task text-generation-with-past $model_dir\n",
  "\n",
- " # load already converted model\n",
+ " # Load already converted model\n",
  " from huggingface_hub import hf_hub_download\n",
  "\n",
  " hf_hub_download(\n",
@@ -336,15 +323,12 @@
  "import numpy as np\n",
  "from tqdm.notebook import trange\n",
  "from pathlib import Path\n",
- "from openvino_tokenizers import add_greedy_decoding\n",
  "from openvino_tokenizers.constants import EOS_TOKEN_ID_NAME\n",
- "from openvino import Core\n",
  "\n",
  "\n",
- "core = Core()\n",
+ "core = ov.Core()\n",
  "\n",
- "# add the greedy decoding subgraph on top of LLM to get the most probable token as an output\n",
- "ov_model = add_greedy_decoding(core.read_model(model_dir / \"openvino_model.xml\"))\n",
+ "ov_model = core.read_model(model_dir / \"openvino_model.xml\")\n",
  "compiled_model = core.compile_model(ov_model)\n",
  "infer_request = compiled_model.create_infer_request()"
  ]
@@ -354,7 +338,7 @@
  "id": "f6f53a35-446d-4a07-9e67-5794a53b12ba",
  "metadata": {},
  "source": [
- "The `infer_request` object provides control over the model's state - a  Key-Value cache that speeds up inference by reducing computations Multiple inference requests can be created, and each request maintains a distinct and separate state.."
+ "The `infer_request` object provides control over the model's state - a Key-Value cache that speeds up inference by reducing computations. Multiple inference requests can be created, and each request maintains a distinct and separate state."
  ]
  },
  {
@@ -366,12 +350,12 @@
  {
  "data": {
  "application/vnd.jupyter.widget-view+json": {
- "model_id": "af5dd23fe83c4fed8ce6b7b0a8ed41e9",
+ "model_id": "9135a4ff0a7141949c4990d803862bc0",
  "version_major": 2,
  "version_minor": 0
  },
  "text/plain": [
- " 0%| | 0/10 [00:00<?, ?it/s]"
+ " 0%| | 0/5 [00:00<?, ?it/s]"
  ]
  },
  "metadata": {},
@@ -384,12 +368,7 @@
  "Prompt:\n",
  "Quick brown fox jumped\n",
  "Generated:\n",
- "over the fence.\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "\n"
+ "over the fence.\n"
  ]
  }
  ],
@@ -401,29 +380,35 @@
  "if \"position_ids\" in (input.any_name for input in infer_request.model_inputs):\n",
  " model_input[\"position_ids\"] = np.arange(model_input[\"input_ids\"].shape[1], dtype=np.int64)[np.newaxis, :]\n",
  "\n",
- "# no beam search, set idx to 0\n",
+ "# No beam search, set idx to 0\n",
  "model_input[\"beam_idx\"] = np.array([0], dtype=np.int32)\n",
- "# end of sentence token is that model signifies the end of text generation\n",
- "# read EOS token ID from rt_info of tokenizer/detokenizer ov.Model object\n",
+ "\n",
+ "# End of sentence token is that model signifies the end of text generation\n",
+ "# Read EOS token ID from rt_info of tokenizer/detokenizer ov.Model object\n",
  "eos_token = ov_tokenizer.get_rt_info(EOS_TOKEN_ID_NAME).value\n",
  "\n",
  "tokens_result = np.array([[]], dtype=np.int64)\n",
  "\n",
- "# reset KV cache inside the model before inference\n",
+ "# Reset KV cache inside the model before inference\n",
  "infer_request.reset_state()\n",
- "max_infer = 10\n",
+ "max_infer = 5\n",
  "\n",
  "for _ in trange(max_infer):\n",
  " infer_request.start_async(model_input)\n",
  " infer_request.wait()\n",
  "\n",
- " # get a prediction for the last token on the first inference\n",
- " output_token = infer_request.get_output_tensor().data[:, -1:]\n",
+ " output_tensor = infer_request.get_output_tensor()\n",
+ "\n",
+ " # Get the most probable token\n",
+ " token_indices = np.argmax(output_tensor.data, axis=-1)\n",
+ " output_token = token_indices[:, -1:]\n",
+ "\n",
+ " # Concatenate previous tokens result with newly generated token\n",
  " tokens_result = np.hstack((tokens_result, output_token))\n",
  " if output_token[0, 0] == eos_token:\n",
  " break\n",
  "\n",
- " # prepare input for new inference\n",
+ " # Prepare input for the next inference iteration\n",
  " model_input[\"input_ids\"] = output_token\n",
  " model_input[\"attention_mask\"] = np.hstack((model_input[\"attention_mask\"].data, [[1]]))\n",
  " model_input[\"position_ids\"] = np.hstack(\n",
@@ -433,11 +418,61 @@
  " )\n",
  " )\n",
  "\n",
+ "\n",
  "text_result = detokenizer(tokens_result)[\"string_output\"]\n",
  "print(f\"Prompt:\\n{text_input[0]}\")\n",
  "print(f\"Generated:\\n{text_result[0]}\")"
  ]
  },
+ {
+ "cell_type": "markdown",
+ "id": "691e7415",
+ "metadata": {},
+ "source": [
+ "## Text Generation Pipeline with OpenVINO GenAI and OpenVINO Tokenizers\n",
+ "[back to top ⬆️](#Table-of-contents:)\n",
+ "\n",
+ "[OpenVINO GenAI](https://github.com/openvinotoolkit/openvino.genai) is a flavor of OpenVINO, aiming to simplify running inference of generative AI models. It hides the complexity of the generation process and minimizes the amount of code required.\n",
+ "OpenVINO GenAI depends on [OpenVINO](https://github.com/openvinotoolkit/openvino) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers).\n",
+ "\n",
+ "Firstly we need to create a pipeline with `LLMPipeline`. `LLMPipeline` is the main object used for text generation using LLM in OpenVINO GenAI API. You can construct it straight away from the folder where both converted model and tokenizer are located, e.g. `ov_genai.LLMPipeline(model_and_tokenizer_path)`. \n",
+ "\n",
+ "As the model and tokenizer are located in different directories, we create a `ov_genai.Tokenizer` object by providing the path to saved tokenizer. Then we will provide directory with model, tokenizer object and device for `LLMPipeline`. Lastly we run `generate` method and get the output in text format.\n",
+ "\n",
+ "Additionally, we can configure parameters for decoding. We can get the default config with `get_generation_config()`, setup parameters, and apply the updated version with `set_generation_config(config)` or put config directly to `generate()`. It's also possible to specify the needed options just as inputs in the `generate()` method, as shown below, e.g. we can add `max_new_tokens` to stop generation if a specified number of tokens is generated and the end of generation is not reached.\n",
+ "\n",
+ "Let's build the same text generation pipeline, but with simplified Python [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md). We will use the same model and tokenizer downloaded in previous steps."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "ce5fb0bf",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Prompt:\n",
+ "Quick brown fox jumped\n",
+ "Generated:\n",
+ "over the lazy dog.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import openvino_genai as ov_genai\n",
+ "\n",
+ "genai_tokenizer = ov_genai.Tokenizer(str(tokenizer_dir))\n",
+ "pipe = ov_genai.LLMPipeline(str(model_dir), genai_tokenizer, \"CPU\")\n",
+ "\n",
+ "result = pipe.generate(text_input[0], max_new_tokens=max_infer)\n",
+ "\n",
+ "print(f\"Prompt:\\n{text_input[0]}\")\n",
+ "print(f\"Generated:\\n{result}\")"
+ ]
+ },
  {
  "cell_type": "markdown",
  "id": "7beb6ec6-2484-44ce-b61b-c7ae605dffee",
@@ -450,12 +485,13 @@
  "\n",
  "<center><img src=\"https://github.com/openvinotoolkit/openvino_notebooks/assets/51917466/d4ece285-e445-4b76-a1ab-356427900860\"></center>\n",
  "\n",
- "The OpenVINO Python API allows you to avoid this by using the `share_inputs` option during inference, but it requires additional input from a developer every time the model is inferred. Combining the models and tokenizers simplifies memory management."
+ "The OpenVINO Python API allows you to avoid this by using the `share_inputs` option during inference, but it requires additional input from a developer every time the model is inferred. Combining the models and tokenizers simplifies memory management.\n",
+ "Moreover, after the combining models inputs have changed - original model has three inputs (`input_ids`, `attention_mask`, `token_type_ids`) and combined model has only one input for text input prompt."
  ]
  },
  {
  "cell_type": "code",
- "execution_count": 11,
+ "execution_count": 45,
  "id": "c044b56b-dae0-4fdb-97df-2aa555285f35",
  "metadata": {},
  "outputs": [],
@@ -470,35 +506,50 @@
  },
  {
  "cell_type": "code",
- "execution_count": 12,
+ "execution_count": 69,
  "id": "c6a42d4c-1982-41b9-9612-aa19138518ac",
  "metadata": {},
  "outputs": [
  {
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Logits: [[ 1.2007061 -1.4698029]]\n"
+ "Original OpenVINO model inputs:\n",
+ "<Output: names[input_ids] shape[?,?] type: i64>\n",
+ "<Output: names[attention_mask] shape[?,?] type: i64>\n",
+ "<Output: names[token_type_ids] shape[?,?] type: i64>\n",
+ "\n",
+ "Combined OpenVINO model inputs:\n",
+ "<Output: names[Parameter_4430] shape[?] type: string>\n",
+ "\n",
+ "Logits: [[ 1.2007061 -1.469803 ]]\n"
  ]
  }
  ],
  "source": [
- "from openvino import Core, save_model\n",
  "from openvino_tokenizers import connect_models\n",
  "\n",
  "\n",
- "core = Core()\n",
+ "core = ov.Core()\n",
  "text_input = [\"Free money!!!\"]\n",
  "\n",
  "ov_tokenizer = core.read_model(model_dir / \"openvino_tokenizer.xml\")\n",
  "ov_model = core.read_model(model_dir / \"openvino_model.xml\")\n",
  "combined_model = connect_models(ov_tokenizer, ov_model)\n",
- "save_model(combined_model, model_dir / \"combined_openvino_model.xml\")\n",
+ "ov.save_model(combined_model, model_dir / \"combined_openvino_model.xml\")\n",
+ "\n",
+ "print(\"Original OpenVINO model inputs:\")\n",
+ "for input in ov_model.inputs:\n",
+ " print(input)\n",
+ "\n",
+ "print(\"\\nCombined OpenVINO model inputs:\")\n",
+ "for input in combined_model.inputs:\n",
+ " print(input)\n",
  "\n",
  "compiled_combined_model = core.compile_model(combined_model)\n",
  "openvino_output = compiled_combined_model(text_input)\n",
  "\n",
- "print(f\"Logits: {openvino_output['logits']}\")"
+ "print(f\"\\nLogits: {openvino_output['logits']}\")"
  ]
  },
  {
@@ -538,7 +589,7 @@
  "name": "python",
  "nbconvert_exporter": "python",
  "pygments_lexer": "ipython3",
- "version": "3.8.10"
+ "version": "3.9.19"
  },
  "openvino_notebooks": {
  "imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/assets/51917466/047f9167-a4ef-4d3d-a33b-d124541f9e2c",