Skip to content

Commit

Permalink
Refactor openvino-tokenizers notebook (#2367)
Browse files Browse the repository at this point in the history
Ticket: CVS-151219
  • Loading branch information
yatarkan committed Sep 10, 2024
1 parent c657c43 commit e57fb8a
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 62 deletions.
1 change: 1 addition & 0 deletions .ci/spellcheck/.pyspelling.wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ DepthAnything
detections
detokenization
detokenizer
detokenizers
Dettmers
dev
detectron
Expand Down
175 changes: 113 additions & 62 deletions notebooks/openvino-tokenizers/openvino-tokenizers.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
" - [Convert Tokenizer from HuggingFace Hub with CLI Tool](#Convert-Tokenizer-from_HuggingFace-Hub-with-CLI-Tool)\n",
" - [Convert Tokenizer from HuggingFace Hub with Python API](#Convert-Tokenizer-from-HuggingFace-Hub-with-Python-API)\n",
"- [Text Generation Pipeline with OpenVINO Tokenizers](#Text-Generation-Pipeline-with-OpenVINO-Tokenizers)\n",
"- [Text Generation Pipeline with OpenVINO GenAI and OpenVINO Tokenizers](#text-generation-pipeline-with-openvino-genai-and-openvino-tokenizers)\n",
"- [Merge Tokenizer into a Model](#Merge-Tokenizer-into-a-Model)\n",
"- [Conclusion](#Conclusion)\n",
"- [Links](#Links)\n",
Expand Down Expand Up @@ -65,13 +66,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "0c698c94-b852-4b06-b699-7d417fb55e10",
"metadata": {},
"outputs": [],
"source": [
"%pip install -Uq pip\n",
"%pip install --pre -Uq openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n",
"%pip install -q -U \"openvino>=2024.3.0\" openvino-tokenizers[transformers] openvino-genai\n",
"%pip install \"numpy<2.0.0\" \"torch>=2.1\" --extra-index-url https://download.pytorch.org/whl/cpu"
]
},
Expand Down Expand Up @@ -155,7 +156,7 @@
" ]>,\n",
" <Model: 'detokenizer'\n",
" inputs[\n",
" <ConstOutput: names[Parameter_21] shape[?,?] type: i64>\n",
" <ConstOutput: names[Parameter_22] shape[?,?] type: i64>\n",
" ]\n",
" outputs[\n",
" <ConstOutput: names[string_output] shape[?] type: string>\n",
Expand All @@ -182,7 +183,9 @@
"id": "cd211b9f-37a9-4ae7-bc3e-4619643c08b8",
"metadata": {},
"source": [
"That way you get OpenVINO model objects. Use `save_model` function from OpenVINO to reuse converted tokenizers later:"
"That way you get OpenVINO model objects. Use `save_model` function from OpenVINO to reuse converted tokenizers later:\n",
"\n",
"> ⚠️ Import `openvino_tokenizers` will add all tokenizer-related operations to OpenVINO, after which you can work with saved tokenizers and detokenizers."
]
},
{
Expand All @@ -192,11 +195,14 @@
"metadata": {},
"outputs": [],
"source": [
"from openvino import save_model\n",
"import openvino as ov\n",
"\n",
"# This import is needed to add all tokenizer-related operations to OpenVINO\n",
"import openvino_tokenizers # noqa: F401\n",
"\n",
"\n",
"save_model(ov_tokenizer, tokenizer_dir / \"openvino_tokenizer.xml\")\n",
"save_model(ov_detokenizer, tokenizer_dir / \"openvino_detokenizer.xml\")"
"ov.save_model(ov_tokenizer, tokenizer_dir / \"openvino_tokenizer.xml\")\n",
"ov.save_model(ov_detokenizer, tokenizer_dir / \"openvino_detokenizer.xml\")"
]
},
{
Expand All @@ -219,15 +225,12 @@
"text": [
"Token ids: [[ 1 4321]\n",
" [ 1 6031]]\n",
"Detokenized text: ['<s> Test' '<s> strings']\n"
"Detokenized text: ['Test' 'strings']\n"
]
}
],
"source": [
"from openvino import compile_model\n",
"\n",
"\n",
"tokenizer, detokenizer = compile_model(ov_tokenizer), compile_model(ov_detokenizer)\n",
"tokenizer, detokenizer = ov.compile_model(ov_tokenizer), ov.compile_model(ov_detokenizer)\n",
"test_strings = [\"Test\", \"strings\"]\n",
"\n",
"token_ids = tokenizer(test_strings)[\"input_ids\"]\n",
Expand Down Expand Up @@ -255,23 +258,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Token ids: [[1, 4321], [1, 6031]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-04-02 18:45:50.238827: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2024-04-02 18:45:50.275055: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2024-04-02 18:45:50.909410: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Token ids: [[1, 4321], [1, 6031]]\n",
"Detokenized text: ['<s> Test', '<s> strings']\n"
]
}
Expand Down Expand Up @@ -307,11 +294,11 @@
"model_dir = Path(Path(model_id).name)\n",
"\n",
"if not model_dir.exists():\n",
" # converting the original model\n",
" # Converting the original model\n",
" # %pip install -U \"git+https://github.com/huggingface/optimum-intel.git\" \"nncf>=2.8.0\" onnx\n",
" # %optimum-cli export openvino -m $model_id --task text-generation-with-past $model_dir\n",
"\n",
" # load already converted model\n",
" # Load already converted model\n",
" from huggingface_hub import hf_hub_download\n",
"\n",
" hf_hub_download(\n",
Expand All @@ -336,15 +323,12 @@
"import numpy as np\n",
"from tqdm.notebook import trange\n",
"from pathlib import Path\n",
"from openvino_tokenizers import add_greedy_decoding\n",
"from openvino_tokenizers.constants import EOS_TOKEN_ID_NAME\n",
"from openvino import Core\n",
"\n",
"\n",
"core = Core()\n",
"core = ov.Core()\n",
"\n",
"# add the greedy decoding subgraph on top of LLM to get the most probable token as an output\n",
"ov_model = add_greedy_decoding(core.read_model(model_dir / \"openvino_model.xml\"))\n",
"ov_model = core.read_model(model_dir / \"openvino_model.xml\")\n",
"compiled_model = core.compile_model(ov_model)\n",
"infer_request = compiled_model.create_infer_request()"
]
Expand All @@ -354,7 +338,7 @@
"id": "f6f53a35-446d-4a07-9e67-5794a53b12ba",
"metadata": {},
"source": [
"The `infer_request` object provides control over the model's state - a Key-Value cache that speeds up inference by reducing computations Multiple inference requests can be created, and each request maintains a distinct and separate state.."
"The `infer_request` object provides control over the model's state - a Key-Value cache that speeds up inference by reducing computations. Multiple inference requests can be created, and each request maintains a distinct and separate state."
]
},
{
Expand All @@ -366,12 +350,12 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "af5dd23fe83c4fed8ce6b7b0a8ed41e9",
"model_id": "9135a4ff0a7141949c4990d803862bc0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/10 [00:00<?, ?it/s]"
" 0%| | 0/5 [00:00<?, ?it/s]"
]
},
"metadata": {},
Expand All @@ -384,12 +368,7 @@
"Prompt:\n",
"Quick brown fox jumped\n",
"Generated:\n",
"over the fence.\n",
"\n",
"\n",
"\n",
"\n",
"\n"
"over the fence.\n"
]
}
],
Expand All @@ -401,29 +380,35 @@
"if \"position_ids\" in (input.any_name for input in infer_request.model_inputs):\n",
" model_input[\"position_ids\"] = np.arange(model_input[\"input_ids\"].shape[1], dtype=np.int64)[np.newaxis, :]\n",
"\n",
"# no beam search, set idx to 0\n",
"# No beam search, set idx to 0\n",
"model_input[\"beam_idx\"] = np.array([0], dtype=np.int32)\n",
"# end of sentence token is that model signifies the end of text generation\n",
"# read EOS token ID from rt_info of tokenizer/detokenizer ov.Model object\n",
"\n",
"# End of sentence token is that model signifies the end of text generation\n",
"# Read EOS token ID from rt_info of tokenizer/detokenizer ov.Model object\n",
"eos_token = ov_tokenizer.get_rt_info(EOS_TOKEN_ID_NAME).value\n",
"\n",
"tokens_result = np.array([[]], dtype=np.int64)\n",
"\n",
"# reset KV cache inside the model before inference\n",
"# Reset KV cache inside the model before inference\n",
"infer_request.reset_state()\n",
"max_infer = 10\n",
"max_infer = 5\n",
"\n",
"for _ in trange(max_infer):\n",
" infer_request.start_async(model_input)\n",
" infer_request.wait()\n",
"\n",
" # get a prediction for the last token on the first inference\n",
" output_token = infer_request.get_output_tensor().data[:, -1:]\n",
" output_tensor = infer_request.get_output_tensor()\n",
"\n",
" # Get the most probable token\n",
" token_indices = np.argmax(output_tensor.data, axis=-1)\n",
" output_token = token_indices[:, -1:]\n",
"\n",
" # Concatenate previous tokens result with newly generated token\n",
" tokens_result = np.hstack((tokens_result, output_token))\n",
" if output_token[0, 0] == eos_token:\n",
" break\n",
"\n",
" # prepare input for new inference\n",
" # Prepare input for the next inference iteration\n",
" model_input[\"input_ids\"] = output_token\n",
" model_input[\"attention_mask\"] = np.hstack((model_input[\"attention_mask\"].data, [[1]]))\n",
" model_input[\"position_ids\"] = np.hstack(\n",
Expand All @@ -433,11 +418,61 @@
" )\n",
" )\n",
"\n",
"\n",
"text_result = detokenizer(tokens_result)[\"string_output\"]\n",
"print(f\"Prompt:\\n{text_input[0]}\")\n",
"print(f\"Generated:\\n{text_result[0]}\")"
]
},
{
"cell_type": "markdown",
"id": "691e7415",
"metadata": {},
"source": [
"## Text Generation Pipeline with OpenVINO GenAI and OpenVINO Tokenizers\n",
"[back to top ⬆️](#Table-of-contents:)\n",
"\n",
"[OpenVINO GenAI](https://github.com/openvinotoolkit/openvino.genai) is a flavor of OpenVINO, aiming to simplify running inference of generative AI models. It hides the complexity of the generation process and minimizes the amount of code required.\n",
"OpenVINO GenAI depends on [OpenVINO](https://github.com/openvinotoolkit/openvino) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers).\n",
"\n",
"Firstly we need to create a pipeline with `LLMPipeline`. `LLMPipeline` is the main object used for text generation using LLM in OpenVINO GenAI API. You can construct it straight away from the folder where both converted model and tokenizer are located, e.g. `ov_genai.LLMPipeline(model_and_tokenizer_path)`. \n",
"\n",
"As the model and tokenizer are located in different directories, we create a `ov_genai.Tokenizer` object by providing the path to saved tokenizer. Then we will provide directory with model, tokenizer object and device for `LLMPipeline`. Lastly we run `generate` method and get the output in text format.\n",
"\n",
"Additionally, we can configure parameters for decoding. We can get the default config with `get_generation_config()`, setup parameters, and apply the updated version with `set_generation_config(config)` or put config directly to `generate()`. It's also possible to specify the needed options just as inputs in the `generate()` method, as shown below, e.g. we can add `max_new_tokens` to stop generation if a specified number of tokens is generated and the end of generation is not reached.\n",
"\n",
"Let's build the same text generation pipeline, but with simplified Python [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md). We will use the same model and tokenizer downloaded in previous steps."
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "ce5fb0bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prompt:\n",
"Quick brown fox jumped\n",
"Generated:\n",
"over the lazy dog.\n"
]
}
],
"source": [
"import openvino_genai as ov_genai\n",
"\n",
"genai_tokenizer = ov_genai.Tokenizer(str(tokenizer_dir))\n",
"pipe = ov_genai.LLMPipeline(str(model_dir), genai_tokenizer, \"CPU\")\n",
"\n",
"result = pipe.generate(text_input[0], max_new_tokens=max_infer)\n",
"\n",
"print(f\"Prompt:\\n{text_input[0]}\")\n",
"print(f\"Generated:\\n{result}\")"
]
},
{
"cell_type": "markdown",
"id": "7beb6ec6-2484-44ce-b61b-c7ae605dffee",
Expand All @@ -450,12 +485,13 @@
"\n",
"<center><img src=\"https://github.com/openvinotoolkit/openvino_notebooks/assets/51917466/d4ece285-e445-4b76-a1ab-356427900860\"></center>\n",
"\n",
"The OpenVINO Python API allows you to avoid this by using the `share_inputs` option during inference, but it requires additional input from a developer every time the model is inferred. Combining the models and tokenizers simplifies memory management."
"The OpenVINO Python API allows you to avoid this by using the `share_inputs` option during inference, but it requires additional input from a developer every time the model is inferred. Combining the models and tokenizers simplifies memory management.\n",
"Moreover, after the combining models inputs have changed - original model has three inputs (`input_ids`, `attention_mask`, `token_type_ids`) and combined model has only one input for text input prompt."
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 45,
"id": "c044b56b-dae0-4fdb-97df-2aa555285f35",
"metadata": {},
"outputs": [],
Expand All @@ -470,35 +506,50 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 69,
"id": "c6a42d4c-1982-41b9-9612-aa19138518ac",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logits: [[ 1.2007061 -1.4698029]]\n"
"Original OpenVINO model inputs:\n",
"<Output: names[input_ids] shape[?,?] type: i64>\n",
"<Output: names[attention_mask] shape[?,?] type: i64>\n",
"<Output: names[token_type_ids] shape[?,?] type: i64>\n",
"\n",
"Combined OpenVINO model inputs:\n",
"<Output: names[Parameter_4430] shape[?] type: string>\n",
"\n",
"Logits: [[ 1.2007061 -1.469803 ]]\n"
]
}
],
"source": [
"from openvino import Core, save_model\n",
"from openvino_tokenizers import connect_models\n",
"\n",
"\n",
"core = Core()\n",
"core = ov.Core()\n",
"text_input = [\"Free money!!!\"]\n",
"\n",
"ov_tokenizer = core.read_model(model_dir / \"openvino_tokenizer.xml\")\n",
"ov_model = core.read_model(model_dir / \"openvino_model.xml\")\n",
"combined_model = connect_models(ov_tokenizer, ov_model)\n",
"save_model(combined_model, model_dir / \"combined_openvino_model.xml\")\n",
"ov.save_model(combined_model, model_dir / \"combined_openvino_model.xml\")\n",
"\n",
"print(\"Original OpenVINO model inputs:\")\n",
"for input in ov_model.inputs:\n",
" print(input)\n",
"\n",
"print(\"\\nCombined OpenVINO model inputs:\")\n",
"for input in combined_model.inputs:\n",
" print(input)\n",
"\n",
"compiled_combined_model = core.compile_model(combined_model)\n",
"openvino_output = compiled_combined_model(text_input)\n",
"\n",
"print(f\"Logits: {openvino_output['logits']}\")"
"print(f\"\\nLogits: {openvino_output['logits']}\")"
]
},
{
Expand Down Expand Up @@ -538,7 +589,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.9.19"
},
"openvino_notebooks": {
"imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/assets/51917466/047f9167-a4ef-4d3d-a33b-d124541f9e2c",
Expand Down

0 comments on commit e57fb8a

Please sign in to comment.