Working OGA

onnx · Aug 28, 2024 · d81877c · d81877c
1 parent 6a5e685
commit d81877c
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 27 deletions.
diff --git a/setup.py b/setup.py
@@ -44,11 +44,43 @@
         "GitPython>=3.1.40",
         "psutil",
     ],
-    extras_require={"llm": {}, "llm-oga-igpu": {}, "llm-oga-npu": {}},
+    extras_require={
+        "llm": [
+            "tqdm",
+            "torch>=2.0.0",
+            "transformers",
+            "accelerate",
+            "py-cpuinfo",
+            "sentencepiece",
+            "datasets",
+            "fastapi",
+        ],
+        "llm-oga-dml": [
+            "onnxruntime-directml==1.18.0",
+            "onnxruntime-genai-directml==0.2.0",
+            "tqdm",
+            "torch>=2.0.0",
+            "transformers",
+            "accelerate",
+            "py-cpuinfo",
+            "sentencepiece",
+            "datasets",
+            "fastapi",
+        ],
+        "llm-oga-npu": [
+            "transformers",
+            "torch",
+            "onnx==1.16.0",
+            "onnxruntime==1.18.0",
+            "numpy==1.26.4",
+        ]
+    },
     classifiers=[],
     entry_points={
         "console_scripts": [
             "turnkey=turnkeyml:turnkeycli",
+            "turnkey-llm=turnkeyml.llm:lemonadecli",
+            "lemonade=turnkeyml.llm:lemonadecli",
         ]
     },
     python_requires=">=3.8, <3.12",

diff --git a/src/turnkeyml/llm/README.md b/src/turnkeyml/llm/README.md
@@ -16,32 +16,40 @@ Contents:
 
 1. Clone: `git clone https://github.com/onnx/turnkeyml.git`
 1. Create and activate a conda environment:
-    1. `conda create -n lemon python=3.10`
-    1. `conda activate lemon`
-1. `cd turnkeyml`
+    1. `conda create -n tk-llm python=3.10`
+    1. `conda activate tk-llm`
+1. `cd turnkeyml` (where `turnkeyml` is the repo root of your TurnkeyML clone)
 1. Install lemonade: `pip install -e .[llm]`
-    - or `pip install -e .[llm-og]` if you want to use `onnxruntime-genai`
+    - or `pip install -e .[llm-oga-dml]` if you want to use `onnxruntime-genai` (see [OGA](#install-onnxruntime-genai))
 1. `lemonade -h` to explore the LLM tools
 
+## Syntax
+
+The `lemonade` CLI uses the same style of syntax as `turnkey`, but with a new set of LLM-specific tools. You can read abhout that syntax [here](https://github.com/onnx/turnkeyml#how-it-works).
+
 ## Chatting
 
 To chat with your LLM try:
 
 `lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are"`
 
-The LLM's response to your prompt will be printed to the screen. You can replace the `"Hello, my thoughts are"` with any prompt you like.
+The LLM will run on CPU with your provided prompt, and the LLM's response to your prompt will be printed to the screen. You can replace the `"Hello, my thoughts are"` with any prompt you like.
 
 You can also replace the `facebook/opt-125m` with any Huggingface checkpoint you like, including LLaMA-2, Phi-2, Qwen, Mamba, etc.
 
+You can also set the `--device` argument in `huggingface-load` to load your LLM on a different device.
+
+Run `lemonade huggingface-load -h` and `lemonade llm-prompt -h` to learn more about those tools.
+
 ## Accuracy
 
-To measure the accuracy of an LLM in Torch eager mode, try this:
+To measure the accuracy of an LLM using MMLU, try this:
 
 `lemonade -i facebook/opt-125m huggingface-load mmlu-accuracy --tests management`
 
-That command will run the management test from MMLU on your LLM and save the score to the lemonade cache at `~/.cache/lemonade`. 
+That command will run just the management test from MMLU on your LLM and save the score to the lemonade cache at `~/.cache/lemonade`.
 
-Learn more about the options provided by a tool by calling `lemonade TOOL -h`, for example `lemonade accuracy-mmlu -h`.
+You can run the full suite of MMLU subjects by ommitting the `--test` argument. You can learn more about this with `lemonade accuracy-mmlu -h.
 
 ## Serving
 
@@ -74,13 +82,34 @@ print("Response:", state.response)
 
 Lemonade supports specialized tools that each require their own setup steps. **Note:** These tools will only appear in `lemonade -h` if you run in an environment that has completed setup.
 
-## Install OnnxRuntime-GenAI-DirectML
+## Install OnnxRuntime-GenAI
+
+To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai) (e.g., the `oga-load` Tool), use `pip install -e .[llm-oga-dml]` instead of the default installation command.
+
+Next, you need to get an OGA model. Per the OGA instructions, we suggest Phi-3-Mini. Use the following command to download it from Hugging Face, and make sure to set your `--local-dir` to the `REPO_ROOT/src/turnkeyml/llm/ort_genai/models` directory.
+
+`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-4k-instruct`
+
+You can try it out with: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
 
-To install support for onnxruntime-genai (e.g., the `oga-load` Tool), use `pip install -e .[llm-og]` to install `lemonade`.
+You can also try Phi-3-Mini-128k-Instruct with the following commands:
+
+`huggingface-cli download microsoft/Phi-3-mini-128k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct`
+
+`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
+
+
+> Note: no other models or devices are officially supported by `lemonade` on OGA at this time. Contributions appreciated!
 
 ## Install Ryzen AI NPU
 
-To run your LLMs on Ryzen AI NPU, first install and set up the `ryzenai-transformers` conda environment. Then, install `lemonade` into `ryzenai-transformers`. The `ryzenai-npu-load` Tool will become available in that environment.
+To run your LLMs on Ryzen AI NPU, first install and set up the `ryzenai-transformers` conda environment (see instructions [here](https://github.com/amd/RyzenAI-SW/tree/main/example/transformers)). Then, install `lemonade` into `ryzenai-transformers`. The `ryzenai-npu-load` Tool will become available in that environment.
+
+You can try it out with: `lemonade -i meta-llama/Llama-2-7b-chat-hf ryzenai-npu-load --device DEVICE llm-prompt -p "Hello, my thoughts are"`
+
+Where `DEVICE` is either "phx" or "stx" if you have a RyzenAI 7xxx/8xxx or 3xx/9xxx processor, respectively.
+
+> Note: only `meta-llama/Llama-2-7b-chat-hf` and `microsoft/Phi-3-mini-4k-instruct` are supported by `lemonade` at this time. Contributions appreciated!
 
 # Contributing
 

diff --git a/src/turnkeyml/llm/tools/ort_genai/oga.py b/src/turnkeyml/llm/tools/ort_genai/oga.py
@@ -190,14 +190,14 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser.add_argument(
             "-d",
             "--device",
-            choices=["cpu", "igpu", "npu"],
+            choices=["igpu", "npu"],
             default="igpu",
             help="Which device to load the model on to (default: igpu)",
         )
 
         parser.add_argument(
             "--dtype",
-            choices=["int4", "float16"],
+            choices=["int4"],
             required=True,
             help="Data type to load the model in",
         )
@@ -209,26 +209,17 @@ def run(
         state: State,
         input: str = phi_3_mini_128k,
         device: str = "igpu",
-        dtype: str = "float16",
+        dtype: str = "int4",
     ) -> State:
 
         checkpoint = input
 
         # Map of models[device][dtype][checkpoint] to the name of the model folder on disk
         supported_models = {
-            "cpu": {
-                "int4": {phi_3_mini_128k: "Phi-3-mini-128k-instruct-onnx_int4_cpu"}
-            },
             "igpu": {
                 "int4": {
-                    llama_2: "llama2-7b-instruct-dml-int4-awq-block-128",
-                    llama_3: "llama3-8b-instruct-dml-int4-awq-block-128",
-                    phi_3_mini_128k: "Phi-3-mini-128k-instruct-onnx_int4_awq_block-128",
-                    phi_3_mini_4k: "Phi-3-mini-4k-instruct-onnx_int4_awq_block-128",
-                },
-                "float16": {
-                    phi_3_mini_128k: "microsoft_Phi-3-mini-128k-instruct",
-                    phi_3_mini_4k: "microsoft_Phi-3-mini-4k-instruct",
+                    phi_3_mini_128k: os.path.join("phi-3-mini-128k-instruct", "directml", "directml-int4-awq-block-128"),
+                    phi_3_mini_4k: os.path.join("phi-3-mini-4k-instruct", "directml", "directml-int4-awq-block-128"),
                 },
             },
             "npu": {

diff --git a/src/turnkeyml/version.py b/src/turnkeyml/version.py
@@ -1 +1 @@
-__version__ = "4.0.0"
+__version__ = "4.0.1"