Merge branch 'main' into perplexity-pre-submit

nod-ai · Nov 21, 2024 · e88a3ea · e88a3ea
2 parents 93b6a2b + c8738b7
commit e88a3ea
Show file tree

Hide file tree

Showing 12 changed files with 397 additions and 82 deletions.
diff --git a/docs/developer_guide.md b/docs/developer_guide.md
@@ -3,6 +3,55 @@
 Each sub-project has its own developer guide. If you would like to work across
 projects, these instructions should help you get started:
 
+
+### Install Dependencies
+
+Install shortfin dependencies
+```bash
+sudo apt update && sudo apt install -y clang lld
+```
+
+### Prepare your python environment
+
+Install:
+
+```
+python-is-python3 python3-venv python3-dev
+```
+
+<details>
+
+<summary> Or, alternatively, use `pyenv` to manage a separate python installation for more control over its version: </summary>
+
+
+First, install pyenv and its dependencies.
+
+```bash
+sudo apt update; sudo apt install build-essential libssl-dev zlib1g-dev \
+libbz2-dev libreadline-dev libsqlite3-dev curl git \
+libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev
+curl https://pyenv.run | bash
+```
+
+Then, make pyenv available by adding the below to your `~/.bashrc`:
+
+```bash
+export PYENV_ROOT="$HOME/.pyenv"
+command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"
+eval "$(pyenv init -)"
+```
+
+Finally, install a pyenv-managed version of python
+
+```bash
+pyenv install 3.12 # or whichever python version you'd like
+pyenv local 3.12
+```
+
+Now, your python, pip, and venv should be managed by pyenv instead.
+
+</details>
+
 ### Setup a venv
 
 We recommend setting up a Python
@@ -54,8 +103,10 @@ See also: [nightly_releases.md](nightly_releases.md).
 ### Running tests
 
 ```bash
+pip install -r shortfin/requirements-tests.txt
 pytest sharktank
 pytest shortfin
+pytest app_tests/integration_tests
 ```
 
 ### Optional: pre-commits and developer settings

diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -167,11 +167,12 @@ def generate_batch_prefill(bs: int):
             model, llama_config.tensor_parallelism_size
         )
 
-        # We need to offset the indices for the cache
-        arg_affinities = {key + 3: arg_affinities[key] for key in arg_affinities}
+        if llama_config.tensor_parallelism_size > 1:
+            # We need to offset the indices for the cache
+            arg_affinities = {key + 3: arg_affinities[key] for key in arg_affinities}
 
-        for i in range(3):
-            arg_affinities[i] = DeviceAffinity("0")
+            for i in range(3):
+                arg_affinities[i] = DeviceAffinity("0")
 
         dynamic_shapes = {
             "tokens": {1: sl_dim},
@@ -244,12 +245,13 @@ def generate_batch_decode(bs: int):
             arg_affinities,
         ) = setup_cache(model, llama_config.tensor_parallelism_size)
 
-        # We need to offset the indices for the cache
-        arg_affinities = {key + 4: arg_affinities[key] for key in arg_affinities}
+        if llama_config.tensor_parallelism_size > 1:
+            # We need to offset the indices for the cache
+            arg_affinities = {key + 4: arg_affinities[key] for key in arg_affinities}
 
-        # Inputs have default affinity 0
-        for i in range(4):
-            arg_affinities[i] = DeviceAffinity("0")
+            # Inputs have default affinity 0
+            for i in range(4):
+                arg_affinities[i] = DeviceAffinity("0")
 
         dynamic_shapes = {
             "tokens": {},

diff --git a/sharktank/sharktank/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py
@@ -189,6 +189,7 @@ class T5Config:
     is_encoder_decoder: bool = True
     is_decoder: bool = False
     vocab_size: int = 32128
+    context_length: int = 512
     d_model: int = 512
     d_kv: int = 64
     d_ff: int = 2048
@@ -206,6 +207,7 @@ class T5Config:
     pad_token_id: int = 0
     eos_token_id: int = 1
     decoder_start_token_id: int = 0
+    context_length_padding_block_size: int = 16
 
     def __post_init__(self):
         self.is_gated_act = self.feed_forward_proj.startswith("gated-")
@@ -226,6 +228,7 @@ def from_gguf_properties(properties: dict[str, Any], **kwargs):
         )
 
         gguf_to_config_names_map = {
+            "t5.context_length": ["context_length"],
             "t5.embedding_length": ["d_model"],
             "t5.feed_forward_length": ["d_ff"],
             "t5.block_count": ["num_layers", "num_decoder_layers"],
@@ -245,6 +248,8 @@ def from_gguf_properties(properties: dict[str, Any], **kwargs):
                 for config_name in config_names
             }
         )
+        if "tokenizer.ggml.tokens" in properties:
+            all_kwargs["vocab_size"] = len(properties["tokenizer.ggml.tokens"])
         all_kwargs.update(kwargs)
 
         return T5Config(**all_kwargs)
diff --git a/sharktank/sharktank/models/t5/__init__.py b/sharktank/sharktank/models/t5/__init__.py
@@ -0,0 +1,8 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from .t5 import *
+from .export import *
diff --git a/sharktank/sharktank/models/t5/export.py b/sharktank/sharktank/models/t5/export.py
@@ -0,0 +1,97 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import Union
+from pathlib import Path
+import torch
+
+from .t5 import T5Config, T5Encoder
+from ...types import Dataset
+from iree.turbine.aot import FxProgramsBuilder, export
+
+__all__ = [
+    "export_encoder_mlir",
+    "export_encoder_iree_parameters",
+    "prune_decoder_parameters",
+]
+
+
+def export_encoder_mlir(
+    model: Union[T5Encoder, Path, str],
+    batch_sizes: list[int],
+    mlir_output_path: str,
+):
+    """
+    Args:
+      model: either the torch module or path to GGUF/IRPA.
+    """
+    if isinstance(model, (Path, str)):
+        dataset = Dataset.load(model)
+        config = T5Config.from_gguf_properties(
+            dataset.properties,
+            # TODO: add this property to our HuggingFace-to-GGUF conversion script.
+            # We currently use llama.cpp's converter and it can not make a distinction
+            # between T5 V1 and V1.1.
+            # V1 uses ReLU and V1.1 uses gated GeLU.
+            feed_forward_proj="gated-gelu",
+        )
+        model = T5Encoder(theta=dataset.root_theta, config=config)
+
+    fxb = FxProgramsBuilder(model)
+
+    for batch_size in batch_sizes:
+        sample_inputs = model.sample_inputs(batch_size)
+
+        context_length_dim_idx = 1
+        assert (
+            sample_inputs["input_ids"].shape[context_length_dim_idx]
+            % config.context_length_padding_block_size
+            == 0
+        )
+        context_length_block_dim_max = (
+            sample_inputs["input_ids"].shape[context_length_dim_idx]
+            // config.context_length_padding_block_size
+        )
+        context_length_block_dim = torch.export.Dim(
+            "block", max=context_length_block_dim_max
+        )
+        context_length_dim = (
+            config.context_length_padding_block_size * context_length_block_dim
+        )
+        dynamic_shapes = {"input_ids": {context_length_dim_idx: context_length_dim}}
+
+        @fxb.export_program(
+            name=f"forward_bs{batch_size}",
+            args=tuple(sample_inputs.values()),
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+        )
+        def _(
+            model,
+            input_ids,
+        ):
+            return model(input_ids)
+
+    output = export(fxb, import_symbolic_shape_expressions=True)
+    output.save_mlir(mlir_output_path)
+
+
+def prune_decoder_parameters(dataset: Dataset):
+    # Remove decoder tensors/parameters if present.
+    try:
+        del dataset.root_theta.tree["dec"]
+    except KeyError:
+        pass
+    try:
+        del dataset.properties["t5.decoder_start_token_id"]
+    except KeyError:
+        pass
+
+
+def export_encoder_iree_parameters(model_path: str, output_path: str):
+    dataset = Dataset.load(model_path)
+    prune_decoder_parameters(dataset)
+    dataset.save(output_path)
diff --git a/sharktank/sharktank/models/t5/t5.py b/sharktank/sharktank/models/t5/t5.py
@@ -26,8 +26,20 @@
 )
 from ... import ops
 from ...types.theta import Theta
+from ...types.tensors import AnyTensor
 from ...layers import FFN, T5Config
 
+__all__ = [
+    "T5Config",
+    "T5LayerFF",
+    "T5Attention",
+    "T5SelfAttention",
+    "T5CrossAttention",
+    "T5Block",
+    "T5Stack",
+    "T5Encoder",
+]
+
 logger = logging.getLogger(__name__)
 
 
@@ -1044,6 +1056,22 @@ def __init__(self, theta: Theta, config: T5Config):
             theta=theta, config=encoder_config, embed_tokens=self.token_embedding
         )
 
+    @property
+    def config(self):
+        return self.encoder.config
+
+    def sample_inputs(self, batch_size: int) -> OrderedDict[str, AnyTensor]:
+        return OrderedDict(
+            [
+                (
+                    "input_ids",
+                    torch.empty(
+                        size=[batch_size, self.config.context_length], dtype=torch.long
+                    ),
+                )
+            ]
+        )
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

diff --git a/sharktank/sharktank/types/gguf_interop/base.py b/sharktank/sharktank/types/gguf_interop/base.py
@@ -11,7 +11,7 @@
 import numpy as np
 import torch
 
-from gguf import GGUFReader, GGUFValueType
+from gguf import GGUFReader, GGUFValueType, ReaderField
 
 from iree.turbine.aot import (
     ExternalTensorTrait,
@@ -44,12 +44,26 @@ def _sanitize_scalar(scalar):
     return scalar
 
 
+def _load_array(field: ReaderField) -> list:
+    if len(field.types) != 2:
+        raise ValueError(f"Unsupported array type {field.types}")
+    element_type = field.types[1]
+    if element_type == GGUFValueType.STRING:
+        return [
+            str(bytes(field.parts[parts_index]), encoding="utf8")
+            for parts_index in field.data
+        ]
+    elif element_type in GGUFReader.gguf_scalar_to_np:
+        return [
+            _sanitize_scalar(field.parts[parts_index][0]) for parts_index in field.data
+        ]
+    else:
+        raise ValueError(f"Unsupported array element type f{element_type}")
+
+
 def _load_properties(reader: GGUFReader) -> dict[str, Any]:
-    # TODO: Figure out what to do with tables.
-    tables: dict[str, Any] = {}
     properties: dict[str, Any] = {
         "schema": "GGUF",
-        # "tables": tables,
     }
 
     # Extract hyper-parameters. Adapted from gguf-dump.py
@@ -60,8 +74,10 @@ def _load_properties(reader: GGUFReader) -> dict[str, Any]:
                 properties[field.name] = str(bytes(field.parts[-1]), encoding="utf8")
             elif field.types[0] in reader.gguf_scalar_to_np:
                 properties[field.name] = _sanitize_scalar(field.parts[-1][0])
+        elif field.types[0] == GGUFValueType.ARRAY:
+            properties[field.name] = _load_array(field)
         else:
-            tables[field.name] = field.parts
+            raise ValueError(f"Invalid field type.")
     return properties
 
 

diff --git a/sharktank/sharktank/utils/iree.py b/sharktank/sharktank/utils/iree.py
@@ -6,7 +6,6 @@
 
 import iree.runtime
 from typing import List, Tuple, Optional, Union
-from copy import deepcopy
 from pathlib import Path
 import torch
 import numpy as np
@@ -91,14 +90,7 @@ def run_iree_module_function(
     )
     if trace_path_prefix is not None:
         for i, arg in enumerate(args):
-            # iree.runtime.DeviceArray.to_host() will cache the result and reuse it.
-            # In the meantime the "actual" device array may have changed.
-            # It kinda assumes immutable arrays.
-            # This should probably not be its behavior.
-            # See https://github.com/iree-org/iree/issues/18870.
-            # deepcopy also returns an numpy ndarray instead of DeviceArray.
-            arg_copy = deepcopy(arg)
-            np.save(f"{trace_path_prefix}{function_name}_arg{i}.npy", arg_copy)
+            np.save(f"{trace_path_prefix}{function_name}_arg{i}.npy", arg.to_host())
     results = invoker(*args)
     if isinstance(results, iree.runtime.DeviceArray):
         results = (results,)
@@ -107,10 +99,10 @@ def run_iree_module_function(
         for i, arg in enumerate(args):
             np.save(
                 f"{trace_path_prefix}{function_name}_arg{i}_post_call.npy",
-                deepcopy(arg),
+                arg.to_host(),
             )
         for i, arg in enumerate(results):
-            np.save(f"{trace_path_prefix}{function_name}_result{i}.npy", deepcopy(arg))
+            np.save(f"{trace_path_prefix}{function_name}_result{i}.npy", arg.to_host())
     return results
 
 
@@ -197,4 +189,4 @@ def call_torch_module_function(
 
 
 def iree_to_torch(*tensors: iree.runtime.DeviceArray) -> List[torch.Tensor]:
-    return [torch.tensor(deepcopy(tensor)) for tensor in tensors]
+    return [torch.tensor(tensor.to_host()) for tensor in tensors]