From caac38fb3db9d60bc9cceaee90f9f92ea3e29764 Mon Sep 17 00:00:00 2001 From: HwH Date: Mon, 15 Apr 2024 18:20:48 +0800 Subject: [PATCH 01/52] minicpm-v --- vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/minicpmv.py | 582 +++++++++++++++++++++++++ 2 files changed, 583 insertions(+) create mode 100644 vllm/model_executor/models/minicpmv.py diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 17fc970568042..58090f46f7775 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -42,6 +42,7 @@ "MptForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), + "MiniCPMV": ("minicpmv", "MiniCPMV"), "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py new file mode 100644 index 0000000000000..1b614bb711d71 --- /dev/null +++ b/vllm/model_executor/models/minicpmv.py @@ -0,0 +1,582 @@ +from typing import List, Optional + +import math +import numpy as np +import torch +from torch import nn +from torch.nn.init import trunc_normal_ +import torch.nn.functional as F +from torchvision.transforms import InterpolationMode +from torchvision import transforms +from functools import partial +from PIL import Image +import timm +from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD + +from datetime import datetime + +from vllm.attention import AttentionMetadata +from vllm.config import VisionLanguageConfig +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import LinearMethodBase +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.models.minicpm import MiniCPMForCausalLM +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + + +# from .configuration_minicpm import MiniCPMVConfig +# from .resampler import Resampler + + +_KEYS_TO_MODIFY_MAPPING = { + "language_model.lm_head": "lm_head", + "language_model.model": "language_model", +} + + +def get_abs_pos(abs_pos, tgt_size): + # abs_pos: L, C + # tgt_size: (H, W) + # return: M, C + src_size = int(math.sqrt(abs_pos.size(0))) + # tgt_size = int(math.sqrt(tgt_size)) + dtype = abs_pos.dtype + + return F.interpolate( + abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2), + size=(tgt_size[0], tgt_size[1]), + mode="bicubic", + align_corners=False, + ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype) + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +class Resampler(nn.Module): + """ + A 2D perceiver-resampler network with one cross attention layers by + (grid_size**2) learnable queries and 2d sincos pos_emb + Outputs: + A tensor with the shape of (grid_size**2, embed_dim) + """ + + def __init__( + self, + grid_size, + embed_dim, + num_heads, + kv_dim=None, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + adaptive=False + ): + super().__init__() + self.num_queries = grid_size ** 2 + self.embed_dim = embed_dim + self.num_heads = num_heads + self.adaptive = adaptive + + self.pos_embed = nn.Parameter( + torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float() + ).requires_grad_(False) + + self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) + trunc_normal_(self.query, std=.02) + + if kv_dim is not None and kv_dim != embed_dim: + self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False) + else: + self.kv_proj = nn.Identity() + + self.attn = nn.MultiheadAttention(embed_dim, num_heads) + self.ln_q = norm_layer(embed_dim) + self.ln_kv = norm_layer(embed_dim) + + self.ln_post = norm_layer(embed_dim) + self.proj = nn.Parameter((embed_dim ** -0.5) * torch.randn(embed_dim, embed_dim)) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x, tgt_size=None, attn_mask=None): + if self.adaptive: + pos_embed = torch.Tensor(get_2d_sincos_pos_embed(self.embed_dim, tgt_size)).float().to(device=x.device, dtype=x.dtype) + else: + pos_embed = get_abs_pos(self.pos_embed, tgt_size) + + x = self.kv_proj(x) + x = self.ln_kv(x).permute(1, 0, 2) + + N = x.shape[1] + q = self.ln_q(self.query) + out = self.attn( + self._repeat(q, N) + self.pos_embed.unsqueeze(1), + x + pos_embed.unsqueeze(1), + x, + attn_mask=attn_mask)[0] + x = out.permute(1, 0, 2) + + x = self.ln_post(x) + x = x @ self.proj + return x + + def _repeat(self, query, N: int): + return query.unsqueeze(1).repeat(1, N, 1) + + +class MiniCPMV(nn.Module): + + def __init__( + self, + config, + linear_method: Optional["LinearMethodBase"] = None + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.llm = MiniCPMForCausalLM(config, linear_method) + self.vpm = self.init_vision_module() + self.vpm.to(dtype=torch.bfloat16) + self.vision_dim = self.vpm.embed_dim + self.embed_dim = self.llm.config.hidden_size + self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) + self.resampler.to(device="cuda", dtype=torch.bfloat16) + self.transform, self.inv_transform, self.resize_transform = self.init_transform() + + self.sampler = Sampler() + + def init_vision_module(self): + default_dtype = torch.get_default_dtype() + torch.set_default_dtype(torch.float16) + model = timm.create_model( + 'vit_so400m_patch14_siglip_384.webli', + pretrained=False, + num_classes=0, + dynamic_img_size=True, + dynamic_img_pad=True + ) + torch.set_default_dtype(default_dtype) + if isinstance(model, timm.models.VisionTransformer): + if model.attn_pool is not None: + model.attn_pool = torch.nn.Identity() + + if self.config.drop_vision_last_layer: + model.blocks = model.blocks[:-1] + + return model + + def init_resampler(self, embed_dim, vision_dim): + default_dtype = torch.get_default_dtype() + torch.set_default_dtype(torch.float16) + resampler = Resampler( + grid_size=int(math.sqrt(self.config.query_num)), + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + adaptive=True + ) + torch.set_default_dtype(default_dtype) + return resampler + + def init_transform(self): + return transforms.Compose([ + transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD) + ]), transforms.ToPILImage(), transforms.Compose([ + transforms.Resize((self.config.scale_resolution, self.config.scale_resolution), InterpolationMode, antialias=True) + ]) + + def get_vision_embedding(self, pixel_values): + res = [] + dtype = self.vpm.pos_embed.data.dtype + for pixel_value in pixel_values: + # V2.0 start + H, W = pixel_value.shape[-2:] + tgt_size = ( + math.ceil(H / self.vpm.patch_embed.patch_size[0]), math.ceil(W / self.vpm.patch_embed.patch_size[0])) + # V2.0 end + vision_embedding = self.vpm.forward_features(pixel_value.unsqueeze(0).type(dtype)) + if hasattr(self.vpm, 'num_prefix_tokens') and self.vpm.num_prefix_tokens > 0: + vision_embedding = vision_embedding[:, self.vpm.num_prefix_tokens:] + res.append(self.resampler(vision_embedding, tgt_size)) + return torch.vstack(res) + + def get_image_bound(self, input_ids, im_start_token_id, im_end_token_id, unk_token_id): + length = len(input_ids) + bound = [] + im_start_idx = -1 + flag = False + for x in range(length): + if input_ids[x] == im_start_token_id: + if flag is False: + flag = True + im_start_idx = x + 1 + elif input_ids[x] == im_end_token_id: + if flag is True: + flag = False + bound.append(im_start_idx) + bound.append(x - 1) + elif input_ids[x] != unk_token_id: + if flag is True: + flag = False + if len(bound) > 0: + bound = torch.tensor(bound).reshape(-1, 2) + return bound + + def ensure_divide(self, length, patch_size): + return max(round(length / patch_size) * patch_size, patch_size) + + def find_best_resize(self, original_size, scale_resolution, patch_size, allow_upscale=False): + width, height = original_size + if (width * height > scale_resolution * scale_resolution) or allow_upscale: + r = width / height + height = int(scale_resolution / math.sqrt(r)) + width = int(height * r) + best_width = self.ensure_divide(width, patch_size) + best_height = self.ensure_divide(height, patch_size) + return (best_width, best_height) + + def get_refine_size(self, original_size, grid, scale_resolution, patch_size, allow_upscale=False): + width, height = original_size + grid_x, grid_y = grid + + refine_width = self.ensure_divide(width, grid_x) + refine_height = self.ensure_divide(height, grid_y) + + grid_width = refine_width / grid_x + grid_height = refine_height / grid_y + + best_grid_size = self.find_best_resize( + (grid_width, grid_height), + scale_resolution, + patch_size, + allow_upscale=allow_upscale + ) + refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y) + return refine_size + + def split_to_patches(self, image, grid): + patches = [] + width, height = (image.shape[-1], image.shape[-2]) + grid_x = int(width / grid[0]) + grid_y = int(height / grid[1]) + for i in range(0, height, grid_y): + images = [] + for j in range(0, width, grid_x): + patch = image[:, i:i+grid_y, j:j+grid_x] + images.append(patch) + patches.append(images) + + return patches + + def slice_image(self, image: torch.Tensor, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False): + original_size = (image.shape[-1], image.shape[-2]) + original_width, original_height = original_size + log_ratio = math.log(original_width / original_height) + ratio = original_width * original_height / (scale_resolution * scale_resolution) + multiple = min(math.ceil(ratio), max_slice_nums) + + source_image = None + best_grid = None + patches = [] + + if multiple <= 1 or never_split: + best_size = self.find_best_resize(original_size, scale_resolution, patch_size) + resize_transform = transforms.Compose([transforms.Resize((best_size[::-1]), InterpolationMode.BICUBIC, antialias=True), transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD)]) + source_image = resize_transform(image) + else: + candidate_split_grids_nums = [] + for i in [multiple - 1, multiple, multiple + 1]: + if i == 1 or i > max_slice_nums: + continue + candidate_split_grids_nums.append(i) + + best_resize = self.find_best_resize(original_size, scale_resolution, patch_size) + resize_transform = transforms.Compose([transforms.Resize(best_resize[::-1], InterpolationMode.BICUBIC, antialias=True), transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD)]) + source_image = resize_transform(image.clone()) + candidate_grids = [] + + # find best grid + for split_grids_nums in candidate_split_grids_nums: + m = 1 + while m <= split_grids_nums: + if split_grids_nums % m == 0: + candidate_grids.append([m, split_grids_nums // m]) + m += 1 + + best_grid = [1, 1] + min_error = float("inf") + for grid in candidate_grids: + error = abs(log_ratio - math.log(grid[0] / grid[1])) + if error < min_error: + best_grid = grid + min_error = error + + refine_size = self.get_refine_size( + original_size, best_grid, scale_resolution, patch_size, allow_upscale=True + ) + + resize_transform = transforms.Compose([transforms.Resize(refine_size[::-1], InterpolationMode.BICUBIC, antialias=True), transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD)]) + refine_image = resize_transform(image.clone()) + patches = self.split_to_patches(refine_image, best_grid) + + return source_image, patches, best_grid + + def get_grid_placeholder(self, grid, query_num): + image_placeholder = [self.config.im_start_token_id] + \ + [self.config.unk_token_id] * query_num + \ + [self.config.im_end_token_id] + + cols = grid[0] + rows = grid[1] + slices = [] + for i in range(rows): + lines = [] + for j in range(cols): + lines += image_placeholder + slices = slices + lines + if i < rows - 1: + slices += [5] # \n + slice_placeholder = [self.config.slice_start_token_id] + slices + [self.config.slice_end_token_id] + return slice_placeholder + + def get_slice_image_placeholder(self, image): + image_placeholder = [self.config.im_start_token_id] + \ + [self.config.unk_token_id] * self.config.query_num + \ + [self.config.im_end_token_id] + slice_images = [] + + source_image, patches, best_grid = self.slice_image( + image, + self.config.max_slice_nums, # default: 9 + self.config.scale_resolution, # default: 448 + self.config.patch_size # default: 14 + ) + + slice_images.append(source_image) + final_placeholder = image_placeholder + + if len(patches) > 0: + for i in range(len(patches)): + for j in range(len(patches[0])): + slice_images.append(patches[i][j]) + + + final_placeholder += self.get_grid_placeholder( + best_grid, self.config.query_num + ) + return slice_images, final_placeholder + + def modify_input_ids(self, input_ids, place_holder, im_start_token_id, im_end_token_id): + place_holder = torch.tensor(place_holder + [5]).to(device=input_ids.device, dtype=input_ids.dtype) + start_idx = 0 + end_idx = 0 + for x in range(input_ids.shape[0]): + if input_ids[x] == im_start_token_id: + start_idx = x + elif input_ids[x] == im_end_token_id: + end_idx = x + input_ids = torch.cat([input_ids[:start_idx], place_holder, input_ids[end_idx + 1:-place_holder.shape[0]+2]], dim=0) + image_start_tokens = torch.where(input_ids == im_start_token_id)[0] + # 跳过 im_start + image_start_tokens += 1 + image_end_tokens = torch.where(input_ids == im_end_token_id)[0] + valid_image_nums = max(len(image_start_tokens), len(image_end_tokens)) + if image_start_tokens[:valid_image_nums].unsqueeze(-1).shape[0] == image_end_tokens[:valid_image_nums].unsqueeze(-1).shape[0]: + image_bound = torch.cat( + [ + image_start_tokens[:valid_image_nums].unsqueeze(-1), + image_end_tokens[:valid_image_nums].unsqueeze(-1), + ], + dim=1 + ) + else: + image_bound = torch.tensor([]).to(device=input_ids.device, dtype=input_ids.dtype) + return image_bound, input_ids + + def get_vllm_embedding(self, data, im_start_token_id, im_end_token_id, unk_token_id): + if 'vision_hidden_states' not in data: + pixel_values = data['pixel_values'] + if pixel_values is not None and len(pixel_values) > 0: + images, places_holder = self.get_slice_image_placeholder( + pixel_values[0] + ) + vision_hidden_states = self.get_vision_embedding(images) + else: + vision_hidden_states = torch.tensor([]).to(data['input_ids'].device) + else: + vision_hidden_states = data['vision_hidden_states'] + + + if data['pixel_values'] is not None: + image_bound, input_ids = self.modify_input_ids(data['input_ids'], places_holder, im_start_token_id, im_end_token_id) + else: + input_ids = data['input_ids'] + image_bound = [] + + vllm_embedding = self.llm.model.embed_tokens(input_ids) * self.llm.config.scale_emb + vision_hidden_states = vision_hidden_states.type(vllm_embedding.dtype) + if len(vision_hidden_states) > 0: + if len(image_bound) > 0: + image_indices = torch.stack( + [torch.arange(r[0], r[1], dtype=torch.long) for r in image_bound] + ).to(vllm_embedding.device) + vllm_embedding.scatter_( + 0, + image_indices.view(-1, 1).repeat(1, vllm_embedding.shape[-1]), + vision_hidden_states.view(-1, vision_hidden_states.shape[-1]) + ) + # if data['pixel_values'] is not None: + # print('input_ids:', input_ids.shape, input_ids) + # print('vision_hidden_states:', vision_hidden_states.shape, vision_hidden_states.mean()) + # print('vllm_embedding:', vllm_embedding.shape, vllm_embedding.mean()) + return vllm_embedding, vision_hidden_states + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + image_input: Optional[torch.Tensor] = None, + ): + vllm_embeddings, vision_hidden_states = self.get_vllm_embedding({ + 'pixel_values': image_input, + 'input_ids': input_ids + }, self.config.im_start_token_id, self.config.im_end_token_id, self.config.unk_token_id) + output = self.llm( + input_ids=None, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + inputs_embeds=vllm_embeddings + ) + return output + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + return self.llm.compute_logits(hidden_states, sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights( + self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None + ): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision + ): + # for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + # if key_to_modify in name: + # name = name.replace(key_to_modify, new_key) + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + use_default_weight_loading = False + if "vpm" in name or 'resampler' in name: + # We only do sharding for language model and + # not vision model for now. + use_default_weight_loading = True + else: + for (param_name, weight_name, + shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + use_default_weight_loading = True + if use_default_weight_loading: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + \ No newline at end of file From a619354e6b52f063980acdd832c0e184a2793825 Mon Sep 17 00:00:00 2001 From: HwH Date: Mon, 15 Apr 2024 19:22:06 +0800 Subject: [PATCH 02/52] fix format --- vllm/model_executor/models/minicpmv.py | 358 +++++++++++++------------ 1 file changed, 190 insertions(+), 168 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 1b614bb711d71..5c2c6ae3ecbc3 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,23 +1,18 @@ +import math +from functools import partial from typing import List, Optional -import math import numpy as np +import timm import torch +import torch.nn.functional as F +from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD from torch import nn from torch.nn.init import trunc_normal_ -import torch.nn.functional as F -from torchvision.transforms import InterpolationMode from torchvision import transforms -from functools import partial -from PIL import Image -import timm -from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD - -from datetime import datetime +from torchvision.transforms import InterpolationMode from vllm.attention import AttentionMetadata -from vllm.config import VisionLanguageConfig -from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.models.minicpm import MiniCPMForCausalLM @@ -26,11 +21,9 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput - # from .configuration_minicpm import MiniCPMVConfig # from .resampler import Resampler - _KEYS_TO_MODIFY_MAPPING = { "language_model.lm_head": "lm_head", "language_model.model": "language_model", @@ -58,7 +51,8 @@ def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): """ grid_size: int of the grid height and width return: - pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + pos_embed: [grid_size*grid_size, embed_dim] or + [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ if isinstance(grid_size, int): grid_h_size, grid_w_size = grid_size, grid_size @@ -73,7 +67,8 @@ def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token: - pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], + axis=0) return pos_embed @@ -81,8 +76,10 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, + grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, + grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb @@ -97,7 +94,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float32) omega /= embed_dim / 2. - omega = 1. / 10000 ** omega # (D/2,) + omega = 1. / 10000**omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product @@ -117,24 +114,22 @@ class Resampler(nn.Module): A tensor with the shape of (grid_size**2, embed_dim) """ - def __init__( - self, - grid_size, - embed_dim, - num_heads, - kv_dim=None, - norm_layer=partial(nn.LayerNorm, eps=1e-6), - adaptive=False - ): + def __init__(self, + grid_size, + embed_dim, + num_heads, + kv_dim=None, + norm_layer=partial, + adaptive=False): super().__init__() - self.num_queries = grid_size ** 2 + self.num_queries = grid_size**2 self.embed_dim = embed_dim self.num_heads = num_heads self.adaptive = adaptive self.pos_embed = nn.Parameter( - torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float() - ).requires_grad_(False) + torch.from_numpy(get_2d_sincos_pos_embed( + embed_dim, grid_size)).float()).requires_grad_(False) self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) trunc_normal_(self.query, std=.02) @@ -149,7 +144,8 @@ def __init__( self.ln_kv = norm_layer(embed_dim) self.ln_post = norm_layer(embed_dim) - self.proj = nn.Parameter((embed_dim ** -0.5) * torch.randn(embed_dim, embed_dim)) + self.proj = nn.Parameter( + (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim)) self.apply(self._init_weights) @@ -164,7 +160,10 @@ def _init_weights(self, m): def forward(self, x, tgt_size=None, attn_mask=None): if self.adaptive: - pos_embed = torch.Tensor(get_2d_sincos_pos_embed(self.embed_dim, tgt_size)).float().to(device=x.device, dtype=x.dtype) + pos_embed = torch.Tensor( + get_2d_sincos_pos_embed(self.embed_dim, + tgt_size)).float().to(device=x.device, + dtype=x.dtype) else: pos_embed = get_abs_pos(self.pos_embed, tgt_size) @@ -173,11 +172,10 @@ def forward(self, x, tgt_size=None, attn_mask=None): N = x.shape[1] q = self.ln_q(self.query) - out = self.attn( - self._repeat(q, N) + self.pos_embed.unsqueeze(1), - x + pos_embed.unsqueeze(1), - x, - attn_mask=attn_mask)[0] + out = self.attn(self._repeat(q, N) + self.pos_embed.unsqueeze(1), + x + pos_embed.unsqueeze(1), + x, + attn_mask=attn_mask)[0] x = out.permute(1, 0, 2) x = self.ln_post(x) @@ -190,11 +188,9 @@ def _repeat(self, query, N: int): class MiniCPMV(nn.Module): - def __init__( - self, - config, - linear_method: Optional["LinearMethodBase"] = None - ): + def __init__(self, + config, + linear_method: Optional["LinearMethodBase"] = None): super().__init__() self.config = config self.linear_method = linear_method @@ -205,24 +201,21 @@ def __init__( self.embed_dim = self.llm.config.hidden_size self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) self.resampler.to(device="cuda", dtype=torch.bfloat16) - self.transform, self.inv_transform, self.resize_transform = self.init_transform() - self.sampler = Sampler() def init_vision_module(self): default_dtype = torch.get_default_dtype() torch.set_default_dtype(torch.float16) - model = timm.create_model( - 'vit_so400m_patch14_siglip_384.webli', - pretrained=False, - num_classes=0, - dynamic_img_size=True, - dynamic_img_pad=True - ) + model = timm.create_model('vit_so400m_patch14_siglip_384.webli', + pretrained=False, + num_classes=0, + dynamic_img_size=True, + dynamic_img_pad=True) torch.set_default_dtype(default_dtype) - if isinstance(model, timm.models.VisionTransformer): - if model.attn_pool is not None: - model.attn_pool = torch.nn.Identity() + if isinstance( + model, + timm.models.VisionTransformer) and model.attn_pool is not None: + model.attn_pool = torch.nn.Identity() if self.config.drop_vision_last_layer: model.blocks = model.blocks[:-1] @@ -232,39 +225,34 @@ def init_vision_module(self): def init_resampler(self, embed_dim, vision_dim): default_dtype = torch.get_default_dtype() torch.set_default_dtype(torch.float16) - resampler = Resampler( - grid_size=int(math.sqrt(self.config.query_num)), - embed_dim=embed_dim, - num_heads=embed_dim // 128, - kv_dim=vision_dim, - adaptive=True - ) + resampler = Resampler(grid_size=int(math.sqrt(self.config.query_num)), + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + adaptive=True) torch.set_default_dtype(default_dtype) return resampler - def init_transform(self): - return transforms.Compose([ - transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD) - ]), transforms.ToPILImage(), transforms.Compose([ - transforms.Resize((self.config.scale_resolution, self.config.scale_resolution), InterpolationMode, antialias=True) - ]) - def get_vision_embedding(self, pixel_values): res = [] dtype = self.vpm.pos_embed.data.dtype for pixel_value in pixel_values: # V2.0 start H, W = pixel_value.shape[-2:] - tgt_size = ( - math.ceil(H / self.vpm.patch_embed.patch_size[0]), math.ceil(W / self.vpm.patch_embed.patch_size[0])) + tgt_size = (math.ceil(H / self.vpm.patch_embed.patch_size[0]), + math.ceil(W / self.vpm.patch_embed.patch_size[0])) # V2.0 end - vision_embedding = self.vpm.forward_features(pixel_value.unsqueeze(0).type(dtype)) - if hasattr(self.vpm, 'num_prefix_tokens') and self.vpm.num_prefix_tokens > 0: - vision_embedding = vision_embedding[:, self.vpm.num_prefix_tokens:] + vision_embedding = self.vpm.forward_features( + pixel_value.unsqueeze(0).type(dtype)) + if hasattr(self.vpm, + 'num_prefix_tokens') and self.vpm.num_prefix_tokens > 0: + vision_embedding = vision_embedding[:, self.vpm. + num_prefix_tokens:] res.append(self.resampler(vision_embedding, tgt_size)) return torch.vstack(res) - def get_image_bound(self, input_ids, im_start_token_id, im_end_token_id, unk_token_id): + def get_image_bound(self, input_ids, im_start_token_id, im_end_token_id, + unk_token_id): length = len(input_ids) bound = [] im_start_idx = -1 @@ -289,9 +277,14 @@ def get_image_bound(self, input_ids, im_start_token_id, im_end_token_id, unk_tok def ensure_divide(self, length, patch_size): return max(round(length / patch_size) * patch_size, patch_size) - def find_best_resize(self, original_size, scale_resolution, patch_size, allow_upscale=False): + def find_best_resize(self, + original_size, + scale_resolution, + patch_size, + allow_upscale=False): width, height = original_size - if (width * height > scale_resolution * scale_resolution) or allow_upscale: + if (width * height > + scale_resolution * scale_resolution) or allow_upscale: r = width / height height = int(scale_resolution / math.sqrt(r)) width = int(height * r) @@ -299,7 +292,12 @@ def find_best_resize(self, original_size, scale_resolution, patch_size, allow_up best_height = self.ensure_divide(height, patch_size) return (best_width, best_height) - def get_refine_size(self, original_size, grid, scale_resolution, patch_size, allow_upscale=False): + def get_refine_size(self, + original_size, + grid, + scale_resolution, + patch_size, + allow_upscale=False): width, height = original_size grid_x, grid_y = grid @@ -309,12 +307,10 @@ def get_refine_size(self, original_size, grid, scale_resolution, patch_size, all grid_width = refine_width / grid_x grid_height = refine_height / grid_y - best_grid_size = self.find_best_resize( - (grid_width, grid_height), - scale_resolution, - patch_size, - allow_upscale=allow_upscale - ) + best_grid_size = self.find_best_resize((grid_width, grid_height), + scale_resolution, + patch_size, + allow_upscale=allow_upscale) refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y) return refine_size @@ -326,17 +322,23 @@ def split_to_patches(self, image, grid): for i in range(0, height, grid_y): images = [] for j in range(0, width, grid_x): - patch = image[:, i:i+grid_y, j:j+grid_x] + patch = image[:, i:i + grid_y, j:j + grid_x] images.append(patch) patches.append(images) - + return patches - def slice_image(self, image: torch.Tensor, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False): + def slice_image(self, + image: torch.Tensor, + max_slice_nums=9, + scale_resolution=448, + patch_size=14, + never_split=False): original_size = (image.shape[-1], image.shape[-2]) original_width, original_height = original_size log_ratio = math.log(original_width / original_height) - ratio = original_width * original_height / (scale_resolution * scale_resolution) + ratio = original_width * original_height / (scale_resolution * + scale_resolution) multiple = min(math.ceil(ratio), max_slice_nums) source_image = None @@ -344,8 +346,15 @@ def slice_image(self, image: torch.Tensor, max_slice_nums=9, scale_resolution=44 patches = [] if multiple <= 1 or never_split: - best_size = self.find_best_resize(original_size, scale_resolution, patch_size) - resize_transform = transforms.Compose([transforms.Resize((best_size[::-1]), InterpolationMode.BICUBIC, antialias=True), transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD)]) + best_size = self.find_best_resize(original_size, scale_resolution, + patch_size) + resize_transform = transforms.Compose([ + transforms.Resize((best_size[::-1]), + InterpolationMode.BICUBIC, + antialias=True), + transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, + std=IMAGENET_INCEPTION_STD) + ]) source_image = resize_transform(image) else: candidate_split_grids_nums = [] @@ -353,9 +362,16 @@ def slice_image(self, image: torch.Tensor, max_slice_nums=9, scale_resolution=44 if i == 1 or i > max_slice_nums: continue candidate_split_grids_nums.append(i) - - best_resize = self.find_best_resize(original_size, scale_resolution, patch_size) - resize_transform = transforms.Compose([transforms.Resize(best_resize[::-1], InterpolationMode.BICUBIC, antialias=True), transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD)]) + + best_resize = self.find_best_resize(original_size, + scale_resolution, patch_size) + resize_transform = transforms.Compose([ + transforms.Resize(best_resize[::-1], + InterpolationMode.BICUBIC, + antialias=True), + transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, + std=IMAGENET_INCEPTION_STD) + ]) source_image = resize_transform(image.clone()) candidate_grids = [] @@ -366,7 +382,7 @@ def slice_image(self, image: torch.Tensor, max_slice_nums=9, scale_resolution=44 if split_grids_nums % m == 0: candidate_grids.append([m, split_grids_nums // m]) m += 1 - + best_grid = [1, 1] min_error = float("inf") for grid in candidate_grids: @@ -374,15 +390,23 @@ def slice_image(self, image: torch.Tensor, max_slice_nums=9, scale_resolution=44 if error < min_error: best_grid = grid min_error = error - - refine_size = self.get_refine_size( - original_size, best_grid, scale_resolution, patch_size, allow_upscale=True - ) - resize_transform = transforms.Compose([transforms.Resize(refine_size[::-1], InterpolationMode.BICUBIC, antialias=True), transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD)]) + refine_size = self.get_refine_size(original_size, + best_grid, + scale_resolution, + patch_size, + allow_upscale=True) + + resize_transform = transforms.Compose([ + transforms.Resize(refine_size[::-1], + InterpolationMode.BICUBIC, + antialias=True), + transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, + std=IMAGENET_INCEPTION_STD) + ]) refine_image = resize_transform(image.clone()) patches = self.split_to_patches(refine_image, best_grid) - + return source_image, patches, best_grid def get_grid_placeholder(self, grid, query_num): @@ -399,8 +423,9 @@ def get_grid_placeholder(self, grid, query_num): lines += image_placeholder slices = slices + lines if i < rows - 1: - slices += [5] # \n - slice_placeholder = [self.config.slice_start_token_id] + slices + [self.config.slice_end_token_id] + slices += [5] # \n + slice_placeholder = [self.config.slice_start_token_id + ] + slices + [self.config.slice_end_token_id] return slice_placeholder def get_slice_image_placeholder(self, image): @@ -411,9 +436,9 @@ def get_slice_image_placeholder(self, image): source_image, patches, best_grid = self.slice_image( image, - self.config.max_slice_nums, # default: 9 - self.config.scale_resolution, # default: 448 - self.config.patch_size # default: 14 + self.config.max_slice_nums, # default: 9 + self.config.scale_resolution, # default: 448 + self.config.patch_size # default: 14 ) slice_images.append(source_image) @@ -423,15 +448,15 @@ def get_slice_image_placeholder(self, image): for i in range(len(patches)): for j in range(len(patches[0])): slice_images.append(patches[i][j]) - - + final_placeholder += self.get_grid_placeholder( - best_grid, self.config.query_num - ) + best_grid, self.config.query_num) return slice_images, final_placeholder - def modify_input_ids(self, input_ids, place_holder, im_start_token_id, im_end_token_id): - place_holder = torch.tensor(place_holder + [5]).to(device=input_ids.device, dtype=input_ids.dtype) + def modify_input_ids(self, input_ids, place_holder, im_start_token_id, + im_end_token_id): + place_holder = torch.tensor(place_holder + [5]).to( + device=input_ids.device, dtype=input_ids.dtype) start_idx = 0 end_idx = 0 for x in range(input_ids.shape[0]): @@ -439,60 +464,62 @@ def modify_input_ids(self, input_ids, place_holder, im_start_token_id, im_end_to start_idx = x elif input_ids[x] == im_end_token_id: end_idx = x - input_ids = torch.cat([input_ids[:start_idx], place_holder, input_ids[end_idx + 1:-place_holder.shape[0]+2]], dim=0) + input_ids = torch.cat([ + input_ids[:start_idx], place_holder, + input_ids[end_idx + 1:-place_holder.shape[0] + 2] + ], + dim=0) image_start_tokens = torch.where(input_ids == im_start_token_id)[0] # 跳过 im_start image_start_tokens += 1 image_end_tokens = torch.where(input_ids == im_end_token_id)[0] valid_image_nums = max(len(image_start_tokens), len(image_end_tokens)) - if image_start_tokens[:valid_image_nums].unsqueeze(-1).shape[0] == image_end_tokens[:valid_image_nums].unsqueeze(-1).shape[0]: - image_bound = torch.cat( - [ - image_start_tokens[:valid_image_nums].unsqueeze(-1), - image_end_tokens[:valid_image_nums].unsqueeze(-1), - ], - dim=1 - ) + if image_start_tokens[:valid_image_nums].unsqueeze( + -1).shape[0] == image_end_tokens[:valid_image_nums].unsqueeze( + -1).shape[0]: + image_bound = torch.cat([ + image_start_tokens[:valid_image_nums].unsqueeze(-1), + image_end_tokens[:valid_image_nums].unsqueeze(-1), + ], + dim=1) else: - image_bound = torch.tensor([]).to(device=input_ids.device, dtype=input_ids.dtype) + image_bound = torch.tensor([]).to(device=input_ids.device, + dtype=input_ids.dtype) return image_bound, input_ids - def get_vllm_embedding(self, data, im_start_token_id, im_end_token_id, unk_token_id): + def get_vllm_embedding(self, data, im_start_token_id, im_end_token_id, + unk_token_id): if 'vision_hidden_states' not in data: pixel_values = data['pixel_values'] if pixel_values is not None and len(pixel_values) > 0: images, places_holder = self.get_slice_image_placeholder( - pixel_values[0] - ) + pixel_values[0]) vision_hidden_states = self.get_vision_embedding(images) else: - vision_hidden_states = torch.tensor([]).to(data['input_ids'].device) + vision_hidden_states = torch.tensor([]).to( + data['input_ids'].device) else: vision_hidden_states = data['vision_hidden_states'] - if data['pixel_values'] is not None: - image_bound, input_ids = self.modify_input_ids(data['input_ids'], places_holder, im_start_token_id, im_end_token_id) + image_bound, input_ids = self.modify_input_ids( + data['input_ids'], places_holder, im_start_token_id, + im_end_token_id) else: input_ids = data['input_ids'] image_bound = [] - vllm_embedding = self.llm.model.embed_tokens(input_ids) * self.llm.config.scale_emb + vllm_embedding = self.llm.model.embed_tokens( + input_ids) * self.llm.config.scale_emb vision_hidden_states = vision_hidden_states.type(vllm_embedding.dtype) - if len(vision_hidden_states) > 0: - if len(image_bound) > 0: - image_indices = torch.stack( - [torch.arange(r[0], r[1], dtype=torch.long) for r in image_bound] - ).to(vllm_embedding.device) - vllm_embedding.scatter_( - 0, - image_indices.view(-1, 1).repeat(1, vllm_embedding.shape[-1]), - vision_hidden_states.view(-1, vision_hidden_states.shape[-1]) - ) - # if data['pixel_values'] is not None: - # print('input_ids:', input_ids.shape, input_ids) - # print('vision_hidden_states:', vision_hidden_states.shape, vision_hidden_states.mean()) - # print('vllm_embedding:', vllm_embedding.shape, vllm_embedding.mean()) + if len(vision_hidden_states) > 0 and len(image_bound) > 0: + image_indices = torch.stack([ + torch.arange(r[0], r[1], dtype=torch.long) for r in image_bound + ]).to(vllm_embedding.device) + vllm_embedding.scatter_( + 0, + image_indices.view(-1, 1).repeat(1, vllm_embedding.shape[-1]), + vision_hidden_states.view(-1, vision_hidden_states.shape[-1])) return vllm_embedding, vision_hidden_states def forward( @@ -503,17 +530,17 @@ def forward( attn_metadata: AttentionMetadata, image_input: Optional[torch.Tensor] = None, ): - vllm_embeddings, vision_hidden_states = self.get_vllm_embedding({ - 'pixel_values': image_input, - 'input_ids': input_ids - }, self.config.im_start_token_id, self.config.im_end_token_id, self.config.unk_token_id) - output = self.llm( - input_ids=None, - positions=positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - inputs_embeds=vllm_embeddings - ) + vllm_embeddings, vision_hidden_states = self.get_vllm_embedding( + { + 'pixel_values': image_input, + 'input_ids': input_ids + }, self.config.im_start_token_id, self.config.im_end_token_id, + self.config.unk_token_id) + output = self.llm(input_ids=None, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + inputs_embeds=vllm_embeddings) return output def compute_logits(self, hidden_states: torch.Tensor, @@ -527,16 +554,14 @@ def sample( ) -> Optional[SamplerOutput]: next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - - def load_weights( - self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None - ): + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): stacked_params_mapping = [ - # (param_name, shard_name, shard_id) + # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), @@ -545,11 +570,10 @@ def load_weights( ] params_dict = dict(self.named_parameters()) for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision - ): - # for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): - # if key_to_modify in name: - # name = name.replace(key_to_modify, new_key) + model_name_or_path, cache_dir, load_format, revision): + # for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + # if key_to_modify in name: + # name = name.replace(key_to_modify, new_key) if "rotary_emb.inv_freq" in name: continue if ("rotary_emb.cos_cached" in name @@ -578,5 +602,3 @@ def load_weights( weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - - \ No newline at end of file From 6f7e0efa2cce9e3076145e0fc63092c7b484747a Mon Sep 17 00:00:00 2001 From: HwH Date: Mon, 15 Apr 2024 19:26:53 +0800 Subject: [PATCH 03/52] add minicpmv example --- examples/minicpmv_example.py | 131 +++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 examples/minicpmv_example.py diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py new file mode 100644 index 0000000000000..e27e0e5aa02d3 --- /dev/null +++ b/examples/minicpmv_example.py @@ -0,0 +1,131 @@ +import torch +import math +from PIL import Image +from torchvision import transforms +from vllm import LLM, SamplingParams +from vllm.sequence import MultiModalData +from transformers import AutoConfig, AutoTokenizer + + +def slice_image( + image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False +): + original_size = image.size + original_width, original_height = original_size + log_ratio = math.log(original_width / original_height) + ratio = original_width * original_height / (scale_resolution * scale_resolution) + multiple = min(math.ceil(ratio), max_slice_nums) + + best_grid = None + + if multiple > 1 and not never_split: + candidate_split_grids_nums = [] + for i in [multiple - 1, multiple, multiple + 1]: + if i == 1 or i > max_slice_nums: + continue + candidate_split_grids_nums.append(i) + + # source image, down-sampling and ensure divided by patch_size + candidate_grids = [] + + # find best grid + for split_grids_nums in candidate_split_grids_nums: + m = 1 + while m <= split_grids_nums: + if split_grids_nums % m == 0: + candidate_grids.append([m, split_grids_nums // m]) + m += 1 + + best_grid = [1, 1] + min_error = float("inf") + for grid in candidate_grids: + error = abs(log_ratio - math.log(grid[0] / grid[1])) + if error < min_error: + best_grid = grid + min_error = error + + return best_grid + + +def get_grid_placeholder(grid, query_num): + image_placeholder = query_num + 2 + + cols = grid[0] + rows = grid[1] + slices = 0 + for i in range(rows): + lines = 0 + for j in range(cols): + lines += image_placeholder + if i < rows - 1: + slices += lines + 1 + else: + slices += lines + slice_placeholder = 2 + slices + return slice_placeholder + + +def get_slice_image_placeholder(config, image): + image_placeholder = config.query_num + 2 + + best_grid = slice_image( + image, + config.max_slice_nums, + config.scale_resolution, + config.patch_size, + ) + final_placeholder = image_placeholder + + if best_grid is not None: + final_placeholder += get_grid_placeholder( + best_grid, config.query_num + ) + + return final_placeholder - 1 + + +config = AutoConfig.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True) + + +sampling_params = SamplingParams( + temperature=0.7, + top_p=0.8, + top_k=100, + seed=3472, + max_tokens=1024, + min_tokens=150, + # temperature=0, + # use_beam_search=True, + # length_penalty=1.2, + # best_of=3 +) +llm = LLM( + model="openbmb/MiniCPM-V-2", + image_input_type="pixel_values", + image_token_id=101, + image_input_shape="1,3,448,448", + image_feature_size=64, + gpu_memory_utilization=0.75, + trust_remote_code=True, +) + + +if __name__ == '__main__': + image = Image.open('./example.png').convert('RGB') + addtion_tokens = get_slice_image_placeholder(config, image) + image = transforms.Compose([transforms.ToTensor()])(img=image) + images = torch.stack([image]) + + prompt = "<用户>" + \ + "Provide an intricate description of the image,capturing its visual elements, including colors, shapes, textures, objects, and any people present." + \ + "" + '' * addtion_tokens + + outputs = llm.generate( + prompt, + multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE, data=images), + sampling_params=sampling_params + ) + + print(outputs[0].outputs[0].text) + From 75c2d3175e0c9abacaa099feafcdf1054ca6d8f9 Mon Sep 17 00:00:00 2001 From: HwH Date: Mon, 15 Apr 2024 19:29:54 +0800 Subject: [PATCH 04/52] fix format --- examples/minicpmv_example.py | 49 ++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index e27e0e5aa02d3..1237276201fa4 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -1,21 +1,26 @@ -import torch import math + +import torch from PIL import Image from torchvision import transforms +from transformers import AutoConfig, AutoTokenizer + from vllm import LLM, SamplingParams from vllm.sequence import MultiModalData -from transformers import AutoConfig, AutoTokenizer -def slice_image( - image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False -): +def slice_image(image, + max_slice_nums=9, + scale_resolution=448, + patch_size=14, + never_split=False): original_size = image.size original_width, original_height = original_size log_ratio = math.log(original_width / original_height) - ratio = original_width * original_height / (scale_resolution * scale_resolution) + ratio = original_width * original_height / (scale_resolution * + scale_resolution) multiple = min(math.ceil(ratio), max_slice_nums) - + best_grid = None if multiple > 1 and not never_split: @@ -73,24 +78,23 @@ def get_slice_image_placeholder(config, image): config.max_slice_nums, config.scale_resolution, config.patch_size, - ) + ) final_placeholder = image_placeholder if best_grid is not None: - final_placeholder += get_grid_placeholder( - best_grid, config.query_num - ) + final_placeholder += get_grid_placeholder(best_grid, config.query_num) return final_placeholder - 1 -config = AutoConfig.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True) -tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True) - +config = AutoConfig.from_pretrained('openbmb/MiniCPM-V-2', + trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', + trust_remote_code=True) sampling_params = SamplingParams( - temperature=0.7, - top_p=0.8, + temperature=0.7, + top_p=0.8, top_k=100, seed=3472, max_tokens=1024, @@ -110,7 +114,6 @@ def get_slice_image_placeholder(config, image): trust_remote_code=True, ) - if __name__ == '__main__': image = Image.open('./example.png').convert('RGB') addtion_tokens = get_slice_image_placeholder(config, image) @@ -118,14 +121,12 @@ def get_slice_image_placeholder(config, image): images = torch.stack([image]) prompt = "<用户>" + \ - "Provide an intricate description of the image,capturing its visual elements, including colors, shapes, textures, objects, and any people present." + \ + "Provide an intricate description of the image." + \ "" + '' * addtion_tokens - outputs = llm.generate( - prompt, - multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE, data=images), - sampling_params=sampling_params - ) + outputs = llm.generate(prompt, + multi_modal_data=MultiModalData( + type=MultiModalData.Type.IMAGE, data=images), + sampling_params=sampling_params) print(outputs[0].outputs[0].text) - From 4b4c7f3c6a1b344ccb5a500de8bbaa11681f67cd Mon Sep 17 00:00:00 2001 From: HwH Date: Tue, 23 Apr 2024 15:42:04 +0800 Subject: [PATCH 05/52] add timm import hints --- vllm/model_executor/models/minicpm.py | 13 ++++++------- vllm/model_executor/models/minicpmv.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 49eda9c9a8112..acc4aec875354 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -442,13 +442,12 @@ def __init__( config.vocab_size) self.sampler = Sampler() - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata) return hidden_states diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 5c2c6ae3ecbc3..07abba1ff0d0e 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -3,7 +3,11 @@ from typing import List, Optional import numpy as np -import timm + +try: + import timm +except ImportError: + raise ImportError('Please install timm==0.9.10') from ImportError import torch import torch.nn.functional as F from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD @@ -114,12 +118,14 @@ class Resampler(nn.Module): A tensor with the shape of (grid_size**2, embed_dim) """ + default_norm_layer = partial(nn.LayerNorm, eps=1e-6) + def __init__(self, grid_size, embed_dim, num_heads, kv_dim=None, - norm_layer=partial, + norm_layer=default_norm_layer, adaptive=False): super().__init__() self.num_queries = grid_size**2 From 189d28e2c75c19c2268393863492c6564148511b Mon Sep 17 00:00:00 2001 From: HwH Date: Tue, 23 Apr 2024 17:00:42 +0800 Subject: [PATCH 06/52] adapt to new vllm version --- vllm/model_executor/models/minicpm.py | 16 +++++++++------- vllm/model_executor/models/minicpmv.py | 18 +++++++----------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 03d67f55fdabc..f2ea0a3f281b7 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -441,14 +441,16 @@ def __init__( config.vocab_size) self.sampler = Sampler() - def forward(self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + input_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata) + attn_metadata, input_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 07abba1ff0d0e..2a2c478b2a825 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,6 +1,6 @@ import math from functools import partial -from typing import List, Optional +from typing import Iterable, List, Optional, Tuple import numpy as np @@ -19,10 +19,9 @@ from vllm.attention import AttentionMetadata from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.minicpm import MiniCPMForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) from vllm.sequence import SamplerOutput # from .configuration_minicpm import MiniCPMVConfig @@ -461,6 +460,8 @@ def get_slice_image_placeholder(self, image): def modify_input_ids(self, input_ids, place_holder, im_start_token_id, im_end_token_id): + if len(torch.where(input_ids == im_end_token_id)[0]) == 0: + return [], input_ids place_holder = torch.tensor(place_holder + [5]).to( device=input_ids.device, dtype=input_ids.dtype) start_idx = 0 @@ -546,7 +547,7 @@ def forward( positions=positions, kv_caches=kv_caches, attn_metadata=attn_metadata, - inputs_embeds=vllm_embeddings) + input_embeds=vllm_embeddings) return output def compute_logits(self, hidden_states: torch.Tensor, @@ -561,11 +562,7 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -575,8 +572,7 @@ def load_weights(self, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): + for name, loaded_weight in weights: # for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): # if key_to_modify in name: # name = name.replace(key_to_modify, new_key) From af353f2430d912081aef003eb8069661bc16061c Mon Sep 17 00:00:00 2001 From: HwH Date: Fri, 26 Apr 2024 10:49:23 +0800 Subject: [PATCH 07/52] add timm dependency to requirements-common.txt, change examples/minicpmv --- examples/minicpmv_example.py | 125 +++++++++++++++++++---------------- requirements-common.txt | 1 + 2 files changed, 69 insertions(+), 57 deletions(-) diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index 1237276201fa4..450e2eeac9666 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -1,5 +1,8 @@ import math - +try: + import timm +except ImportError: + raise ImportError('Please install timm==0.9.10') from ImportError import torch from PIL import Image from torchvision import transforms @@ -70,63 +73,71 @@ def get_grid_placeholder(grid, query_num): return slice_placeholder -def get_slice_image_placeholder(config, image): - image_placeholder = config.query_num + 2 +class MiniCPMV_VLLM: + def __init__(self) -> None: + self.config = AutoConfig.from_pretrained('openbmb/MiniCPM-V-2', + trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', + trust_remote_code=True) + self.llm = LLM( + model="openbmb/MiniCPM-V-2", + image_input_type="pixel_values", + image_token_id=101, + image_input_shape="1,3,448,448", + image_feature_size=64, + gpu_memory_utilization=0.75, + trust_remote_code=True, + ) + + def get_slice_image_placeholder(self, image): + image_placeholder = self.config.query_num + 2 + + best_grid = slice_image( + image, + self.config.max_slice_nums, + self.config.scale_resolution, + self.config.patch_size, + ) + final_placeholder = image_placeholder + + if best_grid is not None: + final_placeholder += get_grid_placeholder(best_grid, self.config.query_num) + + return final_placeholder - 1 + + def generate(self, image, question, sampling_params): + addtion_tokens = self.get_slice_image_placeholder(image) + image = transforms.Compose([transforms.ToTensor()])(img=image) + images = torch.stack([image]) + + prompt = "<用户>" + \ + question + \ + "" + '' * addtion_tokens + + outputs = self.llm.generate(prompt, + multi_modal_data=MultiModalData( + type=MultiModalData.Type.IMAGE, data=images), + sampling_params=sampling_params) + return outputs[0].outputs[0].text - best_grid = slice_image( - image, - config.max_slice_nums, - config.scale_resolution, - config.patch_size, - ) - final_placeholder = image_placeholder - - if best_grid is not None: - final_placeholder += get_grid_placeholder(best_grid, config.query_num) - - return final_placeholder - 1 - - -config = AutoConfig.from_pretrained('openbmb/MiniCPM-V-2', - trust_remote_code=True) -tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', - trust_remote_code=True) - -sampling_params = SamplingParams( - temperature=0.7, - top_p=0.8, - top_k=100, - seed=3472, - max_tokens=1024, - min_tokens=150, - # temperature=0, - # use_beam_search=True, - # length_penalty=1.2, - # best_of=3 -) -llm = LLM( - model="openbmb/MiniCPM-V-2", - image_input_type="pixel_values", - image_token_id=101, - image_input_shape="1,3,448,448", - image_feature_size=64, - gpu_memory_utilization=0.75, - trust_remote_code=True, -) if __name__ == '__main__': - image = Image.open('./example.png').convert('RGB') - addtion_tokens = get_slice_image_placeholder(config, image) - image = transforms.Compose([transforms.ToTensor()])(img=image) - images = torch.stack([image]) - - prompt = "<用户>" + \ - "Provide an intricate description of the image." + \ - "" + '' * addtion_tokens - - outputs = llm.generate(prompt, - multi_modal_data=MultiModalData( - type=MultiModalData.Type.IMAGE, data=images), - sampling_params=sampling_params) + model = MiniCPMV_VLLM() + + sampling_params = SamplingParams( + temperature=0.7, + top_p=0.8, + top_k=100, + seed=3472, + max_tokens=1024, + min_tokens=150, + # temperature=0, + # use_beam_search=True, + # length_penalty=1.2, + # best_of=3 + ) - print(outputs[0].outputs[0].text) + image = Image.open('./example.png').convert('RGB') + question = "Provide an intricate description of the image." + response = model.generate(image, question, sampling_params) + print(response) diff --git a/requirements-common.txt b/requirements-common.txt index 3cc7bba8f84db..52705411035c8 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -6,6 +6,7 @@ numpy requests py-cpuinfo transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. +timm==0.9.10 tokenizers >= 0.19.1 # Required for Llama 3. fastapi uvicorn[standard] From 4204a02f0e3b0ee272dda7c1c41deb7292bac20d Mon Sep 17 00:00:00 2001 From: HwH Date: Sun, 5 May 2024 23:59:42 +0800 Subject: [PATCH 08/52] merge latest main --- examples/minicpmv_example.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index 450e2eeac9666..5840540bd0446 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -1,8 +1,4 @@ import math -try: - import timm -except ImportError: - raise ImportError('Please install timm==0.9.10') from ImportError import torch from PIL import Image from torchvision import transforms From 8b63870a0f471bf097f17f5725574a809c02a8a2 Mon Sep 17 00:00:00 2001 From: HwH Date: Mon, 6 May 2024 00:01:20 +0800 Subject: [PATCH 09/52] merge latest main --- examples/minicpmv_example.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index 5840540bd0446..1df3ff2fe5e0b 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -1,4 +1,5 @@ import math + import torch from PIL import Image from torchvision import transforms @@ -70,11 +71,12 @@ def get_grid_placeholder(grid, query_num): class MiniCPMV_VLLM: + def __init__(self) -> None: self.config = AutoConfig.from_pretrained('openbmb/MiniCPM-V-2', - trust_remote_code=True) + trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', - trust_remote_code=True) + trust_remote_code=True) self.llm = LLM( model="openbmb/MiniCPM-V-2", image_input_type="pixel_values", @@ -97,7 +99,8 @@ def get_slice_image_placeholder(self, image): final_placeholder = image_placeholder if best_grid is not None: - final_placeholder += get_grid_placeholder(best_grid, self.config.query_num) + final_placeholder += get_grid_placeholder(best_grid, + self.config.query_num) return final_placeholder - 1 @@ -111,9 +114,10 @@ def generate(self, image, question, sampling_params): "" + '' * addtion_tokens outputs = self.llm.generate(prompt, - multi_modal_data=MultiModalData( - type=MultiModalData.Type.IMAGE, data=images), - sampling_params=sampling_params) + multi_modal_data=MultiModalData( + type=MultiModalData.Type.IMAGE, + data=images), + sampling_params=sampling_params) return outputs[0].outputs[0].text From b01948cd52a4249979d7319bcdfdb333a834821b Mon Sep 17 00:00:00 2001 From: HwH Date: Fri, 24 May 2024 10:46:20 +0800 Subject: [PATCH 10/52] minicpmv_2 init --- vllm/model_executor/models/minicpmv.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 2a2c478b2a825..cc02050ef992d 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -16,6 +16,9 @@ from torchvision import transforms from torchvision.transforms import InterpolationMode +from vllm.config import CacheConfig, LoRAConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.attention import AttentionMetadata from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.sampler import Sampler @@ -195,11 +198,16 @@ class MiniCPMV(nn.Module): def __init__(self, config, - linear_method: Optional["LinearMethodBase"] = None): + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ): super().__init__() self.config = config - self.linear_method = linear_method - self.llm = MiniCPMForCausalLM(config, linear_method) + self.llm = MiniCPMForCausalLM(config, + cache_config=cache_config, + quant_config=quant_config, + lora_config=lora_config) self.vpm = self.init_vision_module() self.vpm.to(dtype=torch.bfloat16) self.vision_dim = self.vpm.embed_dim From 0b1be33eb7135870df9bd19fbc97571763ff3aa3 Mon Sep 17 00:00:00 2001 From: hezhihui Date: Mon, 27 May 2024 10:06:42 +0800 Subject: [PATCH 11/52] Make changes based on the review --- examples/avatar.jpg | Bin 0 -> 70290 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/avatar.jpg diff --git a/examples/avatar.jpg b/examples/avatar.jpg new file mode 100644 index 0000000000000000000000000000000000000000..61dd46a0446700af760533fbdb3729d210e55d18 GIT binary patch literal 70290 zcmb4qWmptW)bB2hNFyLiFQveWNVjxIgVfU9APYzd(%s!%(#--=QqtXBOD)pU*Z+O) z{qTN#e>2Z~IWu$4d^q!*d|r6o01(Ja%Si)(Kmb7Y-vB%>10(>bC@BBUe?a>;(Xr6c z(a_LeU}9omy?pWVCC&>R99%pi0$e;oJRBSXQUXF^l2@-@y~HOYCnX^#B6&sf-z7lQ ze`nCpvC+}7NpNv+N&eS(?g9{Eq4=Wsp#tdvD1<;%Lg4c+02KfLzy$s$?*9i6<)4Tc zm;fy7e`I+A01y=g?H^Flu>SE!K?MT-(FxHpi0Ckh-+!c6HFm}#;ruQhm-h;+_T__w ziGO@;_arI9<{xY__53p~Q|ooJAX~*K6d)r<8WkIB(T?p@-b``yU)e;GQgKOAz`%G0VO8kd;V_n zsYuU2b|b5mqQ5{|lnu5d(p8|-DlO#zQA~L^6XW&y$8q};lSFhJWGjOi;5hK-Nhd7b zPLT&U_~u+1fO{kBLbcNpF9U;Hsg5qs?UKsWEI_Z$6>pj!RVk;R0pnYGP zU{Rh^SeE2{=Cx=(p6%dWRC{bF!MQwu&h{3UBxkQa3CCs!hV1Ouzs`Vv)~LNLdrh^c4NaQR4-Skev|U-0!Tn4 zc&^cd&}F{dX*hs%P)yv?rKvFmfThq=EZiIs&x zl67h=2{i7gJ2gY#m(I4ms39u5M?74FEiKOg?mh2)#q-4!qOcQFD3fb|p0UF+#CJ&j zh83^+*Ln*)4dtF|ZPLrXwE{h_Z*+Z@(t7Dm@2JQf8TJ%c*_(xDM@Fb3B;>DEFV&e4YE^527fNhA$Y7WWSD#(ulp?BE zqe-mahJ$VK+@({IN@r?9sYhx3=(argl@`0OLL(ok?i`gyG{XVSr?y*=nUEh8Q&GnL~2y89LmmM4H%1>LI<&qY_ zG|MQYg<-$vyKJswOE7eXV?;Ep!1O0SaN{>?u57R5)EV?H9CWoUGC?76`=~3q@2rst z5{-GiX1H*bPwQO2Vx{{C+LU38c{0^;1s#tmFOnNzpY&Abu9=TpG`R;X1d+k0^BN)a z-d&vvOvUOx{zzltc&t0Bi>IU3rEAi!UnGUAW=EVc!76L zdKS3)5Dw#TdX`PB5!qb8C#AxKwq!PGbV+>}t-N@|f(4yn} zJ|=Y|nPK+eQM~UDvH;ZZHw>#RV47c|j7xqCetv{g-t*c8{JqUmtMTC^t_n*d3{k>2 z=bDuJ`l*}W<2Di}2j{7FF#82=E4(QCE`-#CTau(;4uYgVPhMoTzO4>W01@gh$05>e ztTzEeAIR}D{LNKUoysi|7<~lAjyzBxVpmZ8&pcqo_sqI5vmG=8@ug$+c z_(By;l!W`u9@ogP6DC^kedBGO1ZRaZjJJnPd>1X~>a17Vz9mcJI=dn_5%MuN&7Ic_ zkI@20VMWpWpADY@xinjEzP6k&^*{w3W&V9j@xu{N0seW(eLgD49Dce~nz!$Q@`C9z z;Lz^<+3%>)_7+zO+Y*$4E^zn_x+v*$vw;n{3b z91@@I>#Y0jZS#Y&c81qoP`A;dV+7yN7yW%|y`j@hT|DYVy~xJ)f-H`eWyNxA5U$u= z*$$7>^@Z-`o|fZSq+KNFjgOl)1w{445eBgScF!o0ap3A}eH}TMeoe2t-GUX(3HuG8~Pcmdmifv+5k&Te5VZa?-p=c)^8+X}2-y z5J9s24P)t(ZU$kfmEFnipoF74UQYhv?bcIx<*yMoQ-1lfJa}QCOS(_zaXG*0zB(62 z#pV~GL%qK;hMZ)?C1;>p>qX+!M>n+PA@YlCnVHGsR!nx7yFz0P%^O-=>8D8XMSXHS zhFC%C!y{jRt6X( z!9;-i5KDMEMQ#fXfAn3zoCrm6I~BybW1=G_GTywG)@1B=8r)9A)W)X3bcbr;8IZq7 zc_eUiZQwSduBfZ0PxWhrXJ(zd(fp;}6a8>vl4o=WFdlV=&s85kx|OjzQ#q~QIcJuZ zFORj%nXMU0WlAMy8}o&2HR+tcsz2NRcR(K;+iZB^Q}0$usqMhJxnqyP;Rqz!^zW8w z%noTJb%pO@fauTC&TjA+-p8_6u?H+GF`9No^^JjQN0(S0ZW8(bG#lfwEk8>FuFZ8) zw_g*RfiTr2oY;r4@GD;^)tB=7EL_@!U0^wvBCYB2+7_)P@F(xLwyDW zS~M2aZPJ3BLe#zeovajOMHCwH100Hn8y)91g;aXYoLQ()*n9mUS=VBiux1dNl4U+G z%MA;^F#w~krR5f^jsK*AiIa=6a!X6MDkjgr+g1c(4Bntc!|ef))X=T^pI{EsqWK{N z8D*`hQoh8%)QMw(4UIh2J!S7%K51&4%}D#cjDo`k^Kf*GHP!C?y%QRkincFEc@=<4IXQRl(MlvfHC@}$e;1ov({|V9nP?%vAqu=N z0#J09*yfyqIX4JRP4{+iC$MWlcX`g%R637av8X5%)0={x`9vQv6{4O2jkdIN>^}n> znIZ0xSp_`jykeC5?OCZuo+NCs{U`SCY}-j~eeQ!zgSl#yz8$1>}Rq;{yG z+Wjck7@9*u?8^xs8gV>4+1;7Ws3W*OB+;2u89fz^Q@>M@zgVypO1mwPeu7p`8u^A& zhd;h{(_|mIG&eKqS-6@OHU|5xoBA?n5tf)onH#VVh44^((#d&GY&Z9N0I(tKwrX>k z8p)R=Xjzn~fC}d?-h?%n_3TcL?3gIitmjSo#7GL=crsMTUVr>8>`+A_6*<;;&yWR~ z7o&%u5k($4+u}S>GF?nHl$aF6uL(p}K_k*i*-!<>1C{1!3>L?Qel z?4t5R$fT|Ijle`ym4ah)I%{Tk&2Gk3eQboleT~Q0M9B@zGXn`=he#gLt1JNmj(H^>eDk?Y!4M=h|6vJyOad)i%LT&tA-V!ngG$zB()g9qC)4j0Dd;59a zsbv}?%3sk+b=f@frbhe#+|++iVA`^7)Vl1x$f@|w#JW==59}FWzS5uo<>M~1^67_wN4^ugO+blJ5JzqwB zKuskKYlRq=%l|&s6O_+ig=!`})9z~jI2BdSQ)?ubhc|;v5y!rft6e>1~7?Yx(W63ae`F^9OM-9Sy5oE?0RS^;;0zQ7nt^eMeP-%Yelq zEn0m}t(bAGJmy-@oHSgzOHu+ZNCzme#cAYvqPtOCH~El7;JoS<96bmT>Op1ITVX!t zp@MtT&CH~h*|3&lAQtiqR~G0)1YzH^A9+}c-Ios)?Gz(Y4ncekxg8bUhc~i~cTXh% zaT?s8wYNoUeXop-S&%NUSb<_l*-IWwUoOM6*26@sirs_*7XvDWFYo;;E89Qlxh3s; zo4zdXwk<3ycL>`$#CP875WJASkGLx@{x%djnivJ6FsX`hla-l$jIX=k*tXd}kIJN6 z&x3GY8mZQ6w`I|+(3!foyuUw?|ExAMsX3~wLCF+(;3C@?{xNg3zofRHDQQ?9^6t|y zm}T@!OrE_U!8Y}_PYNgPzBQiFJ$*a;W{>X~;0kbg29N}<*EmW}ijQ<*p`T#hs!9MK zoMVthIIg&BC+$mdfE3gbpYdz87jG9T!QS@ZMV4#rM>|ooh{ls`pUxyoGHbA2g=kG8 zz-ryH27ebi2T&|@C_n^qJ~H^OmVU3cZT1i~0~7@3S@KuiGh}wj3M>E-IBG@fj{`AC zw9oDSzTPr7C}J-t-hDI0Ut~GrJ(Ka>Wvw`2IC5KG1Qo9JR@k_N3Rfhe@(diZyR_L) zYM&>&RsVJ+GhG%2ji1uG^`7kf$cU>Cr5S|ZEbSxv7}$~=oy|ULfwW#zrwylGQJd$R zD^iZS$gXkP?d8yu3>LeNUK@Pwb(YU0trIBMS$`SHi~hsrH>GKQU!X=ysH?DQy+;Yc z`J&T&a(k~|Uoo>uIVzF&?IQ(2C|;=orXD2x>z2(z(QJi@KGO<+<_y$!Y{5{AvPx?CT^{y@uZL ziiD2a`Zd*m>mY7`46xhUL9P?_%|~fbEhCDLopC9$p)#sw_2%a-_e?R$AElNa!(+|j zjqB9s#W&1?#qPa8r5B8=rY?`m%xsoCStB#L0wu-s#tan^kujglM@%UD0;cBs9M`zW zfQzd=Wlnc9izVFC2Pp5-_5eboGngo6CWfKL7Wgnv|9Z<-W8vQ{cNLpV<>m}+Cvi2dFEXLW9@>ciPIots;TBQ>WA#!L-ecEYoE1{%-jSvGMRS*svc1XCBCku zCIFpHp2KjRnnkvGAWR^m4j@<2xh%8b3s}Z_K3|TO z#!Nz|hWNhQ8!iwP|Gco8c`y7EUshGL)T3%K2zO+Wa2n8-(lLoYYtI6d;6MoE&illr zZ+UV%hmEbx$xom;LF7_bOTUK7hf)h7ThxVaWKm3re2G3-SGjutSa&bT~mZBskq#$#^$c0euc~p ze)JlbiP3oPCxH6Ipk6e95cUnD-B{zL673*G`j7pIQU<<`!o5qEeS%@|kO6?yWAZ4< zu7vLT&%!V5_=krg9VvDXxwQ-XM zX9<5R{9aSxQoYGZxZ?%>ePe?bvbLtsP(yX)8%E0WH_S!3B#hdR!);xmW0K*W!(H}H z9&WSpELrm87hU^38Z&^{(Xu6|paqI*V!wc<|H*u`lNXF>SfKa}NP~2>nIV}!n?LEZ zL%(RMtj%|ZzuzNW51#uNF*%^A@WDK6YolTP!a(=ukErw`U!siP9?gWm8|{;mO%O`s zK}g1E>ecXtU!$&%O(Kz>I;lX@&Ij3vQ_=(FxOmQoPxdO} zX5{{+>LKB8E2DlmTfQ;>Pz<#Q#|K#})UA>PLODyS+di{$xWLKq6)Kk_)Lg!Jr!Uu= z8$W)gIKfA(#$2cH?|d6}zZF;qt+KFE$gS&GQMFHcYc0l~tRW;7$=d996sp38$!d(B z0Tcs^4J=dpJ|AzcpmL)9!r_>|D1o*1Z8l=B9in(H@RXLtDK|}~J|%aileJt@#MH4C zSUoY*uqy=MOH2DS7T5ty#}mF^$S^EzR!IT)J3mq*rC-{zyFFepEK@b1hYG&meM3^X zLfWEN$5?=eadZ8J)~HkBw2U&9__ZTV4x7o@7D*+<3-8!n#*^`^ebq3^l$m%fxwPg3 zR8ZUT=@3hL7$%xd1;FuBE-vD1lCz39=I<0u3|#KS4#;&mB+1dm@623 z+1sgv&?(3&8gTwsdPYhUztlTfvMfnzz3N@_-ovM;`EzMAqnNzjMFgtu>eBSh7v(z= zu8gMLKs|Veil#sSX16&WySLhkz+uEohwRTEQm#O__dgols6>x-tukzTPL$T`g1@Nx zWbnP>Kgn35OFpRoDdp|LCGgFlxUIbRjFGDfb=V(4t(;-mB)>Ak^mr`lBlQ8Pd6@SctQ#;S7#RJSWB zmxdzLR*Rl}7=?FS0FuB4{HG8#gVs_nP@p#)2h**NU=oagoW%D9qV!~)!i5gHfa zCF~UPv(bb?T)+;$R%PPL+^L2o8K}K@39Kz+k&|%zP zDpUr?)+ALu&`}k-DmDWDs`gG;OB8W~EX3No%2U}0Ht{rTZa2zQ{aW`}L=!T!Ry@r1(!w)`(P zr!6#V8W1n3uc@bTEvNcnELZwb>2N#k{PnMFXG&Uu9AZ;f9$7!&>dnP*Yf={9;v;p$ zJ8aOOsgMTUhRbt_Lh8fXln?5)e2X$V|1tnft(2YGdUi`fgZ~9w@hn*sqiN z>$Fok=>#M`J1L>|Om^b$8k?>=8a){eTo5YbRi^uwR&}1W(76q*^pSky|3jpdVRK>Yn$Pa9&a8+rQ4sg$8y)(gdg+3l0oRB znxh9c!e@ceC113P)0tmS_=HK)$e#6MFCVOczZVq}4d*cB^5(A8*l#-iO(Df?viuv% z1-v5Fn`{z89oROau7soUPd2pKqj_LL+}U&2dZ!ThdSqPx>~oNUt2W{xKAb^9<%+n8Wg`N{5ZJcSVofNq*F|YHMUn(a<|@-R1en~ zYbx)D@DZ{$yM~x++9?U6D*s4Kb9;0vlnMD%8WHhw57pG-n^mOMoE^Dud(z8V+R&k+ zOUTq5%IRuz?VoZCKVW*+<-aF;;Ir^O;ywe2u=)l9>y9e6P9lIvRJHH@q^`Kuq9ZB@ zTi>9Pm0S_xs~8h9yrvek+e@(wz9voq&NX)yuDVC^(^hDAwaGQ=j&6vY@!htF9|M`O zhn-8M1%4~ky4TQo7(A{jjce`E8uc?7j=3Pv#ku}F=vbvM&CotYKZp}l@aBz7jr`ctwsM_TcPPx{>++kw)jwp_B<iAP_sda2XZe&fC3|fj78<;9B7foPzcj=Mtr&$HA+Zl=d%2hda#n>_6;+gw)DVFTNE{PqZccnVR} z@sfySoDh^h@EMRkQrA#OsziCT-B&yr`8txa(w%}$&YE|RSh{2RaPL^`UbLnb5!9Bk>#O>z4NR-n&_>?58!k zrW{GSz(6eiy@q4)6K9yBf{%(d3-FJZ^lEHy<3V(O9wC5?kFMIzC;mMAw~FZW9N1hdZQG1hkri{NmR=zJ3$$=Yi+iDv?$%uP zaoGCeAp!lGVS~+2ho688Cp%QqPQ(iMGZ$l!JBOi@z4_=4e7=-~Sql^nk;D*tOvoa< z!R;*HSddYf$YhyUE}HjAm;g78VI|u?Owi85L|B>8cpgzm5tz+;*&Q!)kqQ<8k8 z{?2$HX;O+cm>a*af19}fN%Dj)f36jPY^n&!u{vUXYjyihl(hX{{3|+?LJH{?OcSB)*tfP zy!AtKef8S!kf80PucUPMAPznmjp^dI{%k`&!ZVkA-C0|1Spw~10h5CL9B`qF;D;(E z{P!X1Ojh-$$P^kL4MlE5^R5u1h&bg|!x(-mW~ne+G8IahtTSZ-mqxVhk<*^tA~j@V z*&|3aXgg86;A5}~r7vm|o$JD4$vG6K1c8xa-lRN^w0y14|>f2?x>DY&{ga}5!~@QF}i#1tF_#n1O14D7GzEJc|h4!~9UFm6yxk zJ7XGs>tGPYqF&X)FU26)kMkYVz#(L+xqWj-A(OAOSB=An`b zWuzorOT9}Z-gfPbW0{ZdzA~J3DRTM~C3ze8g05UTr)*UC0o$VE)P6`bMROj~Irouj zlG1S#xV^}dl~p=;gsa%>!`ci=;mg?(a|LO4R!cg+Q?-QBs?xD#JKUquofF!t5%|_c zGx}8Mo(oMPVXn?%=oDS6V-K~3 zuU1AscMM)0CI#z%0b25|x_-`k!wvyl0!8&-B;SKX3|SKcu){n*VY* z!0~=bT_iNLs8-KwQdtt9ZsHg&=NssLUoB=t{(NLf7j(J ziT)`>Zv1_v)g~s!>nmnXrxabcHC5uDMcZD`%SN6WcNI+Ej-C`ARe#Oijke9wtoXYo z{4?47-(O5pc}7X9Cc2RVZ2N>sOM0{v-_`h^0k7lh1!raF4V|>! z*jWBB@l=({q5u8cvp6IJlRC&|X33)#0|g_u*1JuCqc3mj-v3)t)pQ!-)_Y~2)+M}D|0Iv?4gv_Jc;vXnQ550Ad2NJKAC|^jJVI*x_$u|>+Jqn2& zd{ljnmJ%H?arqT3#nd;&b&^Wq4UaVqY-6ThlwAKp^%+1e7N9Q2Ds`71y7NSQ%k0b5 z<2|{|;VkyD)$s}S7P2e&S9vGLh2r}Z7YSfE@fonm(w^~(76*zOM?L9KCKj_xU_SwY zL7W#w2|{qHZk296`S^8-Un6n-k7SiK9?w89=mKst)T5~Tp(Y;^fB&}Xj3hrsk_V^NYr*Cr0USDW9o|i{MV!^}6ebo&a93ar z&^=_+)DX<&m7Y~YE*GLm*3kS4NIEQd2JFe`KMiuvQl~t`e`k1I;#E-+L)^Ukr{zj# zX~peYmyVnHuyfJu!NIQI`^}h({nkSH>uQa9t^Ud#$!U&5mNUB?<&c(Ug@dK#$3#-) zP02W;g(!Bf3PuRwZPP08r^?KN5cZFtO#V@es1A~BsvL`;O@Aa(YMx;pJoi>tE>2q* z)}p9IB0s4*o>_W`*Vw5>WayTx z|MrdK{^qu205$HhwI#|FSTMs!7S_J5j)hr_D9ngTm-!L*FE*4*eONOAodS>w3mu13 zZ19*?9_}z!3dU0%zF^%x$*XcKlLa=@|Drjg7jCaDs&Ko1l;yVv;BdFC{VK5X^rK+efAapJd#$<;7%I!=Ep_2=cnL9q1m z@|_f-`6EskT8IW>6lI~H+`mwnGU&pJfb~7Z+kIVDWS0e7fEUNw)bD84DaIF}f158{ zWeXHgwnd2^?Cqkfqz)`FVv%)$9k4%L#iwIC9CxnJpD!2pRkEOIS+}6!&5QY0#@J^# z!avn3J}^BYxPD!1_oFibyi?{+3jRx`LRCgwm>7 zq(xj~NIrl)ly}O0PhJp6P1*pzzY?1}L`&wPfhif3r0{W3FXy7;P~v8RnorBz+3+e| z+pidIEgnI)1#PMheFcq3zn54+n@et2EVD1YMthaB5?ny5_PVmvWpdf(!dVBd^d`j) z4+K3yr25%q{o|zxdB+%YT9l(P)8-@mSDKJN5g7dmHM<`H>131My%cic zwow;JD`*0J2bg$jN=XmBO(FJ*kd$e3|Urw*0-yDeqr4qsg8=?F4*w+a_^gyVjr8Sws`)$xSJN$D2-Cl#I#E9L0;K1g>uAN;gE% z<(dS9>@z@$X+DM6C#7Uo#7RxWcId}}KFc@q!AN@#-zt1nkoEG!atRUt z;`f`@;7L@Iib%WpH*w(aEv`QY_1{n_PQ%ccd0K<_I8`2YRgQ3Hy1ugXjA8Z=A+RjA z#-rxq1@R?aB5Ps_S7qVpyg9@`DL$WfPpL8luRlZH7%I6s{1Yj)9 zZ9P{w3>m+a+Uq^^BWjg!F{|czwX}6E_Vbsk;E;RRkmV3D1@t57#iF>TWhCDB-FmIA zf#}NlF9mMmB8&9pUoOMc=5LAW9QgYsQ`IXi0VjjkTJlxw=7$z58gSE{exkZD#{F{d zeMVMm>z$y)-8EhUSHnpQ3q$CkbHwpN_dpYAS>wdOyII+D1d?Y}J59s%>#-+@?K-;2 zQ|2gahj5R?ZzM_CgK1#Ar?om@!uoflaj^}Frh}bW|JH_&2E;h8tRnHV>i1V>(svoP zPFEM26yCeS#K%XDJ0;RSg}e2s14k!jYr<}N(VrNX%!=_dJ*H(SZ=#C=6w*D3DrAT} z;$rN(K+B_+wwAJ#a_&h*)zw{do>Bk?a1 zTUAD-Yj60q{X^4XQFXEEXce!EQROI|uclzFuHV~1vmOrXLIvA=ccuMKWE87ki{v@ccZo_@EklU1D&FmHN`Ti+ zUj>C+tLF0QuXo|+icjq2kR}|W$LVvh)9(=P%uaSJ@pL@K1DrNp+I%V^Mu2mSm88fQ ztTvS6xDPUoaSJ#Ec6lF?g3b?aNf<9#WO@uV~;cs`89Xanfii z+_3!SQWnL0RPe9J35y6l>ae z;RG={2t(8-kz^%nYki*N#)@~>a^0Z)xJYv((tlNBmy0Q~SzY>PpL{37MDSyL3y*GL zbmpkuwg(TeLpb5VAgGb%4Tv}TL%k2N)mgC2r$2viMng%F2@#-p6N~Lk+q*MuB5y4PF)i>dHuO6q2}oFB6+n^h*h9T-1^ z*4;-h?h7#0V|!m#Qj^M`WK2LWmHNCOZX$>zl@L?DI1rFmk_LjmWsnlI`d-BPLe#-E z;-1vrpa9`FM(J5N`YukAtHhV+r)l)E%Caj>y6P+&{XAL{wGkcRL`w|5h8;GLw&V0# zhUoSc$--ODs-W{DGh|4IzA0H~KEP+_XCn-Ea`fuix< z7o=W&7nCu?8stX8bb$x<9o|WH99;0n8-}Df%Xo#-49%};!u<8_{a@?98;{T@js0Re zUPzbl1c_z4KS-3WNr(b=&v-#E} zJ8!X~PV}wp>_wVe(>VANQ2=#=KoWCQX9)xO7I^xoCdqIT96OTaw0DnNDYj{3Ss=dF zm^IQEJza9Q5Ic!l*UmJjpj=d!ALD8$`0Rs}g?XFc-uwK3Ln{oy|(B5ZKbJIFm)qttvk; zbz)C#F4Mrs?&FuHy*9%VAIwIJ;W6 zW2|&Wy_a{8)|^tyh)3bIo>Ry^rIkOC{8wd+U&lcP>Q`PHgrPum_wH<@C!r{EYv(6> zWYSxweYBk>yyDtZ?5$7d82zF_>*-b)R+F`y3Hyt9$fa|7tPro})m7EYWjCbH6aduh zNZg^)W*?9(Oh#(WTVLUw>DQ1@vpfnQohLoR*K>$`ZGzYQWZQ6JGLQeoueLy0&v2BM z@=*(MgU{hd-)~|6S7@@gSFrQAxj_ zidla8!>x52*&?~wF?7l3lHb-md(9&TFptfSCp>Obul$1h(3U4cmA)u^Wf#=U&VZTI z@vrd%-rQoAaxE|AE<17fXJmd1c{2UL9zZ8F9CuPkAy^kiNwkC;(fW@~0y+`t(`v6Lii=$KRA#-!$0HKrk zg3GR%<}D6kf=cJR{C09F0e<37mA_-a=Pq3gA#2Q zNeCtNt^^Oyb+>IAjNzuW0z6j=_&yOy&wv=KeqFSdCe|N6Y_M(?-wjz&slR3VefXd` z)hx%2G4vFbQf%zI5q4pi<~3IT)V0e6V}St@TyVT6tUZvjT9GQqQ0gGeH%&+IH@1F< z5tbf>fF)V+IWmv@ukDkmH{1Z!0@ry`e)7RB&)hx18Ct+jEF_d3a4GLHC{1)g6|O?G z8?dy7c!@g5O!%8-2F<1LS{Bu9g~T>##8~EiJdt09+#Bf(REIK-aU%0b<1Z$#Xn%n; zYlWtF5tRvMmvG9)pM&k=Ie#kZLAUiyHo+(7XQ1{R@J?O3X=KZ)%fRn*r)!*r1{;t2 zB&UjvB(@Q+MSW(jl@Q_P^;wnLvDe$Fv@SVHOQoV$M!XLL=HFw_|0P>6889rGrHMp-| zH2N8=3L?W1sqRzDcl;Qwm|=Oz=Rlg&$t^a(9XfS`0s^4t*s~5NwNBe@gL<@r+5A7& zYm4vAU}d;~1ONoI?)~hW?3fb;4Se;#C}iNWqn0b&&EOh&nK!}`((?-ps7;Yjj2x42 z2gOSqU(R=e>M?-;VS=!@7<&FiZQvShWe;Ntg-KLN?>%wuFLO}FQIMIF#&D0mUgV4AD}7T`|8zb zYSz-6onY0>mV<}M(?<#to^et<$5oOVUQ;7MNBF)_LqH*oQH!W52*)!JkcvKV=(v(t z@X?BA>f2G~-nA8jqb1QVf$1`6wnTsbtiNRO~ zD3?i2ZA@F#Yb6(r$92z9d_z#NcY~-O34rM*UIgoBV7~904zb?xOyayK?+-z6V#wC0 z5rZo){3E)Z9CNpq8%yCeKyXAK&6U)AWzOG$2Sn!*q370EaGa}qkW2A`wII|of`!?Vemd8ES z;$0D7vsYO<=8K=KR6x3L_u{K629!yvhZ7h>>w4Gbi5YMV*CnbU=TrxiP4F*fTMF3$ z54+<6{p+f=3dc4TU?AN*HIa&6Olu|~UN13sCxT%Icyx^TA!La)Jfh!QV}z-+-&Sm2>})sJr_!9*0QsS4ZPih|K) zSmx|eW9(qk%7qS> z-=okyUHKXCrQ$PVo~PS8jXoOAThcX+<%i6t(7azbkSQ%I5&6mVSHBWw(PPrLHneQ9 zxd4#$I__&}YpSVm#;tSdwswE0bscL%0}26cbi8Mk71}ARp0_z6E&ttAa$P_C3U;x16|X+So5YHnInV2)3TVCS-yZT* zryTlgVH`B_qu4lk&TbUI=ecUSm=i?akIA}QZ2O}OeGq1-E>jA9Y~j&r>Yx(g@n*X> zyML1x7~l@|Vvq_Lh+bb+_|1>c{xvBcuadn^F}DS^a({9)S^e6!I48HS@M~khK@r9e z?S%Pz7Nu?FNKe1eJ({i97(oUS!R*P|4a25Dv1>#3mFs#!9L$V=U2LNkDP$-Y2oy2g zQC`vZj?S9zb-s2nNIHsM2g#_`BZtzHT2RVk58xBuzgAH>TVLb10}i5|*qDwebQTFd z%J*n)@5jjSzTVTSR9jmybe`Mbew;hY`9t?U#lA{0w!YWx>1?eDj)rpM(yr(B$5jfK zNXY*P$M=Owv+beNJ;RAReg(c6pwRidlUa?h5DOg3=q$itx0K3fASNF$g^j8IP+6z% z2i)zPGrSMo(mLh_c_5^~MORO<()lu%e&2UzY4ch~5Ky@%HNUv>?)7`ZR1gNjc1j3kqCVuS?xFYmDh5!f7Wc^Y=I#V=tVthmt*a^NnP+X+1K5JUF z_RRkBqBEXz$4+w{kP;_md-;}7j_vR$W(Tg4;C!{bh~~F~PqUMVazUYdnUlaHev@27 zaPdT9AZE=2%P})rwl7y6q6Jo1x4Nwl`ftSGG(SlKV-CvoJ5VrcN0~i8*c(gIwr@c-$v2^XX- z9rY0+j`@B-GwWSrEDtmBmC)%9F{_tz3FuyJizvf7 zG)u=?P!4{g#ndFa3HD+7MKi?b?*TS)31t_?(iu58%2@Os;w$9yaYzFSG*QS%m6{}} zs_nuT4NhL}(hzriexLYO`&dl&%n6IMZjvvnVu!-7H*bs`rH@3$MY1);mO|1^yCgaT z-A`Nyuk9J|!Rx79PZhfu8^rRboKNhI-s+Kflhytipm|7AsYSe(Idn0Y@$_ryT%M;g zZcTeafELdhfu7j*CR9&VVi>#Y!A8O?K#vB`3X?2Hc_LT=YN%)K_XK z&S(Pn3!3J47eD~l8}&xkeb;FnhzQLV@+bg>+jK<_-5JMq#16qA9jNWq1KdxDZ0|%X znn3E8{>4vSlx+3GifqbXT~2uzh-YEpm&*RD5U#^#!)m0asESx>BXJI6_UYVtuT>D& z3-ndo(UP$2*GTy4VrZV$WkW@t!|&Z=yiVmdVp6xGVO?P z^tznQc`mysREX+9bT0ZPZ$~IS>;MfvbTqsI>OfXKpa}A$ z@q1|va079w$N-Imml6Z^>|e4mVIyX^`#BZQb@YZbk3C2l($Y3)AIwkzfpbNJ98al1 zBV6zaW3Sw52>P^(*nVi5$llGd@2LP6mU?M;9*#oTw*t=_{{Rv&e#c>ZZQ>}Tg7y;D zn|YLgDBwWYKl4fQ^nERM{-|Un_Z|`b^O)(0KRC1e#8lRi1=`EoGnb4lC~ms%7tX>Z|kkGdtVcJj@=3Q4cH zv1>=UM{8}@Z|`iH0D|V+1`+IThe6kFTmJyWgK`Ma)9a}LbIIL+kLF4MnA{z3*Nh+yM?YA0qMFIUB=c1-R^)m#7H-5zr>0d=*7bxf4qgwWrm{APxg>(#*2Zu z+r)qby~Nl?;qF&i)(s08j!C(0U}?9j{ZE@3BDl$#HfD zkLE&L8~`@eS+8v`{Lxs`atj@|PZ9=fe10EGr2_+Z8~c<1P{I$=@#%CrbhJCf z5AfX=UY7x2N%cTQ`m|s9kQyIT*1Sca->%mxb5ofBb9xcERd`!t;%Oe`O44H%q?Vn{ z7XsneQYW&+C83pI{{Wltw4YFRx|S5d)Qyu_4g+y@4^JXnzukK1swxE-i2) zP3)>Q`sJyR^xv_vvS4_e6E-%v!LxKAj;a?iFf$d93kIF+6B#9%aM~QjD~=ZpJ5G@E zxl@>9*-r?KzzcpP-wB2{;gsYs5(%*>X`7?#3aygc(X)jI7^#L38Ksf;0>|kCs(993 z*A{5#E~A9v85%B4)S#J;jkd8KYNJN%BuHU3h6;$wECt2Y&xm-#>!|Qb7LwLaGaB&I zxgAwA1;fo%f}HV8CZvh+wTM@NBJ_cp{sOE(+)r5IhOarq+xJGBm>*ld?XVGM*DcSYvE zx9XfRtS9N;)gMbdkO*t@G7lsovOsrB7{N)HNj5-_e*k>j{IB*w^HOMKdt;a78nY!PQxoEyqDY=B=DToJ44g>hFT*r(GbN8c{T@#e>DkY3u}0Rz(pbRq zH(IuM;VdDtS75Iq+rE^G{{V$$vr|h=DQt{%G7hbCY=0uToAISX(>m&MtH1cMF)ItN zb`#K{FKvivX!i)NkhRY7_({LefV_5nul3 z0Ae$w1cD^#{e++c95n{>7;oee!DuIO<#a|r#9m*0{{V#?In8lA!DHJ>021a6%bF2O z$7eRnoLu5Vo(sDD2(BOjsDu5~fOjx%HO2WZ5{)6hyC8WY2Q-`WHV3AJdxp4;d--TU z1_Ws)?E9eDHrI?>y^xUCu#IE`l&wwJ03m2Ujrtke4;q-J;zGWyQ_@J^uie z6_PYLQN!AU!X-FdgIl!A1JJ=^!ZXZ#MR>Mn8 z4>DOC8{rqSMwTa4PS|3+vb@=>Yj-EfdOIRynm!lgx$(?ljToI%G{o1BppmfmQ$jl& zmdfu5xIHE*o`iBehAcBovcAVd-D~L{%Pr4(Va~YAHLwQN$!3wb@h@m_mjjas;l!AQ zOD%|2Mz6Q(WODWp1=>82Nw-J! z{>lw^cs$Sjim-vM!P7<8YmDu7k8p|}4Z(f?0J?y=pzSB-#0nT202=059x_@;0ff8B z5Ek3&`{`Eh7iCB0tj|u@w7QC%^QU-j&1m0$$fw3T5Uk=9Qcz1Aby0}qEz}zCB;Qjz z76PX;&1_Zqvl5mWL*eF@#^#HSJ(LEJ4a~Z|n29Y0gL4{4@kh5r9Ka3rGgp{H;3DPw z4<;aM%Ux#4+T!f&m;jf8-k|y-2Le9;J-Z_VVZ5LF6^kTfj7L6V;*Sq&jg8*@9u|)> zQm(~IVp&%x;(5xtDoi$;oh!|WKTCZ{TeTO$T(MzOzOu8MQ_3Z+h!6viV`Xla zH(}KIqPCf&bO827h86}pQ?!kn-BbFp~T9O^suN z^pU6BC(t@a#v~mYE>lUO()L?&W(`qaP}0YiIzJPy(L`mr4s|jw#In%xP_pot9yf46tX6I;uhaG_JkJpGBpHxZ0V|dB@6!6mP}4N%CAeE|vE^ z3I?6J72dB&o+~fp!kGQhOGqij?mMEn?FV$DM38Qk4`J?_*J6~Ibvvi2osdS|?(<~U z_1O5u$>|}`=n^$`QBhO6X`UmApHtjy0?qNBF3(X=AiyXeIVVV-NZdn@ptt#_6qz#_ z&E>T@qO`Uyt)qYd2fyf`cS-dGT2v+6*y^{7xO;02CL2#tid3GQmiTY(nYujjnMop| zp9Q8nmN=bf)1}qd2g{gt2M~^mmN-Xu52*Wums%&eNzmo+xL-?Y%lrS3XzZBxL z5|a>;VhPkq4fek3!{2R^q9AwK6tod#W-E#+$?{N*Chj7*KVAe07l~Ln@OZt3m^u^PUhro*;DZjBQ}n$@SGyK;e>+b zvt#P7Mw0!JK1-(EJrr_MYf0#u5{zHmwopeEaONN0QCC8I%vVx)e-SV`_qCrkBYV0D z8>Lzwhq6zuCcE9YP&k(kaBNdimWVK(u8Y8bMGK9w`PpGki9h3_RtlJF8LkN#BYNs< zf?Ddab8)Gkh z4VFBk{fd~$9ZD}D#h?Rkbm`1^0FAxE5Yl?}RyJ2_T`{LAq zswdjcxb1at(2m%LUg?vrHpRw7wc9$Rr4!0^l(N!F_-aezk~e8{2;3BlTEn<>hwQx# zocMD=o0ued`C7*PGq82{9o8L~a63p4Ws8`ySN2AR z++4aqJy8y>n&dvVMR0#z&!9?;R=$@Hb)EkJMw;j-jJvDef-z!6-fiZN1H+BuE*^z& zjmPP0hu+CR8xjug2Hh-(*vTF@JMvt}G~L8Gru_qu@`H&NdG{#*054z*+it*YQ0RAe zzwCqp&46`~={F(7f#wPTBZHgO&$qS8=C$Np7w@+~xChN% z;WFpmA>kQzorScNU-K0Oqi;*CdTD7enll_>tYk6Sjq(6&s@x~S#cm^tW7FK!Pbpw0 z_(3+Zu%4Ghyi0~PLGoE@N@&^ujt4QIdoP?Bv>*7Z7aQ$WAlu!2GsURgRYNN^fvKbq zb@OkDm=#V3p0SuJ1K9IW_ls?CN@SMTVm{GWYaHFn-aXOFZrc|hdmm9W0B-Inn}4N9;ei$4#!Ay#@%d_(&rJgi2lh&dtVzoF}c9`l?L1P zG)I)jMlG;x~Uf*7&B@-UPT;{lX0<0NFDOYDaa-R*QY?`GzybI|bX~_uI$5{u5 zn7X9SoU9T>FPKDGGX9TTuqvu8xI_Ac4Al9e@pS zQgIS?IQl&2@q?D7#B%ipB}on2O8)@gIkvk&=vR-1tzT2`>KD>JE9HpuMnT1}`boODIcU}+nY=Y5e}z(;@eTm!eg{>Usz(pckT)TmHz zU>0fF&UKGq7ihS)r|r{ZZeEf=W4^@{MC)5it>tv>fiITBizgtLZPcXP4P9|=ucrPU z$Z#65NNO79=?;wA=c!54Q<>^aq>yjY{Svp=MY_At@wE4m+Q1uj=%y*&BczM~HcTAB zB%7Uzl#7<=8q(c{%9W32ONSi?Yg;2et9k@?>E#B10UgRMJVDy+unVdKSm4uVd6fyQ zx*5p})NC%J^8qyQM$qe>+to`5?9R!v>DJ`E zj;b?Ut-v6{;3bil7fEfkT&ddX&)Jx66It|#CS&eAhU+C(?jGQ51TC;5rpi#_ z6WezSdoJRp+!H*a=<3Fao?}4VB;PDdX?Yd{sq$59e>p*xshcfh_i?jg0`7OPRUewn zTt#_fOR#(y&icUp$|P z7~N!;hH9yN(!c4WiT?n*X!0s=EnpNmTA9&O!Wwrjo!hwm*JAnq013f0Hm0e8-E7^- zQ15?rk41+TzRFks0Hu+beJwO?WHGiTGyodW&?Px79SY4|#@sXlW2Bxz+T;YrUNTo# zPGiF`lKsCC)Nl4m@~MLMIygBN{2Iun4mP?ejjVNx+TU|!kv|>IIQtywCH<8jZ4C`5 zvt}=aR-Bh2Fg8nGdtHNC`&IJIUoTxsa(fi5-;V?iZfGH(h@+`7zh z)*#$cF#iDi6zcLaEdC`+v2du`e$8lj526tuDKh)F%@L9Sb|0GP5y=jm`wpL~D`W(8 zMI^bS!jg5`fE;rgxX93(HP=SdsVELZ+DAk;Z<24K-Oz~Id!Pl0Z}5~CXzY{a4>To+ z6iTX!ioK_n^%{{S^m#<)?0RTorYl1+luT##v=Lw_Ju&@pjfAx7(i_$P(a zk4c$qY|oieqh8qE5Ejtq>Tn)ox}C!~tAW*#6?5H%)wFoxs9^!8-=eyPTXDCtkHs?H z8;(R(oKI$h4FXBq4T>k)?6~i$)99rg%sv3d+0NZ$Sd0~4j@~eatl+`b& z!Yil4XK}e8>I3YFJ$`V?F$YT6vFbMUd_aeR>Jt^t9Z3uHVL#%{6tFlbtk0I|b50R8 z9Ya8NlgpOdvE7P^%X~q@Gc&$jlYFgjdTB@o{+?>hazL~1>Zg1zgHj@jvWfDE<-BVK zslY4cqN-tJvAB!7@eag#CN67wSQGDM==|lBDD#aViee+FXY^B!t@J*N0>|?8R$IiK zNm{`h!SJ~Bc=HRC%Egru+Vpj}d6CFk-(+Zvb+bqJml8qxV?ORHwE*v_wq4vR~_y?jZhgHd*%bTQkX8u6{8Lz(QN7=}w z%p~m5{p2cd`@~=Ek%(s84GUC*SDgO;kU$QKau5RGfwKdEt=@%CE7=#%`hwmvoqrq z*k&IQnyR7tTF?{xZkF~-{4H^AT(^uzmMWefL0bJWy8AK2{{Yo#Ul=)g@?=U% z>06}5I1O<GT+RFk-BLb#xaj`hyVw5D1q!~7Go^5o&|8~r#|M|XNE>7sO4<3JPQM3a6d&P z7T6w=_;TT~%9&NQGC&zyS{nID8~QfuANaM+Q&i_z@JBmUJrr$l_aS=+2B4CTkN1Q& zHVZUgrq&DR#~N{YvA!i^b4@I6baT2O-yqYnu`ZU7;+ReM09(}ABxE3;13in;!-c#~ zZ`~Edbp|VYDy?gwqSSEBh*d*P1TB(T2xA)4a!U5U!{-S`PQaU4Ur!q`S%u5pyPDEE zEN|hTg%8HD$g>_J5M>1^CL4{O??3Ze{{R;_@rUs4C(>cXwYZzf8Z`|p{FJ&#mMg8A zjxxC;!G6)lVbcR9wuDu4d~$xMTW)H)gQmO1U)Q&{6;`7amd{v>0W!lp?ps}6^M zR?UT#Pg6G}w}0YGC!{sWpGb127V^GzsgfKk6o#a<+iwZ|ckZrH3p};L=goY=%h=^k zU9X0=ildP%P7VYcbY9~4e&a*9twA(ZZH^p5k)vLd^^TXabZ4@>je@2y=!|X`YsQif zOD~-KX3Fo%oIu545Vjg=W^>~W78X81eH^KX>##Qiy78~Z9tiq;dXo^-XrJJ_hvK!< zr|2Si&F>^w2A_SAS)$h=g!u;}{u1l7qJqI-3`5ifQ(7cmj=K%ZK(O;DTPDDrHuJjT zL4J^9BHl}dw1IfQ>@0ynbK`A}#mBwWwjWsyGeL8;7Z0UFy2Jx`{We_P$5{TsBk!T{ zn}aO&ZAop3XK+Gx;^wdbcO4WK9f-Czh@SC&flaa^SRM|p)*aL`k4IC9Y-b+5j+-Hg zpo@@2$w2DHm;DwexkBfbF|CwYzMWG_;LzP-;hw4$9kWKuI7fRQwg=U6?9qO=@>Q(T z=KSMgX%d$8H%iy$SGE?%3)>TIr&EtrUvu?ItDMOV_pw%C9w#}OlB|yyhHRi-z#ATc z%Gq0r#Zi{zhf&l(vv2gOmZr4qW8cUpCTBJx(pp<@vYI~3xX+AqdK)H0235nIf9gk7 z0i1JlnLDt29o*bW$3x3c^j@V>aLz8`7yZhq>cRf$whS3OhYywkY4 zM+0k;x1MT+Y7se@;BMz+%`Xq9%UNOBC7~VIG%Xup%b6xm(qz}m^{J#BR z)iN}Wb6S_rIE@D`!9C&1$Nuwf3URg+qbivI!+4C-Pl2Z3f^QiPD z>4=&=Ha66cOGh>*RTLDlnI$_0v|N<2O}>Sd+Kby1sjfd;pS}8%Whs^NrXiaunqr8@ zN==_q?t}f+=P&STE>u%u@)yWPqfXDHdJ?o+95Ng_o;WGvY%wrcHN6(LJ_M%Iv!rtw zo>?U04`fzd%Jdm(nUPY$-C$VG;dveIdZ~_XjhYhtp=;SD=zH{A@q}QKia8nCD<#Ta z#B@v#SAJ`)DC~bU_i%$*@>6R=Mvt0ic_NCLk87uz&yo;Nk^pEYnEfrfKg`ZLw#iZ- zeVcSTAKiw?0S}v&{Xa$DHzm3!A`(YLKP%=EFdB-vFSf|T5_=BqxbsdvRCd`5XxnYl zb^;%@N$AucwN9aS*e72i(b#?0e2Z>2ACfx|Z~IUaU_VqMxz66jF6su2^pDANXtzy5 z??O(f^9JaGuSx#^bnwyuH$n0&cjC9Qf3RJ8DY=s)VVLU-(Xdcf>eMQ!+5@hu(c`(N zB+k~?hhtcOX+ueH9>4Wki!k`veaZJmG!cWixKK)z`%JkT+FV&w!t?eAik9fBWD}4H zhG1Gj^4%?&^6np-@Y59SW2F}`%r|o#?mH)##zMex$ct)e#7{rgL*a6#$&xYLp80BU z97UDX_PL~~Cs7`UrT+jW1&UmCV%jIE!AVNDrem0f6Fv`#R8mcCaX7S)Ha@AlN!D)Q z2lA~SHRBj=aE3T4DQ3a(xJKr7+Bpw)-~Co~$&s!gf*cnyeJ;6g;Oi;5*t-nt0d~9) zI+E0%)o>&Y_0hWRWVnN^hp$9D&;hxA-`Q0Qy9nTQXfj^i^dm97^^1D34zS_%xued= zEI_`ephK9@H!f-Z>K70|v%uf@S6%VE2N!R@L~9Z+ay=2SeF>L1kVAn3uZGEbI%9}e zK~B^CE84+k_aza|{{Xuy4-RNujAu+Dou5&ir}8&XT`^MG?u?0m;oLjn?J$+vSp`2b zOYONA{{S`QYJ3+GL!ABx-m0IJUydghPXG_f(TyO&eTzHOMnc-B$&=kGiK5)c91V+l6+fm(a_f%%ShJp z-L*g+wiZ94yZk8BoPULt*nLce@UXdpGH-A~hl@a@j}=J}jz=-CxdhzV6H7zx=iK>e zR&FBUZNW39F-peh45)UIZi4E%PlIA_>J$?8F2Xs0y3BRl?t@L&y8a#V9C=?OQoYcY z)Kq~p+~H>uSCBYklj?JxZ=j-xxzTG5d#%s{zKLNp{{XY-sp*;kY)lLrc?C3j>~bTs z&NR#uC*m~@aE!+hEiN|R%jWJkT_!P#Rn%t%#?BadWNwy;rY)%5(AF{IW`y|!pe8)+K)8F@w@sQiKpn0n7k{0YYK6YF04=bp+ zJ;UiS>d2o_i(+>a@wKDC(`$9;c1?_VF;dxk;~x?*?ib}4a<&5`ZLSRrl1RU*7XJV~ zi_aNzhgrLcmCl}yj-seYcdC)T0dlpz6mSz2&zQS2E@7$v0F-iM59nsQ3HXoSb z?F-NKRYBjPe1>{wiTZ*|wEd7+k+YhAx{hDg#lZRmk$akYINPsl3y_tJ{+UN5b~?1iMfo0kB3l!AoR&%b|;xDHa_bHYzc?h(06t3$Lo;=kh}&uG!Z1iNE-Tt$qIhq%HbE z8z}nxUZSz2NiC!NECXcALz?cS`viCH(P60iq|o`~TRyfY8y@CvK=&3wrObBjX>an` zC^cxYz0Y6}G&@(~`-LXRHp0As2@V7MtAm5uKz03WlzDZufOT5^5-c(o0Q-xi4`8&k zY+Z5Y8jj`>Ye}&4ZP2mKZl>D9{Lyaql18Erm`k8I`D+5-*+nhCS6ln2?R%aGwXC>Y zcO?3yNCFWXHZM2+RCc&9UseAAy0{lU!o|L4vR9eQ76(#xk{$m5=n$S`b#xw9LSGnf zTOTA{5=uagu5+ER4eC~llK4SAWRO;3@8m)C3Tyb>J7399MKZbS2(5lZC|hM;Eo6L8 zHB>ZHFvA7>Q%KJU9-eD#%2_84%8(9RO`5E?OR5~-5&r;XKTS&w3=M(?2%%uJS^_=7 z=`rx(T21>oOuYHu_Q#|s9NTYnAG_&vLPo(3jjwf}WAE}DIwAPCMzc~ufhZa*5Fld+ zLCOeSR~Q{p4P!5`B`62wk06Ifph-Hi+lKyVqOr%IKnv+3paB>SlUGzd-H}X^6aez= z9Z)`8#i0J{ul`CZw~JYRlsHE;)kVEIY}o!~LoABp z?UTb2>H3Lp4R=!K61cNf@4ePtHynndBf`NQi{`ibD*ided}|i{+-{17A<#f~{FPbx z3Szo1*W)dd?AGzTw}WGln6VC6aNIwb8s?uqs|wCI6gUnqBs6tytbpzxs9YZ+qN*pB zcNr#=n)hz!zEa{+COFFumN1dfIm;6(CO0mP+-zyk^un{kO`87z@m(Fyui(mze zae?Ulmj*KZ8a%v9G}Hy45N^}=C`)$yM0s5gg~RD?-l%TEEoR@{Dq8@A>uJ>24@2D< zrL8US5(%}gg4Qdw&vv zwRlm2XoRucog^gi=pxJcYG>21Le?<8-7KUes0@@fq-Z@78Jdyx8A}svOeM~R!L0+S zvX%p%qLuLL7@GmbYn{B6HPT2(_t|Q4^P~-9j@?3`@J9ikEo7+Us$q&r34VumSJu`; zBdyI~aP6ygE{yqf%D)lv)jn*=*rf#V2lF6!Y3?0kZ`EL52snjK6OZHA1dMQPHSWTF zY`skQnKVyj?#BxiSIWFK!g7XU#BrKdUZS*GO+a+}CD17zJov7`A;P$+EVS|RqN=B4 zrrq30zxpg#IM{;I?p(c#@q3WQF@nLEVPi+icpeXMN&f(Kapf=l-{SV%Z6XPEgG?@;Lo;P@ECJk^0%XQ-C5*i#HeS-C`#+^uR1%XFy zv%2Q{8v*|7$&kWoC5{-QYu@(;wV>~Eqn5X%H+wHU9LQ2}2RPEx=zD1)A@e-$2=ZRR z#rR8t<7~pH>S+YD)Mxi#n*sJr{vSAR7EQ`@!f3;B3C6ie8;?b-*Rqz4j*>|k7>{$P z+ucnCq$H)Kq!Nyj95xu?iGh(YS*-(P;#UVU#2xr zAF^&SUdkwOMmAD54{0ChpQgwh_ws0vIJ9mqJyVeWV@S4-WQ8p7RzlaqBVGu-!c;NL zy@k?^{k5A9NB}JVc&4DZ;a(lq$2{~RPB!U*aCMdm$$OgA)ikDA?H|S_IE&il6pG;`}ykGdQ0M?hBRC)%Q-^|f+h(V z*YjU-;v_WoUT`8Z;7c2CAW|^af`w z90SJS+sf#4w6?=lysncsEf-^YY;`W>i)|Zk{OLpt9!Ri>!|uArzOsCQMGPzg?2LMs zBNtlsYe?<701ae~!>|0v+Y{7+Jjy1fk@T|u=rn>!b6dXu08&5>*+@Hs{o;Y)t{2it z^e8OXZ00lkONfo69W5Vpq6cX3_a5Z|V7GJbQA~gvhBE&E%!ggY`E2#^B~L<~rQJrW zdK6tGxa{xE42=TUjh8$9OM}CE0ryYjCiD-8TFgJ1k-)O`Za-egDJkDEC&cU7tR7^k ze4UXivt>(N%Mszg+0_N#m>#Qj!1DDLT*Kp{p^KLjI+OJ{_avc&qC;*qbE{mf&%^n@k zWs~M??VLVtT6kzp0~>JC$cu>{cTm}9(_7JIioUeCnx(*LJpk@{C~Q+ZV3;C zR=f&h=vGWlvUv;Lh`q#}$26p7d7ZzV4 z`arx`fE>^IAT14b^~MsC2HUT1B$zBe)`9)PnYZ%Ebb)hN{;!Yjfaq}z)WS!i=N9d0 z0Y0&=FR;G5docl|35mNBSO=Oh5nE?Qe&|~E=^@X*_eucazl>YYLP6UKL`U&5{{W&M zTT6E3{em$q0{p@L1;OkV8t!fUB`AS3Mf_0!i=r^=gB_BNHny98ffP77*3=q5kvl+f zEC*vA=tEhpglHz?qUSezhXM!x0Br+Bq+!&0kLu99f$eO|lwr<13Z{omxin0a0sKv7 z%g)~qahg6VVXc0U_UgMp+-$vZ!!BKJK*O1RVbrW?2d6->8hnL5P56MLse+0}VY3aM z#`r_6zqabcme)j!(dhVfK4yk}K;12#v(*+v%d%9~$k>ZpcE;|n+ptyK^_uCjHUTy< zJc1y?13~;m^b0%Tc2>mr^_g9pvm{S$6G}rVX=UCwfNy&qy;QZL8*JboE?4nGi$K)j zGBX6g-F*+ycWCTX+}{q_uEwT|I#fxC0jP-wgq7&LItVbzMpIGKMNJDv(Zs;eKP2eO zpaKBYr(GGY^v^S7?l0oGBNu}z=Xy~UTciw&5CDeBax8E63ve=t8Xg=ATrLYu!B8A> z$TYd*Bw{X-jwx|CVsq)IYXFgt0Fh(wWl!KHH;UqUS-5^NO(i{TAMV}Fz1?o$`lhr= z875qFhF^y^b5oDSb}>LG5*(Qmwq+Kii<8pBO#5NUlY6R7IpC5!-nS^%G^SD zOR4DG3vM{uI7$d$%xk>CUk)z5~RE`x_zDw-}_a7`3E%87$? z>UVNec?;#~ErL_KT#!Aazd!_)s_>hN)SOVuuB?Q%3WyIbc#lx; zVzsL1qpOY7(niT+cp=SrxFLCSjd?E+;TCG6&KX(*1{C*Nx{@uU>GfD4X_7*FH^AV+ zTn)fV!X$f(whgt_Um#}+JoA>RYw2Wbxy-wp-J}~3e(KWu=&|W#6G*up&@P#$p$p4D z8F`W^u&g%-c?M=qWB{MTZ!fy^-1RU~)4HNpXpMs9*4PE}^Wq0Svp;8wsbR6Frg@4t zgR_Tnqo3M|B8Y)Dulz-$5!L0$ z&cnuc_6x>LZJT?uPo0;qfAJ8;X=}4q1w|D@B$f!?WMJP(@bz9q47^eUBU*feQZC0! zzQDT)2V8btrLNUy#BJ!hH#g}o7yd+JpxpBSKgqfV1ST_JUmo5_OB&!UEPlv^f$q3{ zt=zR3-J}g|{{RXE4B*`$YGEU?D_Sf^gY1MGI?j0eFrf!o@H_Kr1rjW1?+OqHoFy1N zw;;S<`3eHzeV*U&q!jHevECG40X8m*$yVv>%x~R@U82|4(d@*jY#5%H!204I!3Zro z7eqeYQQ8BKh2Z`h#+J2$zy+^|hVJ|3x-he|BU|F1Vzjih z_;oB)wCrp!#w^w}Y?nH1^#qNz3-vx|x(Pdj`7V`e3ySn|(sG+}V*daz-j+A^TqhpM zK3VcVlI~bO>2|*(bj5k?M0H3+<#{8r0Pdr*1k(fN`&oBA-Fq(NhNFf85-|kKq!a>o zFx(@)6Sw=W(AL-xkY?7!92y45q55`QBDbmn-O}uSz&#Nhj$2}bN>Bh4I*`~d7Cn#w zNxJRGTzaks$OE(=G(Ikb+M zylSQG79p2(JxIavJ`Q8JZnr@As~9doYlv|k#44U4gI8egwQVG{$Ia~^FHw!rYAIo> ziG zvR4zq>oBQIQ_?yI1HUj^@=e&lTet+Q*9k|ajX2IT{wy;@jq31yQSNl^;E@5=*fPtk z4%UqQ(?@JOb?G;7w`I=PZJr7Buu17@8#RaBaAp4h3}@_tMOb98yRLr7OWYl6f%imh z_xc)93tfAY7vz-yvz$)l-91n~M}P=AzUuYDjuB!UQp6^Xn@lv&TU^>UzmoK0eikyV zE)wR?6OChFbBwx;wE?$u^t39AY(K+ZagQ?PACi&Rs-m+<9$l8REevv1vS%5%m?BIcV>LWg5>&XUg>Ho8fI$+ z$Y?jZ^#1@4^0XO8D^cN5eGoJ>ZRota!#byDe~Idvu26h55y;E+v9*`4*b5Jf>MG<6 zjHkJ12_+GmY2VEYTwm-w(Tq)d#yp>taei@J0r6}~3Bx@5f)LH-KeQ;X&ihL1v(bwl41TJS6lf>v+@#lfQO+QBxW zr*jM5^JePs>QVd`Ai~Px%|rp=By~xd1yp}Xzha)eG=g?FJEpQg=>C@3m+Y8}3SDR@ zXff!2(t4z^T(-=`gH=$vV;d)l(U-TVxVQSFJ*G=rbvK1ZV}!NB*EdgI%JY_+A-H*z zDY7014i`#6WPM~J#mCI6YhB#gCfGq68|;q@3*Toq!aJ#+%=t;|PFaiHqQb+;DNajB zw%en)HXtm~a!~0YYOt|Q82b%Y*2Obi$8PPcqcbK6Nty6E%sR1w@l6?S7UR&Oy$-LJ`>l>FF5h@ z0sjDjIM0IS*kt8HRXlM?r|}+!{{Z*ddN!`Env#Llv)_>%W=EUbYbG>l#io&wd`R&b znHwaUNS&s-wmlJmMZqVpUm#bBvYu7Mv%Wb79YYzCs$+ZF_8rQ;aZ51c9BkrYgAHq| zDVpv@G<}0X+^<(~3xN1{4C!O8sitf3i6a6>INry-m1#Gw%Wz1yd^K>64i{xlmzBaw z%9n{Ic-WI~(Qg%%LW+t>Z_8v+my%CF0<2*m$s+?>U^@=UJ}zb`vcC;+{6=XS-c=l% zIQ6@1syE5@Nu%aJAM-1-Ulwud)=RDvISeGEBnIhY~Q*$E;2Y+0q1m)V|8CtU|XR;Lw4}SLG`i_g2cEU*7rqu zaUDhbB=n7YF}i$$Apms|grF|XU~WbI&}w^2h&Dd1NwnXiSzv|D4{$%bE%Lu~09QI! z5^g^{N%h@dN5g6RyB)N#CkUJ9i8#ouoJh zh9@QB%l~77`HK6Hm zKK;j4H;eIy7OiVrK}8KEhVCa>D>RMGVd&9s;Y&wY?L>z(4|QfaHbrrZ=&vj~Y?nDM z%RHI$Gt`|8#ARXr$Y}onM6@{8ZFvE;^SRx3a8-zE+}Q+y!Ok5M=<%9{N?&pb7O*KD z_CIX33`#%sCM3r5izEZ*=HL5aIT@ok(li%WTo2;XpvE;&Z>C2AeNryOXM4YR^CNtT zO#1Wuk9c%{>R8Tx&4$I*%@*yYzwD%_xPhCw22t|!v7_;{ZmEAT*_vlF2wUN zgJdXlJIQvElu!{aE|kzwber@_!G$GqGo#CLLmySnsBN}IH2@RCzJgf#n^w@XhK&Lzv{Y2yql`l zQ41HM&3RVV_ZE4xAIaFRTA0UK0SpG**u~g>${gHm14p=CrIry_6Qi0jBM7l;g~Fs` zoFl^VHo5dqejiXtKg7fK?6EHK(_O&c#`1}+SYk2GI$f?m%7>gDNHzz`%0nN(D)P=C z`bx4x+ID+eX1IBU7-Vhlt?s{&SdxQ_)Q*NNNo@no7j=Ns<)tY@1eNu$)5O-f^E@%2 zx4AY*z1ADZ;157Tw(ye&lQD4@0;GlKSIn{GkS%7L{{W&@(Dm%~W&?=f9}j#lq?)Cg zr<=z3n{7^c0_wF%@mYuBSd3KIju%l!Nm6t&X93XnQ`}C-QgH`}cy1k0tl>1!YLSiD z7ka0BPvM?fXAi4s0^sgOvL0q6dut-5} zw%GLRblRoz3Q*>joAp0*Rz2W7!b7V20ogA#F`72FUYO6+4|FlZdIp#82kJD6ctdp-9+c?iUA!o0GqgN3b9pUfs|6q?{Vr8-Ee+ zWad+T#}Y|)ZDH;=L8!?ct#sMb#VnZQ-5( z!aDT|f$>{NCgJNUq~Dh?a09xtaJp%#a$XkZv^GN!xF>Fj&t8$IUs@yb7X>hGA63*; zQ%7or^%rdK$)-O5h|i% zb1pSmidW9*YO16%$mq!8ki zHIy>HqbG#3{K7Pbp@5&LI$LkWrEr{fApZbyad~qOjQeQ_pj&&s{>eH4KQJ-wUD%Lxi|B&NfXD=aX5LiZ*V-3OvD?WhtYFh zELt4mPncbzLN+9)uS2_G&09l2Yjr&LwXBaEhWEeb5hL7`uOPo&5 z7D~fX=;U>+(KBmVz25RilIb2fdRkqpxkiU`(IX{O%WmWbYcLWmzjYid99URMx!+fk zA=XX2Z@-Yba*KYDJGJeevkzn4{>WVBw{P=JKt5=~{%gEEm{`04*%8wqEO(XAa!-Ad zfyQFHpVT1CQ1%xHSrxvfHp@M3;BWx~fP30rEy7DeE?8(Ob z>50#fEx$eQZdy1LAbtsL)#pbEM3|Vp(aMB9H9;%7?gt+zk zh)AB%pwnlP^6e~`7Z7NnyPWtdM0fMEtXI&xHo6Nk{w8@ZB4ZgMjvhaSxhvT6Ks8*qvhlwgs2#%` z^@Zo$J+F3|G*k;+pc@12)UQ+Itm#Rgveh0bMJrs{V{V|+eJ0z5ZW=Oj6KtLY^&Jv3 zF_IfSpn=gU7({w^^FtQyHx~oCk+VGzS-&2j$+!(xDP1X!nw~QuAY7X)J1l%)W$be@ zvk!GVw6xMYOmjWE6MxZK{w(8~UJzoOwz!d`kUxiemGj0PXUsLxUr->l4!2P`eoJ*$ zrR@5ni%$h?Fv}CfjgVYhbj`7zQ*6*Y)i;GNtf=A64*Km}{!qT5V`Xb|tQx>vJ-Z;A zj)wrLKvus|M(vAD)7=g+5Jm2oQVvNPop8Gv#(!nuRh4Y9H1N7hU&tYdI)TZ?$!cg;0YNmVRVuu5ADY<7Y@?7buKgT%~7 zHDMJPE+HY7S}~|dlckOR+j?iOmcqv5h{DOn?noNwQ%a@hPKt#*_MgNVFaAJGLwX4Y`9?75A5AED0G z{jQ0~k4OZxJAWjb9Jbz(=Rb?;*U@BJdOmOe08u+y0PA&k;np86AC)lZUNpQyH4i0^ zd`x2v&74}mGK6DPa8ySmuW7xnZr^q2$}B!OAY&nEYePV>^ij@D8g<2qL;!mmdy=Q2 zp>=IEku$?1nm}+ix2i&tg`N=R+>YwefzX2B8q>d;qh!qpeW>yVX!s?}+3Pk{;uw}> zVTxCh&0w7^zQ8E%FZ?m&{tNK7ddk|lYcbs=uoB^CI@@f$+lV$CHks2%@W#e2;^Sbx zd~t&s#J(hW{T5ZjT_D1tov^j>5?UG!oc+Dj5p9wwZ`qFUZ-IG-jacBL!lE(K#zCH` z?z314?0ohs=*I^>7ILQyaJXAN92lM^1>I9TO?lPmNLJ1h@S_XiE>xz%Fy=SF&?K8k z4v_hhpt#fV7lH6=4nrB3HcuvR3OAokFkZ#uT z?cV6Tribn%vHlk$M#61&=-;L8xHOks-^1GKT>ve;?V;Z2+6Xt$`=WBg&C5mq0GSZN zb?(c1JvTrdfVs!O`=M}qtT}JDnj_$kt-tpn<+k>g-0?V@P7X=i?1b~p@N0HenP~)|R{p?a!Ad8N^y%LY~DazL2 zcDxTTQJ&32i#fjGCeh;Gsu%w5LK-eT&!cn8=(-l$RGp(w`2|_DCRY%$pec|WB?61g6#+X!VB5k zC0tFqAOy7fuG7C{I?H+?*5m=fmg;{*I!Bri*FU;2rhl3MBY+*zbl-cWc?GA$ghVz# z2>psPFZ?Ne1I-SQJD>y)@}fMDHVM!Jx(5dKKnOLCBK;Q_-EOx|L)r&L#!x#10N0e{ zZd-?Bw>WtrcIi{o6pqBOh%s6@X{w_V!6R(uxi$Dh#Tk_N_g8>|;L3+OHj~^o@*RWC zg}bWH!{C~mgLquFIz)0ESN_wMq46byw}|{1sII4Bc8EmnAojJ^1H#<@07ZcC23Y5- zY!z@pGg{Y+66?Cc-fNkMT^+JzV}qD$bDtY=TKbLKNL_8Wz4|X(XoeWHwV|M%tIgjF zm~ZU-sf^|sL!_;p#q9)c;4W3C68MBPT*#rz)m08V5?XAmD8FP@Et)QJ%T&3)FIC}H zkTzObhgY`X9>r$g5BYaD@VhcrU~|$*J^?(AbTfwn);8wd{Riftsy--jcp77NPO0UM zn&L=mQkUYV6>NHc;mKlf+e=+x`H-eXD=XxB-WryG#=61&S4vGJP=@Z&b>-Z<@h^>I z7~D`CuTaP&b68~5Y}=b$?67sye4fJQGG( z_BdZ+H&~os3Ryo6#6Ef`wY{#^*uM|8makAlDdalz+;&mV%ngbrnE(;9XFR6+C4^cP zjs{aw!%q`KVq-&E(by-3h!QOC2fpP((`K9(5R}DBh{H7C{{TkEf=Ag&)6|JcXg9J{ zIr|Rc)=Qv|3N|4?%KRFloB%n@b2@p8BFh&vY;=?imGsUQB#$DLYg%W3xR=5F&xB_R z_Q8YHR8!3}z ze>N)k0~`R--$aRN>`A@HNSBfwE@-{3fYJi+NCbHl*|>}~w9!9bji%%!U0&H4cX&dT zc4xTkGqeu~cwhsm2+=jp4RG9Q{StiIepSQn;(8RMGlx@?6Pj8Lh$%LZzfF!|VQaTH zM#izaOudCqTUjGvf+m+26VqgPA4~Y>Xm^#?SI5kgnfQj>GX}wGBc#IK;_A8XZb$Ig zD5X7mELTjnjKhPOPEI26dmqm-!s?m{WFn2yk1Gq0xL%>7z~0l?`dJ@_xe|vYWn4Q4 zq|oNm#u{ARu6wI{>6p&evZrL(s~9+tG&oav*DbLf)te8ZWMn%{i0+1#m7%*1sq0zF z7F9MjPrinjo)r!)OD#P^bww*nT;D>gnP-K0Unj`gj57tJrK}|Csbnn@8~*@m*H;ZX zhTT$bHanBITO?0IIX^`UjQmi;I9HSEaXdz$u)9N>3lCSi+W@@F@cF{J{w!u1t|I3s z#ZxM0bEzPzZodV=X&33X$}8j3i1b*dYRkE0)dTi@9L4o~vv*wD+j}hwg+s2_oka&CNX%u1F>4i(a36%2&kA$)YKA&< z7F(>Fodu1Xs@v?hEGDKZ+Ez^y+an$U1*YKLKj;ALqu#n9oZhq%(Xs3-aJVAGFO;C|VybObgS$)yA&ytcyIG+9{r@)FV@+fne zb7qG5-3fND;UJB!QJm)iac{Uu8jS(AAP)W&V|pY(gW0Ecupcl{d8FR_zqvxmgU;Gr z>7^J*+uYjY`zfjHI||oduPpjqaddavN7)PKEhWLs->-!nS-@O@>=WDAr(84(hL5@% z_tGuygJ#~=aPM$~P_VU!*)#(Qn&Z%20hOb3YkHt`<8WTkJd09KdDyhvCD`Qt2Lbl{ z4I|Fw7DrfI27%^DPx7u<#Iv3;4jY71zOt%QVAhR?sXdpWGT*_TG0SY&&2t*@XBt{T zt8XIWqLr~ls1E)C(iUYA$ww#xVphf6y{o==)thB%rV9BpFqiIWaBQi)3^|P-hUi3~8|g)4WDVLyywQU%aJT-}x@RXk zlotb|h!;y+@!&S_oH3lxX}<$-@C+_w2HF9UAqrcEU-z%T*5wno%CTImx# zkcpgnAJGB|In8fkfv)sKbKGo*WOGLS5(+6Jj@^-)dwoaqO^l?V=8%8rmtZ<@E+1J% z4-tzOT{jbf^$-wqnom$trUREs%WP77;Cd99nYKx~aCaS(UqCoAld|T?Yo?-GqsJ-o zLhJ_yTeqP}^9}5R?{EU&q7FpPaq#Sv0WLN{sE@>dG(RmV8#Nc@Bo`$8kbnnB@8VUC zAYg+W%d@qBikLrr32x_5JqPNn#+-))CKF{1u=-+d4B?k6a!zcj z*DF7D3o3jXnQIo?&0_xm5}6YS$yw%yFJQBn;i;()eLGFVbse_f_>)Z0kgIC3D6yOh zrPLUlTiuS+uH83II5UBZ!+Re4;hW|j-*Ez0Ig?94=olA8(4Ny6u`2Y z%_mD__g2$ExLNJ#|{&l3ss@2-|iHUatK&d0T@yf}FXA#Un&q_qo!N zPpXwNw5-{iWNs2elDH_m;}w3%#zjSX#)*(A^d3o9{)BsSlIP+${La z$LT5=O`BOuS5D^!inxm-``88R{J)!U?8`+3O&wG-Hbw^l9D&wZPGNldw^bdOv1VYnC3SPQ z`Dh<4{hM@rE(g<6vN-_3hpgB%{B z;U-mP7~WLXZx^1THrrXf`|sUcq^E3}*zu`{u~KZa*7`iL@GZq$V;g=Ulye1KlU2}1 zswT`V=Cm6T`mbU*v;aE#uMu#Hk+?U;mSuvSf*53`cY(VN^p)*~pAPA={)+bPb{utI zK0omv4R(FYM^S5q6Z#ImG5*Ww8(!<1{!8Z9$F^^SG4WQCiW`^VkUAHZ_a(sp0N=WN zNTIjc<|o?YL!L)}3NDTAA0a+gM{`LVxAx*A{LtWS1@_T5qDc9i~ z$CG{h3$#3*vJY~*mf-7t@cP{v&;z78;F3r-xM~|X59uBn5AdKZb^e(=wkUaImXF?m zN19!v0N4%ruD_O)T|l>Iy8i&muX!7SnXfy-C!@ne<5@Mm9ze-lKg_&9#Ur4=aO#Trp5W415zx<9KGp-> zdVhxB0C1ri)A*W-W z+m`AbLeFy^3d{nbviz~94gL|;YFUb`oK7m3S#ub$^-0%b7+yI@`dmUTRD^6xdLMK+ z>{#CY8OAcmv3!?QV{4^pj?U$akg1gUVmiz@pDzxA(e%pS5n9FpM@L6c_@bUUA$fQ1 z?bUW+q=K59%^fg}_OJm@U9PUslcG5t%^rKlGJM7I5e|nQ!BQ%4>Y7O$*RsSfN52c6 zpc$luv87XZg_IZRVABC`U~vV{c`_#Esl;Qmug538#0Gc7@Q90QW-1$KAr{{{Sg*xv~RJ({MsG zC|5Qs?_=nS3=RAtDA-sw{SnPPBd7_WBI%$F*`SnBByMTa&Ha-6s0ToWf;JE|izjPf zAspqV#vn(UQ|2wwJNpEqi*76shlcbB0WmC{u0F`&({17?Zck;vxcUT)YY6c-Q6aaI zHRSF|M%{Oc{{T(>QlU^%NIRg`jOIDV>9XL{Ziz(;H0CHA?%||>wSR(K1}eN07tBv- z{h^4-BV{+egIquZ>;lQ2OMx0e+o4n63cOTiM9($(Clr;@MOHydwF-Hh%=7@sVIpil)bJW02E z`L24lb+*Ip=zkSRor1$i2{%2jxoe2@vQdIJj|jG{#gZZe9EVW^D=J8!;i1q1rJ8OJ zpQ0nFUDd8=1ptw-lFMAUeo05XI-1wFPzg^=jAbl_DKs(R&nDMy={K@c;vVOHm!EKM zD%EhKiFm#hngqWgJi4js154W7Rv>J)`W)YdW(Kv?IE+&l?gI}00FqY{Wu%D)MoBGy zjlN|+fH1@w+WdkS}iIqA-YF`o7AK zn>gE(vW-785XGZt>>~x_pDP8J;@=feVo_+oaGI*n=V@e(iOmG|OYhLenXd6OHn${X z`us}auWJifZ*LLkyla5wdMqwl+{IN=+WdB{?va%37dxG{{Fa4_U>s26dxRLALkhv} z9i`7`C+&5oaQETQEM*48EVSaTw!-pCHE;Jz@IEU>6_)sAk8!Utvk=S_GT~TrW3y!C zmTDL2?!94<@)jA8a_l%}7q65KvR~EC?n<&pLdfBcEr4j+!?IDXU?euzZq`UOy$DL; z!rvB?ex*8sJueRr2W`_@#+k!gYg={HZsg2kea6=XH$o?-S}@9sNYH%M#}1e~p;`SE zSTb033e|#E5#e{#PQg*NXq`)<-4u6dDSk^_+k?N7EiH_D1h{D%6^c;PYMa@@Fb3Otpmc$)0{19iAoER}=IeEjrZR|9&06J~vQwb_# zVsQgm0j?dcF%lO1MU!)mMNm9fPPPH}sz0I=_M@-K_ozD)DPjCUA#7oMfXx=qEs z7qvb*c+E+JK^t(rQL;(sX%6L zPYFJ~5gz`b+6X>DSVEyvUeJGczhS5%mOA>3WFO{5B|KwAF_1f< z7EPB#k%O0{YTN|)Zp3)GOResy9{5`=EUE4UgSnoIK-}cHu5ulK~i1b@*Ja<`&Fs=D79*dn@iz_{9_z4Q?@& zW@LRF`bl*W_90cWGk%S{Wsh8SHQDUUZw$fdaQLE*oxGhoS{ZNMj=%ty$oYOsd6`hb zXmwl*c35r)@omElVuDIcM;4z4oz4ZOvXVd>>8JuOw@w-7JPRMg+Hib+hZc_a4X2r| zEg#F*^i?FJS+dxr_Okx~D~o}+($mzn7NV#@N=-bC<^j=*FIFUBGWiBt5t7LL(5;5LN#LOOo zvYf{F7%Hh{EkNKyKsQqj2vqGFmT_nnGMfh5XYK z-?KuvzA24r>dic?qL=9#1Lm4$JSn)ITCHR+{rgy41@>v)ATA6Fx#?FHp0lf z?Ybg5vF+G25QK#m4Y)?-?XgXTTb=lSBv(%*_5~;oj90bF;|NV1JfvFdgXI1rbO7X2 zYCBjX{Ghjr-sz?0{3r88BBuWUP_h7aXyMcKeyf{J3kO~O(@ieFi>ebHhg1N&n;7mt zG$F9Nar&m30s|J_UvvQR#f%#rk$KS$xBDi{U>)v`7!PFD0^)dow7AwqW3l{_LN?tT zz{_;FK!CQgBj!;bRT=0CBcWq{kz_#4B`BfYGfR9U?5UaOF3(d{Pmf`Lxy5K)B)fJp zfn#rdg1Oir&^PWqls8tbgNbH~rA*Y*Ha-sgMB$ue!+&GeLfQ;!nkKX{?%qNLu07Si z5yPd!U!thDQ@5~QMaFW2h9gaj;FBCX6sC|!3`a|3w)QL0JRjp`8=WaXWajf08ILi= zt!%`DAiaTU=zlfJ--$XU`A48(=hc-Be3$$_sy9R$^S!K{hFO;YJKw5hjNmm`Gt8}N zeJS0t4wgyL;$ix1O}v#=(PnFq^~}`$x}lOX_6C5PMN=c3=QW|i{Sx}!puCG&Sc~YT zC87^AaQ^@V&Gf=7QjL`s7Q2S!WVpY=+*otusmj>pZW~rzcD2V-8^1wiy9r5XaXnDD zx$fAIo+4+P>M|D;GUShn48tBK96qT*7F{0%Gxj|Tet$W-CQH1L$Eax@OVN_jS=!F$ z%%X^kV=n;&h{P}tg#5orQpvGO>a1EpyPGk)3%u67fn-bJR(iw=e;Thh(kC!Ebh z4uqbG=geq`n{?R$y$G5nL@d!}k3?fm#(;Ht?h?^F$lXC9TrRCeVdjgy>pn?a;0Vhg3X+vGRu28n2wKi)> zX$bDA@rK#GZFJVb2aLIo*{vSE)sq5YsgM%uHN8nuP+{+tz%{oeTrdnB(Ffh5w#ucl z_$Eb@4wFlC5PED=(;zM+?`^hCU}1=Kx>n)4lVR+t5y|hM#lXDp_DwN5=y04qYPpV> z;%RX{fjn!P2(xxs-yX95HJZ3(jNy0#-5w=T7;I&vgVbO2x+2%~u@n4?v-gKbORwnG%#+bLX7&y`$B&NzlQRPs6U64=h+ zt-io2kdiju&awVmu97`t>NoX3!=w=6Z`#36t&=pgbBkPc5;}Ho`XX7Ofy9Hqm6EKm zChj-&2$#~`-r@=CWc>|HcS`p}?M*s;Rf>Z%OGL&omnElF)nYTZRABNu*rDl`YqyW1 zLH_{EX$jFZLtjnW6Y3n}9FjD?2up8kkQQ7j3AiQiceT;ue6o^Q$w)neNeQgE9Ebk^ zskc+p<^0Jrg=I8OcApD?19WIOTTe?-^Ef6sHV;+RrG$Xf_9JDJ;W&O^sgRl)I{JZY z7R3XdZ^$c750*HKiB-owAo6n_-1gz9|$fQr=al$8*WPtw)Q_|oX~I+ja9B|VZq-yKZVsX z1Nyl}WUMyIVxr8jZ>X{#UAm)xWkjUhl94G0mo?TpBV?` z;*(6k`4*cMr)3;+8vGHm#`7B+a9Zy*@}eJxJ~3dk38Kn4#XR8b>T=y*`&_+S@QW`O zFlx%oznw5VQ}#iGzMihU58XU|8{Xs)s!j`C5=Q<{v~2mZz1Xg7q@<-`k@87ryQl+o zDEbFscW%}LsS3xHvT&iqrmU!E?{9XK*+g*$x-rabAnG8EyOmZ;qV#5pwtvhr%Y(&1 z3k;Dy#m^m-K2^czbV61*wDlHN9HU6etO91&Lw9oxjGaQB%k&X$A|bXm1EPzxXhrm7 z*p_CNr-<%?a!W%5ZXUjCNv4)8Dw~@XEN-yexOG`(XT#>G;?+~(j?sbvcWsuM=*(1> zmBT>^9GWkvsYgUUKap^rBViRY2wO)}PSWAYXbZ{J)U(o5)WY|NX?vT0V`b|+x0ztU z@hbjgjrn&*M#o{Y^2TM5PmR_|AJF;UTkWZKUMAj4N0@%1#&D8oXlMTbyP)TT)Fi<4 zTtEY&>d!v}yc8_~5v>K?5{(u>97qDlf=%sYrk_Mz8}vW`UFf(z$D%l9-}POl!?HjM zXL&zJTqJw8wY`EhzQ{uW-(nB~M)&^!qTotb!{oRI-*liR?T`ZGiiN%S) z$tdchnS@_dZw?3YS&eZwKH06J$Ea#0#=*zJl_NuM+$JrToYuC$7LKUY?3PJ%hZ5f4 zC#il&B@X%>Hg`-5ZF?h$ba)82qG@?)&<^2{Io7fFMsz0j5;!y6CkZVU33frwEf>?eK^xFWqijzN0Q-`i z3J_i%Hy66$Pp~6$I(nz{lFI#cL!U2(9mMcj)JKFT8z&NvnfBsJKz9X5}sOEWFKb;Pb0;Itbl?&jgqW+%lM)fD6iztqIG!H1|7X!Tr}hV0J`fhHu1aT zUjX6gs>AR$jAInJ0ql2yhkwm@yBWmztHg(E#xXkAv80eCpG~jU*0Mr}f^x|tUtNgY z9o*J5lYe`wal*N;MR{dW4{WUQ8;UuLiP)>_ak|N;B;AmM;cuV{rxRqiM}5It%@+vr z?Jk1&WvOuOE6uOxu~KZv8Ga++!eI6wpY>Ov<7|HVD$B~#71G(~Cos0|Ji>bf=ARF& zt!}D%I*Geoj3s(XEUSaWJUCr5J);Ur3V3PZ5kmtbVCdE~0s1R3@tXcjt1m8}sGfGh z_;-(Dj{s1~75mHKRu*tS6LpN0kjLqAZTc;{QLr1M7Kb;wvb=q1ZL*{CmpAsKB&Xn8 zK^)v&<;SRGLr>dvyKq*j9unxHj@`>kmGQ!KxRoJzfrwWNY=uv#`) zf%slI$$9X#wy@~1e1eS;)K2HZ(`HIju&_IVoIArSUd%Hr;m)ROECb@!LG>p0O(=MC zNzHV0I7K8evwJcYjlU%?hDjuNjYtHRxc!vQNyZ;GF)Ty^*EF?q+~(i2v-&WC=@NW9 z%A*LXmZBPF2D@U=(c!h#j-mK<@HGd?J?| zt@*l1Sog`f0QCDQRHgYdERx?yR|LSD1}6;eo=D=qR_}G9#%VN(A1%v#!lpHnQJzG; z7c^~i)kD|fScM%=g^!k)?{F@jnNo4(=+J8_FzVT^$!`AssA#33)+1}QyqjHSl46oZ z3+dWJfW4HCTFh+4CJ>erQV!>(lgNIXMT0YDDR8OXO2%pgbtgw|uVEqtxBKZ#ju z^7URTR>>dGbtPGFh+xC$#Was+Fm39Z`$G2;kB&!4gkiyr8l)q#s?%ce;qK;1`?nKt z0auLaPfo@-M(f+CJ(e+~T zXmMJmKS%E1V7|wwR;sM1{2wY&Q!#`|aOdB->X>4$6VXWtackV$Cdp}X{?vc`j3v}54FU)O_ zb4P0wi3ImSr_`VY3Awr}iXmaX$Xgk2RJzy#5xaUIM{eOeh^QsP4r4*+lnrRT${_;% z69fcG-{H5i?Lh8;mY&GX)4jp~;^v;nE)koj?v1yZQb*9JOcwV>5(e}^sdl9pfb$6! zLK;t^0PWI&!QBuCX#jep&azJXB~L_=Bpr|_S25jD2BLbUBzO!iakFjLvVp|&h6{p@ zWmSAl{=*y`@5%oprL)o0zIq^fG3Cs+sO+_~ascu{AP!+^0E5`5uEvFn?eYu$3 zxb{iMC~uR!5>-}qk{e;&K{q{M6zz7rQkA3=)>i)jx~Jw(h+Lb1Im|Wesg!;go(}2T z6OPS4!om}%BV*gByf@u>wy)y%4SUR3ZV6dcDcP472mRD_K5OE?5^LlXG>pLM3H{lc z5-gi0F9)O4b5&B({_<+L;%MCJb*vRPHhf8BY>h835X2o*D_g7zVXFR%AFB9#&e+Bq zUt5|pIi8*f-6(7aPeJOj-XCSGD-d;j)ldq#AgOdQ$tfqo0QL1!Q&-k{*;69rjSg(| zZX@y1`nX*dRm&B16WjN1sTN^9uezb0y0bqp6tf_5n}3CEyMM_(dkd z-_eCQAP6+H8(m8rdZYfOX=Pm}w;`q(gyHyAg(Vae{;XgHK6H(D;T9dNlPzdHP=<>Y zI)oRbLhZ5pVYRdi0dHhRkPl6eX$Zh=(L9O0_RhGz!3iSB3G+qN5{MsPy}i)u0Cc~) zAaJtnZ6Ku31I^ynvKP;wbh=0peu&3=Cd<<$t6bL0w+>4=hM)@;J9Tn~K~aeslVBWY^1g=R}MY*P`GuS6C5W7PTTIKK56w+M%kO8+YN45U9V?LD5i_uR%?QT7oD-l z$2L+wcHwYT^$vZ54|7=+98tZ;cQt@+t2h=+!^R}q+KSlbW}E42hRJNxQ$maO`@4 zk<~gjR1LJhVpEgfvaQf@T@U1ZGgCWF96`i%(gKgdFxjx_pHU4wZF$`6t`4Dv#_eDi z?i4;TS1lNO8p{hUyl(9LFgz z>W|+@$4ck!+o=fir6Hv)fPo zStU_74n0DQ%6QBW*0t4O2R!WOcURiHMv8>sJk*9kFNT{Akr4H77O&9c0nfVPtZJf= z@|h!?99%EaY|^f-N8vYlMDDZ)8H!lVAgSqao#yy5_e1Pyi!G)S=Pp zgbgW01w!Nbrh_g;%&eEM(;G$H@3Li`^Q2i$h!)yPseCcUKg_Jp1H!7aP9o~eMp>pL z+&TM!=sgxLe~^YObiJKtgzz`{Yf?*~*^e2Kc9{!q8-2=3EV0YB zGtFS(yjI~9ni=FNBbJs{TUEF54b`Hec&Q@QwxiGhv+R|Ke3g5e_vwwz-_=>8e3->> z5eFExTK$xH5q#Sd!Q{jCv92q!?~#k~Dc%UXJjho^m}w&Q;_ zqEN=k96UnGiPg!OZYjezO{jdVWxbtYWt`?d8sd0OOY12Nl<$vl_)WjbdiM>>5!Sha zM~&=krmOf{OI=AY(mYK&0IbuIWl!yAJ>+q)@}u&X;_|wplDcdzW50l3@>C47$L!6MWr~XxBNCz7&y<}^ z9-eE_c{@5`dD4ZIbW?|F7c6n{53b<2vQWOGZl{!YoCW{{V|?fc{(m0IGdS z@P9`goi1d=ByIY14*ZJTgWG#*w#i3LE2Fa|0D6Ts_w27Kv}6tVZp&1z44P_*U)<_D z{;FpT;HFi;s$;2-4{WkDSQl=8ReX*24(D9R4^!PahzAQ0myoInk*vV5s!x}a95yDn z8~p=iIRvv)O4Atyk2bZsCh~ZT0b$bK_SseQj}|iiNrXDKt)f6KBEIL@MJbE)o9P!! z=TP_LjAS+Bf670%qVX>ZaT;tt1H@#ktBwoSJNicf*ef=8_@~FSoOL)p3r1*YW(G4e zh2C4yW;kyTq^qdCL!8!xxUhE&MY_?#@4!?eJ zqZix)sd$D~#xg}yt0*eu9imXtVS8TTS$=WW-E`2GIH?|abyB8o6848YqjX^yw&|bk2Q=(| zFuV4QhoK*mdLSLTHj%O*gk2!o=^T3>vN)@g$-j~SSi^1Ye`HrfaEbdyM)YVuWJ8Ha zdVoI3=mUsXIhGs#i65VNJ6RZ2I^yVPQ-vIJI^;R0>zwA@VE(og>=O(2|+SRbV=# zIf2gKp;)=Nu(Ca)rtw}%c-AQIqUhXsR@2hd(z(qMpAZM4xY6d$euFD6(QB2H;rb{+ z<(n|>MfO@gS;Oh^3KC1Yw@@IiBO-+fG{1pl%c{D(BKu zI(lhc1kE=NYDS|PtT7VT*mQQwsNgt^tz}fYV0EraRnY{XnQh9Pe8lkTMpM_*3Z}-} zHQ@N2?v>2kXsE~O9~CVp%RR!{^>j#6Fo7|_&e9_X0PR?n*$9pH!;qkT;AUD#XC#h`Lj67}RRD02) zt6I@up~t1g3uI*4OhVQrIIP7TR`6=)bE9s+7aJ-*A0wy)j=$98;<(eO;OrZlg9&P}eo) z#lqGK-FUN~dMZ!JRgUdy5Ifx*FgI|TYH{w1;xXbZBx~JLb*4ls`B~`fQ9K2;vrmXi*2&IQ|p%MRf1K zgbR1NUGcT8N%%)wBS0tfGZEPvob%W8N@xUZe^gT2btyCfM26udS60{|e6qin6k}7| zdjOUV2d3*j;>KV!)Ue?c+s#$tdGcpZ`}R=ZY-z2^MeICjWnCKLy_E^)pE~B5YUCx< zrTQXiKZI-w)bLCu3RcGyZLkA!s99SImY$Y3H~@CB0cf;3w8nS1wbzwp-tW1~;QH6} z{{W1Kj>C*~+a{dW#b%Ad2x-|g%?<#9ce+W^Udx>*Z+j|j?HwC?S|;C-a?T?zx{7vl z2>d5y6^d{(En-jRM^VTh!+YEHOr)=TE+MWr0{6OUK4YrVMy>q{ROJ=ZDo+Z#)~16& z!2G?1Ol379a1TLYe?<6Tcx4C;q@#)eKZGb~ajJrD_XlpA#ffTmx0??jmC91~#it6a zWvZj1Ru=L`*S}>WLyH<`_{rP!P;A8`aki_EG^l%JZnm4K%ML8ma>i_AuWL)}@zU#F z%ec!cVH$|m4X(ViLphFiKUdOkvehz%aBl41!)?luG>mNgMQnDxnl@<4w3ze~Hb_|V zL9w-nUQ**138um74RsAUY|T6A7StE6aN6-v7$kxSx@DR0YJ6Ua?2)Y^*4!oA_F~sL zSN{MU&kp5$*w)C}3ZUrZ2HkeLx>V??ryy<|I;bu$@YLn(N}~~?yCc*IXJPD^<&1f? zlyR}|B1VfFsaoG?*T=R|e@^BrjOEy8h!@7!w0Vw-rG;bomkzT8Rn!tlAqT}9UEy%= zx7}UqaV8eNJ$kpYtyhSoVPuWe)8ENi$FZlQBg=T|?ycD#nd#0S@e>W=4i3p_WMgVu zX3G(GiKn{1#{*j*m>o*_`GjVg&K~AZo5Q6oX*!(C?C(3TX5`)_VVRpQQddV&A2LRO z<`-@cab>c*T^?aSwoWaU*U8CwAO{2Hp>rO4!ZNKSve(tcPUg2zW1KGUK&Tv7qm%r~$t0_tvSgAu=7yumxg-&1-$7sh!7N=Vi)?0KfCuWdwd`jgdLrpGBChdxPQ63EfCts!~*K37FyBZ1p{ zshTXqEj$7e@>)fRC32V3^m=EY;$*Cd?V3d%W}vB+(CcX~?cGaJ%N`1Plv`UEhL5Gs!g#H?=|SyR4CWlJX6YMx!l|sJf^zV1MZIK7#r%ivckxr zw&;IVqM(Hp6i*_*TQno%?rWBawzY|e#C1v7l(~_o)dr@X->Px}5*^qQxHR_`OM-g@1dwh; z(h4H-+IXEK&XzP1N%B~PweQV&x_a<05OHe(?9f)D#4JbTa&9K@?GJM~Z6mG83o*zP z4|(QFe12B4<$T@zV4#mr{T1Wmla{QKGz>1BHYNf9u;^5*buEOD<6+q}gvS$icm(!L zt0%L;^4u>zT{G#llzK>;7=v0*gr6cxBe7V<&eT2CdDxL4Sg!e?>uM+$i7`=9*#UEV z^-=QFUE37Z^##D$#@ej zH?Sq4t^vnw!h0SgrGc$KYp4!o+z2~_t7>=OeUtVYMTNEZ9mV{LvE{5&<)tOkn|BL3 zZD6&@`zQ2xkBC^G%^TSqt5Qd(GM-_IdS(_#9$mGG=&f~_gIz{!cRg31IDeQvT4u4% z@ZS5CuQIu{RE5lS$D&m9b6FUP*~rfLp$vGXHAvDNG#!_gaVJlJ*GkCA4Ap+C)j6&! zbMAI zoJF~WCb*3g;XJ`L@39uyYS&CxXP@QyT!&86%?pFX&DZH;rT}njqTFsGnhy7Zmc7>s>nZW zXI#4j9WO4XCIHb9j9$apS#s|eaI8W|>G3FJ!zu}}d-sn20O-9YoT}7RS@RZF4eH5H zUd!od2<&JrK}%IjT^}-@XzCf+%r^;3FdK9}iu#&sqb5;tdSP_8)O#*5H_ZE^>>X~2 zK*rm02f9@DFhfg6Lv_{%&;+T8m)L?)Lpt3^0wja5To?#SIsi!^*$ZL-ZbCLUZ(M!q&1Y5*+zDcP}) zcA8@s;xC3xI(#KKX%jRMeQt)LmKuR@cl1Vz5gTj`)&O<^ z+uS)6(Y5V10ixU4IWP-uxsirkT-{l##m8NvD*pgGJ-#e!M8QVq#_8RmfQnjjg_t?`0*Gp%u8i5m*+J(5kr>s-wuU zMK0;6=x0TSRMpHQWNzQRl&sl8v5{(P42RRQ*=Tb0Ji)Aj03HVB)>3g{G!RH3ozN^6UsXZQblLEvdcZ!|<)>qygWGQRw;+ndR&56DsTsrhxon9A7PWI^x znZ95vUIT+v)>8uv1}HqsY!o(mDa>4d4;p9LQ^hzF3oRAL#8Ds^GDZe=gs7%-y*Ba}2bXaW+ZMV!yT;eLdGCV_l0>RcSl!JrfCP-6{SM zk1nSd4=aAAiCG>WWO+4_dBL<03DlQWL;Km_tHIo>V{SN z(qw~Y7O-QA?R)Os(@`zOfHyuy$XNFcow@CDnD#wdE=UjDSsp?hgHvW z=u?Fzs4+)VNYdAqka~r&WUSC@;ABO^!Y)d|1;d4fy_MG^<6j{zkStEuDcdVsT5Dy? zKH`pqNE+*twSicsWsjc>h5Menty4TyGJ1F)7zNhx5VE{X31N^qx}H>#Z8TQ8r0AFZ z8O+sUl6hJ|Hs}X+Xv26}nKL$5uE%NUW;9geP;P^`@4AJ8uBrTbk>Ft%VE2;a!8J1Ue}>A3lr`4CM$#uG-~jvu0qtck3B;n25aNnb>Tq0cQ~F6=89H|-bc|FY000(ms(8o? z04*S&B~Ehe+{R83N=2s!#LR{s*4V2VuPhg7vGY|NAPCwEg^lj42BJ4a9dWALgwJN8 zl47`}ON=H@s0Q4udngf$@e+E6-R)s zYXFyIN0oPmo9-;S6qfdR+_d^;JnWy2ibk7r zB5l6*C2JJgUo2xt7CYa%hRoSB49t}Cb7`5*xwG2aAI(h*X9xC_Y{jnMXzUv^Vfluf zO-WGM2XXj^xKdo!Jar=8Ya^S4IHm_H>M**uhqUTzHE8=?O=aBpaH@vk`K}3R9xMe@ zd`xiZ&~CJA&RYA?&s*YVrS^I>FwS8Crs?EV50HR-ue!YD9HT{F6jhYdMN>;NH8Ma! z4h2lrWOg=PO>1{O0&KJ4@X4X!ZE@L@$p~pJCr_$MXxkKJ!0KAu{4M#~Cx9Bs3$8tt zFEixVW!iRL&(v+#qJ4R4$Ly_dikueoRGCr+$6&kWS`Odjk(Gw$!jjyDrxuzqWd|3g zJT(o!WJ5eZh+L-42HT>DuH@FX3Qj(UCZayw5q@pDzNC|)xagA*ZL){ylKp1T?AEtX z9Zbcdr5jD8EDIFl`HDT$2rT-Fj4yCENpMltv|MitQ>wh4fV0b$kVlo2E= zA-7_I)e!k8-dQV)d_JiJWQWZ{-pEC4R_GLFi~FM%kUdh0NR2Jt=mhNtakrvP(=ywn z+Wk%dZb`PvDa}D`l0ptoL$)}*;52pgR;-JLNfv3O`H0-ei>~dxF0Oni;i$)=Yw9EL zkX#%>xZ@dineY7>x>qd}V^e*W3jD)WgjYxN^zJc8d~PFfo@P8N_LMa9(>NM;i-&bn7_D9( zjpg}1Wvw%@z7uWLXAYV8CQ_C4jxfU6H3I#B1uRS!@0P78Uh+T*I!#LN0G zNaynKR!jTO5N3=fY!eZdnnsqrr0i5PWpDv(++o9Ms%m7d1NY=^eAYlfzQJ~wm%F30 z&Af&(#U~t=9Bu&`d9QFkyw0K{%4C^ z8z|{#FE+pCm6`GZwXN=<>S_dTcBeF3rmpJ_5OR7)qGjYpxboQ`CTnlL$gYww-i;pW zC}J$O?rqU!#aSCVkI7D`pWwomk)-pKG2x{N}H#}NR}@EX?3sWmSmS5K}-a6Gi%xs!Ar*Zb*@7 ze^OdM%^vp&i)3?dUR!?T1gjq4ZXPY{qSm`b`t2TBrMJH5x~x$c{eM&15271+bO|8k1Uaj{&Eu4EPG(&UMaws^|zbTGKF&E{%t`_qLWxw#V48=Ij#CY>IFv4(} zP1zB>jhnaRKtE-HV$qtsb_l~kFO{sZ?{j7D+;a(!hFlJwrwpeMPTFP*M$V0QAUnnLZ|Rjz`B}z+{p0$lWo$(GXhG(O&b84MoHpk5dUET1v<~F6==Y zFA0N{Y4|r^JhgDw=DNAV;&HK&g5cYI{MWTOEd^9O9>F244*7@EH296k7TIbvk=E~WJFXrkNx*1t9 zj#(oI++WJ&dTT1@iVRyI;n?i83}g?dYuzi{02UkH@=-XgiuW$jn{k8TO9jv6_=PL~ z0A=DXxekPQN%afIoJ7d@-w1J|hSpTYQr_BKy}#@)P4Q#mN~bYL{{Rl;86R6$BdqGE zL9;zU?y(&8!%RyB!DhyDO<&C7bru}SpM5>exA#!lPb6sLWKOsCvWqI#UomF zDM_jt*6dByR);v@lEBxRff=^p$mca-?{?LUaP&-$Jd^ZkKQm7qsS(^p#!u|%sdW$D zn-Sj1bB3*kijFq6F&l`vcbd+zdY{|5fHoExnk>=fw{@ar@cM8xw@sUCEzElOtsZfo zkn*o{*`q|`Cc}H~rC_IQWNX-VDw{)yOHTbp=KBlwR-8K)CI+>@7j-K~Uj|sDu4t6# zxgKRXiHA+V>$(`^bAd8O*AG64SgT@=7d7z~xb(HkPhySdg_9dNs65G(DKi!r=pP@9 zba?C)hMH)>c$>u5i`>{*Q!`H$aC{38F~u)1TE;ebifo!ou=Uwn$eGs~;Lc{J!z<)9 zIc@S;#FBIa(2pxGY~XAdo)HyvGsPt=tPhSrZX6Fy*MhT_exlBlj-;WWbn{;NS`NzN z@Y}_yGM;5-TFn@Qv{*G$+Q@nOiz(f>AKf?K{{WShlT&MyO-wAP!|a;q_NEz7=_%e^ z1GGDG-iouGD5>$-D1HNp=EVn7QZGndDT07YzEG&{Ia|PMn>KhkB z;p}06(w5=uaX(*+WguyY-(+fL0Rwq<5!%hN7YCm-R=U`>K7bUr9Tyr~c18|x*>sL@ z3T}u{k${2G2g{!;qm4t($gXpZ#_7D5q1s?F4NkR_mSA$q&10eHm%-^iYOSoex!-l5 zVfbjKjjT2xdMxaW*wFZ!_+Bn~QRIxt_1Xul`itA=)@Y-TAV*qxlJO?$Tn z_fuo!E0*+7%*2)?__3~w_Erp^f&SI5bDTeS6^)4MN`rH>w(V&q*A`k&4qMH^qnXj@ zd6Zpdc{^Moq;B{;XkB7*TQ~flO%4@+#>nh1Z**LCmYWjg)9L5A_v_RqR5gudBLhyL z2KG^v_{*v5W2%IbA_NiUR|hG%`#oLN2uDyxkB~A%7&sF=0P@|n1j9b!^R}?z@il@V zcO@%HogI#3>X*QIRZz!q+oGaiP}b%Ochyx+=L?bvDTPSNTkM*}YFQNoR7`6{z%C&} z#aPF9LFjIyaV}@=hBc0Jz}v%fblVf7sG-v&jeKG5xYW`TKG~-YYqG{FA9 z<8w|G!u?K&v`8A#N1Bh0stTwu_OaTf@&e^sxT!-9W5^VAu63GRHF<+&CrN0hWVK?7 zGENER$IxLyiW!Iw=x@I2@m)~*okm$TJoP1Jd0#F!F5(6TNL~Td9ptondIr?X5fWVA zqDXzLwJPFe+*7AKq05_Da5|cMEQO3FOL;FqVp%?H-CbJ@Y;$>FhLhaxupCbgn+dL| zrI4L*JhU$3#aNsfwR&!qV_?pWNmVPxldo_I5l$DZoz{dEQ@aKu}_a)L}hPYH4f6PT| zXezD2UK7^wa~ymIcT>?qFD$mb{RgV_=f%Zrw45-IGDkC+I1bW!qI@9GLrue0RR|i% z^!T8)|LktVspj(z$@oA8;93%S3VdUA^E!L-0g1MG+iX?O%hX(;MC*L zwKgpq*{GWWoxxBgAhpK9Y&2g7H4FjM;^t2c^srlXi{}gmuOR0dyb7ZFp?GLPbQVsbcyRh$51CcujcTmyId;G|a1rAgjrw{bwnEmQHO)41X>Xp# zvdmjf8`x~2u{lFvcJ_D|1G;rlQTvB(ZWVhU#Em@`ph+WO3b9pg$E0IWKLrNu(w+${DB9C`;Mqw z_PN7y*7r~uml-%tj#G7N-0CThNU@@Hw^K+i@STV%GHeEtcfL19*6IL<0lJ5y%ov?? zMeyO$%E7hcVJSMC&0PeRmXYw5j*nplN6HV{9->Lf!^gejl~P1xd!{v6CVFt-*>#%} zvbkXxgX-xadya=aqo-tM%nd}0BUfvs;TB7>v6XOnU^2bm(zk6siuHy><~5Cbbc^2p zYB~rbbyN9GcA?as>F9MyEsQrqM|C2Oh}%T(EY9tkUt_rmW>mJCKnZa6Okgghr`g2c zu}a3!=sZr-CHkjqG+MCiLtHl2c1vOzf}an=u zit$%7&}E!7NMWd%t(ELyaA@vtZ&b$gIV6_ZWXjZekB_;9_(oo2lyr>#mZUb|Z_?}3 zd^z|i%UN#^dB)uQUcPqPN55b%(MgR!jJISr;C@9P($iJG2<2Mg!F`OP;YM!Bi@wa$@-0kL(;wEF3qch?VI!=2bl!)Er z7Db%1HZIo~%|xFqARw@GT3$NG8&WEHYWdLiQ(H{<QP_y6(s>WD^iUNMW76tlW^-CcZIo76 z*P>eL*>y6))?L!Z(fy|+TQm}QnIj(G;s<4#PmwVSr@BX0%X7IJliV*M)yEW($z3BF z?$=rJy2a`x_)}i{W}}wGa^q2Ho;z6V9kRdTHNUf|ocCK+;BR$>;5kXyRJz4m-6ihV z>gcS}I}oXRd6lKiZ)X*oeHPy&!hmR^_+c}RZl^lzCL`+mh0UWz*uRJ!AzM__R#3a)(_X z9!qiIaAbK_CK2K;B>JrX051OBBcc5@zp2?^%k`LqZIzE2;|)F&)lN}RGO4;;-w8U1 z^-&q6Lg5#^G2nSPLm8t!OS|TpGA`nrElZNU+iTl`aoG6i{e%vALnm>In-T zr^Ipmt%lanKArQxB=FQn*k7SqsikB3TejAPi1J{sy;XdKr4FVf$#Ys1VUNwJ2u zNL*O=ZU{*iwokq&yzuiX^Jb=*&1?B*q3!0UxQ&E0l#sQLA~OBhHoCT8_(V80DUvW- zs9b~DW_(}bJq{fhf+?8!Tl9vTo2N+>6r8NtP{*h!u(#fYqG>eVOCE|h2FB%$&eU`< zj=hRul&Y$6kyTe+sg5SixUjN!TSOZa+AMAe*+uBujCn50rWKhfu*&A*4Gj$=#ie19!zzMBx1AQC$M z9sH7l2a&zP7MV-#T2qM2oIF8x=a}nV3~HPxV3K*J<;5FIoaY^^t6VzC(s7G9$%#tW z3>tujFgWbv?3?i-woFo5T(3m&C1rGt<}e%SVE+KHy(7ZxgHOY(t4%==CGH%_9!<${ z>ap@Cxp7B9hesImWsUf2-9aTVe=ZJREvy|h+SXn$|8Fsk>NBEP|!zD6GI}6fsF&mFP&M(1&4-wQPGE0GNOi(ra54gKTxnB zo&NwZz3Y>+Gczs$M_mlfm9oYhbUOAu)fHb3jXC6fe%Vd z5Jj$?5;;d`^nVzsC9UDsHw8GnkAc7e*7hsL9}HC1;{G2LjMDar0lB$03*EIQI_gPj z6|EjQdR6Jd>K3VtLb53!~-P5>RO*P9>nO5bBO#U9(96BRq0(`zp#d}CoW&|-K_ zB*@y@%7RPY(PH2L8~#hy?Up*~P+09vcn$=4FDiU$G>V)63mu#dyL3&Ic?2z<`aJc9 z)Ynw8?uL!?HJwWVDo+l?%*M~~-0a)0TYNoayc!-FQqIUAs&!Djq3sSgw?*f_h|Ion zrxIM+N>)sw2+roi%cQ65!yuDOO}bq$$eBM9;)Z8!4J%#p>><)U$7Z(a zu35_XhY>R*G<587h0*~epc^%}+~3W6bA?JM$9v{>D+2kH; z&C4!LC(&TYSynu&fl}f0ku)|l(Y;Sgm8L@@d2MS&tlO_NarcZ^zYgjs@NB%;-HTQL z>$#eXLA|y~{55gQGG{EuRe|P5MEaOT?gp1_)LCq}_0hv^&F!9=doFI0kh#M^=$Dc} z*VuGjBP=zGSurok47(6C?R)pw0c)=|d{XhcJg-Ahm2jsQVl@$wEQ4(gAdQya#C-Kf zkg~02C0OvyE%1jM5_>P3_~tz}aLl--HHgcp^%55XPvPu6l1Y~6uWxK^gW>$h%!HET zxxVUlib$#48t?$~zeNolZ1c$-CKnl~nZnj!xhm_IxF3yU^rCD=Z<2I}!x?EO?y5@I z#OPs-J2k--%AChb1ky0M(lx!>)nCjwlvtzVbV0y0TFzVESNM0N!|J7Nbd9Qw&9j^Z zyQ(h}csV8+nPR8FD5jE@Rt?>EX+C%IRp*e4i^tuqxSIa}!3GOUi<-t(Q#NS1wzQ1| z9hFz(Hw>V|c&UfbP|}J>MKCYW0BIzx%i*6aLBq^CmL~(ulf?HssB)luc4P{w+(Qb@ zG*P~GJ-;s?jil-B4&LgdZ%$;FVNSShqtAFQCrL~sifV?tPR&!$#~iL~k<8Awu-|0= z0D&J2n8z4hG_sS1QZ=Nqq3k|0dnqnCd^+M>0;0q5HsXIc$)wOltOnOwF}^M6%9JJ7 zL}U(Ra~C*(3F~gD^Q2>}T6R=Xn(68=@g%c0DJ^}EQ|z4J*^(OB02Kl5Z{a4zTl%rr zLlk>1Ge%#gLgE1OPuV;(#9Lz9ekQ2uc?{5#t7~*iQWDy3aV3_$jgqIn#G27k&3J=o zD5r_i>#C5D!|1(Z@O8s@GImji9;>Ef@mxak0!Hj%w%?-j4-GPj;;$2;ig4G`QM;KT zAB%PL2Lg&5BMyd=CWf*mhO~JBvadb1Qnt-K8f_$kseLf!8F0MDvQvYGs$ zqh~Z+D7MvN!gfY!jGB#ufKtMGhaxFe^=xSqOoYZXx2^1r$dRVy-TEz#t0>|4P8Whx zO$YDmDcdX0aP1(U)m-VgWrAj0r!ypHD-dIUcJOU;+UgJ9R^_B?$HZEz`kx1y9>CiW z>yy+XknK0Re~aMq;nmeN_AcZ|SmK(BXIr6Kk^7d>fpc+uL`i+pI%ILZ&C}*mxVrb* zALaJ)PHTRO?o5x-haN(rE*lh%7NNb6@ z#VRCwT0q=eveCFxj7I7R3*I#_ScBPezFD)-{{T-UwvR~7gA7Xtbs=Sqybl3I&{I|5 zlnwJ+z4uP=P9Z_^Esj>SIfLu$LWsxnTs1QZW0i%Ypt|Ha{!c(jwb+&;iMI+D zEG}X0R6TxmrEL!`IG+@R?%{pWWz4bEJTdJK9#+{(V7Y#i8>5zHv`H%t@qu%1G`s7e z{8Ui3511cag%(J{Tt5*VLUn~_>?j*ZY!y}Zd9LqiS9gvfk3haQdjn;Q;)Fp(TG z-|TJoQ+aRUp8~^!Dk>(Hs-d7bG>~pp_O}nKosiHmf*M$@zMFJX@zGIAFqV;?<{Txo z_H->|Q4he~lmm<5V}Tm4&=q3Of%K9FwfA#8jl!tfY2o>YO{J~sE|}xqjaux|*sQs7 zs~e_%fEvbwar1_PF>XKDTG+sR<_bG9Gm zy7|5^Z1c+CKLLPT}qz z&_91OdD1%8JB6gdFU=FLb)czW zB5oPef6Z+wwkRFAHgLS)(&zP&Zvhp}4uZJ$F|B0EArTaSA+31a-@x9LUZ4bY6wbm~$~)9-|L? zZyfQM&!8cB3lIIH@Z-ej>MLV)P9IcV7<^*GK|6j{RV`iY)vjIW^|P+mg8TLEn2e`~ zM~i;cik*>HbTYA^5@K|v=Vjmo$?Xl_RoxY*)z`@iG;ZtpzQE9C)#1oia zT;sQ*!Tu_}3#g30V7oLgQ z(j#`K4y8NDnL`xfK4NAcN&CiX21^TRZ@Q|+(&BVm*5Plz2=(lt&NZ~}k5i?98;+vI zSf-e%8JStOWH&jp(i}M9W*5W!A{Z$l(^04Qk8s?db$`KVrIxj=61Pt0r$o=@bpl20 z>a3UzFl{-Qf*eJMve(jz=;myFo=fHO7`FpTd;+H6vpl?-XP4i~bO!LW=!En)1W!|9`?g@6XO zF=UPH2^n-em7`Dv9(S@RjE}HueNhC#?gr~A_>AI|JU+;Etd3h^6>+*KLwbiDf%_=i z77=Xozs2tqo11ayGUgpq%%`JqGwJ9bw%_;WyysgpVtJ$j?zm-!`>8+Q)`{^-*&j5s zKU{$;na}E#%|Vu*U@2oaN?a#3y8~u(F;!`cXY&^nr;YK zoSl-<=_(Pxqk8xr=H5NfVG>CN4M)w)4*CHBzjb5b9~3F`&ZbU7Dp)`w+xPt!k>oPU z^5?qbp0-!63+5)}{N+v_;#)6(fGnn+e{RfNUn^yosJ~42Jq(SdkU6Hr8;^fAWv9oX zuctJXu`;R$i#|D9c95zO)<;oI9E{su;CfjEl1UQ@7Z>tbeZ};4H&qK4+>J{Q%~&=A zOxUU5j*b>L>0kg4$z=R!<0lMr?h@Rwjn=X9zE2E+hVeVj>jdMU8me%fb_s^c=;}I3 z@ON&KR5|!_zCh?MfoAzHpsOO6q zpt$|yGP1|i6(>1HS~6_)$s^?MO2#LOO3XgAJDmy8LFx$ENoEXgTsH)rtruyy03Gk> ztX~jWHQX}em0U5jM@fj6O2KOjT~hJbhTYb6gqYb7BH}~Tm2tqUIk37nI__z> zx}khNN7mV7q8g+Jp*tx8H_{K}nBnXi3(bg6LqG#hPbHqo1BM%J7E|#b!Z`T7!>YU} zMh$FNmYMz9`z=(wv=2XYGTGn_?PM=wlhfNiar`A_rsR$!Rq+1+HBKqw6THM7eIdhV z55HCHej?W4d7lgvcqEK_VP(k%;GNfzJ~}vN{DGh7bH)(Y!Yba^NcNinAS0!^U2J?M z$LMjqfrsL>@`t*zGVQ+!?mg6>#z~`Y313S{$}=4shA~kaW6`AVeYa2~>2}8B(NE@? zocv<9Q;VElK|X2*IIdT9jQ;@RTRhu73cGlqkJNV^&{|M0dygdRl-qClE-|!i(OGQI zOGGvz;>u~{Bn>SkW|#xrSD6FBxGyH_LC$VvJ_k~7XBLriRZk#g>0Kl>y1#dRo=cw0 ze6Ea9(Ba38GLtl|1iPj7QaCJ;)8bU}OSEiobFb3Li%IGbIpj2h(P_GyuZlA&y~f^5 z+3~#Zf#zn?a%u!~Q#3WCi)@}j#QF@s0fp={JdeOu2O~wa$KWLO8zp(=ts0(ON10f$ z@sj7uXKVgq5u@qonZ|2dh~jlsEx2sb7u(%1*c8>(CCnS58Oyt_mJ#6<6NSD?$+;ot zFgK!4Sih2Oa!*tu?(Uq*>~!~t+jF=`nugp04(P6JhUqh7-*kg?4=}g_?y54E5X5rj zbL5TgG_Q5bTBKUn4T={xP_Y*ts*X-7y>@J1XUh^>Or4_QtdPLy9v&Fs)OAAsBN?L6 z4zl6=dn)m!Y2371*%U(3>uCXDOT zUd}xhMznN%B_-LZ<_!0T;FCT!9WQUv)7?<2^QI|`%1c~n;U?rXlXaWF9IVXv#AVlU zaoyfZQ#WSJKLx_2p%2wu8@Xw+v0YQO=@sWX>5G$oi5$e~>zD;R%zT9Jsn=Cnj2^yX zSzB2h9Y1$9-{`3nnci5-ye=otQjD+7brr?zd)ye~(79~Y;-@azi+}0}L24$2O_}4t z44Pd8<(8sx(*Lti66z3y%P>iQlH)ng;!@aB#?hSUdXqK&#h>9D{0sd8ng z=&6FfrJG*|tSLh&XrQN!wq$G#1K1~woH4d&{F6RYTz;b41ZD>vf%~k~v~)7lW>4`w zn^}V7_u+8P(OX_Qe7pFINf#kjX|b`uZVzGd!Ac!jQ!7b+q12A*H{ujQHy5%*Q=7); zwXeCj`uAEdA2@3!cj4r8_*{csJdK&452#zBf=}R*D@H?0$9^DS^^|zdHHOM?8j>8_ zum+Z!ce35Fj28jp21{g;MIB6Yvs6+y>W$XFRplJX#au%a$n^Ots|{;vDE|QE^_O_I zHrRQuOmG_lsmr`Fql&P+Oy_D1e)a`5N0&pKxU(3^`If(c+5V$1;_rFE9_zL84+}5V z!2S1Ln54&We8)i?*hOP2BX9(|Cx;FFm18_`pA6y+B#gwFs-u2XgQ(eHa^U&9;j%O=Ol(L%CPKbfq|a z#JDuvD2cSb1>gnKI7%!=?I+1Qr>bdtn}4A+5Xvm;9ZfeEJAKj>Y@GMZ=oyJ5!Z@VT=%*<5Wsx&WMbFu49}Sh}lZteWb@WRco3hFpNdN+_945tvUg9PZ zQt(S%C}FYlQdjJb)u^9CaOzi0Bzs-A-CZILY;9w909whGABPHfBbA_OAUFhEV;G~r z>13>|m6X)+i@EYT!Ch}=EBDyEr;=@L&qd59TW`1uoq+L2FK4Nr9uHp0vi!F}VP!n{UyfOf~E_DJxlr2%>3g zTVMjcd^q8d&zfpymLntMXNAY`^jZEl8>!&ZdN}nR*2nP0%WEr0Xw`9h^;!8lQEcga zR1~sejr#6nq&=jIcT{}e!psXd)V`*woQi@hBx&7W=b5v=V3}7)%1E_IvqzHWV`pcn zTTCslgpq{epA3;tZ<4KaJMXJ>rd#-QmjlHht)CEVjD^9)nr(HZWlZ&kW;vyXnxgd) zn>sgjx=V<)SoByU+k!*QsCXdxg?&lMXy%>;6-$(Tp&WOMSRPlZj+NDMzHLE{jgGl* zy1!vq^)6zhY}9gz8K(CJ_StyqtAx478LgJCS|aIO?M@}(Yy1wQ%{g94aLW2?QXcO& zEj@N9buVJ$=g!HvSsfI9EpUSt#It>UejO9)dGFU@xo+JSd^ly9;x_$}RDb^fSzgkf zs-Bu>OE#8yxT*#{#{3Tp#p$SNU^SAyK^wFm^islZCU#ekqRTfY+Atik!wl5iW}2ax zeO)t{@I#AizNPSPNq^ybDtebW({h`ryUyFJ9|eo%+fOuc0Owe*KyczWGJ)>=O=^kH zaw5vZuOAyp_ITU>01!4Z9xUZM?0!+&i0f@@Cg5fS{NKga*FGs76%)j)dHQy8wtQ-5 zc=KNw8weUdWWefY6Wo`7(Of8(;@i_YQyWbq&BB)~>dUCoajVbD@E)+RDv?NFbUr<7*>LO=EHC$zk+^iSe-IZS?p8bYSQ8ujqlS~& z7j9=>=;G}gxgZ(G9=L5u#LgXJxO~*YqN$KNN_+2TfJj~-_)5&~! z*#){AJP-3rT==$hJmKDt2Q)|&#?Z2i9KRJK~CZZAlk z$92CIseP^thy2ugZZ4(s^2j8*c&e28UFypjzh`CtNR@}#omcB z@&@kw(|H7>jU=I+x|?hCQp#IKWES3vKNM=*+DceiCG|;i0rF2cYgHq#=`{nRR@YSw z^;GkT7C7o;V_|wWYPo$}y?zd=aN&>t0BD>uxzF2RjN@(6rjSd>EcgL|WhN2SmR zbsnj#Zi0y~&>;oK(HYSeqcDrvJOtev-MI+Ogp|I$KC77flmLgA1D_j(op4a$*!!^A{#sHwPTU)G`=XWy9l|7(vB(YVN~4vH8{?*I zU}sAbX|F|=pExp{*i_k`M5*d5&xX}AU0mW)ItCk+njA6`ELab!80ev=H@209{wK4w z{{ZY!LsuFs)xDHl-Pn&?2&|6JRfA&9fhM0bbwz; zz18}$Axb$b{FRo;rSd--yd}R$LSx@_juTf+iqLbZpQ}#9C7R;kF16}ibYxiDVO#(< zD0H^K-*gTepk$2H8f0ibS5j>zSI~P6K@Y?B9~Dm#Gc94Um9+veZO~bD!wrfr!z{PW z8JNH7HJyJoLoQ3CY$<%z2PMXEnpJ^A{L+D~e>yeYu|~MaqWc%M?FQ-uF$HZ)J#jH;Fi3l&bM# ziX0l2O7}Q9)0W(m=kBdDHIT88wYF#*05%H@;SaoqNehFnHXx_OWYhL^CB|S{E&;lIHVr|TX`hE;R8dyZ&$FYMw+b?brSx*Kgc+oA zl%uG1JzQ>c+yFa*Qza1;^kW=D_&bKCxz*V=cVRP+5sXMfO}g1+Rd@#)avYP%5UQ)D zr@xFe$$k8+z41?J!$@~s?v??7=^zzd#GW}4M}x2rh|KAPM&{%AG}YESI6~;$J@!(E z;@YB~!MS#JH1*Wr3ie(pg5mg}EUBfAR*`Mm6a@S-%IAa^4y)U?~e?nBT20P?+sLBp)IO(SK43Z`U@+&D07 ztSrOn0M0b>A$IHSl)^1=6tdq zY>p(kY#)Wu;q1{JT~$L1KyTFErFcq=S*~|W4T6cMvUpFJj)@~42Jhrj zjHz`sTs184$rFQx{WnHuInVAvDK~%|{jNu3t7vtLaXz8&qcpcPaPtGir;u}Jrv={Q zWs-bi;_^Ev;W%tl@@XonnkSXJSa}kY_&3cIxW^4$g}S4eh|#&k#N&H5=c3ZMox_Sw zCS_`zCS5T|;drF}2abhahuUJxsdjlk1;Xg~UB=A2Pk>Hlq^hTiSv0ti^6igR?kCs7 zhvDY3sr@q@(}@C>IW2SHWK zudE0A78xvaK(WWtA zJrrgchB_QSUZ#hYS@9ePAVf3heq~u0Tv-+TsY%hCLuu;61O=}i zFJ(N#i8}`82?Va?mdM?&?x{Jefp}&wRV3IuOO?DYqZd`e;!X83hK$yn@e2`afi1F`t#F|qJ$ZK&Nz(C`Bk1&^(&fw7+L z7g9+iQdee4@l2^=TuRLurnSwQvPn6c2-xd?lGZpa!c$3B%x4>gyuu5kWQ3PB-gZ|! zox!{pEyWd9fjri^bJ{ws2APbPPOusqwY-%%%kXPrV9m?wUiQmgSZ6RqKZN-$PdaAm zJ{a)N5ymBdDbdrN8cm6D8(C%84NP>ce2*clj9%qnUllxmi$Bs+dWnmeUh5NbY)bZz0HBWx;T8)kn%J87Ztrqe zi7Oy$erK_T*JWc~Ev~P6MLsh#R z9dx{dLx!E3-B)rrdot|~w`-Nqw7JL5%F#~&s*JhE={;7H{w4?e3((cNJxl-)ROT4x z9qw+TD9lij12;X?y(5E9gzR=%_WCT=`!wtdO;izq*eNyw99$F@S8JO^VsCFXH>Trq z?fjNJmYHldh4yVGyrRZH!I8^9=tV=H*mxD+!WyA7;J&! zQT1}{Za+IOPKN3#FEwBgW?Z{phE+r*f~F@hwEBej`yCgjv*yM~%?`~ON`hH`rl%h- zFS~aC04pgsIEQSjSZ*6P4tR~3a#dS7A(alE247%2huYtgw&?(_y9!*Pqn+}5b|xTX za2K*~Ra42Ol1FUTgL0$^Zi1k5T~Lb$sAv|{!aqfB`6}SqnM8Tmk(AUCiDIW*mu!#f zeb&X6u*Oi~P%(#7NWIo)lW@o?F({&zF!EjqC2f?{Y>Fu3U;s2)!EyB3p<|ie*@ru; zjAAtu!SWWKpxH{(;O^xK9ZM@B?J14gPu*-hA>bTY&W*UPASHqweiogzTHCKcVz^EV zlrnLmeLPaNjXGHNf?c;oa?dOMiBE{GFH%Q|zh}V(ERSgCKcAA z>!Zp~qmh;Yr5I7j(YtNvy&;!)PaQTJ95s-(Nm=zhz+9@o6LL@dJ|L{1q~_C~1M~}! zf9xx9@=>w&%;|VX{-z`vM#rK8Ew~mc>r0oZv6xHX{oJGKVc3L?MoKM=*~E~p$1)Gm zN?sb}=%<^Ih)b{4KE!Bbp`VuKmX6o%n`zX`*g zZB+0v#xWy@?%n3ESN-6=#H_GjQ)JqTI@*VqJh%cpw^|lv#bcttB*Y^i>tHSSDUi9= zyDZyZEog_%8PcBvfy`sW>WJ{Qy_E|NpBBW+@lTw4bh_Gax3!c7Om!HoL#38a8@iIF zC^0u-^s(X5c#Gr&yUjuSrzz`Yc~;Ww*|6y(si1~g0Ssg#$$r<=T zxIsfT-K2q}4U)4$1b+GyIoT?CG3$e4kFFBs&886%@&6NU^~I;nAb*Z(&6cAsxDRIhGyY@C@_54 z1e4N-SV-v3pg)_*dc?7@#s&9o_X??=cyWSp3Wf1kL<_XrU99iTBP5KI>4nNXLc+5| zP*zaQ9D)1&ULo!k&8}@u=V1PhUO~(}0L{4;xz(9Yu6l}Q8?}v+&$qXAIhH;rajdZz zq|F#LL-{@dBN?RJeUdlu92C1fql8a*8xx#00c)ovnjBmmu-SR%Dg098`j(06>*b|$ zzpIiWZTDBGek1Y>{{Tik7u}>LK7m1+Yl~b1TK@nOu8h{kZX7{3^4(`~@xv!ok*6G{ zYkLyzsy`6;ipW?smyN znlSdCODk2h`erro)99YXGdx&kA5m2(<+3I@_a@y1*UtR?ohrDUo$IkGvez-PIk7Yg z4Iu2duZrF#e$T_A%XD%9JpMULdWLQnnQ)4B(^E9G>vgTNWsh$}*!A_X`c2 z%SD=RebjA680hNQ-EF^8QzW)IL}vOqBV%QPO$WY{ViDbo;$jX%s^#VMv3FGY^cs%G zvPsxPr(?M5sYL2Tic(!{hhwCsnX|R!lC<1Bbyuvl#w>3KedKtli>00o)mEm1m!un= z>~=`9pZDx&H@{n`{+@?M*~il@xX&3g1}BK~;;0nygS2dJ*7e_Yp=7>0)5}Nl*wAYp zE&Iz|r~Q^AS3oJBd+H9{08vgD*_~+IKqKjLYp-PR%(f3`^d@WL)*DNNNHE#kB|#nt z>};wzd*VkEr6rZ{Hlcu7xl*% zLos|yV)&jOXfTLP0h_}4S_w8(Y!eHXnyK?u0O!f#FFTS-ZGz!nDQCXg{D(zjC5?0WWAT}Q1X<0sn`M;0w$WJmlBOBR%mS`J)nzWvuMrGf}v)EW&J1A-3g`j9_19tXK z*;HZ$gSu?_#7_;jqJM-XOj?G%peua3(Td~SA5%S~0dPIkwIqTD-|AJU2&uaPwT+a; zDQqAz*9yF&(IgjUp_DNLRZKzZKSgPZ&TY?cHJ{`uuQZJ&!rc}4JQnHN-eqH3?6`BB zHCtiqfyT&mmiAoW3w+P2gbsF*ro|ddG&J7#Nx8duAq=~2PS@E851wBg`Azu?ibS%K$R8(C{yJIIsgOnMqSq{88J%wt7>76EU(49aYyd#jr896N=>^U>=rnAcj;HR}5MJ|5+`XsKS!aYh3} zyj%IKf=uO6nlU#w5txd09g7+Tm5Ye5y^kQ0OS%y?i zhYGPHSw1Kv7G)%j5xt{w3efS~akUUKD4OTrXBW21GRZQ*lxpNPFVh83)o(imSmG}d ztE=(aN^BtVU}-ITncR7#Q*lB{kEt!_(Ek8wQM#(7lFV9K^4m0BRVs7cERs}V(lxa$ zI)rv-M|QfU;h4nqxa`lVbES$~lHzY-Qz_bhd_DkL=Q`kysEnV>EUZM9^i1QtFq*Fv zp~dj$2wOCNzGFzacd~)SD4R(yCURkZt69NbFg#ACsm2G&Ba%CKk3}J!vcy>J1vrtd zY@Kx z8E<`%HdlGB;PK&o0li8Kw*5j!S$W+D;t6-47S_;5i65HV^-WLK5}HMl>;t&9#^|mf zbX*!d(MHG3H)3mEg1OED+mxAkW7}Wyd#gPqJajQVurNj(d_Z?b zjg4->sWGuzmnK1Uwa^RPO&Nig}PeNr(b_~y#XshXs*(^;i^fufnSgG2^pEc&M zg*^9D1#Mncu6v#%k=(xFw%h#|rsaUcGhNhMpgmS9I!VPLBH3e*J+2J$c(sob4KHSsz}FVS`?e*^;idaT$GHIs<< z>b30i)n~x`QPh^wE-96TP0Bov0 znC@-%k2u!+8)Q7YgUyIy6}6m-W*Hv$JODu7eV323gD7!4YQC=pr1@+j2R0@P+fnoP zE7ac?989dPbskZ{XLM6kKTSZnXmIi--4~uXg@;F!Fgz;?rwx0x8KsilTz0xg4W`NA zV~nD|B`cfCvu$9E=)_T%YVi|rK}}mZm+I@)7H5~dN9?Nm^ld(cOtMPbH*s(*DlX`s>?5msTna^30 zT=$-S#AU$;0+Ebgv997ZyZ&xxnk$ z!0c4h=~(!*>9Fdd>s||k8Z26Ou}H;FWP}WBSl2n>pbOlkC#W&Bzxt!N*)}YDsKZ)r zTsOaDgC~+UwX%aCy@0v}9#lon3_5JXy^3X1g=m-tZ*yX&UiR`LI(M)MtL4`r17qDT zzCewt!-P9E#pl&G6os<1f#d3>WV?0WbA6LXH0joNaW+YkhuRT`fsD6?5RleX(1 z%q@kK2F$r9>RyicHi3-Fc*RI)E^S7U`2?_9ky8Hv{A}N)y)*nbI=h5uYKdtVu^VoU z9Ikmf+hThoHWo_9?WW#J;`Cvr!&9)vHIEaIQ|g~l&n|+;ZX9oaB^ig3*SUvqpM1fJ zO#miIbSz(G>nP0VaR}N74R=zss0G5uaB8R1z8Bo?dipJPgqKmpCH|&@vG0zVc@6+K zBI~5F=uy}&W8EffHt%w&iWhP^mCqcrx1upRW51a|TXuqk158ekVmMV*JK7$}V=N)U z<8Z!M<@_>ymBftMmTG`xkkr0Lc(x-}zyIvPG&^o-W`}C+B{L@NCETz z0Oe`nONFVBUk-K25t4sI`|q{vvs|eWsNxJwF0)-xVfm}>DCd68dzwh_tlJ~V_vV&A zSBbOSAG)#TCB-d|@KNJt+3A_+Sx+oRGvjFi#CKkY;dL}@%jPzVNa1l}uxu`$n?~6j zE=%olw!D*D@^RC0_Im}r*B>6<&rZI@ts>)JfIBnNNl73KtZ(#kTl7d}jJ9Eah{krOEsGL|X<7B@;-!l#Umdfvly5VX8v^02PkMWoW{(tXO_viH~NW{+2xyS}jV{ zkxj)bep3~aHakEru(0a2j4h4EX=$v&=-C*({M0@!hH zuDUXBp1Uy2-Ihf@arv Date: Mon, 27 May 2024 10:07:51 +0800 Subject: [PATCH 12/52] Make changes based on the review --- examples/avatar.jpg | Bin 70290 -> 0 bytes requirements-common.txt | 1 - vllm/model_executor/models/minicpmv.py | 29 +++++++++++++++++++++---- 3 files changed, 25 insertions(+), 5 deletions(-) delete mode 100644 examples/avatar.jpg diff --git a/examples/avatar.jpg b/examples/avatar.jpg deleted file mode 100644 index 61dd46a0446700af760533fbdb3729d210e55d18..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 70290 zcmb4qWmptW)bB2hNFyLiFQveWNVjxIgVfU9APYzd(%s!%(#--=QqtXBOD)pU*Z+O) z{qTN#e>2Z~IWu$4d^q!*d|r6o01(Ja%Si)(Kmb7Y-vB%>10(>bC@BBUe?a>;(Xr6c z(a_LeU}9omy?pWVCC&>R99%pi0$e;oJRBSXQUXF^l2@-@y~HOYCnX^#B6&sf-z7lQ ze`nCpvC+}7NpNv+N&eS(?g9{Eq4=Wsp#tdvD1<;%Lg4c+02KfLzy$s$?*9i6<)4Tc zm;fy7e`I+A01y=g?H^Flu>SE!K?MT-(FxHpi0Ckh-+!c6HFm}#;ruQhm-h;+_T__w ziGO@;_arI9<{xY__53p~Q|ooJAX~*K6d)r<8WkIB(T?p@-b``yU)e;GQgKOAz`%G0VO8kd;V_n zsYuU2b|b5mqQ5{|lnu5d(p8|-DlO#zQA~L^6XW&y$8q};lSFhJWGjOi;5hK-Nhd7b zPLT&U_~u+1fO{kBLbcNpF9U;Hsg5qs?UKsWEI_Z$6>pj!RVk;R0pnYGP zU{Rh^SeE2{=Cx=(p6%dWRC{bF!MQwu&h{3UBxkQa3CCs!hV1Ouzs`Vv)~LNLdrh^c4NaQR4-Skev|U-0!Tn4 zc&^cd&}F{dX*hs%P)yv?rKvFmfThq=EZiIs&x zl67h=2{i7gJ2gY#m(I4ms39u5M?74FEiKOg?mh2)#q-4!qOcQFD3fb|p0UF+#CJ&j zh83^+*Ln*)4dtF|ZPLrXwE{h_Z*+Z@(t7Dm@2JQf8TJ%c*_(xDM@Fb3B;>DEFV&e4YE^527fNhA$Y7WWSD#(ulp?BE zqe-mahJ$VK+@({IN@r?9sYhx3=(argl@`0OLL(ok?i`gyG{XVSr?y*=nUEh8Q&GnL~2y89LmmM4H%1>LI<&qY_ zG|MQYg<-$vyKJswOE7eXV?;Ep!1O0SaN{>?u57R5)EV?H9CWoUGC?76`=~3q@2rst z5{-GiX1H*bPwQO2Vx{{C+LU38c{0^;1s#tmFOnNzpY&Abu9=TpG`R;X1d+k0^BN)a z-d&vvOvUOx{zzltc&t0Bi>IU3rEAi!UnGUAW=EVc!76L zdKS3)5Dw#TdX`PB5!qb8C#AxKwq!PGbV+>}t-N@|f(4yn} zJ|=Y|nPK+eQM~UDvH;ZZHw>#RV47c|j7xqCetv{g-t*c8{JqUmtMTC^t_n*d3{k>2 z=bDuJ`l*}W<2Di}2j{7FF#82=E4(QCE`-#CTau(;4uYgVPhMoTzO4>W01@gh$05>e ztTzEeAIR}D{LNKUoysi|7<~lAjyzBxVpmZ8&pcqo_sqI5vmG=8@ug$+c z_(By;l!W`u9@ogP6DC^kedBGO1ZRaZjJJnPd>1X~>a17Vz9mcJI=dn_5%MuN&7Ic_ zkI@20VMWpWpADY@xinjEzP6k&^*{w3W&V9j@xu{N0seW(eLgD49Dce~nz!$Q@`C9z z;Lz^<+3%>)_7+zO+Y*$4E^zn_x+v*$vw;n{3b z91@@I>#Y0jZS#Y&c81qoP`A;dV+7yN7yW%|y`j@hT|DYVy~xJ)f-H`eWyNxA5U$u= z*$$7>^@Z-`o|fZSq+KNFjgOl)1w{445eBgScF!o0ap3A}eH}TMeoe2t-GUX(3HuG8~Pcmdmifv+5k&Te5VZa?-p=c)^8+X}2-y z5J9s24P)t(ZU$kfmEFnipoF74UQYhv?bcIx<*yMoQ-1lfJa}QCOS(_zaXG*0zB(62 z#pV~GL%qK;hMZ)?C1;>p>qX+!M>n+PA@YlCnVHGsR!nx7yFz0P%^O-=>8D8XMSXHS zhFC%C!y{jRt6X( z!9;-i5KDMEMQ#fXfAn3zoCrm6I~BybW1=G_GTywG)@1B=8r)9A)W)X3bcbr;8IZq7 zc_eUiZQwSduBfZ0PxWhrXJ(zd(fp;}6a8>vl4o=WFdlV=&s85kx|OjzQ#q~QIcJuZ zFORj%nXMU0WlAMy8}o&2HR+tcsz2NRcR(K;+iZB^Q}0$usqMhJxnqyP;Rqz!^zW8w z%noTJb%pO@fauTC&TjA+-p8_6u?H+GF`9No^^JjQN0(S0ZW8(bG#lfwEk8>FuFZ8) zw_g*RfiTr2oY;r4@GD;^)tB=7EL_@!U0^wvBCYB2+7_)P@F(xLwyDW zS~M2aZPJ3BLe#zeovajOMHCwH100Hn8y)91g;aXYoLQ()*n9mUS=VBiux1dNl4U+G z%MA;^F#w~krR5f^jsK*AiIa=6a!X6MDkjgr+g1c(4Bntc!|ef))X=T^pI{EsqWK{N z8D*`hQoh8%)QMw(4UIh2J!S7%K51&4%}D#cjDo`k^Kf*GHP!C?y%QRkincFEc@=<4IXQRl(MlvfHC@}$e;1ov({|V9nP?%vAqu=N z0#J09*yfyqIX4JRP4{+iC$MWlcX`g%R637av8X5%)0={x`9vQv6{4O2jkdIN>^}n> znIZ0xSp_`jykeC5?OCZuo+NCs{U`SCY}-j~eeQ!zgSl#yz8$1>}Rq;{yG z+Wjck7@9*u?8^xs8gV>4+1;7Ws3W*OB+;2u89fz^Q@>M@zgVypO1mwPeu7p`8u^A& zhd;h{(_|mIG&eKqS-6@OHU|5xoBA?n5tf)onH#VVh44^((#d&GY&Z9N0I(tKwrX>k z8p)R=Xjzn~fC}d?-h?%n_3TcL?3gIitmjSo#7GL=crsMTUVr>8>`+A_6*<;;&yWR~ z7o&%u5k($4+u}S>GF?nHl$aF6uL(p}K_k*i*-!<>1C{1!3>L?Qel z?4t5R$fT|Ijle`ym4ah)I%{Tk&2Gk3eQboleT~Q0M9B@zGXn`=he#gLt1JNmj(H^>eDk?Y!4M=h|6vJyOad)i%LT&tA-V!ngG$zB()g9qC)4j0Dd;59a zsbv}?%3sk+b=f@frbhe#+|++iVA`^7)Vl1x$f@|w#JW==59}FWzS5uo<>M~1^67_wN4^ugO+blJ5JzqwB zKuskKYlRq=%l|&s6O_+ig=!`})9z~jI2BdSQ)?ubhc|;v5y!rft6e>1~7?Yx(W63ae`F^9OM-9Sy5oE?0RS^;;0zQ7nt^eMeP-%Yelq zEn0m}t(bAGJmy-@oHSgzOHu+ZNCzme#cAYvqPtOCH~El7;JoS<96bmT>Op1ITVX!t zp@MtT&CH~h*|3&lAQtiqR~G0)1YzH^A9+}c-Ios)?Gz(Y4ncekxg8bUhc~i~cTXh% zaT?s8wYNoUeXop-S&%NUSb<_l*-IWwUoOM6*26@sirs_*7XvDWFYo;;E89Qlxh3s; zo4zdXwk<3ycL>`$#CP875WJASkGLx@{x%djnivJ6FsX`hla-l$jIX=k*tXd}kIJN6 z&x3GY8mZQ6w`I|+(3!foyuUw?|ExAMsX3~wLCF+(;3C@?{xNg3zofRHDQQ?9^6t|y zm}T@!OrE_U!8Y}_PYNgPzBQiFJ$*a;W{>X~;0kbg29N}<*EmW}ijQ<*p`T#hs!9MK zoMVthIIg&BC+$mdfE3gbpYdz87jG9T!QS@ZMV4#rM>|ooh{ls`pUxyoGHbA2g=kG8 zz-ryH27ebi2T&|@C_n^qJ~H^OmVU3cZT1i~0~7@3S@KuiGh}wj3M>E-IBG@fj{`AC zw9oDSzTPr7C}J-t-hDI0Ut~GrJ(Ka>Wvw`2IC5KG1Qo9JR@k_N3Rfhe@(diZyR_L) zYM&>&RsVJ+GhG%2ji1uG^`7kf$cU>Cr5S|ZEbSxv7}$~=oy|ULfwW#zrwylGQJd$R zD^iZS$gXkP?d8yu3>LeNUK@Pwb(YU0trIBMS$`SHi~hsrH>GKQU!X=ysH?DQy+;Yc z`J&T&a(k~|Uoo>uIVzF&?IQ(2C|;=orXD2x>z2(z(QJi@KGO<+<_y$!Y{5{AvPx?CT^{y@uZL ziiD2a`Zd*m>mY7`46xhUL9P?_%|~fbEhCDLopC9$p)#sw_2%a-_e?R$AElNa!(+|j zjqB9s#W&1?#qPa8r5B8=rY?`m%xsoCStB#L0wu-s#tan^kujglM@%UD0;cBs9M`zW zfQzd=Wlnc9izVFC2Pp5-_5eboGngo6CWfKL7Wgnv|9Z<-W8vQ{cNLpV<>m}+Cvi2dFEXLW9@>ciPIots;TBQ>WA#!L-ecEYoE1{%-jSvGMRS*svc1XCBCku zCIFpHp2KjRnnkvGAWR^m4j@<2xh%8b3s}Z_K3|TO z#!Nz|hWNhQ8!iwP|Gco8c`y7EUshGL)T3%K2zO+Wa2n8-(lLoYYtI6d;6MoE&illr zZ+UV%hmEbx$xom;LF7_bOTUK7hf)h7ThxVaWKm3re2G3-SGjutSa&bT~mZBskq#$#^$c0euc~p ze)JlbiP3oPCxH6Ipk6e95cUnD-B{zL673*G`j7pIQU<<`!o5qEeS%@|kO6?yWAZ4< zu7vLT&%!V5_=krg9VvDXxwQ-XM zX9<5R{9aSxQoYGZxZ?%>ePe?bvbLtsP(yX)8%E0WH_S!3B#hdR!);xmW0K*W!(H}H z9&WSpELrm87hU^38Z&^{(Xu6|paqI*V!wc<|H*u`lNXF>SfKa}NP~2>nIV}!n?LEZ zL%(RMtj%|ZzuzNW51#uNF*%^A@WDK6YolTP!a(=ukErw`U!siP9?gWm8|{;mO%O`s zK}g1E>ecXtU!$&%O(Kz>I;lX@&Ij3vQ_=(FxOmQoPxdO} zX5{{+>LKB8E2DlmTfQ;>Pz<#Q#|K#})UA>PLODyS+di{$xWLKq6)Kk_)Lg!Jr!Uu= z8$W)gIKfA(#$2cH?|d6}zZF;qt+KFE$gS&GQMFHcYc0l~tRW;7$=d996sp38$!d(B z0Tcs^4J=dpJ|AzcpmL)9!r_>|D1o*1Z8l=B9in(H@RXLtDK|}~J|%aileJt@#MH4C zSUoY*uqy=MOH2DS7T5ty#}mF^$S^EzR!IT)J3mq*rC-{zyFFepEK@b1hYG&meM3^X zLfWEN$5?=eadZ8J)~HkBw2U&9__ZTV4x7o@7D*+<3-8!n#*^`^ebq3^l$m%fxwPg3 zR8ZUT=@3hL7$%xd1;FuBE-vD1lCz39=I<0u3|#KS4#;&mB+1dm@623 z+1sgv&?(3&8gTwsdPYhUztlTfvMfnzz3N@_-ovM;`EzMAqnNzjMFgtu>eBSh7v(z= zu8gMLKs|Veil#sSX16&WySLhkz+uEohwRTEQm#O__dgols6>x-tukzTPL$T`g1@Nx zWbnP>Kgn35OFpRoDdp|LCGgFlxUIbRjFGDfb=V(4t(;-mB)>Ak^mr`lBlQ8Pd6@SctQ#;S7#RJSWB zmxdzLR*Rl}7=?FS0FuB4{HG8#gVs_nP@p#)2h**NU=oagoW%D9qV!~)!i5gHfa zCF~UPv(bb?T)+;$R%PPL+^L2o8K}K@39Kz+k&|%zP zDpUr?)+ALu&`}k-DmDWDs`gG;OB8W~EX3No%2U}0Ht{rTZa2zQ{aW`}L=!T!Ry@r1(!w)`(P zr!6#V8W1n3uc@bTEvNcnELZwb>2N#k{PnMFXG&Uu9AZ;f9$7!&>dnP*Yf={9;v;p$ zJ8aOOsgMTUhRbt_Lh8fXln?5)e2X$V|1tnft(2YGdUi`fgZ~9w@hn*sqiN z>$Fok=>#M`J1L>|Om^b$8k?>=8a){eTo5YbRi^uwR&}1W(76q*^pSky|3jpdVRK>Yn$Pa9&a8+rQ4sg$8y)(gdg+3l0oRB znxh9c!e@ceC113P)0tmS_=HK)$e#6MFCVOczZVq}4d*cB^5(A8*l#-iO(Df?viuv% z1-v5Fn`{z89oROau7soUPd2pKqj_LL+}U&2dZ!ThdSqPx>~oNUt2W{xKAb^9<%+n8Wg`N{5ZJcSVofNq*F|YHMUn(a<|@-R1en~ zYbx)D@DZ{$yM~x++9?U6D*s4Kb9;0vlnMD%8WHhw57pG-n^mOMoE^Dud(z8V+R&k+ zOUTq5%IRuz?VoZCKVW*+<-aF;;Ir^O;ywe2u=)l9>y9e6P9lIvRJHH@q^`Kuq9ZB@ zTi>9Pm0S_xs~8h9yrvek+e@(wz9voq&NX)yuDVC^(^hDAwaGQ=j&6vY@!htF9|M`O zhn-8M1%4~ky4TQo7(A{jjce`E8uc?7j=3Pv#ku}F=vbvM&CotYKZp}l@aBz7jr`ctwsM_TcPPx{>++kw)jwp_B<iAP_sda2XZe&fC3|fj78<;9B7foPzcj=Mtr&$HA+Zl=d%2hda#n>_6;+gw)DVFTNE{PqZccnVR} z@sfySoDh^h@EMRkQrA#OsziCT-B&yr`8txa(w%}$&YE|RSh{2RaPL^`UbLnb5!9Bk>#O>z4NR-n&_>?58!k zrW{GSz(6eiy@q4)6K9yBf{%(d3-FJZ^lEHy<3V(O9wC5?kFMIzC;mMAw~FZW9N1hdZQG1hkri{NmR=zJ3$$=Yi+iDv?$%uP zaoGCeAp!lGVS~+2ho688Cp%QqPQ(iMGZ$l!JBOi@z4_=4e7=-~Sql^nk;D*tOvoa< z!R;*HSddYf$YhyUE}HjAm;g78VI|u?Owi85L|B>8cpgzm5tz+;*&Q!)kqQ<8k8 z{?2$HX;O+cm>a*af19}fN%Dj)f36jPY^n&!u{vUXYjyihl(hX{{3|+?LJH{?OcSB)*tfP zy!AtKef8S!kf80PucUPMAPznmjp^dI{%k`&!ZVkA-C0|1Spw~10h5CL9B`qF;D;(E z{P!X1Ojh-$$P^kL4MlE5^R5u1h&bg|!x(-mW~ne+G8IahtTSZ-mqxVhk<*^tA~j@V z*&|3aXgg86;A5}~r7vm|o$JD4$vG6K1c8xa-lRN^w0y14|>f2?x>DY&{ga}5!~@QF}i#1tF_#n1O14D7GzEJc|h4!~9UFm6yxk zJ7XGs>tGPYqF&X)FU26)kMkYVz#(L+xqWj-A(OAOSB=An`b zWuzorOT9}Z-gfPbW0{ZdzA~J3DRTM~C3ze8g05UTr)*UC0o$VE)P6`bMROj~Irouj zlG1S#xV^}dl~p=;gsa%>!`ci=;mg?(a|LO4R!cg+Q?-QBs?xD#JKUquofF!t5%|_c zGx}8Mo(oMPVXn?%=oDS6V-K~3 zuU1AscMM)0CI#z%0b25|x_-`k!wvyl0!8&-B;SKX3|SKcu){n*VY* z!0~=bT_iNLs8-KwQdtt9ZsHg&=NssLUoB=t{(NLf7j(J ziT)`>Zv1_v)g~s!>nmnXrxabcHC5uDMcZD`%SN6WcNI+Ej-C`ARe#Oijke9wtoXYo z{4?47-(O5pc}7X9Cc2RVZ2N>sOM0{v-_`h^0k7lh1!raF4V|>! z*jWBB@l=({q5u8cvp6IJlRC&|X33)#0|g_u*1JuCqc3mj-v3)t)pQ!-)_Y~2)+M}D|0Iv?4gv_Jc;vXnQ550Ad2NJKAC|^jJVI*x_$u|>+Jqn2& zd{ljnmJ%H?arqT3#nd;&b&^Wq4UaVqY-6ThlwAKp^%+1e7N9Q2Ds`71y7NSQ%k0b5 z<2|{|;VkyD)$s}S7P2e&S9vGLh2r}Z7YSfE@fonm(w^~(76*zOM?L9KCKj_xU_SwY zL7W#w2|{qHZk296`S^8-Un6n-k7SiK9?w89=mKst)T5~Tp(Y;^fB&}Xj3hrsk_V^NYr*Cr0USDW9o|i{MV!^}6ebo&a93ar z&^=_+)DX<&m7Y~YE*GLm*3kS4NIEQd2JFe`KMiuvQl~t`e`k1I;#E-+L)^Ukr{zj# zX~peYmyVnHuyfJu!NIQI`^}h({nkSH>uQa9t^Ud#$!U&5mNUB?<&c(Ug@dK#$3#-) zP02W;g(!Bf3PuRwZPP08r^?KN5cZFtO#V@es1A~BsvL`;O@Aa(YMx;pJoi>tE>2q* z)}p9IB0s4*o>_W`*Vw5>WayTx z|MrdK{^qu205$HhwI#|FSTMs!7S_J5j)hr_D9ngTm-!L*FE*4*eONOAodS>w3mu13 zZ19*?9_}z!3dU0%zF^%x$*XcKlLa=@|Drjg7jCaDs&Ko1l;yVv;BdFC{VK5X^rK+efAapJd#$<;7%I!=Ep_2=cnL9q1m z@|_f-`6EskT8IW>6lI~H+`mwnGU&pJfb~7Z+kIVDWS0e7fEUNw)bD84DaIF}f158{ zWeXHgwnd2^?Cqkfqz)`FVv%)$9k4%L#iwIC9CxnJpD!2pRkEOIS+}6!&5QY0#@J^# z!avn3J}^BYxPD!1_oFibyi?{+3jRx`LRCgwm>7 zq(xj~NIrl)ly}O0PhJp6P1*pzzY?1}L`&wPfhif3r0{W3FXy7;P~v8RnorBz+3+e| z+pidIEgnI)1#PMheFcq3zn54+n@et2EVD1YMthaB5?ny5_PVmvWpdf(!dVBd^d`j) z4+K3yr25%q{o|zxdB+%YT9l(P)8-@mSDKJN5g7dmHM<`H>131My%cic zwow;JD`*0J2bg$jN=XmBO(FJ*kd$e3|Urw*0-yDeqr4qsg8=?F4*w+a_^gyVjr8Sws`)$xSJN$D2-Cl#I#E9L0;K1g>uAN;gE% z<(dS9>@z@$X+DM6C#7Uo#7RxWcId}}KFc@q!AN@#-zt1nkoEG!atRUt z;`f`@;7L@Iib%WpH*w(aEv`QY_1{n_PQ%ccd0K<_I8`2YRgQ3Hy1ugXjA8Z=A+RjA z#-rxq1@R?aB5Ps_S7qVpyg9@`DL$WfPpL8luRlZH7%I6s{1Yj)9 zZ9P{w3>m+a+Uq^^BWjg!F{|czwX}6E_Vbsk;E;RRkmV3D1@t57#iF>TWhCDB-FmIA zf#}NlF9mMmB8&9pUoOMc=5LAW9QgYsQ`IXi0VjjkTJlxw=7$z58gSE{exkZD#{F{d zeMVMm>z$y)-8EhUSHnpQ3q$CkbHwpN_dpYAS>wdOyII+D1d?Y}J59s%>#-+@?K-;2 zQ|2gahj5R?ZzM_CgK1#Ar?om@!uoflaj^}Frh}bW|JH_&2E;h8tRnHV>i1V>(svoP zPFEM26yCeS#K%XDJ0;RSg}e2s14k!jYr<}N(VrNX%!=_dJ*H(SZ=#C=6w*D3DrAT} z;$rN(K+B_+wwAJ#a_&h*)zw{do>Bk?a1 zTUAD-Yj60q{X^4XQFXEEXce!EQROI|uclzFuHV~1vmOrXLIvA=ccuMKWE87ki{v@ccZo_@EklU1D&FmHN`Ti+ zUj>C+tLF0QuXo|+icjq2kR}|W$LVvh)9(=P%uaSJ@pL@K1DrNp+I%V^Mu2mSm88fQ ztTvS6xDPUoaSJ#Ec6lF?g3b?aNf<9#WO@uV~;cs`89Xanfii z+_3!SQWnL0RPe9J35y6l>ae z;RG={2t(8-kz^%nYki*N#)@~>a^0Z)xJYv((tlNBmy0Q~SzY>PpL{37MDSyL3y*GL zbmpkuwg(TeLpb5VAgGb%4Tv}TL%k2N)mgC2r$2viMng%F2@#-p6N~Lk+q*MuB5y4PF)i>dHuO6q2}oFB6+n^h*h9T-1^ z*4;-h?h7#0V|!m#Qj^M`WK2LWmHNCOZX$>zl@L?DI1rFmk_LjmWsnlI`d-BPLe#-E z;-1vrpa9`FM(J5N`YukAtHhV+r)l)E%Caj>y6P+&{XAL{wGkcRL`w|5h8;GLw&V0# zhUoSc$--ODs-W{DGh|4IzA0H~KEP+_XCn-Ea`fuix< z7o=W&7nCu?8stX8bb$x<9o|WH99;0n8-}Df%Xo#-49%};!u<8_{a@?98;{T@js0Re zUPzbl1c_z4KS-3WNr(b=&v-#E} zJ8!X~PV}wp>_wVe(>VANQ2=#=KoWCQX9)xO7I^xoCdqIT96OTaw0DnNDYj{3Ss=dF zm^IQEJza9Q5Ic!l*UmJjpj=d!ALD8$`0Rs}g?XFc-uwK3Ln{oy|(B5ZKbJIFm)qttvk; zbz)C#F4Mrs?&FuHy*9%VAIwIJ;W6 zW2|&Wy_a{8)|^tyh)3bIo>Ry^rIkOC{8wd+U&lcP>Q`PHgrPum_wH<@C!r{EYv(6> zWYSxweYBk>yyDtZ?5$7d82zF_>*-b)R+F`y3Hyt9$fa|7tPro})m7EYWjCbH6aduh zNZg^)W*?9(Oh#(WTVLUw>DQ1@vpfnQohLoR*K>$`ZGzYQWZQ6JGLQeoueLy0&v2BM z@=*(MgU{hd-)~|6S7@@gSFrQAxj_ zidla8!>x52*&?~wF?7l3lHb-md(9&TFptfSCp>Obul$1h(3U4cmA)u^Wf#=U&VZTI z@vrd%-rQoAaxE|AE<17fXJmd1c{2UL9zZ8F9CuPkAy^kiNwkC;(fW@~0y+`t(`v6Lii=$KRA#-!$0HKrk zg3GR%<}D6kf=cJR{C09F0e<37mA_-a=Pq3gA#2Q zNeCtNt^^Oyb+>IAjNzuW0z6j=_&yOy&wv=KeqFSdCe|N6Y_M(?-wjz&slR3VefXd` z)hx%2G4vFbQf%zI5q4pi<~3IT)V0e6V}St@TyVT6tUZvjT9GQqQ0gGeH%&+IH@1F< z5tbf>fF)V+IWmv@ukDkmH{1Z!0@ry`e)7RB&)hx18Ct+jEF_d3a4GLHC{1)g6|O?G z8?dy7c!@g5O!%8-2F<1LS{Bu9g~T>##8~EiJdt09+#Bf(REIK-aU%0b<1Z$#Xn%n; zYlWtF5tRvMmvG9)pM&k=Ie#kZLAUiyHo+(7XQ1{R@J?O3X=KZ)%fRn*r)!*r1{;t2 zB&UjvB(@Q+MSW(jl@Q_P^;wnLvDe$Fv@SVHOQoV$M!XLL=HFw_|0P>6889rGrHMp-| zH2N8=3L?W1sqRzDcl;Qwm|=Oz=Rlg&$t^a(9XfS`0s^4t*s~5NwNBe@gL<@r+5A7& zYm4vAU}d;~1ONoI?)~hW?3fb;4Se;#C}iNWqn0b&&EOh&nK!}`((?-ps7;Yjj2x42 z2gOSqU(R=e>M?-;VS=!@7<&FiZQvShWe;Ntg-KLN?>%wuFLO}FQIMIF#&D0mUgV4AD}7T`|8zb zYSz-6onY0>mV<}M(?<#to^et<$5oOVUQ;7MNBF)_LqH*oQH!W52*)!JkcvKV=(v(t z@X?BA>f2G~-nA8jqb1QVf$1`6wnTsbtiNRO~ zD3?i2ZA@F#Yb6(r$92z9d_z#NcY~-O34rM*UIgoBV7~904zb?xOyayK?+-z6V#wC0 z5rZo){3E)Z9CNpq8%yCeKyXAK&6U)AWzOG$2Sn!*q370EaGa}qkW2A`wII|of`!?Vemd8ES z;$0D7vsYO<=8K=KR6x3L_u{K629!yvhZ7h>>w4Gbi5YMV*CnbU=TrxiP4F*fTMF3$ z54+<6{p+f=3dc4TU?AN*HIa&6Olu|~UN13sCxT%Icyx^TA!La)Jfh!QV}z-+-&Sm2>})sJr_!9*0QsS4ZPih|K) zSmx|eW9(qk%7qS> z-=okyUHKXCrQ$PVo~PS8jXoOAThcX+<%i6t(7azbkSQ%I5&6mVSHBWw(PPrLHneQ9 zxd4#$I__&}YpSVm#;tSdwswE0bscL%0}26cbi8Mk71}ARp0_z6E&ttAa$P_C3U;x16|X+So5YHnInV2)3TVCS-yZT* zryTlgVH`B_qu4lk&TbUI=ecUSm=i?akIA}QZ2O}OeGq1-E>jA9Y~j&r>Yx(g@n*X> zyML1x7~l@|Vvq_Lh+bb+_|1>c{xvBcuadn^F}DS^a({9)S^e6!I48HS@M~khK@r9e z?S%Pz7Nu?FNKe1eJ({i97(oUS!R*P|4a25Dv1>#3mFs#!9L$V=U2LNkDP$-Y2oy2g zQC`vZj?S9zb-s2nNIHsM2g#_`BZtzHT2RVk58xBuzgAH>TVLb10}i5|*qDwebQTFd z%J*n)@5jjSzTVTSR9jmybe`Mbew;hY`9t?U#lA{0w!YWx>1?eDj)rpM(yr(B$5jfK zNXY*P$M=Owv+beNJ;RAReg(c6pwRidlUa?h5DOg3=q$itx0K3fASNF$g^j8IP+6z% z2i)zPGrSMo(mLh_c_5^~MORO<()lu%e&2UzY4ch~5Ky@%HNUv>?)7`ZR1gNjc1j3kqCVuS?xFYmDh5!f7Wc^Y=I#V=tVthmt*a^NnP+X+1K5JUF z_RRkBqBEXz$4+w{kP;_md-;}7j_vR$W(Tg4;C!{bh~~F~PqUMVazUYdnUlaHev@27 zaPdT9AZE=2%P})rwl7y6q6Jo1x4Nwl`ftSGG(SlKV-CvoJ5VrcN0~i8*c(gIwr@c-$v2^XX- z9rY0+j`@B-GwWSrEDtmBmC)%9F{_tz3FuyJizvf7 zG)u=?P!4{g#ndFa3HD+7MKi?b?*TS)31t_?(iu58%2@Os;w$9yaYzFSG*QS%m6{}} zs_nuT4NhL}(hzriexLYO`&dl&%n6IMZjvvnVu!-7H*bs`rH@3$MY1);mO|1^yCgaT z-A`Nyuk9J|!Rx79PZhfu8^rRboKNhI-s+Kflhytipm|7AsYSe(Idn0Y@$_ryT%M;g zZcTeafELdhfu7j*CR9&VVi>#Y!A8O?K#vB`3X?2Hc_LT=YN%)K_XK z&S(Pn3!3J47eD~l8}&xkeb;FnhzQLV@+bg>+jK<_-5JMq#16qA9jNWq1KdxDZ0|%X znn3E8{>4vSlx+3GifqbXT~2uzh-YEpm&*RD5U#^#!)m0asESx>BXJI6_UYVtuT>D& z3-ndo(UP$2*GTy4VrZV$WkW@t!|&Z=yiVmdVp6xGVO?P z^tznQc`mysREX+9bT0ZPZ$~IS>;MfvbTqsI>OfXKpa}A$ z@q1|va079w$N-Imml6Z^>|e4mVIyX^`#BZQb@YZbk3C2l($Y3)AIwkzfpbNJ98al1 zBV6zaW3Sw52>P^(*nVi5$llGd@2LP6mU?M;9*#oTw*t=_{{Rv&e#c>ZZQ>}Tg7y;D zn|YLgDBwWYKl4fQ^nERM{-|Un_Z|`b^O)(0KRC1e#8lRi1=`EoGnb4lC~ms%7tX>Z|kkGdtVcJj@=3Q4cH zv1>=UM{8}@Z|`iH0D|V+1`+IThe6kFTmJyWgK`Ma)9a}LbIIL+kLF4MnA{z3*Nh+yM?YA0qMFIUB=c1-R^)m#7H-5zr>0d=*7bxf4qgwWrm{APxg>(#*2Zu z+r)qby~Nl?;qF&i)(s08j!C(0U}?9j{ZE@3BDl$#HfD zkLE&L8~`@eS+8v`{Lxs`atj@|PZ9=fe10EGr2_+Z8~c<1P{I$=@#%CrbhJCf z5AfX=UY7x2N%cTQ`m|s9kQyIT*1Sca->%mxb5ofBb9xcERd`!t;%Oe`O44H%q?Vn{ z7XsneQYW&+C83pI{{Wltw4YFRx|S5d)Qyu_4g+y@4^JXnzukK1swxE-i2) zP3)>Q`sJyR^xv_vvS4_e6E-%v!LxKAj;a?iFf$d93kIF+6B#9%aM~QjD~=ZpJ5G@E zxl@>9*-r?KzzcpP-wB2{;gsYs5(%*>X`7?#3aygc(X)jI7^#L38Ksf;0>|kCs(993 z*A{5#E~A9v85%B4)S#J;jkd8KYNJN%BuHU3h6;$wECt2Y&xm-#>!|Qb7LwLaGaB&I zxgAwA1;fo%f}HV8CZvh+wTM@NBJ_cp{sOE(+)r5IhOarq+xJGBm>*ld?XVGM*DcSYvE zx9XfRtS9N;)gMbdkO*t@G7lsovOsrB7{N)HNj5-_e*k>j{IB*w^HOMKdt;a78nY!PQxoEyqDY=B=DToJ44g>hFT*r(GbN8c{T@#e>DkY3u}0Rz(pbRq zH(IuM;VdDtS75Iq+rE^G{{V$$vr|h=DQt{%G7hbCY=0uToAISX(>m&MtH1cMF)ItN zb`#K{FKvivX!i)NkhRY7_({LefV_5nul3 z0Ae$w1cD^#{e++c95n{>7;oee!DuIO<#a|r#9m*0{{V#?In8lA!DHJ>021a6%bF2O z$7eRnoLu5Vo(sDD2(BOjsDu5~fOjx%HO2WZ5{)6hyC8WY2Q-`WHV3AJdxp4;d--TU z1_Ws)?E9eDHrI?>y^xUCu#IE`l&wwJ03m2Ujrtke4;q-J;zGWyQ_@J^uie z6_PYLQN!AU!X-FdgIl!A1JJ=^!ZXZ#MR>Mn8 z4>DOC8{rqSMwTa4PS|3+vb@=>Yj-EfdOIRynm!lgx$(?ljToI%G{o1BppmfmQ$jl& zmdfu5xIHE*o`iBehAcBovcAVd-D~L{%Pr4(Va~YAHLwQN$!3wb@h@m_mjjas;l!AQ zOD%|2Mz6Q(WODWp1=>82Nw-J! z{>lw^cs$Sjim-vM!P7<8YmDu7k8p|}4Z(f?0J?y=pzSB-#0nT202=059x_@;0ff8B z5Ek3&`{`Eh7iCB0tj|u@w7QC%^QU-j&1m0$$fw3T5Uk=9Qcz1Aby0}qEz}zCB;Qjz z76PX;&1_Zqvl5mWL*eF@#^#HSJ(LEJ4a~Z|n29Y0gL4{4@kh5r9Ka3rGgp{H;3DPw z4<;aM%Ux#4+T!f&m;jf8-k|y-2Le9;J-Z_VVZ5LF6^kTfj7L6V;*Sq&jg8*@9u|)> zQm(~IVp&%x;(5xtDoi$;oh!|WKTCZ{TeTO$T(MzOzOu8MQ_3Z+h!6viV`Xla zH(}KIqPCf&bO827h86}pQ?!kn-BbFp~T9O^suN z^pU6BC(t@a#v~mYE>lUO()L?&W(`qaP}0YiIzJPy(L`mr4s|jw#In%xP_pot9yf46tX6I;uhaG_JkJpGBpHxZ0V|dB@6!6mP}4N%CAeE|vE^ z3I?6J72dB&o+~fp!kGQhOGqij?mMEn?FV$DM38Qk4`J?_*J6~Ibvvi2osdS|?(<~U z_1O5u$>|}`=n^$`QBhO6X`UmApHtjy0?qNBF3(X=AiyXeIVVV-NZdn@ptt#_6qz#_ z&E>T@qO`Uyt)qYd2fyf`cS-dGT2v+6*y^{7xO;02CL2#tid3GQmiTY(nYujjnMop| zp9Q8nmN=bf)1}qd2g{gt2M~^mmN-Xu52*Wums%&eNzmo+xL-?Y%lrS3XzZBxL z5|a>;VhPkq4fek3!{2R^q9AwK6tod#W-E#+$?{N*Chj7*KVAe07l~Ln@OZt3m^u^PUhro*;DZjBQ}n$@SGyK;e>+b zvt#P7Mw0!JK1-(EJrr_MYf0#u5{zHmwopeEaONN0QCC8I%vVx)e-SV`_qCrkBYV0D z8>Lzwhq6zuCcE9YP&k(kaBNdimWVK(u8Y8bMGK9w`PpGki9h3_RtlJF8LkN#BYNs< zf?Ddab8)Gkh z4VFBk{fd~$9ZD}D#h?Rkbm`1^0FAxE5Yl?}RyJ2_T`{LAq zswdjcxb1at(2m%LUg?vrHpRw7wc9$Rr4!0^l(N!F_-aezk~e8{2;3BlTEn<>hwQx# zocMD=o0ued`C7*PGq82{9o8L~a63p4Ws8`ySN2AR z++4aqJy8y>n&dvVMR0#z&!9?;R=$@Hb)EkJMw;j-jJvDef-z!6-fiZN1H+BuE*^z& zjmPP0hu+CR8xjug2Hh-(*vTF@JMvt}G~L8Gru_qu@`H&NdG{#*054z*+it*YQ0RAe zzwCqp&46`~={F(7f#wPTBZHgO&$qS8=C$Np7w@+~xChN% z;WFpmA>kQzorScNU-K0Oqi;*CdTD7enll_>tYk6Sjq(6&s@x~S#cm^tW7FK!Pbpw0 z_(3+Zu%4Ghyi0~PLGoE@N@&^ujt4QIdoP?Bv>*7Z7aQ$WAlu!2GsURgRYNN^fvKbq zb@OkDm=#V3p0SuJ1K9IW_ls?CN@SMTVm{GWYaHFn-aXOFZrc|hdmm9W0B-Inn}4N9;ei$4#!Ay#@%d_(&rJgi2lh&dtVzoF}c9`l?L1P zG)I)jMlG;x~Uf*7&B@-UPT;{lX0<0NFDOYDaa-R*QY?`GzybI|bX~_uI$5{u5 zn7X9SoU9T>FPKDGGX9TTuqvu8xI_Ac4Al9e@pS zQgIS?IQl&2@q?D7#B%ipB}on2O8)@gIkvk&=vR-1tzT2`>KD>JE9HpuMnT1}`boODIcU}+nY=Y5e}z(;@eTm!eg{>Usz(pckT)TmHz zU>0fF&UKGq7ihS)r|r{ZZeEf=W4^@{MC)5it>tv>fiITBizgtLZPcXP4P9|=ucrPU z$Z#65NNO79=?;wA=c!54Q<>^aq>yjY{Svp=MY_At@wE4m+Q1uj=%y*&BczM~HcTAB zB%7Uzl#7<=8q(c{%9W32ONSi?Yg;2et9k@?>E#B10UgRMJVDy+unVdKSm4uVd6fyQ zx*5p})NC%J^8qyQM$qe>+to`5?9R!v>DJ`E zj;b?Ut-v6{;3bil7fEfkT&ddX&)Jx66It|#CS&eAhU+C(?jGQ51TC;5rpi#_ z6WezSdoJRp+!H*a=<3Fao?}4VB;PDdX?Yd{sq$59e>p*xshcfh_i?jg0`7OPRUewn zTt#_fOR#(y&icUp$|P z7~N!;hH9yN(!c4WiT?n*X!0s=EnpNmTA9&O!Wwrjo!hwm*JAnq013f0Hm0e8-E7^- zQ15?rk41+TzRFks0Hu+beJwO?WHGiTGyodW&?Px79SY4|#@sXlW2Bxz+T;YrUNTo# zPGiF`lKsCC)Nl4m@~MLMIygBN{2Iun4mP?ejjVNx+TU|!kv|>IIQtywCH<8jZ4C`5 zvt}=aR-Bh2Fg8nGdtHNC`&IJIUoTxsa(fi5-;V?iZfGH(h@+`7zh z)*#$cF#iDi6zcLaEdC`+v2du`e$8lj526tuDKh)F%@L9Sb|0GP5y=jm`wpL~D`W(8 zMI^bS!jg5`fE;rgxX93(HP=SdsVELZ+DAk;Z<24K-Oz~Id!Pl0Z}5~CXzY{a4>To+ z6iTX!ioK_n^%{{S^m#<)?0RTorYl1+luT##v=Lw_Ju&@pjfAx7(i_$P(a zk4c$qY|oieqh8qE5Ejtq>Tn)ox}C!~tAW*#6?5H%)wFoxs9^!8-=eyPTXDCtkHs?H z8;(R(oKI$h4FXBq4T>k)?6~i$)99rg%sv3d+0NZ$Sd0~4j@~eatl+`b& z!Yil4XK}e8>I3YFJ$`V?F$YT6vFbMUd_aeR>Jt^t9Z3uHVL#%{6tFlbtk0I|b50R8 z9Ya8NlgpOdvE7P^%X~q@Gc&$jlYFgjdTB@o{+?>hazL~1>Zg1zgHj@jvWfDE<-BVK zslY4cqN-tJvAB!7@eag#CN67wSQGDM==|lBDD#aViee+FXY^B!t@J*N0>|?8R$IiK zNm{`h!SJ~Bc=HRC%Egru+Vpj}d6CFk-(+Zvb+bqJml8qxV?ORHwE*v_wq4vR~_y?jZhgHd*%bTQkX8u6{8Lz(QN7=}w z%p~m5{p2cd`@~=Ek%(s84GUC*SDgO;kU$QKau5RGfwKdEt=@%CE7=#%`hwmvoqrq z*k&IQnyR7tTF?{xZkF~-{4H^AT(^uzmMWefL0bJWy8AK2{{Yo#Ul=)g@?=U% z>06}5I1O<GT+RFk-BLb#xaj`hyVw5D1q!~7Go^5o&|8~r#|M|XNE>7sO4<3JPQM3a6d&P z7T6w=_;TT~%9&NQGC&zyS{nID8~QfuANaM+Q&i_z@JBmUJrr$l_aS=+2B4CTkN1Q& zHVZUgrq&DR#~N{YvA!i^b4@I6baT2O-yqYnu`ZU7;+ReM09(}ABxE3;13in;!-c#~ zZ`~Edbp|VYDy?gwqSSEBh*d*P1TB(T2xA)4a!U5U!{-S`PQaU4Ur!q`S%u5pyPDEE zEN|hTg%8HD$g>_J5M>1^CL4{O??3Ze{{R;_@rUs4C(>cXwYZzf8Z`|p{FJ&#mMg8A zjxxC;!G6)lVbcR9wuDu4d~$xMTW)H)gQmO1U)Q&{6;`7amd{v>0W!lp?ps}6^M zR?UT#Pg6G}w}0YGC!{sWpGb127V^GzsgfKk6o#a<+iwZ|ckZrH3p};L=goY=%h=^k zU9X0=ildP%P7VYcbY9~4e&a*9twA(ZZH^p5k)vLd^^TXabZ4@>je@2y=!|X`YsQif zOD~-KX3Fo%oIu545Vjg=W^>~W78X81eH^KX>##Qiy78~Z9tiq;dXo^-XrJJ_hvK!< zr|2Si&F>^w2A_SAS)$h=g!u;}{u1l7qJqI-3`5ifQ(7cmj=K%ZK(O;DTPDDrHuJjT zL4J^9BHl}dw1IfQ>@0ynbK`A}#mBwWwjWsyGeL8;7Z0UFy2Jx`{We_P$5{TsBk!T{ zn}aO&ZAop3XK+Gx;^wdbcO4WK9f-Czh@SC&flaa^SRM|p)*aL`k4IC9Y-b+5j+-Hg zpo@@2$w2DHm;DwexkBfbF|CwYzMWG_;LzP-;hw4$9kWKuI7fRQwg=U6?9qO=@>Q(T z=KSMgX%d$8H%iy$SGE?%3)>TIr&EtrUvu?ItDMOV_pw%C9w#}OlB|yyhHRi-z#ATc z%Gq0r#Zi{zhf&l(vv2gOmZr4qW8cUpCTBJx(pp<@vYI~3xX+AqdK)H0235nIf9gk7 z0i1JlnLDt29o*bW$3x3c^j@V>aLz8`7yZhq>cRf$whS3OhYywkY4 zM+0k;x1MT+Y7se@;BMz+%`Xq9%UNOBC7~VIG%Xup%b6xm(qz}m^{J#BR z)iN}Wb6S_rIE@D`!9C&1$Nuwf3URg+qbivI!+4C-Pl2Z3f^QiPD z>4=&=Ha66cOGh>*RTLDlnI$_0v|N<2O}>Sd+Kby1sjfd;pS}8%Whs^NrXiaunqr8@ zN==_q?t}f+=P&STE>u%u@)yWPqfXDHdJ?o+95Ng_o;WGvY%wrcHN6(LJ_M%Iv!rtw zo>?U04`fzd%Jdm(nUPY$-C$VG;dveIdZ~_XjhYhtp=;SD=zH{A@q}QKia8nCD<#Ta z#B@v#SAJ`)DC~bU_i%$*@>6R=Mvt0ic_NCLk87uz&yo;Nk^pEYnEfrfKg`ZLw#iZ- zeVcSTAKiw?0S}v&{Xa$DHzm3!A`(YLKP%=EFdB-vFSf|T5_=BqxbsdvRCd`5XxnYl zb^;%@N$AucwN9aS*e72i(b#?0e2Z>2ACfx|Z~IUaU_VqMxz66jF6su2^pDANXtzy5 z??O(f^9JaGuSx#^bnwyuH$n0&cjC9Qf3RJ8DY=s)VVLU-(Xdcf>eMQ!+5@hu(c`(N zB+k~?hhtcOX+ueH9>4Wki!k`veaZJmG!cWixKK)z`%JkT+FV&w!t?eAik9fBWD}4H zhG1Gj^4%?&^6np-@Y59SW2F}`%r|o#?mH)##zMex$ct)e#7{rgL*a6#$&xYLp80BU z97UDX_PL~~Cs7`UrT+jW1&UmCV%jIE!AVNDrem0f6Fv`#R8mcCaX7S)Ha@AlN!D)Q z2lA~SHRBj=aE3T4DQ3a(xJKr7+Bpw)-~Co~$&s!gf*cnyeJ;6g;Oi;5*t-nt0d~9) zI+E0%)o>&Y_0hWRWVnN^hp$9D&;hxA-`Q0Qy9nTQXfj^i^dm97^^1D34zS_%xued= zEI_`ephK9@H!f-Z>K70|v%uf@S6%VE2N!R@L~9Z+ay=2SeF>L1kVAn3uZGEbI%9}e zK~B^CE84+k_aza|{{Xuy4-RNujAu+Dou5&ir}8&XT`^MG?u?0m;oLjn?J$+vSp`2b zOYONA{{S`QYJ3+GL!ABx-m0IJUydghPXG_f(TyO&eTzHOMnc-B$&=kGiK5)c91V+l6+fm(a_f%%ShJp z-L*g+wiZ94yZk8BoPULt*nLce@UXdpGH-A~hl@a@j}=J}jz=-CxdhzV6H7zx=iK>e zR&FBUZNW39F-peh45)UIZi4E%PlIA_>J$?8F2Xs0y3BRl?t@L&y8a#V9C=?OQoYcY z)Kq~p+~H>uSCBYklj?JxZ=j-xxzTG5d#%s{zKLNp{{XY-sp*;kY)lLrc?C3j>~bTs z&NR#uC*m~@aE!+hEiN|R%jWJkT_!P#Rn%t%#?BadWNwy;rY)%5(AF{IW`y|!pe8)+K)8F@w@sQiKpn0n7k{0YYK6YF04=bp+ zJ;UiS>d2o_i(+>a@wKDC(`$9;c1?_VF;dxk;~x?*?ib}4a<&5`ZLSRrl1RU*7XJV~ zi_aNzhgrLcmCl}yj-seYcdC)T0dlpz6mSz2&zQS2E@7$v0F-iM59nsQ3HXoSb z?F-NKRYBjPe1>{wiTZ*|wEd7+k+YhAx{hDg#lZRmk$akYINPsl3y_tJ{+UN5b~?1iMfo0kB3l!AoR&%b|;xDHa_bHYzc?h(06t3$Lo;=kh}&uG!Z1iNE-Tt$qIhq%HbE z8z}nxUZSz2NiC!NECXcALz?cS`viCH(P60iq|o`~TRyfY8y@CvK=&3wrObBjX>an` zC^cxYz0Y6}G&@(~`-LXRHp0As2@V7MtAm5uKz03WlzDZufOT5^5-c(o0Q-xi4`8&k zY+Z5Y8jj`>Ye}&4ZP2mKZl>D9{Lyaql18Erm`k8I`D+5-*+nhCS6ln2?R%aGwXC>Y zcO?3yNCFWXHZM2+RCc&9UseAAy0{lU!o|L4vR9eQ76(#xk{$m5=n$S`b#xw9LSGnf zTOTA{5=uagu5+ER4eC~llK4SAWRO;3@8m)C3Tyb>J7399MKZbS2(5lZC|hM;Eo6L8 zHB>ZHFvA7>Q%KJU9-eD#%2_84%8(9RO`5E?OR5~-5&r;XKTS&w3=M(?2%%uJS^_=7 z=`rx(T21>oOuYHu_Q#|s9NTYnAG_&vLPo(3jjwf}WAE}DIwAPCMzc~ufhZa*5Fld+ zLCOeSR~Q{p4P!5`B`62wk06Ifph-Hi+lKyVqOr%IKnv+3paB>SlUGzd-H}X^6aez= z9Z)`8#i0J{ul`CZw~JYRlsHE;)kVEIY}o!~LoABp z?UTb2>H3Lp4R=!K61cNf@4ePtHynndBf`NQi{`ibD*ided}|i{+-{17A<#f~{FPbx z3Szo1*W)dd?AGzTw}WGln6VC6aNIwb8s?uqs|wCI6gUnqBs6tytbpzxs9YZ+qN*pB zcNr#=n)hz!zEa{+COFFumN1dfIm;6(CO0mP+-zyk^un{kO`87z@m(Fyui(mze zae?Ulmj*KZ8a%v9G}Hy45N^}=C`)$yM0s5gg~RD?-l%TEEoR@{Dq8@A>uJ>24@2D< zrL8US5(%}gg4Qdw&vv zwRlm2XoRucog^gi=pxJcYG>21Le?<8-7KUes0@@fq-Z@78Jdyx8A}svOeM~R!L0+S zvX%p%qLuLL7@GmbYn{B6HPT2(_t|Q4^P~-9j@?3`@J9ikEo7+Us$q&r34VumSJu`; zBdyI~aP6ygE{yqf%D)lv)jn*=*rf#V2lF6!Y3?0kZ`EL52snjK6OZHA1dMQPHSWTF zY`skQnKVyj?#BxiSIWFK!g7XU#BrKdUZS*GO+a+}CD17zJov7`A;P$+EVS|RqN=B4 zrrq30zxpg#IM{;I?p(c#@q3WQF@nLEVPi+icpeXMN&f(Kapf=l-{SV%Z6XPEgG?@;Lo;P@ECJk^0%XQ-C5*i#HeS-C`#+^uR1%XFy zv%2Q{8v*|7$&kWoC5{-QYu@(;wV>~Eqn5X%H+wHU9LQ2}2RPEx=zD1)A@e-$2=ZRR z#rR8t<7~pH>S+YD)Mxi#n*sJr{vSAR7EQ`@!f3;B3C6ie8;?b-*Rqz4j*>|k7>{$P z+ucnCq$H)Kq!Nyj95xu?iGh(YS*-(P;#UVU#2xr zAF^&SUdkwOMmAD54{0ChpQgwh_ws0vIJ9mqJyVeWV@S4-WQ8p7RzlaqBVGu-!c;NL zy@k?^{k5A9NB}JVc&4DZ;a(lq$2{~RPB!U*aCMdm$$OgA)ikDA?H|S_IE&il6pG;`}ykGdQ0M?hBRC)%Q-^|f+h(V z*YjU-;v_WoUT`8Z;7c2CAW|^af`w z90SJS+sf#4w6?=lysncsEf-^YY;`W>i)|Zk{OLpt9!Ri>!|uArzOsCQMGPzg?2LMs zBNtlsYe?<701ae~!>|0v+Y{7+Jjy1fk@T|u=rn>!b6dXu08&5>*+@Hs{o;Y)t{2it z^e8OXZ00lkONfo69W5Vpq6cX3_a5Z|V7GJbQA~gvhBE&E%!ggY`E2#^B~L<~rQJrW zdK6tGxa{xE42=TUjh8$9OM}CE0ryYjCiD-8TFgJ1k-)O`Za-egDJkDEC&cU7tR7^k ze4UXivt>(N%Mszg+0_N#m>#Qj!1DDLT*Kp{p^KLjI+OJ{_avc&qC;*qbE{mf&%^n@k zWs~M??VLVtT6kzp0~>JC$cu>{cTm}9(_7JIioUeCnx(*LJpk@{C~Q+ZV3;C zR=f&h=vGWlvUv;Lh`q#}$26p7d7ZzV4 z`arx`fE>^IAT14b^~MsC2HUT1B$zBe)`9)PnYZ%Ebb)hN{;!Yjfaq}z)WS!i=N9d0 z0Y0&=FR;G5docl|35mNBSO=Oh5nE?Qe&|~E=^@X*_eucazl>YYLP6UKL`U&5{{W&M zTT6E3{em$q0{p@L1;OkV8t!fUB`AS3Mf_0!i=r^=gB_BNHny98ffP77*3=q5kvl+f zEC*vA=tEhpglHz?qUSezhXM!x0Br+Bq+!&0kLu99f$eO|lwr<13Z{omxin0a0sKv7 z%g)~qahg6VVXc0U_UgMp+-$vZ!!BKJK*O1RVbrW?2d6->8hnL5P56MLse+0}VY3aM z#`r_6zqabcme)j!(dhVfK4yk}K;12#v(*+v%d%9~$k>ZpcE;|n+ptyK^_uCjHUTy< zJc1y?13~;m^b0%Tc2>mr^_g9pvm{S$6G}rVX=UCwfNy&qy;QZL8*JboE?4nGi$K)j zGBX6g-F*+ycWCTX+}{q_uEwT|I#fxC0jP-wgq7&LItVbzMpIGKMNJDv(Zs;eKP2eO zpaKBYr(GGY^v^S7?l0oGBNu}z=Xy~UTciw&5CDeBax8E63ve=t8Xg=ATrLYu!B8A> z$TYd*Bw{X-jwx|CVsq)IYXFgt0Fh(wWl!KHH;UqUS-5^NO(i{TAMV}Fz1?o$`lhr= z875qFhF^y^b5oDSb}>LG5*(Qmwq+Kii<8pBO#5NUlY6R7IpC5!-nS^%G^SD zOR4DG3vM{uI7$d$%xk>CUk)z5~RE`x_zDw-}_a7`3E%87$? z>UVNec?;#~ErL_KT#!Aazd!_)s_>hN)SOVuuB?Q%3WyIbc#lx; zVzsL1qpOY7(niT+cp=SrxFLCSjd?E+;TCG6&KX(*1{C*Nx{@uU>GfD4X_7*FH^AV+ zTn)fV!X$f(whgt_Um#}+JoA>RYw2Wbxy-wp-J}~3e(KWu=&|W#6G*up&@P#$p$p4D z8F`W^u&g%-c?M=qWB{MTZ!fy^-1RU~)4HNpXpMs9*4PE}^Wq0Svp;8wsbR6Frg@4t zgR_Tnqo3M|B8Y)Dulz-$5!L0$ z&cnuc_6x>LZJT?uPo0;qfAJ8;X=}4q1w|D@B$f!?WMJP(@bz9q47^eUBU*feQZC0! zzQDT)2V8btrLNUy#BJ!hH#g}o7yd+JpxpBSKgqfV1ST_JUmo5_OB&!UEPlv^f$q3{ zt=zR3-J}g|{{RXE4B*`$YGEU?D_Sf^gY1MGI?j0eFrf!o@H_Kr1rjW1?+OqHoFy1N zw;;S<`3eHzeV*U&q!jHevECG40X8m*$yVv>%x~R@U82|4(d@*jY#5%H!204I!3Zro z7eqeYQQ8BKh2Z`h#+J2$zy+^|hVJ|3x-he|BU|F1Vzjih z_;oB)wCrp!#w^w}Y?nH1^#qNz3-vx|x(Pdj`7V`e3ySn|(sG+}V*daz-j+A^TqhpM zK3VcVlI~bO>2|*(bj5k?M0H3+<#{8r0Pdr*1k(fN`&oBA-Fq(NhNFf85-|kKq!a>o zFx(@)6Sw=W(AL-xkY?7!92y45q55`QBDbmn-O}uSz&#Nhj$2}bN>Bh4I*`~d7Cn#w zNxJRGTzaks$OE(=G(Ikb+M zylSQG79p2(JxIavJ`Q8JZnr@As~9doYlv|k#44U4gI8egwQVG{$Ia~^FHw!rYAIo> ziG zvR4zq>oBQIQ_?yI1HUj^@=e&lTet+Q*9k|ajX2IT{wy;@jq31yQSNl^;E@5=*fPtk z4%UqQ(?@JOb?G;7w`I=PZJr7Buu17@8#RaBaAp4h3}@_tMOb98yRLr7OWYl6f%imh z_xc)93tfAY7vz-yvz$)l-91n~M}P=AzUuYDjuB!UQp6^Xn@lv&TU^>UzmoK0eikyV zE)wR?6OChFbBwx;wE?$u^t39AY(K+ZagQ?PACi&Rs-m+<9$l8REevv1vS%5%m?BIcV>LWg5>&XUg>Ho8fI$+ z$Y?jZ^#1@4^0XO8D^cN5eGoJ>ZRota!#byDe~Idvu26h55y;E+v9*`4*b5Jf>MG<6 zjHkJ12_+GmY2VEYTwm-w(Tq)d#yp>taei@J0r6}~3Bx@5f)LH-KeQ;X&ihL1v(bwl41TJS6lf>v+@#lfQO+QBxW zr*jM5^JePs>QVd`Ai~Px%|rp=By~xd1yp}Xzha)eG=g?FJEpQg=>C@3m+Y8}3SDR@ zXff!2(t4z^T(-=`gH=$vV;d)l(U-TVxVQSFJ*G=rbvK1ZV}!NB*EdgI%JY_+A-H*z zDY7014i`#6WPM~J#mCI6YhB#gCfGq68|;q@3*Toq!aJ#+%=t;|PFaiHqQb+;DNajB zw%en)HXtm~a!~0YYOt|Q82b%Y*2Obi$8PPcqcbK6Nty6E%sR1w@l6?S7UR&Oy$-LJ`>l>FF5h@ z0sjDjIM0IS*kt8HRXlM?r|}+!{{Z*ddN!`Env#Llv)_>%W=EUbYbG>l#io&wd`R&b znHwaUNS&s-wmlJmMZqVpUm#bBvYu7Mv%Wb79YYzCs$+ZF_8rQ;aZ51c9BkrYgAHq| zDVpv@G<}0X+^<(~3xN1{4C!O8sitf3i6a6>INry-m1#Gw%Wz1yd^K>64i{xlmzBaw z%9n{Ic-WI~(Qg%%LW+t>Z_8v+my%CF0<2*m$s+?>U^@=UJ}zb`vcC;+{6=XS-c=l% zIQ6@1syE5@Nu%aJAM-1-Ulwud)=RDvISeGEBnIhY~Q*$E;2Y+0q1m)V|8CtU|XR;Lw4}SLG`i_g2cEU*7rqu zaUDhbB=n7YF}i$$Apms|grF|XU~WbI&}w^2h&Dd1NwnXiSzv|D4{$%bE%Lu~09QI! z5^g^{N%h@dN5g6RyB)N#CkUJ9i8#ouoJh zh9@QB%l~77`HK6Hm zKK;j4H;eIy7OiVrK}8KEhVCa>D>RMGVd&9s;Y&wY?L>z(4|QfaHbrrZ=&vj~Y?nDM z%RHI$Gt`|8#ARXr$Y}onM6@{8ZFvE;^SRx3a8-zE+}Q+y!Ok5M=<%9{N?&pb7O*KD z_CIX33`#%sCM3r5izEZ*=HL5aIT@ok(li%WTo2;XpvE;&Z>C2AeNryOXM4YR^CNtT zO#1Wuk9c%{>R8Tx&4$I*%@*yYzwD%_xPhCw22t|!v7_;{ZmEAT*_vlF2wUN zgJdXlJIQvElu!{aE|kzwber@_!G$GqGo#CLLmySnsBN}IH2@RCzJgf#n^w@XhK&Lzv{Y2yql`l zQ41HM&3RVV_ZE4xAIaFRTA0UK0SpG**u~g>${gHm14p=CrIry_6Qi0jBM7l;g~Fs` zoFl^VHo5dqejiXtKg7fK?6EHK(_O&c#`1}+SYk2GI$f?m%7>gDNHzz`%0nN(D)P=C z`bx4x+ID+eX1IBU7-Vhlt?s{&SdxQ_)Q*NNNo@no7j=Ns<)tY@1eNu$)5O-f^E@%2 zx4AY*z1ADZ;157Tw(ye&lQD4@0;GlKSIn{GkS%7L{{W&@(Dm%~W&?=f9}j#lq?)Cg zr<=z3n{7^c0_wF%@mYuBSd3KIju%l!Nm6t&X93XnQ`}C-QgH`}cy1k0tl>1!YLSiD z7ka0BPvM?fXAi4s0^sgOvL0q6dut-5} zw%GLRblRoz3Q*>joAp0*Rz2W7!b7V20ogA#F`72FUYO6+4|FlZdIp#82kJD6ctdp-9+c?iUA!o0GqgN3b9pUfs|6q?{Vr8-Ee+ zWad+T#}Y|)ZDH;=L8!?ct#sMb#VnZQ-5( z!aDT|f$>{NCgJNUq~Dh?a09xtaJp%#a$XkZv^GN!xF>Fj&t8$IUs@yb7X>hGA63*; zQ%7or^%rdK$)-O5h|i% zb1pSmidW9*YO16%$mq!8ki zHIy>HqbG#3{K7Pbp@5&LI$LkWrEr{fApZbyad~qOjQeQ_pj&&s{>eH4KQJ-wUD%Lxi|B&NfXD=aX5LiZ*V-3OvD?WhtYFh zELt4mPncbzLN+9)uS2_G&09l2Yjr&LwXBaEhWEeb5hL7`uOPo&5 z7D~fX=;U>+(KBmVz25RilIb2fdRkqpxkiU`(IX{O%WmWbYcLWmzjYid99URMx!+fk zA=XX2Z@-Yba*KYDJGJeevkzn4{>WVBw{P=JKt5=~{%gEEm{`04*%8wqEO(XAa!-Ad zfyQFHpVT1CQ1%xHSrxvfHp@M3;BWx~fP30rEy7DeE?8(Ob z>50#fEx$eQZdy1LAbtsL)#pbEM3|Vp(aMB9H9;%7?gt+zk zh)AB%pwnlP^6e~`7Z7NnyPWtdM0fMEtXI&xHo6Nk{w8@ZB4ZgMjvhaSxhvT6Ks8*qvhlwgs2#%` z^@Zo$J+F3|G*k;+pc@12)UQ+Itm#Rgveh0bMJrs{V{V|+eJ0z5ZW=Oj6KtLY^&Jv3 zF_IfSpn=gU7({w^^FtQyHx~oCk+VGzS-&2j$+!(xDP1X!nw~QuAY7X)J1l%)W$be@ zvk!GVw6xMYOmjWE6MxZK{w(8~UJzoOwz!d`kUxiemGj0PXUsLxUr->l4!2P`eoJ*$ zrR@5ni%$h?Fv}CfjgVYhbj`7zQ*6*Y)i;GNtf=A64*Km}{!qT5V`Xb|tQx>vJ-Z;A zj)wrLKvus|M(vAD)7=g+5Jm2oQVvNPop8Gv#(!nuRh4Y9H1N7hU&tYdI)TZ?$!cg;0YNmVRVuu5ADY<7Y@?7buKgT%~7 zHDMJPE+HY7S}~|dlckOR+j?iOmcqv5h{DOn?noNwQ%a@hPKt#*_MgNVFaAJGLwX4Y`9?75A5AED0G z{jQ0~k4OZxJAWjb9Jbz(=Rb?;*U@BJdOmOe08u+y0PA&k;np86AC)lZUNpQyH4i0^ zd`x2v&74}mGK6DPa8ySmuW7xnZr^q2$}B!OAY&nEYePV>^ij@D8g<2qL;!mmdy=Q2 zp>=IEku$?1nm}+ix2i&tg`N=R+>YwefzX2B8q>d;qh!qpeW>yVX!s?}+3Pk{;uw}> zVTxCh&0w7^zQ8E%FZ?m&{tNK7ddk|lYcbs=uoB^CI@@f$+lV$CHks2%@W#e2;^Sbx zd~t&s#J(hW{T5ZjT_D1tov^j>5?UG!oc+Dj5p9wwZ`qFUZ-IG-jacBL!lE(K#zCH` z?z314?0ohs=*I^>7ILQyaJXAN92lM^1>I9TO?lPmNLJ1h@S_XiE>xz%Fy=SF&?K8k z4v_hhpt#fV7lH6=4nrB3HcuvR3OAokFkZ#uT z?cV6Tribn%vHlk$M#61&=-;L8xHOks-^1GKT>ve;?V;Z2+6Xt$`=WBg&C5mq0GSZN zb?(c1JvTrdfVs!O`=M}qtT}JDnj_$kt-tpn<+k>g-0?V@P7X=i?1b~p@N0HenP~)|R{p?a!Ad8N^y%LY~DazL2 zcDxTTQJ&32i#fjGCeh;Gsu%w5LK-eT&!cn8=(-l$RGp(w`2|_DCRY%$pec|WB?61g6#+X!VB5k zC0tFqAOy7fuG7C{I?H+?*5m=fmg;{*I!Bri*FU;2rhl3MBY+*zbl-cWc?GA$ghVz# z2>psPFZ?Ne1I-SQJD>y)@}fMDHVM!Jx(5dKKnOLCBK;Q_-EOx|L)r&L#!x#10N0e{ zZd-?Bw>WtrcIi{o6pqBOh%s6@X{w_V!6R(uxi$Dh#Tk_N_g8>|;L3+OHj~^o@*RWC zg}bWH!{C~mgLquFIz)0ESN_wMq46byw}|{1sII4Bc8EmnAojJ^1H#<@07ZcC23Y5- zY!z@pGg{Y+66?Cc-fNkMT^+JzV}qD$bDtY=TKbLKNL_8Wz4|X(XoeWHwV|M%tIgjF zm~ZU-sf^|sL!_;p#q9)c;4W3C68MBPT*#rz)m08V5?XAmD8FP@Et)QJ%T&3)FIC}H zkTzObhgY`X9>r$g5BYaD@VhcrU~|$*J^?(AbTfwn);8wd{Riftsy--jcp77NPO0UM zn&L=mQkUYV6>NHc;mKlf+e=+x`H-eXD=XxB-WryG#=61&S4vGJP=@Z&b>-Z<@h^>I z7~D`CuTaP&b68~5Y}=b$?67sye4fJQGG( z_BdZ+H&~os3Ryo6#6Ef`wY{#^*uM|8makAlDdalz+;&mV%ngbrnE(;9XFR6+C4^cP zjs{aw!%q`KVq-&E(by-3h!QOC2fpP((`K9(5R}DBh{H7C{{TkEf=Ag&)6|JcXg9J{ zIr|Rc)=Qv|3N|4?%KRFloB%n@b2@p8BFh&vY;=?imGsUQB#$DLYg%W3xR=5F&xB_R z_Q8YHR8!3}z ze>N)k0~`R--$aRN>`A@HNSBfwE@-{3fYJi+NCbHl*|>}~w9!9bji%%!U0&H4cX&dT zc4xTkGqeu~cwhsm2+=jp4RG9Q{StiIepSQn;(8RMGlx@?6Pj8Lh$%LZzfF!|VQaTH zM#izaOudCqTUjGvf+m+26VqgPA4~Y>Xm^#?SI5kgnfQj>GX}wGBc#IK;_A8XZb$Ig zD5X7mELTjnjKhPOPEI26dmqm-!s?m{WFn2yk1Gq0xL%>7z~0l?`dJ@_xe|vYWn4Q4 zq|oNm#u{ARu6wI{>6p&evZrL(s~9+tG&oav*DbLf)te8ZWMn%{i0+1#m7%*1sq0zF z7F9MjPrinjo)r!)OD#P^bww*nT;D>gnP-K0Unj`gj57tJrK}|Csbnn@8~*@m*H;ZX zhTT$bHanBITO?0IIX^`UjQmi;I9HSEaXdz$u)9N>3lCSi+W@@F@cF{J{w!u1t|I3s z#ZxM0bEzPzZodV=X&33X$}8j3i1b*dYRkE0)dTi@9L4o~vv*wD+j}hwg+s2_oka&CNX%u1F>4i(a36%2&kA$)YKA&< z7F(>Fodu1Xs@v?hEGDKZ+Ez^y+an$U1*YKLKj;ALqu#n9oZhq%(Xs3-aJVAGFO;C|VybObgS$)yA&ytcyIG+9{r@)FV@+fne zb7qG5-3fND;UJB!QJm)iac{Uu8jS(AAP)W&V|pY(gW0Ecupcl{d8FR_zqvxmgU;Gr z>7^J*+uYjY`zfjHI||oduPpjqaddavN7)PKEhWLs->-!nS-@O@>=WDAr(84(hL5@% z_tGuygJ#~=aPM$~P_VU!*)#(Qn&Z%20hOb3YkHt`<8WTkJd09KdDyhvCD`Qt2Lbl{ z4I|Fw7DrfI27%^DPx7u<#Iv3;4jY71zOt%QVAhR?sXdpWGT*_TG0SY&&2t*@XBt{T zt8XIWqLr~ls1E)C(iUYA$ww#xVphf6y{o==)thB%rV9BpFqiIWaBQi)3^|P-hUi3~8|g)4WDVLyywQU%aJT-}x@RXk zlotb|h!;y+@!&S_oH3lxX}<$-@C+_w2HF9UAqrcEU-z%T*5wno%CTImx# zkcpgnAJGB|In8fkfv)sKbKGo*WOGLS5(+6Jj@^-)dwoaqO^l?V=8%8rmtZ<@E+1J% z4-tzOT{jbf^$-wqnom$trUREs%WP77;Cd99nYKx~aCaS(UqCoAld|T?Yo?-GqsJ-o zLhJ_yTeqP}^9}5R?{EU&q7FpPaq#Sv0WLN{sE@>dG(RmV8#Nc@Bo`$8kbnnB@8VUC zAYg+W%d@qBikLrr32x_5JqPNn#+-))CKF{1u=-+d4B?k6a!zcj z*DF7D3o3jXnQIo?&0_xm5}6YS$yw%yFJQBn;i;()eLGFVbse_f_>)Z0kgIC3D6yOh zrPLUlTiuS+uH83II5UBZ!+Re4;hW|j-*Ez0Ig?94=olA8(4Ny6u`2Y z%_mD__g2$ExLNJ#|{&l3ss@2-|iHUatK&d0T@yf}FXA#Un&q_qo!N zPpXwNw5-{iWNs2elDH_m;}w3%#zjSX#)*(A^d3o9{)BsSlIP+${La z$LT5=O`BOuS5D^!inxm-``88R{J)!U?8`+3O&wG-Hbw^l9D&wZPGNldw^bdOv1VYnC3SPQ z`Dh<4{hM@rE(g<6vN-_3hpgB%{B z;U-mP7~WLXZx^1THrrXf`|sUcq^E3}*zu`{u~KZa*7`iL@GZq$V;g=Ulye1KlU2}1 zswT`V=Cm6T`mbU*v;aE#uMu#Hk+?U;mSuvSf*53`cY(VN^p)*~pAPA={)+bPb{utI zK0omv4R(FYM^S5q6Z#ImG5*Ww8(!<1{!8Z9$F^^SG4WQCiW`^VkUAHZ_a(sp0N=WN zNTIjc<|o?YL!L)}3NDTAA0a+gM{`LVxAx*A{LtWS1@_T5qDc9i~ z$CG{h3$#3*vJY~*mf-7t@cP{v&;z78;F3r-xM~|X59uBn5AdKZb^e(=wkUaImXF?m zN19!v0N4%ruD_O)T|l>Iy8i&muX!7SnXfy-C!@ne<5@Mm9ze-lKg_&9#Ur4=aO#Trp5W415zx<9KGp-> zdVhxB0C1ri)A*W-W z+m`AbLeFy^3d{nbviz~94gL|;YFUb`oK7m3S#ub$^-0%b7+yI@`dmUTRD^6xdLMK+ z>{#CY8OAcmv3!?QV{4^pj?U$akg1gUVmiz@pDzxA(e%pS5n9FpM@L6c_@bUUA$fQ1 z?bUW+q=K59%^fg}_OJm@U9PUslcG5t%^rKlGJM7I5e|nQ!BQ%4>Y7O$*RsSfN52c6 zpc$luv87XZg_IZRVABC`U~vV{c`_#Esl;Qmug538#0Gc7@Q90QW-1$KAr{{{Sg*xv~RJ({MsG zC|5Qs?_=nS3=RAtDA-sw{SnPPBd7_WBI%$F*`SnBByMTa&Ha-6s0ToWf;JE|izjPf zAspqV#vn(UQ|2wwJNpEqi*76shlcbB0WmC{u0F`&({17?Zck;vxcUT)YY6c-Q6aaI zHRSF|M%{Oc{{T(>QlU^%NIRg`jOIDV>9XL{Ziz(;H0CHA?%||>wSR(K1}eN07tBv- z{h^4-BV{+egIquZ>;lQ2OMx0e+o4n63cOTiM9($(Clr;@MOHydwF-Hh%=7@sVIpil)bJW02E z`L24lb+*Ip=zkSRor1$i2{%2jxoe2@vQdIJj|jG{#gZZe9EVW^D=J8!;i1q1rJ8OJ zpQ0nFUDd8=1ptw-lFMAUeo05XI-1wFPzg^=jAbl_DKs(R&nDMy={K@c;vVOHm!EKM zD%EhKiFm#hngqWgJi4js154W7Rv>J)`W)YdW(Kv?IE+&l?gI}00FqY{Wu%D)MoBGy zjlN|+fH1@w+WdkS}iIqA-YF`o7AK zn>gE(vW-785XGZt>>~x_pDP8J;@=feVo_+oaGI*n=V@e(iOmG|OYhLenXd6OHn${X z`us}auWJifZ*LLkyla5wdMqwl+{IN=+WdB{?va%37dxG{{Fa4_U>s26dxRLALkhv} z9i`7`C+&5oaQETQEM*48EVSaTw!-pCHE;Jz@IEU>6_)sAk8!Utvk=S_GT~TrW3y!C zmTDL2?!94<@)jA8a_l%}7q65KvR~EC?n<&pLdfBcEr4j+!?IDXU?euzZq`UOy$DL; z!rvB?ex*8sJueRr2W`_@#+k!gYg={HZsg2kea6=XH$o?-S}@9sNYH%M#}1e~p;`SE zSTb033e|#E5#e{#PQg*NXq`)<-4u6dDSk^_+k?N7EiH_D1h{D%6^c;PYMa@@Fb3Otpmc$)0{19iAoER}=IeEjrZR|9&06J~vQwb_# zVsQgm0j?dcF%lO1MU!)mMNm9fPPPH}sz0I=_M@-K_ozD)DPjCUA#7oMfXx=qEs z7qvb*c+E+JK^t(rQL;(sX%6L zPYFJ~5gz`b+6X>DSVEyvUeJGczhS5%mOA>3WFO{5B|KwAF_1f< z7EPB#k%O0{YTN|)Zp3)GOResy9{5`=EUE4UgSnoIK-}cHu5ulK~i1b@*Ja<`&Fs=D79*dn@iz_{9_z4Q?@& zW@LRF`bl*W_90cWGk%S{Wsh8SHQDUUZw$fdaQLE*oxGhoS{ZNMj=%ty$oYOsd6`hb zXmwl*c35r)@omElVuDIcM;4z4oz4ZOvXVd>>8JuOw@w-7JPRMg+Hib+hZc_a4X2r| zEg#F*^i?FJS+dxr_Okx~D~o}+($mzn7NV#@N=-bC<^j=*FIFUBGWiBt5t7LL(5;5LN#LOOo zvYf{F7%Hh{EkNKyKsQqj2vqGFmT_nnGMfh5XYK z-?KuvzA24r>dic?qL=9#1Lm4$JSn)ITCHR+{rgy41@>v)ATA6Fx#?FHp0lf z?Ybg5vF+G25QK#m4Y)?-?XgXTTb=lSBv(%*_5~;oj90bF;|NV1JfvFdgXI1rbO7X2 zYCBjX{Ghjr-sz?0{3r88BBuWUP_h7aXyMcKeyf{J3kO~O(@ieFi>ebHhg1N&n;7mt zG$F9Nar&m30s|J_UvvQR#f%#rk$KS$xBDi{U>)v`7!PFD0^)dow7AwqW3l{_LN?tT zz{_;FK!CQgBj!;bRT=0CBcWq{kz_#4B`BfYGfR9U?5UaOF3(d{Pmf`Lxy5K)B)fJp zfn#rdg1Oir&^PWqls8tbgNbH~rA*Y*Ha-sgMB$ue!+&GeLfQ;!nkKX{?%qNLu07Si z5yPd!U!thDQ@5~QMaFW2h9gaj;FBCX6sC|!3`a|3w)QL0JRjp`8=WaXWajf08ILi= zt!%`DAiaTU=zlfJ--$XU`A48(=hc-Be3$$_sy9R$^S!K{hFO;YJKw5hjNmm`Gt8}N zeJS0t4wgyL;$ix1O}v#=(PnFq^~}`$x}lOX_6C5PMN=c3=QW|i{Sx}!puCG&Sc~YT zC87^AaQ^@V&Gf=7QjL`s7Q2S!WVpY=+*otusmj>pZW~rzcD2V-8^1wiy9r5XaXnDD zx$fAIo+4+P>M|D;GUShn48tBK96qT*7F{0%Gxj|Tet$W-CQH1L$Eax@OVN_jS=!F$ z%%X^kV=n;&h{P}tg#5orQpvGO>a1EpyPGk)3%u67fn-bJR(iw=e;Thh(kC!Ebh z4uqbG=geq`n{?R$y$G5nL@d!}k3?fm#(;Ht?h?^F$lXC9TrRCeVdjgy>pn?a;0Vhg3X+vGRu28n2wKi)> zX$bDA@rK#GZFJVb2aLIo*{vSE)sq5YsgM%uHN8nuP+{+tz%{oeTrdnB(Ffh5w#ucl z_$Eb@4wFlC5PED=(;zM+?`^hCU}1=Kx>n)4lVR+t5y|hM#lXDp_DwN5=y04qYPpV> z;%RX{fjn!P2(xxs-yX95HJZ3(jNy0#-5w=T7;I&vgVbO2x+2%~u@n4?v-gKbORwnG%#+bLX7&y`$B&NzlQRPs6U64=h+ zt-io2kdiju&awVmu97`t>NoX3!=w=6Z`#36t&=pgbBkPc5;}Ho`XX7Ofy9Hqm6EKm zChj-&2$#~`-r@=CWc>|HcS`p}?M*s;Rf>Z%OGL&omnElF)nYTZRABNu*rDl`YqyW1 zLH_{EX$jFZLtjnW6Y3n}9FjD?2up8kkQQ7j3AiQiceT;ue6o^Q$w)neNeQgE9Ebk^ zskc+p<^0Jrg=I8OcApD?19WIOTTe?-^Ef6sHV;+RrG$Xf_9JDJ;W&O^sgRl)I{JZY z7R3XdZ^$c750*HKiB-owAo6n_-1gz9|$fQr=al$8*WPtw)Q_|oX~I+ja9B|VZq-yKZVsX z1Nyl}WUMyIVxr8jZ>X{#UAm)xWkjUhl94G0mo?TpBV?` z;*(6k`4*cMr)3;+8vGHm#`7B+a9Zy*@}eJxJ~3dk38Kn4#XR8b>T=y*`&_+S@QW`O zFlx%oznw5VQ}#iGzMihU58XU|8{Xs)s!j`C5=Q<{v~2mZz1Xg7q@<-`k@87ryQl+o zDEbFscW%}LsS3xHvT&iqrmU!E?{9XK*+g*$x-rabAnG8EyOmZ;qV#5pwtvhr%Y(&1 z3k;Dy#m^m-K2^czbV61*wDlHN9HU6etO91&Lw9oxjGaQB%k&X$A|bXm1EPzxXhrm7 z*p_CNr-<%?a!W%5ZXUjCNv4)8Dw~@XEN-yexOG`(XT#>G;?+~(j?sbvcWsuM=*(1> zmBT>^9GWkvsYgUUKap^rBViRY2wO)}PSWAYXbZ{J)U(o5)WY|NX?vT0V`b|+x0ztU z@hbjgjrn&*M#o{Y^2TM5PmR_|AJF;UTkWZKUMAj4N0@%1#&D8oXlMTbyP)TT)Fi<4 zTtEY&>d!v}yc8_~5v>K?5{(u>97qDlf=%sYrk_Mz8}vW`UFf(z$D%l9-}POl!?HjM zXL&zJTqJw8wY`EhzQ{uW-(nB~M)&^!qTotb!{oRI-*liR?T`ZGiiN%S) z$tdchnS@_dZw?3YS&eZwKH06J$Ea#0#=*zJl_NuM+$JrToYuC$7LKUY?3PJ%hZ5f4 zC#il&B@X%>Hg`-5ZF?h$ba)82qG@?)&<^2{Io7fFMsz0j5;!y6CkZVU33frwEf>?eK^xFWqijzN0Q-`i z3J_i%Hy66$Pp~6$I(nz{lFI#cL!U2(9mMcj)JKFT8z&NvnfBsJKz9X5}sOEWFKb;Pb0;Itbl?&jgqW+%lM)fD6iztqIG!H1|7X!Tr}hV0J`fhHu1aT zUjX6gs>AR$jAInJ0ql2yhkwm@yBWmztHg(E#xXkAv80eCpG~jU*0Mr}f^x|tUtNgY z9o*J5lYe`wal*N;MR{dW4{WUQ8;UuLiP)>_ak|N;B;AmM;cuV{rxRqiM}5It%@+vr z?Jk1&WvOuOE6uOxu~KZv8Ga++!eI6wpY>Ov<7|HVD$B~#71G(~Cos0|Ji>bf=ARF& zt!}D%I*Geoj3s(XEUSaWJUCr5J);Ur3V3PZ5kmtbVCdE~0s1R3@tXcjt1m8}sGfGh z_;-(Dj{s1~75mHKRu*tS6LpN0kjLqAZTc;{QLr1M7Kb;wvb=q1ZL*{CmpAsKB&Xn8 zK^)v&<;SRGLr>dvyKq*j9unxHj@`>kmGQ!KxRoJzfrwWNY=uv#`) zf%slI$$9X#wy@~1e1eS;)K2HZ(`HIju&_IVoIArSUd%Hr;m)ROECb@!LG>p0O(=MC zNzHV0I7K8evwJcYjlU%?hDjuNjYtHRxc!vQNyZ;GF)Ty^*EF?q+~(i2v-&WC=@NW9 z%A*LXmZBPF2D@U=(c!h#j-mK<@HGd?J?| zt@*l1Sog`f0QCDQRHgYdERx?yR|LSD1}6;eo=D=qR_}G9#%VN(A1%v#!lpHnQJzG; z7c^~i)kD|fScM%=g^!k)?{F@jnNo4(=+J8_FzVT^$!`AssA#33)+1}QyqjHSl46oZ z3+dWJfW4HCTFh+4CJ>erQV!>(lgNIXMT0YDDR8OXO2%pgbtgw|uVEqtxBKZ#ju z^7URTR>>dGbtPGFh+xC$#Was+Fm39Z`$G2;kB&!4gkiyr8l)q#s?%ce;qK;1`?nKt z0auLaPfo@-M(f+CJ(e+~T zXmMJmKS%E1V7|wwR;sM1{2wY&Q!#`|aOdB->X>4$6VXWtackV$Cdp}X{?vc`j3v}54FU)O_ zb4P0wi3ImSr_`VY3Awr}iXmaX$Xgk2RJzy#5xaUIM{eOeh^QsP4r4*+lnrRT${_;% z69fcG-{H5i?Lh8;mY&GX)4jp~;^v;nE)koj?v1yZQb*9JOcwV>5(e}^sdl9pfb$6! zLK;t^0PWI&!QBuCX#jep&azJXB~L_=Bpr|_S25jD2BLbUBzO!iakFjLvVp|&h6{p@ zWmSAl{=*y`@5%oprL)o0zIq^fG3Cs+sO+_~ascu{AP!+^0E5`5uEvFn?eYu$3 zxb{iMC~uR!5>-}qk{e;&K{q{M6zz7rQkA3=)>i)jx~Jw(h+Lb1Im|Wesg!;go(}2T z6OPS4!om}%BV*gByf@u>wy)y%4SUR3ZV6dcDcP472mRD_K5OE?5^LlXG>pLM3H{lc z5-gi0F9)O4b5&B({_<+L;%MCJb*vRPHhf8BY>h835X2o*D_g7zVXFR%AFB9#&e+Bq zUt5|pIi8*f-6(7aPeJOj-XCSGD-d;j)ldq#AgOdQ$tfqo0QL1!Q&-k{*;69rjSg(| zZX@y1`nX*dRm&B16WjN1sTN^9uezb0y0bqp6tf_5n}3CEyMM_(dkd z-_eCQAP6+H8(m8rdZYfOX=Pm}w;`q(gyHyAg(Vae{;XgHK6H(D;T9dNlPzdHP=<>Y zI)oRbLhZ5pVYRdi0dHhRkPl6eX$Zh=(L9O0_RhGz!3iSB3G+qN5{MsPy}i)u0Cc~) zAaJtnZ6Ku31I^ynvKP;wbh=0peu&3=Cd<<$t6bL0w+>4=hM)@;J9Tn~K~aeslVBWY^1g=R}MY*P`GuS6C5W7PTTIKK56w+M%kO8+YN45U9V?LD5i_uR%?QT7oD-l z$2L+wcHwYT^$vZ54|7=+98tZ;cQt@+t2h=+!^R}q+KSlbW}E42hRJNxQ$maO`@4 zk<~gjR1LJhVpEgfvaQf@T@U1ZGgCWF96`i%(gKgdFxjx_pHU4wZF$`6t`4Dv#_eDi z?i4;TS1lNO8p{hUyl(9LFgz z>W|+@$4ck!+o=fir6Hv)fPo zStU_74n0DQ%6QBW*0t4O2R!WOcURiHMv8>sJk*9kFNT{Akr4H77O&9c0nfVPtZJf= z@|h!?99%EaY|^f-N8vYlMDDZ)8H!lVAgSqao#yy5_e1Pyi!G)S=Pp zgbgW01w!Nbrh_g;%&eEM(;G$H@3Li`^Q2i$h!)yPseCcUKg_Jp1H!7aP9o~eMp>pL z+&TM!=sgxLe~^YObiJKtgzz`{Yf?*~*^e2Kc9{!q8-2=3EV0YB zGtFS(yjI~9ni=FNBbJs{TUEF54b`Hec&Q@QwxiGhv+R|Ke3g5e_vwwz-_=>8e3->> z5eFExTK$xH5q#Sd!Q{jCv92q!?~#k~Dc%UXJjho^m}w&Q;_ zqEN=k96UnGiPg!OZYjezO{jdVWxbtYWt`?d8sd0OOY12Nl<$vl_)WjbdiM>>5!Sha zM~&=krmOf{OI=AY(mYK&0IbuIWl!yAJ>+q)@}u&X;_|wplDcdzW50l3@>C47$L!6MWr~XxBNCz7&y<}^ z9-eE_c{@5`dD4ZIbW?|F7c6n{53b<2vQWOGZl{!YoCW{{V|?fc{(m0IGdS z@P9`goi1d=ByIY14*ZJTgWG#*w#i3LE2Fa|0D6Ts_w27Kv}6tVZp&1z44P_*U)<_D z{;FpT;HFi;s$;2-4{WkDSQl=8ReX*24(D9R4^!PahzAQ0myoInk*vV5s!x}a95yDn z8~p=iIRvv)O4Atyk2bZsCh~ZT0b$bK_SseQj}|iiNrXDKt)f6KBEIL@MJbE)o9P!! z=TP_LjAS+Bf670%qVX>ZaT;tt1H@#ktBwoSJNicf*ef=8_@~FSoOL)p3r1*YW(G4e zh2C4yW;kyTq^qdCL!8!xxUhE&MY_?#@4!?eJ zqZix)sd$D~#xg}yt0*eu9imXtVS8TTS$=WW-E`2GIH?|abyB8o6848YqjX^yw&|bk2Q=(| zFuV4QhoK*mdLSLTHj%O*gk2!o=^T3>vN)@g$-j~SSi^1Ye`HrfaEbdyM)YVuWJ8Ha zdVoI3=mUsXIhGs#i65VNJ6RZ2I^yVPQ-vIJI^;R0>zwA@VE(og>=O(2|+SRbV=# zIf2gKp;)=Nu(Ca)rtw}%c-AQIqUhXsR@2hd(z(qMpAZM4xY6d$euFD6(QB2H;rb{+ z<(n|>MfO@gS;Oh^3KC1Yw@@IiBO-+fG{1pl%c{D(BKu zI(lhc1kE=NYDS|PtT7VT*mQQwsNgt^tz}fYV0EraRnY{XnQh9Pe8lkTMpM_*3Z}-} zHQ@N2?v>2kXsE~O9~CVp%RR!{^>j#6Fo7|_&e9_X0PR?n*$9pH!;qkT;AUD#XC#h`Lj67}RRD02) zt6I@up~t1g3uI*4OhVQrIIP7TR`6=)bE9s+7aJ-*A0wy)j=$98;<(eO;OrZlg9&P}eo) z#lqGK-FUN~dMZ!JRgUdy5Ifx*FgI|TYH{w1;xXbZBx~JLb*4ls`B~`fQ9K2;vrmXi*2&IQ|p%MRf1K zgbR1NUGcT8N%%)wBS0tfGZEPvob%W8N@xUZe^gT2btyCfM26udS60{|e6qin6k}7| zdjOUV2d3*j;>KV!)Ue?c+s#$tdGcpZ`}R=ZY-z2^MeICjWnCKLy_E^)pE~B5YUCx< zrTQXiKZI-w)bLCu3RcGyZLkA!s99SImY$Y3H~@CB0cf;3w8nS1wbzwp-tW1~;QH6} z{{W1Kj>C*~+a{dW#b%Ad2x-|g%?<#9ce+W^Udx>*Z+j|j?HwC?S|;C-a?T?zx{7vl z2>d5y6^d{(En-jRM^VTh!+YEHOr)=TE+MWr0{6OUK4YrVMy>q{ROJ=ZDo+Z#)~16& z!2G?1Ol379a1TLYe?<6Tcx4C;q@#)eKZGb~ajJrD_XlpA#ffTmx0??jmC91~#it6a zWvZj1Ru=L`*S}>WLyH<`_{rP!P;A8`aki_EG^l%JZnm4K%ML8ma>i_AuWL)}@zU#F z%ec!cVH$|m4X(ViLphFiKUdOkvehz%aBl41!)?luG>mNgMQnDxnl@<4w3ze~Hb_|V zL9w-nUQ**138um74RsAUY|T6A7StE6aN6-v7$kxSx@DR0YJ6Ua?2)Y^*4!oA_F~sL zSN{MU&kp5$*w)C}3ZUrZ2HkeLx>V??ryy<|I;bu$@YLn(N}~~?yCc*IXJPD^<&1f? zlyR}|B1VfFsaoG?*T=R|e@^BrjOEy8h!@7!w0Vw-rG;bomkzT8Rn!tlAqT}9UEy%= zx7}UqaV8eNJ$kpYtyhSoVPuWe)8ENi$FZlQBg=T|?ycD#nd#0S@e>W=4i3p_WMgVu zX3G(GiKn{1#{*j*m>o*_`GjVg&K~AZo5Q6oX*!(C?C(3TX5`)_VVRpQQddV&A2LRO z<`-@cab>c*T^?aSwoWaU*U8CwAO{2Hp>rO4!ZNKSve(tcPUg2zW1KGUK&Tv7qm%r~$t0_tvSgAu=7yumxg-&1-$7sh!7N=Vi)?0KfCuWdwd`jgdLrpGBChdxPQ63EfCts!~*K37FyBZ1p{ zshTXqEj$7e@>)fRC32V3^m=EY;$*Cd?V3d%W}vB+(CcX~?cGaJ%N`1Plv`UEhL5Gs!g#H?=|SyR4CWlJX6YMx!l|sJf^zV1MZIK7#r%ivckxr zw&;IVqM(Hp6i*_*TQno%?rWBawzY|e#C1v7l(~_o)dr@X->Px}5*^qQxHR_`OM-g@1dwh; z(h4H-+IXEK&XzP1N%B~PweQV&x_a<05OHe(?9f)D#4JbTa&9K@?GJM~Z6mG83o*zP z4|(QFe12B4<$T@zV4#mr{T1Wmla{QKGz>1BHYNf9u;^5*buEOD<6+q}gvS$icm(!L zt0%L;^4u>zT{G#llzK>;7=v0*gr6cxBe7V<&eT2CdDxL4Sg!e?>uM+$i7`=9*#UEV z^-=QFUE37Z^##D$#@ej zH?Sq4t^vnw!h0SgrGc$KYp4!o+z2~_t7>=OeUtVYMTNEZ9mV{LvE{5&<)tOkn|BL3 zZD6&@`zQ2xkBC^G%^TSqt5Qd(GM-_IdS(_#9$mGG=&f~_gIz{!cRg31IDeQvT4u4% z@ZS5CuQIu{RE5lS$D&m9b6FUP*~rfLp$vGXHAvDNG#!_gaVJlJ*GkCA4Ap+C)j6&! zbMAI zoJF~WCb*3g;XJ`L@39uyYS&CxXP@QyT!&86%?pFX&DZH;rT}njqTFsGnhy7Zmc7>s>nZW zXI#4j9WO4XCIHb9j9$apS#s|eaI8W|>G3FJ!zu}}d-sn20O-9YoT}7RS@RZF4eH5H zUd!od2<&JrK}%IjT^}-@XzCf+%r^;3FdK9}iu#&sqb5;tdSP_8)O#*5H_ZE^>>X~2 zK*rm02f9@DFhfg6Lv_{%&;+T8m)L?)Lpt3^0wja5To?#SIsi!^*$ZL-ZbCLUZ(M!q&1Y5*+zDcP}) zcA8@s;xC3xI(#KKX%jRMeQt)LmKuR@cl1Vz5gTj`)&O<^ z+uS)6(Y5V10ixU4IWP-uxsirkT-{l##m8NvD*pgGJ-#e!M8QVq#_8RmfQnjjg_t?`0*Gp%u8i5m*+J(5kr>s-wuU zMK0;6=x0TSRMpHQWNzQRl&sl8v5{(P42RRQ*=Tb0Ji)Aj03HVB)>3g{G!RH3ozN^6UsXZQblLEvdcZ!|<)>qygWGQRw;+ndR&56DsTsrhxon9A7PWI^x znZ95vUIT+v)>8uv1}HqsY!o(mDa>4d4;p9LQ^hzF3oRAL#8Ds^GDZe=gs7%-y*Ba}2bXaW+ZMV!yT;eLdGCV_l0>RcSl!JrfCP-6{SM zk1nSd4=aAAiCG>WWO+4_dBL<03DlQWL;Km_tHIo>V{SN z(qw~Y7O-QA?R)Os(@`zOfHyuy$XNFcow@CDnD#wdE=UjDSsp?hgHvW z=u?Fzs4+)VNYdAqka~r&WUSC@;ABO^!Y)d|1;d4fy_MG^<6j{zkStEuDcdVsT5Dy? zKH`pqNE+*twSicsWsjc>h5Menty4TyGJ1F)7zNhx5VE{X31N^qx}H>#Z8TQ8r0AFZ z8O+sUl6hJ|Hs}X+Xv26}nKL$5uE%NUW;9geP;P^`@4AJ8uBrTbk>Ft%VE2;a!8J1Ue}>A3lr`4CM$#uG-~jvu0qtck3B;n25aNnb>Tq0cQ~F6=89H|-bc|FY000(ms(8o? z04*S&B~Ehe+{R83N=2s!#LR{s*4V2VuPhg7vGY|NAPCwEg^lj42BJ4a9dWALgwJN8 zl47`}ON=H@s0Q4udngf$@e+E6-R)s zYXFyIN0oPmo9-;S6qfdR+_d^;JnWy2ibk7r zB5l6*C2JJgUo2xt7CYa%hRoSB49t}Cb7`5*xwG2aAI(h*X9xC_Y{jnMXzUv^Vfluf zO-WGM2XXj^xKdo!Jar=8Ya^S4IHm_H>M**uhqUTzHE8=?O=aBpaH@vk`K}3R9xMe@ zd`xiZ&~CJA&RYA?&s*YVrS^I>FwS8Crs?EV50HR-ue!YD9HT{F6jhYdMN>;NH8Ma! z4h2lrWOg=PO>1{O0&KJ4@X4X!ZE@L@$p~pJCr_$MXxkKJ!0KAu{4M#~Cx9Bs3$8tt zFEixVW!iRL&(v+#qJ4R4$Ly_dikueoRGCr+$6&kWS`Odjk(Gw$!jjyDrxuzqWd|3g zJT(o!WJ5eZh+L-42HT>DuH@FX3Qj(UCZayw5q@pDzNC|)xagA*ZL){ylKp1T?AEtX z9Zbcdr5jD8EDIFl`HDT$2rT-Fj4yCENpMltv|MitQ>wh4fV0b$kVlo2E= zA-7_I)e!k8-dQV)d_JiJWQWZ{-pEC4R_GLFi~FM%kUdh0NR2Jt=mhNtakrvP(=ywn z+Wk%dZb`PvDa}D`l0ptoL$)}*;52pgR;-JLNfv3O`H0-ei>~dxF0Oni;i$)=Yw9EL zkX#%>xZ@dineY7>x>qd}V^e*W3jD)WgjYxN^zJc8d~PFfo@P8N_LMa9(>NM;i-&bn7_D9( zjpg}1Wvw%@z7uWLXAYV8CQ_C4jxfU6H3I#B1uRS!@0P78Uh+T*I!#LN0G zNaynKR!jTO5N3=fY!eZdnnsqrr0i5PWpDv(++o9Ms%m7d1NY=^eAYlfzQJ~wm%F30 z&Af&(#U~t=9Bu&`d9QFkyw0K{%4C^ z8z|{#FE+pCm6`GZwXN=<>S_dTcBeF3rmpJ_5OR7)qGjYpxboQ`CTnlL$gYww-i;pW zC}J$O?rqU!#aSCVkI7D`pWwomk)-pKG2x{N}H#}NR}@EX?3sWmSmS5K}-a6Gi%xs!Ar*Zb*@7 ze^OdM%^vp&i)3?dUR!?T1gjq4ZXPY{qSm`b`t2TBrMJH5x~x$c{eM&15271+bO|8k1Uaj{&Eu4EPG(&UMaws^|zbTGKF&E{%t`_qLWxw#V48=Ij#CY>IFv4(} zP1zB>jhnaRKtE-HV$qtsb_l~kFO{sZ?{j7D+;a(!hFlJwrwpeMPTFP*M$V0QAUnnLZ|Rjz`B}z+{p0$lWo$(GXhG(O&b84MoHpk5dUET1v<~F6==Y zFA0N{Y4|r^JhgDw=DNAV;&HK&g5cYI{MWTOEd^9O9>F244*7@EH296k7TIbvk=E~WJFXrkNx*1t9 zj#(oI++WJ&dTT1@iVRyI;n?i83}g?dYuzi{02UkH@=-XgiuW$jn{k8TO9jv6_=PL~ z0A=DXxekPQN%afIoJ7d@-w1J|hSpTYQr_BKy}#@)P4Q#mN~bYL{{Rl;86R6$BdqGE zL9;zU?y(&8!%RyB!DhyDO<&C7bru}SpM5>exA#!lPb6sLWKOsCvWqI#UomF zDM_jt*6dByR);v@lEBxRff=^p$mca-?{?LUaP&-$Jd^ZkKQm7qsS(^p#!u|%sdW$D zn-Sj1bB3*kijFq6F&l`vcbd+zdY{|5fHoExnk>=fw{@ar@cM8xw@sUCEzElOtsZfo zkn*o{*`q|`Cc}H~rC_IQWNX-VDw{)yOHTbp=KBlwR-8K)CI+>@7j-K~Uj|sDu4t6# zxgKRXiHA+V>$(`^bAd8O*AG64SgT@=7d7z~xb(HkPhySdg_9dNs65G(DKi!r=pP@9 zba?C)hMH)>c$>u5i`>{*Q!`H$aC{38F~u)1TE;ebifo!ou=Uwn$eGs~;Lc{J!z<)9 zIc@S;#FBIa(2pxGY~XAdo)HyvGsPt=tPhSrZX6Fy*MhT_exlBlj-;WWbn{;NS`NzN z@Y}_yGM;5-TFn@Qv{*G$+Q@nOiz(f>AKf?K{{WShlT&MyO-wAP!|a;q_NEz7=_%e^ z1GGDG-iouGD5>$-D1HNp=EVn7QZGndDT07YzEG&{Ia|PMn>KhkB z;p}06(w5=uaX(*+WguyY-(+fL0Rwq<5!%hN7YCm-R=U`>K7bUr9Tyr~c18|x*>sL@ z3T}u{k${2G2g{!;qm4t($gXpZ#_7D5q1s?F4NkR_mSA$q&10eHm%-^iYOSoex!-l5 zVfbjKjjT2xdMxaW*wFZ!_+Bn~QRIxt_1Xul`itA=)@Y-TAV*qxlJO?$Tn z_fuo!E0*+7%*2)?__3~w_Erp^f&SI5bDTeS6^)4MN`rH>w(V&q*A`k&4qMH^qnXj@ zd6Zpdc{^Moq;B{;XkB7*TQ~flO%4@+#>nh1Z**LCmYWjg)9L5A_v_RqR5gudBLhyL z2KG^v_{*v5W2%IbA_NiUR|hG%`#oLN2uDyxkB~A%7&sF=0P@|n1j9b!^R}?z@il@V zcO@%HogI#3>X*QIRZz!q+oGaiP}b%Ochyx+=L?bvDTPSNTkM*}YFQNoR7`6{z%C&} z#aPF9LFjIyaV}@=hBc0Jz}v%fblVf7sG-v&jeKG5xYW`TKG~-YYqG{FA9 z<8w|G!u?K&v`8A#N1Bh0stTwu_OaTf@&e^sxT!-9W5^VAu63GRHF<+&CrN0hWVK?7 zGENER$IxLyiW!Iw=x@I2@m)~*okm$TJoP1Jd0#F!F5(6TNL~Td9ptondIr?X5fWVA zqDXzLwJPFe+*7AKq05_Da5|cMEQO3FOL;FqVp%?H-CbJ@Y;$>FhLhaxupCbgn+dL| zrI4L*JhU$3#aNsfwR&!qV_?pWNmVPxldo_I5l$DZoz{dEQ@aKu}_a)L}hPYH4f6PT| zXezD2UK7^wa~ymIcT>?qFD$mb{RgV_=f%Zrw45-IGDkC+I1bW!qI@9GLrue0RR|i% z^!T8)|LktVspj(z$@oA8;93%S3VdUA^E!L-0g1MG+iX?O%hX(;MC*L zwKgpq*{GWWoxxBgAhpK9Y&2g7H4FjM;^t2c^srlXi{}gmuOR0dyb7ZFp?GLPbQVsbcyRh$51CcujcTmyId;G|a1rAgjrw{bwnEmQHO)41X>Xp# zvdmjf8`x~2u{lFvcJ_D|1G;rlQTvB(ZWVhU#Em@`ph+WO3b9pg$E0IWKLrNu(w+${DB9C`;Mqw z_PN7y*7r~uml-%tj#G7N-0CThNU@@Hw^K+i@STV%GHeEtcfL19*6IL<0lJ5y%ov?? zMeyO$%E7hcVJSMC&0PeRmXYw5j*nplN6HV{9->Lf!^gejl~P1xd!{v6CVFt-*>#%} zvbkXxgX-xadya=aqo-tM%nd}0BUfvs;TB7>v6XOnU^2bm(zk6siuHy><~5Cbbc^2p zYB~rbbyN9GcA?as>F9MyEsQrqM|C2Oh}%T(EY9tkUt_rmW>mJCKnZa6Okgghr`g2c zu}a3!=sZr-CHkjqG+MCiLtHl2c1vOzf}an=u zit$%7&}E!7NMWd%t(ELyaA@vtZ&b$gIV6_ZWXjZekB_;9_(oo2lyr>#mZUb|Z_?}3 zd^z|i%UN#^dB)uQUcPqPN55b%(MgR!jJISr;C@9P($iJG2<2Mg!F`OP;YM!Bi@wa$@-0kL(;wEF3qch?VI!=2bl!)Er z7Db%1HZIo~%|xFqARw@GT3$NG8&WEHYWdLiQ(H{<QP_y6(s>WD^iUNMW76tlW^-CcZIo76 z*P>eL*>y6))?L!Z(fy|+TQm}QnIj(G;s<4#PmwVSr@BX0%X7IJliV*M)yEW($z3BF z?$=rJy2a`x_)}i{W}}wGa^q2Ho;z6V9kRdTHNUf|ocCK+;BR$>;5kXyRJz4m-6ihV z>gcS}I}oXRd6lKiZ)X*oeHPy&!hmR^_+c}RZl^lzCL`+mh0UWz*uRJ!AzM__R#3a)(_X z9!qiIaAbK_CK2K;B>JrX051OBBcc5@zp2?^%k`LqZIzE2;|)F&)lN}RGO4;;-w8U1 z^-&q6Lg5#^G2nSPLm8t!OS|TpGA`nrElZNU+iTl`aoG6i{e%vALnm>In-T zr^Ipmt%lanKArQxB=FQn*k7SqsikB3TejAPi1J{sy;XdKr4FVf$#Ys1VUNwJ2u zNL*O=ZU{*iwokq&yzuiX^Jb=*&1?B*q3!0UxQ&E0l#sQLA~OBhHoCT8_(V80DUvW- zs9b~DW_(}bJq{fhf+?8!Tl9vTo2N+>6r8NtP{*h!u(#fYqG>eVOCE|h2FB%$&eU`< zj=hRul&Y$6kyTe+sg5SixUjN!TSOZa+AMAe*+uBujCn50rWKhfu*&A*4Gj$=#ie19!zzMBx1AQC$M z9sH7l2a&zP7MV-#T2qM2oIF8x=a}nV3~HPxV3K*J<;5FIoaY^^t6VzC(s7G9$%#tW z3>tujFgWbv?3?i-woFo5T(3m&C1rGt<}e%SVE+KHy(7ZxgHOY(t4%==CGH%_9!<${ z>ap@Cxp7B9hesImWsUf2-9aTVe=ZJREvy|h+SXn$|8Fsk>NBEP|!zD6GI}6fsF&mFP&M(1&4-wQPGE0GNOi(ra54gKTxnB zo&NwZz3Y>+Gczs$M_mlfm9oYhbUOAu)fHb3jXC6fe%Vd z5Jj$?5;;d`^nVzsC9UDsHw8GnkAc7e*7hsL9}HC1;{G2LjMDar0lB$03*EIQI_gPj z6|EjQdR6Jd>K3VtLb53!~-P5>RO*P9>nO5bBO#U9(96BRq0(`zp#d}CoW&|-K_ zB*@y@%7RPY(PH2L8~#hy?Up*~P+09vcn$=4FDiU$G>V)63mu#dyL3&Ic?2z<`aJc9 z)Ynw8?uL!?HJwWVDo+l?%*M~~-0a)0TYNoayc!-FQqIUAs&!Djq3sSgw?*f_h|Ion zrxIM+N>)sw2+roi%cQ65!yuDOO}bq$$eBM9;)Z8!4J%#p>><)U$7Z(a zu35_XhY>R*G<587h0*~epc^%}+~3W6bA?JM$9v{>D+2kH; z&C4!LC(&TYSynu&fl}f0ku)|l(Y;Sgm8L@@d2MS&tlO_NarcZ^zYgjs@NB%;-HTQL z>$#eXLA|y~{55gQGG{EuRe|P5MEaOT?gp1_)LCq}_0hv^&F!9=doFI0kh#M^=$Dc} z*VuGjBP=zGSurok47(6C?R)pw0c)=|d{XhcJg-Ahm2jsQVl@$wEQ4(gAdQya#C-Kf zkg~02C0OvyE%1jM5_>P3_~tz}aLl--HHgcp^%55XPvPu6l1Y~6uWxK^gW>$h%!HET zxxVUlib$#48t?$~zeNolZ1c$-CKnl~nZnj!xhm_IxF3yU^rCD=Z<2I}!x?EO?y5@I z#OPs-J2k--%AChb1ky0M(lx!>)nCjwlvtzVbV0y0TFzVESNM0N!|J7Nbd9Qw&9j^Z zyQ(h}csV8+nPR8FD5jE@Rt?>EX+C%IRp*e4i^tuqxSIa}!3GOUi<-t(Q#NS1wzQ1| z9hFz(Hw>V|c&UfbP|}J>MKCYW0BIzx%i*6aLBq^CmL~(ulf?HssB)luc4P{w+(Qb@ zG*P~GJ-;s?jil-B4&LgdZ%$;FVNSShqtAFQCrL~sifV?tPR&!$#~iL~k<8Awu-|0= z0D&J2n8z4hG_sS1QZ=Nqq3k|0dnqnCd^+M>0;0q5HsXIc$)wOltOnOwF}^M6%9JJ7 zL}U(Ra~C*(3F~gD^Q2>}T6R=Xn(68=@g%c0DJ^}EQ|z4J*^(OB02Kl5Z{a4zTl%rr zLlk>1Ge%#gLgE1OPuV;(#9Lz9ekQ2uc?{5#t7~*iQWDy3aV3_$jgqIn#G27k&3J=o zD5r_i>#C5D!|1(Z@O8s@GImji9;>Ef@mxak0!Hj%w%?-j4-GPj;;$2;ig4G`QM;KT zAB%PL2Lg&5BMyd=CWf*mhO~JBvadb1Qnt-K8f_$kseLf!8F0MDvQvYGs$ zqh~Z+D7MvN!gfY!jGB#ufKtMGhaxFe^=xSqOoYZXx2^1r$dRVy-TEz#t0>|4P8Whx zO$YDmDcdX0aP1(U)m-VgWrAj0r!ypHD-dIUcJOU;+UgJ9R^_B?$HZEz`kx1y9>CiW z>yy+XknK0Re~aMq;nmeN_AcZ|SmK(BXIr6Kk^7d>fpc+uL`i+pI%ILZ&C}*mxVrb* zALaJ)PHTRO?o5x-haN(rE*lh%7NNb6@ z#VRCwT0q=eveCFxj7I7R3*I#_ScBPezFD)-{{T-UwvR~7gA7Xtbs=Sqybl3I&{I|5 zlnwJ+z4uP=P9Z_^Esj>SIfLu$LWsxnTs1QZW0i%Ypt|Ha{!c(jwb+&;iMI+D zEG}X0R6TxmrEL!`IG+@R?%{pWWz4bEJTdJK9#+{(V7Y#i8>5zHv`H%t@qu%1G`s7e z{8Ui3511cag%(J{Tt5*VLUn~_>?j*ZY!y}Zd9LqiS9gvfk3haQdjn;Q;)Fp(TG z-|TJoQ+aRUp8~^!Dk>(Hs-d7bG>~pp_O}nKosiHmf*M$@zMFJX@zGIAFqV;?<{Txo z_H->|Q4he~lmm<5V}Tm4&=q3Of%K9FwfA#8jl!tfY2o>YO{J~sE|}xqjaux|*sQs7 zs~e_%fEvbwar1_PF>XKDTG+sR<_bG9Gm zy7|5^Z1c+CKLLPT}qz z&_91OdD1%8JB6gdFU=FLb)czW zB5oPef6Z+wwkRFAHgLS)(&zP&Zvhp}4uZJ$F|B0EArTaSA+31a-@x9LUZ4bY6wbm~$~)9-|L? zZyfQM&!8cB3lIIH@Z-ej>MLV)P9IcV7<^*GK|6j{RV`iY)vjIW^|P+mg8TLEn2e`~ zM~i;cik*>HbTYA^5@K|v=Vjmo$?Xl_RoxY*)z`@iG;ZtpzQE9C)#1oia zT;sQ*!Tu_}3#g30V7oLgQ z(j#`K4y8NDnL`xfK4NAcN&CiX21^TRZ@Q|+(&BVm*5Plz2=(lt&NZ~}k5i?98;+vI zSf-e%8JStOWH&jp(i}M9W*5W!A{Z$l(^04Qk8s?db$`KVrIxj=61Pt0r$o=@bpl20 z>a3UzFl{-Qf*eJMve(jz=;myFo=fHO7`FpTd;+H6vpl?-XP4i~bO!LW=!En)1W!|9`?g@6XO zF=UPH2^n-em7`Dv9(S@RjE}HueNhC#?gr~A_>AI|JU+;Etd3h^6>+*KLwbiDf%_=i z77=Xozs2tqo11ayGUgpq%%`JqGwJ9bw%_;WyysgpVtJ$j?zm-!`>8+Q)`{^-*&j5s zKU{$;na}E#%|Vu*U@2oaN?a#3y8~u(F;!`cXY&^nr;YK zoSl-<=_(Pxqk8xr=H5NfVG>CN4M)w)4*CHBzjb5b9~3F`&ZbU7Dp)`w+xPt!k>oPU z^5?qbp0-!63+5)}{N+v_;#)6(fGnn+e{RfNUn^yosJ~42Jq(SdkU6Hr8;^fAWv9oX zuctJXu`;R$i#|D9c95zO)<;oI9E{su;CfjEl1UQ@7Z>tbeZ};4H&qK4+>J{Q%~&=A zOxUU5j*b>L>0kg4$z=R!<0lMr?h@Rwjn=X9zE2E+hVeVj>jdMU8me%fb_s^c=;}I3 z@ON&KR5|!_zCh?MfoAzHpsOO6q zpt$|yGP1|i6(>1HS~6_)$s^?MO2#LOO3XgAJDmy8LFx$ENoEXgTsH)rtruyy03Gk> ztX~jWHQX}em0U5jM@fj6O2KOjT~hJbhTYb6gqYb7BH}~Tm2tqUIk37nI__z> zx}khNN7mV7q8g+Jp*tx8H_{K}nBnXi3(bg6LqG#hPbHqo1BM%J7E|#b!Z`T7!>YU} zMh$FNmYMz9`z=(wv=2XYGTGn_?PM=wlhfNiar`A_rsR$!Rq+1+HBKqw6THM7eIdhV z55HCHej?W4d7lgvcqEK_VP(k%;GNfzJ~}vN{DGh7bH)(Y!Yba^NcNinAS0!^U2J?M z$LMjqfrsL>@`t*zGVQ+!?mg6>#z~`Y313S{$}=4shA~kaW6`AVeYa2~>2}8B(NE@? zocv<9Q;VElK|X2*IIdT9jQ;@RTRhu73cGlqkJNV^&{|M0dygdRl-qClE-|!i(OGQI zOGGvz;>u~{Bn>SkW|#xrSD6FBxGyH_LC$VvJ_k~7XBLriRZk#g>0Kl>y1#dRo=cw0 ze6Ea9(Ba38GLtl|1iPj7QaCJ;)8bU}OSEiobFb3Li%IGbIpj2h(P_GyuZlA&y~f^5 z+3~#Zf#zn?a%u!~Q#3WCi)@}j#QF@s0fp={JdeOu2O~wa$KWLO8zp(=ts0(ON10f$ z@sj7uXKVgq5u@qonZ|2dh~jlsEx2sb7u(%1*c8>(CCnS58Oyt_mJ#6<6NSD?$+;ot zFgK!4Sih2Oa!*tu?(Uq*>~!~t+jF=`nugp04(P6JhUqh7-*kg?4=}g_?y54E5X5rj zbL5TgG_Q5bTBKUn4T={xP_Y*ts*X-7y>@J1XUh^>Or4_QtdPLy9v&Fs)OAAsBN?L6 z4zl6=dn)m!Y2371*%U(3>uCXDOT zUd}xhMznN%B_-LZ<_!0T;FCT!9WQUv)7?<2^QI|`%1c~n;U?rXlXaWF9IVXv#AVlU zaoyfZQ#WSJKLx_2p%2wu8@Xw+v0YQO=@sWX>5G$oi5$e~>zD;R%zT9Jsn=Cnj2^yX zSzB2h9Y1$9-{`3nnci5-ye=otQjD+7brr?zd)ye~(79~Y;-@azi+}0}L24$2O_}4t z44Pd8<(8sx(*Lti66z3y%P>iQlH)ng;!@aB#?hSUdXqK&#h>9D{0sd8ng z=&6FfrJG*|tSLh&XrQN!wq$G#1K1~woH4d&{F6RYTz;b41ZD>vf%~k~v~)7lW>4`w zn^}V7_u+8P(OX_Qe7pFINf#kjX|b`uZVzGd!Ac!jQ!7b+q12A*H{ujQHy5%*Q=7); zwXeCj`uAEdA2@3!cj4r8_*{csJdK&452#zBf=}R*D@H?0$9^DS^^|zdHHOM?8j>8_ zum+Z!ce35Fj28jp21{g;MIB6Yvs6+y>W$XFRplJX#au%a$n^Ots|{;vDE|QE^_O_I zHrRQuOmG_lsmr`Fql&P+Oy_D1e)a`5N0&pKxU(3^`If(c+5V$1;_rFE9_zL84+}5V z!2S1Ln54&We8)i?*hOP2BX9(|Cx;FFm18_`pA6y+B#gwFs-u2XgQ(eHa^U&9;j%O=Ol(L%CPKbfq|a z#JDuvD2cSb1>gnKI7%!=?I+1Qr>bdtn}4A+5Xvm;9ZfeEJAKj>Y@GMZ=oyJ5!Z@VT=%*<5Wsx&WMbFu49}Sh}lZteWb@WRco3hFpNdN+_945tvUg9PZ zQt(S%C}FYlQdjJb)u^9CaOzi0Bzs-A-CZILY;9w909whGABPHfBbA_OAUFhEV;G~r z>13>|m6X)+i@EYT!Ch}=EBDyEr;=@L&qd59TW`1uoq+L2FK4Nr9uHp0vi!F}VP!n{UyfOf~E_DJxlr2%>3g zTVMjcd^q8d&zfpymLntMXNAY`^jZEl8>!&ZdN}nR*2nP0%WEr0Xw`9h^;!8lQEcga zR1~sejr#6nq&=jIcT{}e!psXd)V`*woQi@hBx&7W=b5v=V3}7)%1E_IvqzHWV`pcn zTTCslgpq{epA3;tZ<4KaJMXJ>rd#-QmjlHht)CEVjD^9)nr(HZWlZ&kW;vyXnxgd) zn>sgjx=V<)SoByU+k!*QsCXdxg?&lMXy%>;6-$(Tp&WOMSRPlZj+NDMzHLE{jgGl* zy1!vq^)6zhY}9gz8K(CJ_StyqtAx478LgJCS|aIO?M@}(Yy1wQ%{g94aLW2?QXcO& zEj@N9buVJ$=g!HvSsfI9EpUSt#It>UejO9)dGFU@xo+JSd^ly9;x_$}RDb^fSzgkf zs-Bu>OE#8yxT*#{#{3Tp#p$SNU^SAyK^wFm^islZCU#ekqRTfY+Atik!wl5iW}2ax zeO)t{@I#AizNPSPNq^ybDtebW({h`ryUyFJ9|eo%+fOuc0Owe*KyczWGJ)>=O=^kH zaw5vZuOAyp_ITU>01!4Z9xUZM?0!+&i0f@@Cg5fS{NKga*FGs76%)j)dHQy8wtQ-5 zc=KNw8weUdWWefY6Wo`7(Of8(;@i_YQyWbq&BB)~>dUCoajVbD@E)+RDv?NFbUr<7*>LO=EHC$zk+^iSe-IZS?p8bYSQ8ujqlS~& z7j9=>=;G}gxgZ(G9=L5u#LgXJxO~*YqN$KNN_+2TfJj~-_)5&~! z*#){AJP-3rT==$hJmKDt2Q)|&#?Z2i9KRJK~CZZAlk z$92CIseP^thy2ugZZ4(s^2j8*c&e28UFypjzh`CtNR@}#omcB z@&@kw(|H7>jU=I+x|?hCQp#IKWES3vKNM=*+DceiCG|;i0rF2cYgHq#=`{nRR@YSw z^;GkT7C7o;V_|wWYPo$}y?zd=aN&>t0BD>uxzF2RjN@(6rjSd>EcgL|WhN2SmR zbsnj#Zi0y~&>;oK(HYSeqcDrvJOtev-MI+Ogp|I$KC77flmLgA1D_j(op4a$*!!^A{#sHwPTU)G`=XWy9l|7(vB(YVN~4vH8{?*I zU}sAbX|F|=pExp{*i_k`M5*d5&xX}AU0mW)ItCk+njA6`ELab!80ev=H@209{wK4w z{{ZY!LsuFs)xDHl-Pn&?2&|6JRfA&9fhM0bbwz; zz18}$Axb$b{FRo;rSd--yd}R$LSx@_juTf+iqLbZpQ}#9C7R;kF16}ibYxiDVO#(< zD0H^K-*gTepk$2H8f0ibS5j>zSI~P6K@Y?B9~Dm#Gc94Um9+veZO~bD!wrfr!z{PW z8JNH7HJyJoLoQ3CY$<%z2PMXEnpJ^A{L+D~e>yeYu|~MaqWc%M?FQ-uF$HZ)J#jH;Fi3l&bM# ziX0l2O7}Q9)0W(m=kBdDHIT88wYF#*05%H@;SaoqNehFnHXx_OWYhL^CB|S{E&;lIHVr|TX`hE;R8dyZ&$FYMw+b?brSx*Kgc+oA zl%uG1JzQ>c+yFa*Qza1;^kW=D_&bKCxz*V=cVRP+5sXMfO}g1+Rd@#)avYP%5UQ)D zr@xFe$$k8+z41?J!$@~s?v??7=^zzd#GW}4M}x2rh|KAPM&{%AG}YESI6~;$J@!(E z;@YB~!MS#JH1*Wr3ie(pg5mg}EUBfAR*`Mm6a@S-%IAa^4y)U?~e?nBT20P?+sLBp)IO(SK43Z`U@+&D07 ztSrOn0M0b>A$IHSl)^1=6tdq zY>p(kY#)Wu;q1{JT~$L1KyTFErFcq=S*~|W4T6cMvUpFJj)@~42Jhrj zjHz`sTs184$rFQx{WnHuInVAvDK~%|{jNu3t7vtLaXz8&qcpcPaPtGir;u}Jrv={Q zWs-bi;_^Ev;W%tl@@XonnkSXJSa}kY_&3cIxW^4$g}S4eh|#&k#N&H5=c3ZMox_Sw zCS_`zCS5T|;drF}2abhahuUJxsdjlk1;Xg~UB=A2Pk>Hlq^hTiSv0ti^6igR?kCs7 zhvDY3sr@q@(}@C>IW2SHWK zudE0A78xvaK(WWtA zJrrgchB_QSUZ#hYS@9ePAVf3heq~u0Tv-+TsY%hCLuu;61O=}i zFJ(N#i8}`82?Va?mdM?&?x{Jefp}&wRV3IuOO?DYqZd`e;!X83hK$yn@e2`afi1F`t#F|qJ$ZK&Nz(C`Bk1&^(&fw7+L z7g9+iQdee4@l2^=TuRLurnSwQvPn6c2-xd?lGZpa!c$3B%x4>gyuu5kWQ3PB-gZ|! zox!{pEyWd9fjri^bJ{ws2APbPPOusqwY-%%%kXPrV9m?wUiQmgSZ6RqKZN-$PdaAm zJ{a)N5ymBdDbdrN8cm6D8(C%84NP>ce2*clj9%qnUllxmi$Bs+dWnmeUh5NbY)bZz0HBWx;T8)kn%J87Ztrqe zi7Oy$erK_T*JWc~Ev~P6MLsh#R z9dx{dLx!E3-B)rrdot|~w`-Nqw7JL5%F#~&s*JhE={;7H{w4?e3((cNJxl-)ROT4x z9qw+TD9lij12;X?y(5E9gzR=%_WCT=`!wtdO;izq*eNyw99$F@S8JO^VsCFXH>Trq z?fjNJmYHldh4yVGyrRZH!I8^9=tV=H*mxD+!WyA7;J&! zQT1}{Za+IOPKN3#FEwBgW?Z{phE+r*f~F@hwEBej`yCgjv*yM~%?`~ON`hH`rl%h- zFS~aC04pgsIEQSjSZ*6P4tR~3a#dS7A(alE247%2huYtgw&?(_y9!*Pqn+}5b|xTX za2K*~Ra42Ol1FUTgL0$^Zi1k5T~Lb$sAv|{!aqfB`6}SqnM8Tmk(AUCiDIW*mu!#f zeb&X6u*Oi~P%(#7NWIo)lW@o?F({&zF!EjqC2f?{Y>Fu3U;s2)!EyB3p<|ie*@ru; zjAAtu!SWWKpxH{(;O^xK9ZM@B?J14gPu*-hA>bTY&W*UPASHqweiogzTHCKcVz^EV zlrnLmeLPaNjXGHNf?c;oa?dOMiBE{GFH%Q|zh}V(ERSgCKcAA z>!Zp~qmh;Yr5I7j(YtNvy&;!)PaQTJ95s-(Nm=zhz+9@o6LL@dJ|L{1q~_C~1M~}! zf9xx9@=>w&%;|VX{-z`vM#rK8Ew~mc>r0oZv6xHX{oJGKVc3L?MoKM=*~E~p$1)Gm zN?sb}=%<^Ih)b{4KE!Bbp`VuKmX6o%n`zX`*g zZB+0v#xWy@?%n3ESN-6=#H_GjQ)JqTI@*VqJh%cpw^|lv#bcttB*Y^i>tHSSDUi9= zyDZyZEog_%8PcBvfy`sW>WJ{Qy_E|NpBBW+@lTw4bh_Gax3!c7Om!HoL#38a8@iIF zC^0u-^s(X5c#Gr&yUjuSrzz`Yc~;Ww*|6y(si1~g0Ssg#$$r<=T zxIsfT-K2q}4U)4$1b+GyIoT?CG3$e4kFFBs&886%@&6NU^~I;nAb*Z(&6cAsxDRIhGyY@C@_54 z1e4N-SV-v3pg)_*dc?7@#s&9o_X??=cyWSp3Wf1kL<_XrU99iTBP5KI>4nNXLc+5| zP*zaQ9D)1&ULo!k&8}@u=V1PhUO~(}0L{4;xz(9Yu6l}Q8?}v+&$qXAIhH;rajdZz zq|F#LL-{@dBN?RJeUdlu92C1fql8a*8xx#00c)ovnjBmmu-SR%Dg098`j(06>*b|$ zzpIiWZTDBGek1Y>{{Tik7u}>LK7m1+Yl~b1TK@nOu8h{kZX7{3^4(`~@xv!ok*6G{ zYkLyzsy`6;ipW?smyN znlSdCODk2h`erro)99YXGdx&kA5m2(<+3I@_a@y1*UtR?ohrDUo$IkGvez-PIk7Yg z4Iu2duZrF#e$T_A%XD%9JpMULdWLQnnQ)4B(^E9G>vgTNWsh$}*!A_X`c2 z%SD=RebjA680hNQ-EF^8QzW)IL}vOqBV%QPO$WY{ViDbo;$jX%s^#VMv3FGY^cs%G zvPsxPr(?M5sYL2Tic(!{hhwCsnX|R!lC<1Bbyuvl#w>3KedKtli>00o)mEm1m!un= z>~=`9pZDx&H@{n`{+@?M*~il@xX&3g1}BK~;;0nygS2dJ*7e_Yp=7>0)5}Nl*wAYp zE&Iz|r~Q^AS3oJBd+H9{08vgD*_~+IKqKjLYp-PR%(f3`^d@WL)*DNNNHE#kB|#nt z>};wzd*VkEr6rZ{Hlcu7xl*% zLos|yV)&jOXfTLP0h_}4S_w8(Y!eHXnyK?u0O!f#FFTS-ZGz!nDQCXg{D(zjC5?0WWAT}Q1X<0sn`M;0w$WJmlBOBR%mS`J)nzWvuMrGf}v)EW&J1A-3g`j9_19tXK z*;HZ$gSu?_#7_;jqJM-XOj?G%peua3(Td~SA5%S~0dPIkwIqTD-|AJU2&uaPwT+a; zDQqAz*9yF&(IgjUp_DNLRZKzZKSgPZ&TY?cHJ{`uuQZJ&!rc}4JQnHN-eqH3?6`BB zHCtiqfyT&mmiAoW3w+P2gbsF*ro|ddG&J7#Nx8duAq=~2PS@E851wBg`Azu?ibS%K$R8(C{yJIIsgOnMqSq{88J%wt7>76EU(49aYyd#jr896N=>^U>=rnAcj;HR}5MJ|5+`XsKS!aYh3} zyj%IKf=uO6nlU#w5txd09g7+Tm5Ye5y^kQ0OS%y?i zhYGPHSw1Kv7G)%j5xt{w3efS~akUUKD4OTrXBW21GRZQ*lxpNPFVh83)o(imSmG}d ztE=(aN^BtVU}-ITncR7#Q*lB{kEt!_(Ek8wQM#(7lFV9K^4m0BRVs7cERs}V(lxa$ zI)rv-M|QfU;h4nqxa`lVbES$~lHzY-Qz_bhd_DkL=Q`kysEnV>EUZM9^i1QtFq*Fv zp~dj$2wOCNzGFzacd~)SD4R(yCURkZt69NbFg#ACsm2G&Ba%CKk3}J!vcy>J1vrtd zY@Kx z8E<`%HdlGB;PK&o0li8Kw*5j!S$W+D;t6-47S_;5i65HV^-WLK5}HMl>;t&9#^|mf zbX*!d(MHG3H)3mEg1OED+mxAkW7}Wyd#gPqJajQVurNj(d_Z?b zjg4->sWGuzmnK1Uwa^RPO&Nig}PeNr(b_~y#XshXs*(^;i^fufnSgG2^pEc&M zg*^9D1#Mncu6v#%k=(xFw%h#|rsaUcGhNhMpgmS9I!VPLBH3e*J+2J$c(sob4KHSsz}FVS`?e*^;idaT$GHIs<< z>b30i)n~x`QPh^wE-96TP0Bov0 znC@-%k2u!+8)Q7YgUyIy6}6m-W*Hv$JODu7eV323gD7!4YQC=pr1@+j2R0@P+fnoP zE7ac?989dPbskZ{XLM6kKTSZnXmIi--4~uXg@;F!Fgz;?rwx0x8KsilTz0xg4W`NA zV~nD|B`cfCvu$9E=)_T%YVi|rK}}mZm+I@)7H5~dN9?Nm^ld(cOtMPbH*s(*DlX`s>?5msTna^30 zT=$-S#AU$;0+Ebgv997ZyZ&xxnk$ z!0c4h=~(!*>9Fdd>s||k8Z26Ou}H;FWP}WBSl2n>pbOlkC#W&Bzxt!N*)}YDsKZ)r zTsOaDgC~+UwX%aCy@0v}9#lon3_5JXy^3X1g=m-tZ*yX&UiR`LI(M)MtL4`r17qDT zzCewt!-P9E#pl&G6os<1f#d3>WV?0WbA6LXH0joNaW+YkhuRT`fsD6?5RleX(1 z%q@kK2F$r9>RyicHi3-Fc*RI)E^S7U`2?_9ky8Hv{A}N)y)*nbI=h5uYKdtVu^VoU z9Ikmf+hThoHWo_9?WW#J;`Cvr!&9)vHIEaIQ|g~l&n|+;ZX9oaB^ig3*SUvqpM1fJ zO#miIbSz(G>nP0VaR}N74R=zss0G5uaB8R1z8Bo?dipJPgqKmpCH|&@vG0zVc@6+K zBI~5F=uy}&W8EffHt%w&iWhP^mCqcrx1upRW51a|TXuqk158ekVmMV*JK7$}V=N)U z<8Z!M<@_>ymBftMmTG`xkkr0Lc(x-}zyIvPG&^o-W`}C+B{L@NCETz z0Oe`nONFVBUk-K25t4sI`|q{vvs|eWsNxJwF0)-xVfm}>DCd68dzwh_tlJ~V_vV&A zSBbOSAG)#TCB-d|@KNJt+3A_+Sx+oRGvjFi#CKkY;dL}@%jPzVNa1l}uxu`$n?~6j zE=%olw!D*D@^RC0_Im}r*B>6<&rZI@ts>)JfIBnNNl73KtZ(#kTl7d}jJ9Eah{krOEsGL|X<7B@;-!l#Umdfvly5VX8v^02PkMWoW{(tXO_viH~NW{+2xyS}jV{ zkxj)bep3~aHakEru(0a2j4h4EX=$v&=-C*({M0@!hH zuDUXBp1Uy2-Ihf@arv= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. -timm==0.9.10 tokenizers >= 0.19.1 # Required for Llama 3. fastapi aiohttp diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index cc02050ef992d..030622fd78e39 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,3 +1,26 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniCPM-V-2 model compatible with HuggingFace weights.""" import math from functools import partial from typing import Iterable, List, Optional, Tuple @@ -27,8 +50,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput -# from .configuration_minicpm import MiniCPMVConfig -# from .resampler import Resampler _KEYS_TO_MODIFY_MAPPING = { "language_model.lm_head": "lm_head", @@ -502,7 +523,7 @@ def modify_input_ids(self, input_ids, place_holder, im_start_token_id, dtype=input_ids.dtype) return image_bound, input_ids - def get_vllm_embedding(self, data, im_start_token_id, im_end_token_id, + def get_embedding(self, data, im_start_token_id, im_end_token_id, unk_token_id): if 'vision_hidden_states' not in data: pixel_values = data['pixel_values'] @@ -545,7 +566,7 @@ def forward( attn_metadata: AttentionMetadata, image_input: Optional[torch.Tensor] = None, ): - vllm_embeddings, vision_hidden_states = self.get_vllm_embedding( + vllm_embeddings, vision_hidden_states = self.get_embedding( { 'pixel_values': image_input, 'input_ids': input_ids From 7724d0e901c3a411871ce8536a143fe7d00db948 Mon Sep 17 00:00:00 2001 From: HwH Date: Mon, 27 May 2024 10:40:35 +0800 Subject: [PATCH 13/52] fix --- vllm/model_executor/models/minicpmv.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 030622fd78e39..3e85a03985be7 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -39,18 +39,16 @@ from torchvision import transforms from torchvision.transforms import InterpolationMode +from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.attention import AttentionMetadata -from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.minicpm import MiniCPMForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput - _KEYS_TO_MODIFY_MAPPING = { "language_model.lm_head": "lm_head", "language_model.model": "language_model", @@ -217,12 +215,13 @@ def _repeat(self, query, N: int): class MiniCPMV(nn.Module): - def __init__(self, - config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ): super().__init__() self.config = config self.llm = MiniCPMForCausalLM(config, @@ -524,7 +523,7 @@ def modify_input_ids(self, input_ids, place_holder, im_start_token_id, return image_bound, input_ids def get_embedding(self, data, im_start_token_id, im_end_token_id, - unk_token_id): + unk_token_id): if 'vision_hidden_states' not in data: pixel_values = data['pixel_values'] if pixel_values is not None and len(pixel_values) > 0: From fe585135aeeefbbe6bd4de26e783af00e8cc8b12 Mon Sep 17 00:00:00 2001 From: hezhihui Date: Mon, 27 May 2024 11:11:30 +0800 Subject: [PATCH 14/52] fix:get model dtype from default_dtype --- vllm/model_executor/models/minicpmv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 3e85a03985be7..7e8742b696102 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -229,11 +229,12 @@ def __init__( quant_config=quant_config, lora_config=lora_config) self.vpm = self.init_vision_module() - self.vpm.to(dtype=torch.bfloat16) + param_dtype = torch.get_default_dtype() + self.vpm.to(dtype=param_dtype) self.vision_dim = self.vpm.embed_dim self.embed_dim = self.llm.config.hidden_size self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) - self.resampler.to(device="cuda", dtype=torch.bfloat16) + self.resampler.to(device="cuda", dtype=param_dtype) self.sampler = Sampler() def init_vision_module(self): From c9aacd8ecbcb23a0efc8e0efc02d198429192a4d Mon Sep 17 00:00:00 2001 From: hezhihui Date: Mon, 27 May 2024 14:53:56 +0800 Subject: [PATCH 15/52] delete redundant annotations --- vllm/model_executor/models/minicpmv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 7e8742b696102..b2fe0bbf489ad 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -506,7 +506,7 @@ def modify_input_ids(self, input_ids, place_holder, im_start_token_id, ], dim=0) image_start_tokens = torch.where(input_ids == im_start_token_id)[0] - # 跳过 im_start + image_start_tokens += 1 image_end_tokens = torch.where(input_ids == im_end_token_id)[0] valid_image_nums = max(len(image_start_tokens), len(image_end_tokens)) From e90c326c0a0abf35fc1f61bed31f808c7685ce41 Mon Sep 17 00:00:00 2001 From: hezhihui Date: Mon, 17 Jun 2024 15:41:31 +0800 Subject: [PATCH 16/52] add test for mnicpmv --- examples/minicpmv_example.py | 27 ++-- tests/models/test_minicpmv.py | 176 +++++++++++++++++++++++++ vllm/model_executor/models/minicpmv.py | 93 ++++++++----- 3 files changed, 253 insertions(+), 43 deletions(-) create mode 100644 tests/models/test_minicpmv.py diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index 1df3ff2fe5e0b..6cfee5f0f77ae 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -6,7 +6,7 @@ from transformers import AutoConfig, AutoTokenizer from vllm import LLM, SamplingParams -from vllm.sequence import MultiModalData +from vllm.multimodal.image import ImagePixelData def slice_image(image, @@ -72,13 +72,13 @@ def get_grid_placeholder(grid, query_num): class MiniCPMV_VLLM: - def __init__(self) -> None: - self.config = AutoConfig.from_pretrained('openbmb/MiniCPM-V-2', + def __init__(self, model_name) -> None: + self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', + self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.llm = LLM( - model="openbmb/MiniCPM-V-2", + model=model_name, image_input_type="pixel_values", image_token_id=101, image_input_shape="1,3,448,448", @@ -113,16 +113,17 @@ def generate(self, image, question, sampling_params): question + \ "" + '' * addtion_tokens - outputs = self.llm.generate(prompt, - multi_modal_data=MultiModalData( - type=MultiModalData.Type.IMAGE, - data=images), - sampling_params=sampling_params) + outputs = self.llm.generate({ + "prompt": prompt, + "multi_modal_data": ImagePixelData(images), + }, + sampling_params=sampling_params + ) return outputs[0].outputs[0].text if __name__ == '__main__': - model = MiniCPMV_VLLM() + model = MiniCPMV_VLLM("openbmb/MiniCPM-V-2") sampling_params = SamplingParams( temperature=0.7, @@ -137,7 +138,7 @@ def generate(self, image, question, sampling_params): # best_of=3 ) - image = Image.open('./example.png').convert('RGB') - question = "Provide an intricate description of the image." + image = Image.open('./images/example.png').convert('RGB') + question = "What is in this image?" response = model.generate(image, question, sampling_params) print(response) diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py new file mode 100644 index 0000000000000..9a3301b3416d6 --- /dev/null +++ b/tests/models/test_minicpmv.py @@ -0,0 +1,176 @@ +from typing import List, Tuple + +import math +import torch +import pytest +from PIL import Image +from torchvision import transforms +from transformers import AutoTokenizer, AutoConfig, AutoModel + +from vllm import LLM, SamplingParams +from vllm.config import VisionLanguageConfig +from vllm.multimodal.image import ImagePixelData + +from ..conftest import IMAGE_FILES + + +IMAGE_PROMPT = "What is in this image?" + + +def slice_image(image, + max_slice_nums=9, + scale_resolution=448, + patch_size=14, + never_split=False): + original_size = image.size + original_width, original_height = original_size + log_ratio = math.log(original_width / original_height) + ratio = original_width * original_height / (scale_resolution * + scale_resolution) + multiple = min(math.ceil(ratio), max_slice_nums) + + best_grid = None + + if multiple > 1 and not never_split: + candidate_split_grids_nums = [] + for i in [multiple - 1, multiple, multiple + 1]: + if i == 1 or i > max_slice_nums: + continue + candidate_split_grids_nums.append(i) + + # source image, down-sampling and ensure divided by patch_size + candidate_grids = [] + + # find best grid + for split_grids_nums in candidate_split_grids_nums: + m = 1 + while m <= split_grids_nums: + if split_grids_nums % m == 0: + candidate_grids.append([m, split_grids_nums // m]) + m += 1 + + best_grid = [1, 1] + min_error = float("inf") + for grid in candidate_grids: + error = abs(log_ratio - math.log(grid[0] / grid[1])) + if error < min_error: + best_grid = grid + min_error = error + + return best_grid + + +def get_grid_placeholder(grid, query_num): + image_placeholder = query_num + 2 + + cols = grid[0] + rows = grid[1] + slices = 0 + for i in range(rows): + lines = 0 + for j in range(cols): + lines += image_placeholder + if i < rows - 1: + slices += lines + 1 + else: + slices += lines + slice_placeholder = 2 + slices + return slice_placeholder + + +class MiniCPMV_VLLM: + + def __init__(self, model_name) -> None: + self.config = AutoConfig.from_pretrained(model_name, + trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + self.llm = LLM( + model=model_name, + image_input_type="pixel_values", + image_token_id=101, + image_input_shape="1,3,448,448", + image_feature_size=64, + gpu_memory_utilization=0.75, + trust_remote_code=True, + ) + + def get_slice_image_placeholder(self, image): + image_placeholder = self.config.query_num + 2 + + best_grid = slice_image( + image, + self.config.max_slice_nums, + self.config.scale_resolution, + self.config.patch_size, + ) + final_placeholder = image_placeholder + + if best_grid is not None: + final_placeholder += get_grid_placeholder(best_grid, + self.config.query_num) + + return final_placeholder - 1 + + def generate(self, image, question, sampling_params): + addtion_tokens = self.get_slice_image_placeholder(image) + image = transforms.Compose([transforms.ToTensor()])(img=image) + images = torch.stack([image]) + + prompt = "<用户>" + \ + question + \ + "" + '' * addtion_tokens + + outputs = self.llm.generate({ + "prompt": prompt, + "multi_modal_data": ImagePixelData(images), + }, + sampling_params=sampling_params + ) + return outputs[0].outputs[0].text + + +model_names = [ + "/data1/hezhihui/projects/MiniCPM-V-2" +] + + +def get_hf_results(model_name, image, question): + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + hf_model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16) + hf_model = hf_model.to(device='cuda', dtype=torch.bfloat16) + hf_model.eval() + msgs = [{'role': 'user', 'content': question}] + res, _, _ = hf_model.chat( + image=image, + msgs=msgs, + context=None, + tokenizer=tokenizer, + sampling=False + ) + return res + + +def get_vllm_results(model_name, image, question): + model = MiniCPMV_VLLM(model_name) + sampling_params = SamplingParams( + use_beam_search=True, + length_penalty=1.2, + best_of=3, + max_tokens=1024, + temperature=0 + ) + res = model.generate(image, question, sampling_params) + return res + + +@pytest.mark.parametrize("model_name", model_names) +@pytest.mark.parametrize("image", IMAGE_FILES) +def test_models(model_name, image) -> None: + if not torch.cuda.is_available(): + return + image = Image.open(image).convert("RGB") + hf_outputs = get_hf_results(model_name, image, IMAGE_PROMPT) + vllm_outputs = get_vllm_results(model_name, image, IMAGE_PROMPT) + # print(hf_outputs) + # print(vllm_outputs) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index b2fe0bbf489ad..bb5eb23af3788 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -38,6 +38,7 @@ from torch.nn.init import trunc_normal_ from torchvision import transforms from torchvision.transforms import InterpolationMode +from PIL import Image from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, LoRAConfig @@ -47,6 +48,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.minicpm import MiniCPMForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import get_dummy_image_data from vllm.sequence import SamplerOutput _KEYS_TO_MODIFY_MAPPING = { @@ -213,6 +216,8 @@ def _repeat(self, query, N: int): return query.unsqueeze(1).repeat(1, N, 1) +@MULTIMODAL_REGISTRY.register_image_pixel_input() +@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) class MiniCPMV(nn.Module): def __init__( @@ -236,6 +241,22 @@ def __init__( self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) self.resampler.to(device="cuda", dtype=param_dtype) self.sampler = Sampler() + self.img2tensor_transform, self.tensor2img_transform = self.init_transform() + + def init_transform(self): + return transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD + ), + ] + ), \ + transforms.Compose( + [ + transforms.ToPILImage() + ] + ) def init_vision_module(self): default_dtype = torch.get_default_dtype() @@ -359,22 +380,23 @@ def split_to_patches(self, image, grid): patch = image[:, i:i + grid_y, j:j + grid_x] images.append(patch) patches.append(images) - return patches def slice_image(self, - image: torch.Tensor, + image_tensor: torch.Tensor, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False): - original_size = (image.shape[-1], image.shape[-2]) + original_size = (image_tensor.shape[-1], image_tensor.shape[-2]) original_width, original_height = original_size log_ratio = math.log(original_width / original_height) ratio = original_width * original_height / (scale_resolution * scale_resolution) multiple = min(math.ceil(ratio), max_slice_nums) + image = self.tensor2img_transform(image_tensor) + source_image = None best_grid = None patches = [] @@ -382,14 +404,19 @@ def slice_image(self, if multiple <= 1 or never_split: best_size = self.find_best_resize(original_size, scale_resolution, patch_size) - resize_transform = transforms.Compose([ - transforms.Resize((best_size[::-1]), - InterpolationMode.BICUBIC, - antialias=True), - transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, - std=IMAGENET_INCEPTION_STD) - ]) - source_image = resize_transform(image) + # The resizing of torchvision is also avaliable in this funciton. + # But there are slight deviations between the results of torchvision resizing and pillow image resizing. + # For the consistency with MiniCPM-V-2 in HF, we choose PIL resizing and this may take a little more time. + # + # resize_transform = transforms.Compose([ + # transforms.Resize((best_size[::-1]), + # InterpolationMode.BICUBIC, + # antialias=True), + # transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, + # std=IMAGENET_INCEPTION_STD) + # ]) + # source_image = resize_transform(image) + source_image = self.img2tensor_transform(image.resize(best_size, Image.Resampling.BICUBIC)).to(image_tensor.device) else: candidate_split_grids_nums = [] for i in [multiple - 1, multiple, multiple + 1]: @@ -399,14 +426,16 @@ def slice_image(self, best_resize = self.find_best_resize(original_size, scale_resolution, patch_size) - resize_transform = transforms.Compose([ - transforms.Resize(best_resize[::-1], - InterpolationMode.BICUBIC, - antialias=True), - transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, - std=IMAGENET_INCEPTION_STD) - ]) - source_image = resize_transform(image.clone()) + # resize_transform = transforms.Compose([ + # transforms.Resize(best_resize[::-1], + # InterpolationMode.BICUBIC, + # antialias=True), + # transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, + # std=IMAGENET_INCEPTION_STD) + # ]) + # source_image = resize_transform(image_tensor.clone()) + source_image = self.img2tensor_transform(image.copy().resize(best_resize, Image.Resampling.BICUBIC)).to(image_tensor.device) + candidate_grids = [] # find best grid @@ -431,14 +460,15 @@ def slice_image(self, patch_size, allow_upscale=True) - resize_transform = transforms.Compose([ - transforms.Resize(refine_size[::-1], - InterpolationMode.BICUBIC, - antialias=True), - transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, - std=IMAGENET_INCEPTION_STD) - ]) - refine_image = resize_transform(image.clone()) + # resize_transform = transforms.Compose([ + # transforms.Resize(refine_size[::-1], + # InterpolationMode.BICUBIC, + # antialias=True), + # transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, + # std=IMAGENET_INCEPTION_STD) + # ]) + # refine_image = resize_transform(image.clone()) + refine_image = self.img2tensor_transform(image.copy().resize(refine_size, Image.Resampling.BICUBIC)).to(image_tensor.device) patches = self.split_to_patches(refine_image, best_grid) return source_image, patches, best_grid @@ -564,12 +594,15 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, - image_input: Optional[torch.Tensor] = None, + **kwargs: object, ): + image_input = kwargs.pop("pixel_values", None) + if image_input is not None: + image_input = image_input.float() vllm_embeddings, vision_hidden_states = self.get_embedding( { - 'pixel_values': image_input, - 'input_ids': input_ids + "pixel_values": image_input, + "input_ids": input_ids }, self.config.im_start_token_id, self.config.im_end_token_id, self.config.unk_token_id) output = self.llm(input_ids=None, From 51cf257b0b48626199bde1d357cff16309b27cda Mon Sep 17 00:00:00 2001 From: hezhihui Date: Wed, 19 Jun 2024 16:21:09 +0800 Subject: [PATCH 17/52] add minicpmv support in --- vllm/multimodal/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c6311d60e0bdd..bc11b7a66f3d4 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -79,6 +79,9 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str, if config.hf_config.model_type in ("llava", "llava_next"): full_prompt = f"{image_prompt}\n{text_prompt}" + elif config.hf_config.model_type in ("minicpmv"): + # TODO: Needs the length of prompt to be changable in model. + full_prompt = f"<用户>{image_prompt}{text_prompt}" else: raise ValueError( f"Unsupported model type: {config.hf_config.model_type}") From 81d4437259ed9afe474cc1fe9112c7c9c8d1b5b8 Mon Sep 17 00:00:00 2001 From: HwH Date: Wed, 19 Jun 2024 17:36:03 +0800 Subject: [PATCH 18/52] format for minicpmv --- examples/minicpmv_example.py | 6 ++--- tests/models/test_minicpmv.py | 36 ++++++++++++-------------- vllm/model_executor/models/minicpmv.py | 23 ++++++++++------ 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index 6cfee5f0f77ae..bac80d39160dc 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -113,12 +113,12 @@ def generate(self, image, question, sampling_params): question + \ "" + '' * addtion_tokens - outputs = self.llm.generate({ + outputs = self.llm.generate( + { "prompt": prompt, "multi_modal_data": ImagePixelData(images), }, - sampling_params=sampling_params - ) + sampling_params=sampling_params) return outputs[0].outputs[0].text diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index 09ca4ef97e2ae..4a546f56171bb 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -13,7 +13,6 @@ from ..conftest import IMAGE_FILES - IMAGE_PROMPT = "What is in this image?" @@ -121,33 +120,32 @@ def generate(self, image, question, sampling_params): question + \ "" + '' * addtion_tokens - outputs = self.llm.generate({ + outputs = self.llm.generate( + { "prompt": prompt, "multi_modal_data": ImagePixelData(images), }, - sampling_params=sampling_params - ) + sampling_params=sampling_params) return outputs[0].outputs[0].text, outputs[0].outputs[0].token_ids -model_names = [ - "openbmb/MiniCPM-V-2" -] +model_names = ["openbmb/MiniCPM-V-2"] def get_hf_results(model_name, image, question): - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - hf_model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16) + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + hf_model = AutoModel.from_pretrained(model_name, + trust_remote_code=True, + torch_dtype=torch.bfloat16) hf_model = hf_model.to(device='cuda', dtype=torch.bfloat16) hf_model.eval() msgs = [{'role': 'user', 'content': question}] - outputs, _, _ = hf_model.chat( - image=image, - msgs=msgs, - context=None, - tokenizer=tokenizer, - sampling=False - ) + outputs, _, _ = hf_model.chat(image=image, + msgs=msgs, + context=None, + tokenizer=tokenizer, + sampling=False) output_ids = tokenizer.encode(outputs)[1:] return outputs, output_ids @@ -159,8 +157,7 @@ def get_vllm_results(model_name, image, question): # length_penalty=1.2, # best_of=3, max_tokens=1024, - temperature=0 - ) + temperature=0) outputs, output_ids = model.generate(image, question, sampling_params) return outputs, output_ids[:-1] @@ -172,7 +169,8 @@ def test_models(model_name, image) -> None: return image = Image.open(image).convert("RGB") hf_outputs, hf_output_ids = get_hf_results(model_name, image, IMAGE_PROMPT) - vllm_outputs, vllm_output_ids = get_vllm_results(model_name, image, IMAGE_PROMPT) + vllm_outputs, vllm_output_ids = get_vllm_results(model_name, image, + IMAGE_PROMPT) common_prefix_len = 0 for x in range(min(len(hf_output_ids), len(vllm_output_ids))): if hf_output_ids[x] != vllm_output_ids[x]: diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 987f247e5b8a0..6b57dc0f995e5 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -243,7 +243,8 @@ def __init__( self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) self.resampler.to(device="cuda", dtype=param_dtype) self.sampler = Sampler() - self.img2tensor_transform, self.tensor2img_transform = self.init_transform() + self.img2tensor_transform, self.tensor2img_transform = \ + self.init_transform() def init_transform(self): return transforms.Compose( @@ -404,9 +405,11 @@ def slice_image(self, patches = [] if multiple <= 1 or never_split: - best_size = self.find_best_resize(original_size, scale_resolution, - patch_size, allow_upscale=True) - # The resizing of torchvision is also avaliable in this funciton. + best_size = self.find_best_resize(original_size, + scale_resolution, + patch_size, + allow_upscale=True) + # The resizing of torchvision is also avaliable in this funciton. # But there are slight deviations between the results of torchvision resizing and pillow image resizing. # For the consistency with MiniCPM-V-2 in HF, we choose PIL resizing and this may take a little more time. # @@ -418,7 +421,9 @@ def slice_image(self, # std=IMAGENET_INCEPTION_STD) # ]) # source_image = resize_transform(image) - source_image = self.img2tensor_transform(image.resize(best_size, Image.Resampling.BICUBIC)).to(image_tensor.device) + source_image = self.img2tensor_transform( + image.resize(best_size, + Image.Resampling.BICUBIC)).to(image_tensor.device) else: candidate_split_grids_nums = [] for i in [multiple - 1, multiple, multiple + 1]: @@ -436,7 +441,8 @@ def slice_image(self, # std=IMAGENET_INCEPTION_STD) # ]) # source_image = resize_transform(image_tensor.clone()) - source_image = self.img2tensor_transform(image.copy().resize(best_resize, Image.Resampling.BICUBIC)).to(image_tensor.device) + source_image = self.img2tensor_transform(image.copy().resize( + best_resize, Image.Resampling.BICUBIC)).to(image_tensor.device) candidate_grids = [] @@ -470,7 +476,8 @@ def slice_image(self, # std=IMAGENET_INCEPTION_STD) # ]) # refine_image = resize_transform(image.clone()) - refine_image = self.img2tensor_transform(image.copy().resize(refine_size, Image.Resampling.BICUBIC)).to(image_tensor.device) + refine_image = self.img2tensor_transform(image.copy().resize( + refine_size, Image.Resampling.BICUBIC)).to(image_tensor.device) patches = self.split_to_patches(refine_image, best_grid) return source_image, patches, best_grid @@ -607,7 +614,7 @@ def forward( "input_ids": input_ids }, self.config.im_start_token_id, self.config.im_end_token_id, self.config.unk_token_id) - + output = self.llm(input_ids=None, positions=positions, kv_caches=kv_caches, From 938a7419f60da032fb1c6fbc580872b00552a08e Mon Sep 17 00:00:00 2001 From: HwH Date: Wed, 19 Jun 2024 17:39:35 +0800 Subject: [PATCH 19/52] format minicpmv --- tests/models/test_minicpmv.py | 3 --- vllm/model_executor/models/minicpmv.py | 7 ++++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index 4a546f56171bb..cc1687c94a574 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -1,5 +1,3 @@ -from typing import List, Tuple - import math import torch import pytest @@ -8,7 +6,6 @@ from transformers import AutoTokenizer, AutoConfig, AutoModel from vllm import LLM, SamplingParams -from vllm.config import VisionLanguageConfig from vllm.multimodal.image import ImagePixelData from ..conftest import IMAGE_FILES diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 6b57dc0f995e5..259297d5d4ae9 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -37,7 +37,6 @@ from torch import nn from torch.nn.init import trunc_normal_ from torchvision import transforms -from torchvision.transforms import InterpolationMode from PIL import Image from vllm.attention import AttentionMetadata @@ -410,8 +409,10 @@ def slice_image(self, patch_size, allow_upscale=True) # The resizing of torchvision is also avaliable in this funciton. - # But there are slight deviations between the results of torchvision resizing and pillow image resizing. - # For the consistency with MiniCPM-V-2 in HF, we choose PIL resizing and this may take a little more time. + # But there are slight deviations between the results of + # torchvision resizing and pillow image resizing. + # For the consistency with MiniCPM-V-2 in HF, + # we choose PIL resizing and this may take a little more time. # # resize_transform = transforms.Compose([ # transforms.Resize((best_size[::-1]), From ff8499d4c06cff70019e0446382ae2d3a7db5ed6 Mon Sep 17 00:00:00 2001 From: HwH Date: Wed, 19 Jun 2024 17:46:38 +0800 Subject: [PATCH 20/52] format minicpmv --- tests/models/test_minicpmv.py | 5 +++-- vllm/model_executor/models/minicpmv.py | 10 +++++----- vllm/multimodal/utils.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index cc1687c94a574..ec1fc622596ff 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -1,9 +1,10 @@ import math -import torch + import pytest +import torch from PIL import Image from torchvision import transforms -from transformers import AutoTokenizer, AutoConfig, AutoModel +from transformers import AutoConfig, AutoModel, AutoTokenizer from vllm import LLM, SamplingParams from vllm.multimodal.image import ImagePixelData diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 259297d5d4ae9..bf5b1c996fc6a 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -33,20 +33,20 @@ raise ImportError('Please install timm==0.9.10') from ImportError import torch import torch.nn.functional as F +from PIL import Image from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD from torch import nn from torch.nn.init import trunc_normal_ from torchvision import transforms -from PIL import Image from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, LoRAConfig, VisionLanguageConfig -from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.minicpm import MiniCPMForCausalLM +from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import get_dummy_image_data @@ -408,10 +408,10 @@ def slice_image(self, scale_resolution, patch_size, allow_upscale=True) - # The resizing of torchvision is also avaliable in this funciton. - # But there are slight deviations between the results of + # The resizing of torchvision is also available in this function. + # But there are slight deviations between the results of # torchvision resizing and pillow image resizing. - # For the consistency with MiniCPM-V-2 in HF, + # For the consistency with MiniCPM-V-2 in HF, # we choose PIL resizing and this may take a little more time. # # resize_transform = transforms.Compose([ diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3187237f14c61..3208d0ae5ae02 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -80,7 +80,7 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str, if config.hf_config.model_type in ("llava", "llava_next"): full_prompt = f"{image_prompt}\n{text_prompt}" elif config.hf_config.model_type in ("minicpmv"): - # TODO: Needs the length of prompt to be changable in model. + # TODO: Needs the length of prompt to be changeable in model. full_prompt = f"<用户>{image_prompt}{text_prompt}" elif config.hf_config.model_type == 'phi3_v': full_prompt = f"{image_prompt}\n{text_prompt}" From 123bdf091eb5e4df98073e177576e14142f78d94 Mon Sep 17 00:00:00 2001 From: hezhihui Date: Mon, 8 Jul 2024 10:37:21 +0800 Subject: [PATCH 21/52] changed for image processor --- examples/minicpmv_example.py | 186 ++++----------- tests/models/test_minicpmv.py | 284 +++++++++------------- vllm/model_executor/models/minicpmv.py | 312 +++---------------------- vllm/worker/model_runner.py | 23 +- 4 files changed, 210 insertions(+), 595 deletions(-) diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index bac80d39160dc..85fdbe2cf5f93 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -1,144 +1,54 @@ -import math - -import torch from PIL import Image -from torchvision import transforms -from transformers import AutoConfig, AutoTokenizer -from vllm import LLM, SamplingParams +from transformers import AutoImageProcessor from vllm.multimodal.image import ImagePixelData +from vllm import LLM, SamplingParams -def slice_image(image, - max_slice_nums=9, - scale_resolution=448, - patch_size=14, - never_split=False): - original_size = image.size - original_width, original_height = original_size - log_ratio = math.log(original_width / original_height) - ratio = original_width * original_height / (scale_resolution * - scale_resolution) - multiple = min(math.ceil(ratio), max_slice_nums) - - best_grid = None - - if multiple > 1 and not never_split: - candidate_split_grids_nums = [] - for i in [multiple - 1, multiple, multiple + 1]: - if i == 1 or i > max_slice_nums: - continue - candidate_split_grids_nums.append(i) - - # source image, down-sampling and ensure divided by patch_size - candidate_grids = [] - - # find best grid - for split_grids_nums in candidate_split_grids_nums: - m = 1 - while m <= split_grids_nums: - if split_grids_nums % m == 0: - candidate_grids.append([m, split_grids_nums // m]) - m += 1 - - best_grid = [1, 1] - min_error = float("inf") - for grid in candidate_grids: - error = abs(log_ratio - math.log(grid[0] / grid[1])) - if error < min_error: - best_grid = grid - min_error = error - - return best_grid - - -def get_grid_placeholder(grid, query_num): - image_placeholder = query_num + 2 - - cols = grid[0] - rows = grid[1] - slices = 0 - for i in range(rows): - lines = 0 - for j in range(cols): - lines += image_placeholder - if i < rows - 1: - slices += lines + 1 - else: - slices += lines - slice_placeholder = 2 + slices - return slice_placeholder - - -class MiniCPMV_VLLM: - - def __init__(self, model_name) -> None: - self.config = AutoConfig.from_pretrained(model_name, - trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - self.llm = LLM( - model=model_name, - image_input_type="pixel_values", - image_token_id=101, - image_input_shape="1,3,448,448", - image_feature_size=64, - gpu_memory_utilization=0.75, - trust_remote_code=True, - ) - - def get_slice_image_placeholder(self, image): - image_placeholder = self.config.query_num + 2 - - best_grid = slice_image( - image, - self.config.max_slice_nums, - self.config.scale_resolution, - self.config.patch_size, - ) - final_placeholder = image_placeholder - - if best_grid is not None: - final_placeholder += get_grid_placeholder(best_grid, - self.config.query_num) - - return final_placeholder - 1 - - def generate(self, image, question, sampling_params): - addtion_tokens = self.get_slice_image_placeholder(image) - image = transforms.Compose([transforms.ToTensor()])(img=image) - images = torch.stack([image]) - - prompt = "<用户>" + \ - question + \ - "" + '' * addtion_tokens - - outputs = self.llm.generate( - { - "prompt": prompt, - "multi_modal_data": ImagePixelData(images), - }, - sampling_params=sampling_params) - return outputs[0].outputs[0].text - - -if __name__ == '__main__': - model = MiniCPMV_VLLM("openbmb/MiniCPM-V-2") - - sampling_params = SamplingParams( - temperature=0.7, - top_p=0.8, - top_k=100, - seed=3472, - max_tokens=1024, - min_tokens=150, - # temperature=0, - # use_beam_search=True, - # length_penalty=1.2, - # best_of=3 - ) +IMAGES = [ + "/data1/hezhihui/vllm/examples/images/example.png", + "/data1/hezhihui/vllm/examples/images/375.jpg" +] + +MODEL_NAME = "/data1/hezhihui/projects/MiniCPM-V-2" + +image = Image.open(IMAGES[1]).convert("RGB") +image_processor = AutoImageProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) + +llm = LLM( + model=MODEL_NAME, + image_input_type="pixel_values", + image_token_id=101, + image_input_shape="1, 3, 448, 488", + image_feature_size=64, + gpu_memory_utilization=0.75, + trust_remote_code=True +) + +prompt = "<用户>" + image_processor.get_slice_image_placeholder(image.size) \ + + "what kind of wine is this?" \ + + "" + +sampling_params = SamplingParams( + # temperature=0.7, + # top_p=0.8, + # top_k=100, + # seed=3472, + max_tokens=1024, + # min_tokens=150, + temperature=0, + use_beam_search=True, + # length_penalty=1.2, + best_of=3 +) + + +outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": ImagePixelData(image) + }, + sampling_params=sampling_params +) +print(outputs[0].outputs[0].text) - image = Image.open('./images/example.png').convert('RGB') - question = "What is in this image?" - response = model.generate(image, question, sampling_params) - print(response) diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index ec1fc622596ff..61febe094bad3 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -1,176 +1,120 @@ -import math +from typing import List, Tuple import pytest -import torch -from PIL import Image -from torchvision import transforms -from transformers import AutoConfig, AutoModel, AutoTokenizer +from transformers import AutoTokenizer, AutoImageProcessor -from vllm import LLM, SamplingParams -from vllm.multimodal.image import ImagePixelData +from vllm.config import VisionLanguageConfig +from vllm.utils import is_cpu from ..conftest import IMAGE_FILES -IMAGE_PROMPT = "What is in this image?" - - -def slice_image(image, - max_slice_nums=9, - scale_resolution=448, - patch_size=14, - never_split=False): - original_size = image.size - original_width, original_height = original_size - log_ratio = math.log(original_width / original_height) - ratio = original_width * original_height / (scale_resolution * - scale_resolution) - multiple = min(math.ceil(ratio), max_slice_nums) - - best_grid = None - - if multiple > 1 and not never_split: - candidate_split_grids_nums = [] - for i in [multiple - 1, multiple, multiple + 1]: - if i == 1 or i > max_slice_nums: - continue - candidate_split_grids_nums.append(i) - - # source image, down-sampling and ensure divided by patch_size - candidate_grids = [] - - # find best grid - for split_grids_nums in candidate_split_grids_nums: - m = 1 - while m <= split_grids_nums: - if split_grids_nums % m == 0: - candidate_grids.append([m, split_grids_nums // m]) - m += 1 - - best_grid = [1, 1] - min_error = float("inf") - for grid in candidate_grids: - error = abs(log_ratio - math.log(grid[0] / grid[1])) - if error < min_error: - best_grid = grid - min_error = error - - return best_grid - - -def get_grid_placeholder(grid, query_num): - image_placeholder = query_num + 2 - - cols = grid[0] - rows = grid[1] - slices = 0 - for i in range(rows): - lines = 0 - for j in range(cols): - lines += image_placeholder - if i < rows - 1: - slices += lines + 1 - else: - slices += lines - slice_placeholder = 2 + slices - return slice_placeholder - - -class MiniCPMV_VLLM: - - def __init__(self, model_name) -> None: - self.config = AutoConfig.from_pretrained(model_name, - trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - self.llm = LLM( - model=model_name, - image_input_type="pixel_values", - image_token_id=101, - image_input_shape="1,3,448,448", - image_feature_size=64, - gpu_memory_utilization=0.75, - trust_remote_code=True, - ) - - def get_slice_image_placeholder(self, image): - image_placeholder = self.config.query_num + 2 - - best_grid = slice_image( - image, - self.config.max_slice_nums, - self.config.scale_resolution, - self.config.patch_size, - ) - final_placeholder = image_placeholder - - if best_grid is not None: - final_placeholder += get_grid_placeholder(best_grid, - self.config.query_num) - - return final_placeholder - 1 - - def generate(self, image, question, sampling_params): - addtion_tokens = self.get_slice_image_placeholder(image) - image = transforms.Compose([transforms.ToTensor()])(img=image) - images = torch.stack([image]) - - prompt = "<用户>" + \ - question + \ - "" + '' * addtion_tokens - - outputs = self.llm.generate( - { - "prompt": prompt, - "multi_modal_data": ImagePixelData(images), - }, - sampling_params=sampling_params) - return outputs[0].outputs[0].text, outputs[0].outputs[0].token_ids - - -model_names = ["openbmb/MiniCPM-V-2"] - - -def get_hf_results(model_name, image, question): - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - hf_model = AutoModel.from_pretrained(model_name, - trust_remote_code=True, - torch_dtype=torch.bfloat16) - hf_model = hf_model.to(device='cuda', dtype=torch.bfloat16) - hf_model.eval() - msgs = [{'role': 'user', 'content': question}] - outputs, _, _ = hf_model.chat(image=image, - msgs=msgs, - context=None, - tokenizer=tokenizer, - sampling=False) - output_ids = tokenizer.encode(outputs)[1:] - return outputs, output_ids - - -def get_vllm_results(model_name, image, question): - model = MiniCPMV_VLLM(model_name) - sampling_params = SamplingParams( - use_beam_search=False, - # length_penalty=1.2, - # best_of=3, - max_tokens=1024, - temperature=0) - outputs, output_ids = model.generate(image, question, sampling_params) - return outputs, output_ids[:-1] - - -@pytest.mark.parametrize("model_name", model_names) -@pytest.mark.parametrize("image", IMAGE_FILES) -def test_models(model_name, image) -> None: - if not torch.cuda.is_available(): - return - image = Image.open(image).convert("RGB") - hf_outputs, hf_output_ids = get_hf_results(model_name, image, IMAGE_PROMPT) - vllm_outputs, vllm_output_ids = get_vllm_results(model_name, image, - IMAGE_PROMPT) - common_prefix_len = 0 - for x in range(min(len(hf_output_ids), len(vllm_output_ids))): - if hf_output_ids[x] != vllm_output_ids[x]: - break - common_prefix_len += 1 +pytestmark = pytest.mark.vlm + +# The image token is placed before "user" on purpose so that the test can pass +HF_IMAGE_PROMPTS = [ + "What's the content of the image?", # noqa: E501 + "What is the season?" +] + +assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) + + +def iter_minicpmv_configs(model_name: str): + image_hw_to_feature_size = { + (448, 448): 64, + } + + for (h, w), f in image_hw_to_feature_size.items(): + for input_type, input_shape in [ + (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), + ]: + yield (model_name, + VisionLanguageConfig(image_input_type=input_type, + image_feature_size=f, + image_token_id=101, + image_input_shape=input_shape, + image_processor=model_name, + image_processor_revision=None)) + + +model_and_vl_config = [ + *iter_minicpmv_configs("/data1/hezhihui/projects/MiniCPM-V-2"), +] + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str], + vlm_config: VisionLanguageConfig, model_id: str): + """Sanitize vllm output to be comparable with hf output. + The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, + x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... + It also reduces `output_str` from "bla" to "bla". + """ + input_ids, output_str = vllm_output + image_token_id = vlm_config.image_token_id + + tokenizer = AutoTokenizer.from_pretrained(model_id) + image_token_str = tokenizer.decode(image_token_id) + + hf_input_ids = [ + input_id if input_id != image_token_id else 0 + for idx, input_id in enumerate(input_ids) + ] + hf_output_str = output_str \ + .replace(image_token_str * vlm_config.image_feature_size, "") \ + .replace("", " ").replace("<|user|>", "") \ + .replace("<|end|>\n<|assistant|>", " ") + + return hf_input_ids, hf_output_str + + +target_dtype = "bfloat16" + + +# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] +# Since we use _attn_implementation="eager" for hf_runner, here is +# numeric difference for longer context and test can't pass +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [8]) +def test_models(hf_runner, vllm_runner, hf_images, vllm_images, + model_and_config, dtype: str, max_tokens: int) -> None: + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalData objects and corresponding + vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + model_id, vlm_config = model_and_config + + with hf_runner(model_id, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, + max_tokens, + images=hf_images) + + image_processor = AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) + vllm_image_prompts = [ + "<用户>" + image_processor.get_slice_image_placeholder(IMAGE_FILES[i].image.size) \ + + p + "" + for i, p in enumerate(HF_IMAGE_PROMPTS) + ] + + with vllm_runner(model_id, + max_model_len=2048, + dtype=dtype, + enforce_eager=True, + **vlm_config.as_cli_args_dict()) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, + max_tokens, + images=vllm_images) + + for i in range(len(HF_IMAGE_PROMPTS)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_to_hf_output( + vllm_outputs[i], vlm_config, model_id) + assert hf_output_str == vllm_output_str, ( + f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") + assert hf_output_ids == vllm_output_ids, ( + f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index bf5b1c996fc6a..9695e093ce27d 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -242,23 +242,6 @@ def __init__( self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) self.resampler.to(device="cuda", dtype=param_dtype) self.sampler = Sampler() - self.img2tensor_transform, self.tensor2img_transform = \ - self.init_transform() - - def init_transform(self): - return transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD - ), - ] - ), \ - transforms.Compose( - [ - transforms.ToPILImage() - ] - ) def init_vision_module(self): default_dtype = torch.get_default_dtype() @@ -295,7 +278,7 @@ def get_vision_embedding(self, pixel_values): dtype = self.vpm.pos_embed.data.dtype for pixel_value in pixel_values: # V2.0 start - H, W = pixel_value.shape[-2:] + H, W = pixel_value[0].shape[-2:] tgt_size = (math.ceil(H / self.vpm.patch_embed.patch_size[0]), math.ceil(W / self.vpm.patch_embed.patch_size[0])) # V2.0 end @@ -308,269 +291,31 @@ def get_vision_embedding(self, pixel_values): res.append(self.resampler(vision_embedding, tgt_size)) return torch.vstack(res) - def get_image_bound(self, input_ids, im_start_token_id, im_end_token_id, - unk_token_id): - length = len(input_ids) - bound = [] - im_start_idx = -1 - flag = False - for x in range(length): - if input_ids[x] == im_start_token_id: - if flag is False: - flag = True - im_start_idx = x + 1 - elif input_ids[x] == im_end_token_id: - if flag is True: - flag = False - bound.append(im_start_idx) - bound.append(x - 1) - elif input_ids[x] != unk_token_id: - if flag is True: - flag = False - if len(bound) > 0: - bound = torch.tensor(bound).reshape(-1, 2) - return bound - - def ensure_divide(self, length, patch_size): - return max(round(length / patch_size) * patch_size, patch_size) - - def find_best_resize(self, - original_size, - scale_resolution, - patch_size, - allow_upscale=False): - width, height = original_size - if (width * height > - scale_resolution * scale_resolution) or allow_upscale: - r = width / height - height = int(scale_resolution / math.sqrt(r)) - width = int(height * r) - best_width = self.ensure_divide(width, patch_size) - best_height = self.ensure_divide(height, patch_size) - return (best_width, best_height) - - def get_refine_size(self, - original_size, - grid, - scale_resolution, - patch_size, - allow_upscale=False): - width, height = original_size - grid_x, grid_y = grid - - refine_width = self.ensure_divide(width, grid_x) - refine_height = self.ensure_divide(height, grid_y) - - grid_width = refine_width / grid_x - grid_height = refine_height / grid_y - - best_grid_size = self.find_best_resize((grid_width, grid_height), - scale_resolution, - patch_size, - allow_upscale=allow_upscale) - refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y) - return refine_size - - def split_to_patches(self, image, grid): - patches = [] - width, height = (image.shape[-1], image.shape[-2]) - grid_x = int(width / grid[0]) - grid_y = int(height / grid[1]) - for i in range(0, height, grid_y): - images = [] - for j in range(0, width, grid_x): - patch = image[:, i:i + grid_y, j:j + grid_x] - images.append(patch) - patches.append(images) - return patches - - def slice_image(self, - image_tensor: torch.Tensor, - max_slice_nums=9, - scale_resolution=448, - patch_size=14, - never_split=False): - original_size = (image_tensor.shape[-1], image_tensor.shape[-2]) - original_width, original_height = original_size - log_ratio = math.log(original_width / original_height) - ratio = original_width * original_height / (scale_resolution * - scale_resolution) - multiple = min(math.ceil(ratio), max_slice_nums) - - image = self.tensor2img_transform(image_tensor) - - source_image = None - best_grid = None - patches = [] - - if multiple <= 1 or never_split: - best_size = self.find_best_resize(original_size, - scale_resolution, - patch_size, - allow_upscale=True) - # The resizing of torchvision is also available in this function. - # But there are slight deviations between the results of - # torchvision resizing and pillow image resizing. - # For the consistency with MiniCPM-V-2 in HF, - # we choose PIL resizing and this may take a little more time. - # - # resize_transform = transforms.Compose([ - # transforms.Resize((best_size[::-1]), - # InterpolationMode.BICUBIC, - # antialias=True), - # transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, - # std=IMAGENET_INCEPTION_STD) - # ]) - # source_image = resize_transform(image) - source_image = self.img2tensor_transform( - image.resize(best_size, - Image.Resampling.BICUBIC)).to(image_tensor.device) - else: - candidate_split_grids_nums = [] - for i in [multiple - 1, multiple, multiple + 1]: - if i == 1 or i > max_slice_nums: - continue - candidate_split_grids_nums.append(i) - - best_resize = self.find_best_resize(original_size, - scale_resolution, patch_size) - # resize_transform = transforms.Compose([ - # transforms.Resize(best_resize[::-1], - # InterpolationMode.BICUBIC, - # antialias=True), - # transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, - # std=IMAGENET_INCEPTION_STD) - # ]) - # source_image = resize_transform(image_tensor.clone()) - source_image = self.img2tensor_transform(image.copy().resize( - best_resize, Image.Resampling.BICUBIC)).to(image_tensor.device) - - candidate_grids = [] - - # find best grid - for split_grids_nums in candidate_split_grids_nums: - m = 1 - while m <= split_grids_nums: - if split_grids_nums % m == 0: - candidate_grids.append([m, split_grids_nums // m]) - m += 1 - - best_grid = [1, 1] - min_error = float("inf") - for grid in candidate_grids: - error = abs(log_ratio - math.log(grid[0] / grid[1])) - if error < min_error: - best_grid = grid - min_error = error - - refine_size = self.get_refine_size(original_size, - best_grid, - scale_resolution, - patch_size, - allow_upscale=True) - - # resize_transform = transforms.Compose([ - # transforms.Resize(refine_size[::-1], - # InterpolationMode.BICUBIC, - # antialias=True), - # transforms.Normalize(mean=IMAGENET_INCEPTION_MEAN, - # std=IMAGENET_INCEPTION_STD) - # ]) - # refine_image = resize_transform(image.clone()) - refine_image = self.img2tensor_transform(image.copy().resize( - refine_size, Image.Resampling.BICUBIC)).to(image_tensor.device) - patches = self.split_to_patches(refine_image, best_grid) - - return source_image, patches, best_grid - - def get_grid_placeholder(self, grid, query_num): - image_placeholder = [self.config.im_start_token_id] + \ - [self.config.unk_token_id] * query_num + \ - [self.config.im_end_token_id] - - cols = grid[0] - rows = grid[1] - slices = [] - for i in range(rows): - lines = [] - for j in range(cols): - lines += image_placeholder - slices = slices + lines - if i < rows - 1: - slices += [5] # \n - slice_placeholder = [self.config.slice_start_token_id - ] + slices + [self.config.slice_end_token_id] - return slice_placeholder - - def get_slice_image_placeholder(self, image): - image_placeholder = [self.config.im_start_token_id] + \ - [self.config.unk_token_id] * self.config.query_num + \ - [self.config.im_end_token_id] - slice_images = [] - - source_image, patches, best_grid = self.slice_image( - image, - self.config.max_slice_nums, # default: 9 - self.config.scale_resolution, # default: 448 - self.config.patch_size # default: 14 - ) - - slice_images.append(source_image) - final_placeholder = image_placeholder - - if len(patches) > 0: - for i in range(len(patches)): - for j in range(len(patches[0])): - slice_images.append(patches[i][j]) - - final_placeholder += self.get_grid_placeholder( - best_grid, self.config.query_num) - return slice_images, final_placeholder - - def modify_input_ids(self, input_ids, place_holder, im_start_token_id, - im_end_token_id): - if len(torch.where(input_ids == im_end_token_id)[0]) == 0: - return [], input_ids - place_holder = torch.tensor(place_holder + [5]).to( - device=input_ids.device, dtype=input_ids.dtype) - start_idx = 0 - end_idx = 0 - for x in range(input_ids.shape[0]): - if input_ids[x] == im_start_token_id: - start_idx = x - elif input_ids[x] == im_end_token_id: - end_idx = x - input_ids = torch.cat([ - input_ids[:start_idx], place_holder, - input_ids[end_idx + 1:-place_holder.shape[0] + 2] - ], - dim=0) + def get_image_bound(self, input_ids, im_start_token_id, im_end_token_id): image_start_tokens = torch.where(input_ids == im_start_token_id)[0] - image_start_tokens += 1 image_end_tokens = torch.where(input_ids == im_end_token_id)[0] - valid_image_nums = max(len(image_start_tokens), len(image_end_tokens)) - if image_start_tokens[:valid_image_nums].unsqueeze( - -1).shape[0] == image_end_tokens[:valid_image_nums].unsqueeze( - -1).shape[0]: - image_bound = torch.cat([ + valid_image_nums = min(len(image_start_tokens), len(image_end_tokens)) + if valid_image_nums == 0: + return [] + image_bound = torch.hstack( + [ image_start_tokens[:valid_image_nums].unsqueeze(-1), image_end_tokens[:valid_image_nums].unsqueeze(-1), - ], - dim=1) - else: - image_bound = torch.tensor([]).to(device=input_ids.device, - dtype=input_ids.dtype) - return image_bound, input_ids + ] + ) + + return image_bound - def get_embedding(self, data, im_start_token_id, im_end_token_id, - unk_token_id): + def get_embedding(self, data, im_start_token_id, im_end_token_id): + input_ids = data['input_ids'] if 'vision_hidden_states' not in data: - pixel_values = data['pixel_values'] - if pixel_values is not None and len(pixel_values) > 0: - images, places_holder = self.get_slice_image_placeholder( - pixel_values[0]) - vision_hidden_states = self.get_vision_embedding(images) + pixel_values_list = data['pixel_values'] + vision_hidden_states = [] + if pixel_values_list is not None: + for pixel_values in pixel_values_list: + if pixel_values is not None and len(pixel_values) > 0: + vision_hidden_states.append(self.get_vision_embedding(pixel_values)) else: vision_hidden_states = torch.tensor([]).to( data['input_ids'].device) @@ -578,17 +323,21 @@ def get_embedding(self, data, im_start_token_id, im_end_token_id, vision_hidden_states = data['vision_hidden_states'] if data['pixel_values'] is not None: - image_bound, input_ids = self.modify_input_ids( - data['input_ids'], places_holder, im_start_token_id, - im_end_token_id) + image_bound = self.get_image_bound(input_ids, + im_start_token_id, + im_end_token_id) else: - input_ids = data['input_ids'] image_bound = [] vlm_embedding = self.llm.model.embed_tokens( input_ids) * self.llm.config.scale_emb - vision_hidden_states = vision_hidden_states.type(vlm_embedding.dtype) + vision_hidden_states = [ + i.type(vlm_embedding.dtype) if isinstance(i, torch.Tensor) else i + for i in vision_hidden_states + ] + if len(vision_hidden_states) > 0 and len(image_bound) > 0: + vision_hidden_states = torch.cat(vision_hidden_states, dim=0) image_indices = torch.stack([ torch.arange(r[0], r[1], dtype=torch.long) for r in image_bound ]).to(vlm_embedding.device) @@ -607,14 +356,11 @@ def forward( **kwargs: object, ): image_input = kwargs.pop("pixel_values", None) - if image_input is not None: - image_input = image_input.float() vlm_embeddings, vision_hidden_states = self.get_embedding( { "pixel_values": image_input, "input_ids": input_ids - }, self.config.im_start_token_id, self.config.im_end_token_id, - self.config.unk_token_id) + }, self.config.im_start_token_id, self.config.im_end_token_id) output = self.llm(input_ids=None, positions=positions, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index d0baa4337f84a..f4bfdc1f52a12 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -637,10 +637,25 @@ def _prepare_model_input( else: lora_mapping = None - multi_modal_kwargs = { - k: torch.cat(v, dim=0).to(self.device) - for k, v in multi_modal_kwargs_list.items() - } + # MiniCPMV needs dynamic image size, which can not be concanated. + if "MiniCPMV" == self.model.__class__.__name__: + def to_device_recursive(value): + if isinstance(value, list): + new_value = [] + for v in value: + new_value += [to_device_recursive(v)] + return new_value + else: + return value.to(self.device) + multi_modal_kwargs = { + k: to_device_recursive(v) + for k, v in multi_modal_kwargs_list.items() + } + else: + multi_modal_kwargs = { + k: torch.cat(v, dim=0).to(self.device) + for k, v in multi_modal_kwargs_list.items() + } return ModelInput( input_tokens=input_tokens_tensor, From d9187c89ca1c3ac3980b425ea943fb1dbe206595 Mon Sep 17 00:00:00 2001 From: hezhihui Date: Wed, 10 Jul 2024 17:06:50 +0800 Subject: [PATCH 22/52] update processing minicpmv --- .../configs/Meta-Llama-3-70B-Instruct.yaml | 11 + ...-3-8B-Instruct-FP8-compressed-tensors.yaml | 11 + .../configs/Meta-Llama-3-8B-Instruct-FP8.yaml | 11 + ...3-8B-Instruct-INT8-compressed-tensors.yaml | 11 + .../configs/Meta-Llama-3-8B-Instruct.yaml | 11 + ...xtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml | 11 + .../Mixtral-8x7B-Instruct-v0.1-FP8.yaml | 11 + .../configs/Mixtral-8x7B-Instruct-v0.1.yaml | 11 + .../configs/Qwen2-57B-A14-Instruct.yaml | 11 + .../lm-eval-harness/configs/models-large.txt | 3 + .../lm-eval-harness/configs/models-small.txt | 4 + .../run-lm-eval-gsm-hf-baseline.sh | 46 + .../run-lm-eval-gsm-vllm-baseline.sh | 51 + .buildkite/lm-eval-harness/run-tests.sh | 59 + .../test_lm_eval_correctness.py | 55 + .buildkite/run-openvino-test.sh | 14 + Dockerfile.openvino | 26 + Dockerfile.ppc64le | 22 + csrc/cpu/cpu_types_vsx.hpp | 491 +++++++ csrc/cpu/cpu_types_x86.hpp | 515 +++++++ csrc/quantization/fp8/fp8_marlin.cu | 1308 +++++++++++++++++ docs/source/_templates/sections/header.html | 38 + .../input_processing_pipeline.rst | 20 + .../input_processing/model_inputs_index.rst | 39 + .../getting_started/openvino-installation.rst | 95 ++ .../models/enabling_multimodal_inputs.rst | 147 ++ .../quantization/supported_hardware.rst | 30 + docs/source/serving/faq.rst | 12 + examples/llava_next_example.py | 36 + examples/offline_inference_mlpspeculator.py | 58 + examples/openai_vision_api_client.py | 86 ++ examples/paligemma_example.py | 52 + requirements-mamba.txt | 3 + requirements-openvino.txt | 9 + .../distributed/test_multimodal_broadcast.py | 54 + tests/distributed/test_parallel_state.py | 57 + tests/distributed/test_pipeline_parallel.py | 149 ++ tests/distributed/test_shm_broadcast.py | 99 ++ tests/entrypoints/llm/__init__.py | 0 tests/entrypoints/llm/test_encode.py | 142 ++ tests/entrypoints/llm/test_generate.py | 142 ++ .../llm/test_generate_multiple_loras.py | 67 + tests/entrypoints/openai/__init__.py | 0 tests/entrypoints/openai/test_chat.py | 873 +++++++++++ tests/entrypoints/openai/test_completion.py | 724 +++++++++ tests/entrypoints/openai/test_embedding.py | 144 ++ .../openai/test_guided_processors.py | 111 ++ tests/entrypoints/openai/test_models.py | 69 + .../openai/test_oot_registration.py | 66 + tests/entrypoints/openai/test_run_batch.py | 53 + tests/entrypoints/openai/test_vision.py | 270 ++++ tests/kernels/test_flashinfer.py | 248 ++++ tests/models/test_compressed_tensors.py | 49 + tests/models/test_jamba.py | 65 + tests/models/test_paligemma.py | 147 ++ tests/multimodal/test_mapper.py | 85 ++ tests/quantization/test_lm_head.py | 45 + .../e2e/test_integration_dist_tp2.py | 123 ++ .../e2e/test_integration_dist_tp4.py | 60 + tests/spec_decode/e2e/test_mlp_correctness.py | 216 +++ tests/tokenization/test_get_eos.py | 31 + tests/worker/test_model_input.py | 152 ++ vllm/attention/backends/openvino.py | 101 ++ .../device_communicators/shm_broadcast.py | 295 ++++ vllm/engine/async_timeout.py | 189 +++ vllm/executor/openvino_executor.py | 163 ++ vllm/inputs/__init__.py | 19 + vllm/inputs/data.py | 143 ++ vllm/inputs/registry.py | 209 +++ vllm/model_executor/layers/fused_moe/layer.py | 197 +++ .../schemes/compressed_tensors_w8a8_fp8.py | 87 ++ .../schemes/compressed_tensors_w8a8_int8.py | 85 ++ .../schemes/compressed_tensors_wNa16.py | 175 +++ .../layers/quantization/utils/w8a8_utils.py | 163 ++ vllm/model_executor/model_loader/openvino.py | 210 +++ vllm/model_executor/models/clip.py | 289 ++++ vllm/model_executor/models/deepseek_v2.py | 537 +++++++ vllm/model_executor/models/gemma2.py | 395 +++++ vllm/model_executor/models/interfaces.py | 144 ++ vllm/model_executor/models/jamba.py | 955 ++++++++++++ vllm/model_executor/models/mlp_speculator.py | 188 +++ vllm/model_executor/models/paligemma.py | 344 +++++ vllm/model_executor/models/utils.py | 41 + vllm/platforms/__init__.py | 18 + vllm/platforms/cuda.py | 34 + vllm/platforms/interface.py | 21 + vllm/platforms/rocm.py | 15 + vllm/spec_decode/draft_model_runner.py | 179 +++ vllm/spec_decode/mlp_speculator_worker.py | 86 ++ .../spec_decode/smaller_tp_proposer_worker.py | 149 ++ .../configs/mlp_speculator.py | 65 + vllm/worker/model_runner_base.py | 162 ++ vllm/worker/openvino_model_runner.py | 344 +++++ vllm/worker/openvino_worker.py | 354 +++++ 94 files changed, 13915 insertions(+) create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml create mode 100644 .buildkite/lm-eval-harness/configs/models-large.txt create mode 100644 .buildkite/lm-eval-harness/configs/models-small.txt create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh create mode 100644 .buildkite/lm-eval-harness/run-tests.sh create mode 100644 .buildkite/lm-eval-harness/test_lm_eval_correctness.py create mode 100755 .buildkite/run-openvino-test.sh create mode 100644 Dockerfile.openvino create mode 100644 Dockerfile.ppc64le create mode 100644 csrc/cpu/cpu_types_vsx.hpp create mode 100644 csrc/cpu/cpu_types_x86.hpp create mode 100644 csrc/quantization/fp8/fp8_marlin.cu create mode 100644 docs/source/_templates/sections/header.html create mode 100644 docs/source/dev/input_processing/input_processing_pipeline.rst create mode 100644 docs/source/dev/input_processing/model_inputs_index.rst create mode 100644 docs/source/getting_started/openvino-installation.rst create mode 100644 docs/source/models/enabling_multimodal_inputs.rst create mode 100644 docs/source/quantization/supported_hardware.rst create mode 100644 docs/source/serving/faq.rst create mode 100644 examples/llava_next_example.py create mode 100644 examples/offline_inference_mlpspeculator.py create mode 100644 examples/openai_vision_api_client.py create mode 100644 examples/paligemma_example.py create mode 100644 requirements-mamba.txt create mode 100644 requirements-openvino.txt create mode 100644 tests/distributed/test_multimodal_broadcast.py create mode 100644 tests/distributed/test_parallel_state.py create mode 100644 tests/distributed/test_pipeline_parallel.py create mode 100644 tests/distributed/test_shm_broadcast.py create mode 100644 tests/entrypoints/llm/__init__.py create mode 100644 tests/entrypoints/llm/test_encode.py create mode 100644 tests/entrypoints/llm/test_generate.py create mode 100644 tests/entrypoints/llm/test_generate_multiple_loras.py create mode 100644 tests/entrypoints/openai/__init__.py create mode 100644 tests/entrypoints/openai/test_chat.py create mode 100644 tests/entrypoints/openai/test_completion.py create mode 100644 tests/entrypoints/openai/test_embedding.py create mode 100644 tests/entrypoints/openai/test_guided_processors.py create mode 100644 tests/entrypoints/openai/test_models.py create mode 100644 tests/entrypoints/openai/test_oot_registration.py create mode 100644 tests/entrypoints/openai/test_run_batch.py create mode 100644 tests/entrypoints/openai/test_vision.py create mode 100644 tests/kernels/test_flashinfer.py create mode 100644 tests/models/test_compressed_tensors.py create mode 100644 tests/models/test_jamba.py create mode 100644 tests/models/test_paligemma.py create mode 100644 tests/multimodal/test_mapper.py create mode 100644 tests/quantization/test_lm_head.py create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py create mode 100644 tests/spec_decode/e2e/test_integration_dist_tp4.py create mode 100644 tests/spec_decode/e2e/test_mlp_correctness.py create mode 100644 tests/tokenization/test_get_eos.py create mode 100644 tests/worker/test_model_input.py create mode 100644 vllm/attention/backends/openvino.py create mode 100644 vllm/distributed/device_communicators/shm_broadcast.py create mode 100644 vllm/engine/async_timeout.py create mode 100644 vllm/executor/openvino_executor.py create mode 100644 vllm/inputs/__init__.py create mode 100644 vllm/inputs/data.py create mode 100644 vllm/inputs/registry.py create mode 100644 vllm/model_executor/layers/fused_moe/layer.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py create mode 100644 vllm/model_executor/layers/quantization/utils/w8a8_utils.py create mode 100644 vllm/model_executor/model_loader/openvino.py create mode 100644 vllm/model_executor/models/clip.py create mode 100644 vllm/model_executor/models/deepseek_v2.py create mode 100644 vllm/model_executor/models/gemma2.py create mode 100644 vllm/model_executor/models/interfaces.py create mode 100644 vllm/model_executor/models/jamba.py create mode 100644 vllm/model_executor/models/mlp_speculator.py create mode 100644 vllm/model_executor/models/paligemma.py create mode 100644 vllm/model_executor/models/utils.py create mode 100644 vllm/platforms/__init__.py create mode 100644 vllm/platforms/cuda.py create mode 100644 vllm/platforms/interface.py create mode 100644 vllm/platforms/rocm.py create mode 100644 vllm/spec_decode/draft_model_runner.py create mode 100644 vllm/spec_decode/mlp_speculator_worker.py create mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py create mode 100644 vllm/transformers_utils/configs/mlp_speculator.py create mode 100644 vllm/worker/model_runner_base.py create mode 100644 vllm/worker/openvino_model_runner.py create mode 100644 vllm/worker/openvino_worker.py diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml new file mode 100644 index 0000000000000..fa6ea236ef04f --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.892 + - name: "exact_match,flexible-extract" + value: 0.892 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml new file mode 100644 index 0000000000000..e40f42a17c18f --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 250 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.752 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml new file mode 100644 index 0000000000000..7a89e8e0c76f2 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 +model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml new file mode 100644 index 0000000000000..bc29002985969 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.728 + - name: "exact_match,flexible-extract" + value: 0.728 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml new file mode 100644 index 0000000000000..fb4b4915ab955 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 +model_name: "meta-llama/Meta-Llama-3-8B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml new file mode 100644 index 0000000000000..75a24e408e7ad --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 +model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.86 + - name: "exact_match,flexible-extract" + value: 0.86 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml new file mode 100644 index 0000000000000..436ec21924ca1 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 +model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.624 + - name: "exact_match,flexible-extract" + value: 0.624 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml new file mode 100644 index 0000000000000..dec9164d1b84e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4 +model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.616 + - name: "exact_match,flexible-extract" + value: 0.632 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml new file mode 100644 index 0000000000000..45d5efc8860f5 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 +model_name: "Qwen/Qwen2-57B-A14B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.792 + - name: "exact_match,flexible-extract" + value: 0.824 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt new file mode 100644 index 0000000000000..2007dd2e1cfa1 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large.txt @@ -0,0 +1,3 @@ +Meta-Llama-3-70B-Instruct.yaml +Mixtral-8x7B-Instruct-v0.1.yaml +Qwen2-57B-A14-Instruct.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt new file mode 100644 index 0000000000000..3300ca64f44b8 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -0,0 +1,4 @@ +Meta-Llama-3-8B-Instruct.yaml +Meta-Llama-3-8B-Instruct-FP8.yaml +Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml +Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh new file mode 100644 index 0000000000000..fdb8ec5393b36 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for transformers. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo +} + +while getopts "m:b:l:f:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model hf \ + --model_args pretrained=$MODEL,parallelize=True \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh new file mode 100644 index 0000000000000..933733e9c1edf --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.2 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh new file mode 100644 index 0000000000000..b4fdde6dab425 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-tests.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using vllm and compares to " + echo "precomputed baseline (measured by HF transformers.)" + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. configs/small-models.txt)" + echo " -t - tensor parallel size" + echo +} + +SUCCESS=0 + +while getopts "c:t:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" + + export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} + export LM_EVAL_TP_SIZE=$TP_SIZE + pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$? + + if [[ $LOCAL_SUCCESS == 0 ]]; then + echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" + else + echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" + fi + + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + +done + +if [ "${SUCCESS}" -eq "0" ]; then + exit 0 +else + exit 1 +fi diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py new file mode 100644 index 0000000000000..7fdce7b53bd7f --- /dev/null +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -0,0 +1,55 @@ +""" +LM eval harness on model to compare vs HF baseline computed offline. +Configs are found in configs/$MODEL.yaml + +* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml +* export LM_EVAL_TP_SIZE=4 +* pytest -s test_lm_eval_correctness.py +""" + +import os +from pathlib import Path + +import lm_eval +import numpy +import yaml + +RTOL = 0.02 +TEST_DATA_FILE = os.environ.get( + "LM_EVAL_TEST_DATA_FILE", + ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") + +TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) + + +def launch_lm_eval(eval_config): + model_args = f"pretrained={eval_config['model_name']}," \ + f"tensor_parallel_size={TP_SIZE}," \ + f"add_bos_token=true" + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks=[task["name"] for task in eval_config["tasks"]], + num_fewshot=eval_config["num_fewshot"], + limit=eval_config["limit"], + batch_size="auto") + + return results + + +def test_lm_eval_correctness(): + eval_config = yaml.safe_load( + Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + + # Launch eval requests. + results = launch_lm_eval(eval_config) + + # Confirm scores match ground truth. + for task in eval_config["tasks"]: + for metric in task["metrics"]: + ground_truth = metric["value"] + measured_value = results["results"][task["name"]][metric["name"]] + print(f'{task["name"]} | {metric["name"]}: ' + f'ground_truth={ground_truth} | measured={measured_value}') + assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh new file mode 100755 index 0000000000000..70e56596c4a86 --- /dev/null +++ b/.buildkite/run-openvino-test.sh @@ -0,0 +1,14 @@ +# This script build the OpenVINO docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t openvino-test -f Dockerfile.openvino . + +# Setup cleanup +remove_docker_container() { docker rm -f openvino-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py diff --git a/Dockerfile.openvino b/Dockerfile.openvino new file mode 100644 index 0000000000000..9861997b451a9 --- /dev/null +++ b/Dockerfile.openvino @@ -0,0 +1,26 @@ +# The vLLM Dockerfile is used to construct vLLM image that can be directly used +# to run the OpenAI compatible server. + +FROM ubuntu:22.04 AS dev + +RUN apt-get update -y && \ + apt-get install -y python3-pip git +WORKDIR /workspace + +# copy requirements +COPY requirements-build.txt /workspace/vllm/ +COPY requirements-common.txt /workspace/vllm/ +COPY requirements-openvino.txt /workspace/vllm/ + +COPY vllm/ /workspace/vllm/vllm +COPY setup.py /workspace/vllm/ + +# install build requirements +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt +# build vLLM with OpenVINO backend +RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ + +COPY examples/ /workspace/vllm/examples +COPY benchmarks/ /workspace/vllm/benchmarks + +CMD ["/bin/bash"] diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le new file mode 100644 index 0000000000000..d4e4c483cada8 --- /dev/null +++ b/Dockerfile.ppc64le @@ -0,0 +1,22 @@ +FROM mambaorg/micromamba +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +USER root + +RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + +# Some packages in requirements-cpu are installed here +# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba +# Currently these may not be available for venv or pip directly +RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +# These packages will be in rocketce eventually +RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing + +RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install + +WORKDIR /vllm-workspace +ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp new file mode 100644 index 0000000000000..b50bdadc5713d --- /dev/null +++ b/csrc/cpu/cpu_types_vsx.hpp @@ -0,0 +1,491 @@ + +#ifndef CPU_TYPES_VSX_HPP +#define CPU_TYPES_VSX_HPP + +#include +#include +#include + +namespace vec_op { + +// FIXME: FP16 is not fully supported in Torch-CPU +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD +#define CPU_KERNEL_GUARD_IN(NAME) +#define CPU_KERNEL_GUARD_OUT(NAME) +#else +#define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; +#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +namespace { +template +constexpr void unroll_loop_item(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); +} +}; // namespace + +template >> +constexpr void unroll_loop(F &&f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } +}; + +typedef struct ss16x8x2_t { + __vector signed short val[2]; +} ss16x8x2_t; + +typedef struct ss16x8x4_t { + __vector signed short val[4]; +} ss16x8x4_t; + +typedef struct f32x4x2_t { + __vector float val[2]; +} f32x4x2_t; + +typedef struct f32x4x4_t { + __vector float val[4]; +} f32x4x4_t; + +struct FP32Vec8; +struct FP32Vec16; + +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __vector signed short reg; + + explicit BF16Vec8(const void *ptr) + : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {} + + explicit BF16Vec8(const FP32Vec8 &); + + void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + ss16x8x2_t reg; + + explicit BF16Vec16(const void *ptr) { + // Load 256 bits in two parts + reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr); + } + + explicit BF16Vec16(const FP32Vec16 &); + + void save(void *ptr) const { + // Save 256 bits in two parts + vec_xst(reg.val[0], 0, (signed short *)ptr); + vec_xst(reg.val[1], 16, (signed short *)ptr); + } +}; + +const static __vector signed short zero = vec_splats((signed short)0); + +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + ss16x8x4_t reg; + explicit BF16Vec32(const void *ptr) + : reg(*reinterpret_cast(ptr)) {} + + explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} + + explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ + vec8_data.reg, + vec8_data.reg, + vec8_data.reg, + vec8_data.reg + }) {} + + void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + union AliasReg { + __vector float reg; + float values[VEC_ELEM_NUM]; + }; + + __vector float reg; + + explicit FP32Vec4(float v) : reg(vec_splats(v)) {} + + explicit FP32Vec4() : reg(vec_splats(0.0f)) {} + + explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {} + + explicit FP32Vec4(__vector float data) : reg(data) {} + + explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + union AliasReg { + f32x4x2_t reg; + float values[VEC_ELEM_NUM]; + }; + + f32x4x2_t reg; + + explicit FP32Vec8(float v) { + reg.val[0] = vec_splats(v); + reg.val[1] = vec_splats(v); + } + + explicit FP32Vec8() { + reg.val[0] = vec_splats(0.0f); + reg.val[1] = vec_splats(0.0f); + } + + explicit FP32Vec8(const float *ptr) { + reg.val[0] = vec_xl(0, ptr); + reg.val[1] = vec_xl(16, ptr); + } + + explicit FP32Vec8(f32x4x2_t data) : reg(data) {} + + explicit FP32Vec8(const FP32Vec8 &data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + } + + explicit FP32Vec8(const BF16Vec8 &v) { + reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); + reg.val[1] = (__vector float)vec_mergel(zero, v.reg); + } + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + FP32Vec8 exp() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::exp(ar.values[0]); + ret.val[0][1] = std::exp(ar.values[1]); + ret.val[0][2] = std::exp(ar.values[2]); + ret.val[0][3] = std::exp(ar.values[3]); + ret.val[1][0] = std::exp(ar.values[4]); + ret.val[1][1] = std::exp(ar.values[5]); + ret.val[1][2] = std::exp(ar.values[6]); + ret.val[1][3] = std::exp(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 tanh() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::tanh(ar.values[0]); + ret.val[0][1] = std::tanh(ar.values[1]); + ret.val[0][2] = std::tanh(ar.values[2]); + ret.val[0][3] = std::tanh(ar.values[3]); + ret.val[1][0] = std::tanh(ar.values[4]); + ret.val[1][1] = std::tanh(ar.values[5]); + ret.val[1][2] = std::tanh(ar.values[6]); + ret.val[1][3] = std::tanh(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 er() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::erf(ar.values[0]); + ret.val[0][1] = std::erf(ar.values[1]); + ret.val[0][2] = std::erf(ar.values[2]); + ret.val[0][3] = std::erf(ar.values[3]); + ret.val[1][0] = std::erf(ar.values[4]); + ret.val[1][1] = std::erf(ar.values[5]); + ret.val[1][2] = std::erf(ar.values[6]); + ret.val[1][3] = std::erf(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 operator*(const FP32Vec8 &b) const { + return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator+(const FP32Vec8 &b) const { + return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator-(const FP32Vec8 &b) const { + return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator/(const FP32Vec8 &b) const { + return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + } + + void save(float *ptr) const { + vec_xst(reg.val[0], 0, ptr); + vec_xst(reg.val[1], 16, ptr); + } +}; + +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + f32x4x4_t reg; + float values[VEC_ELEM_NUM]; + }; + + f32x4x4_t reg; + + explicit FP32Vec16(float v) { + reg.val[0] = vec_splats(v); + reg.val[1] = vec_splats(v); + reg.val[2] = vec_splats(v); + reg.val[3] = vec_splats(v); + } + + explicit FP32Vec16() { + reg.val[0] = vec_splats(0.0f); + reg.val[1] = vec_splats(0.0f); + reg.val[2] = vec_splats(0.0f); + reg.val[3] = vec_splats(0.0f); + } + + explicit FP32Vec16(const float *ptr) { + reg.val[0] = vec_xl(0, ptr); + reg.val[1] = vec_xl(16, ptr); + reg.val[2] = vec_xl(32, ptr); + reg.val[3] = vec_xl(48, ptr); + } + + explicit FP32Vec16(f32x4x4_t data) : reg(data) {} + + explicit FP32Vec16(const FP32Vec16 &data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[2]; + reg.val[3] = data.reg.val[3]; + } + + explicit FP32Vec16(const FP32Vec4 &data) { + reg.val[0] = data.reg; + reg.val[1] = data.reg; + reg.val[2] = data.reg; + reg.val[3] = data.reg; + } + + explicit FP32Vec16(const FP32Vec8 &data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; + } + + explicit FP32Vec16(const BF16Vec16 &v) { + reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); + reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); + reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); + reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); + } + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(f32x4x4_t({ + vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(f32x4x4_t({ + vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(f32x4x4_t({ + vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(f32x4x4_t({ + vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); + } + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + template float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + + AliasReg ar; + ar.reg = reg; + float result = 0; + const int start = idx * group_size; + unroll_loop( + [&result, &start, ar](int i) { result += ar.values[start + i]; }); + + return result; + } + + void save(float *ptr) const { + vec_xst(reg.val[0], 0, ptr); + vec_xst(reg.val[1], 16, ptr); + vec_xst(reg.val[2], 32, ptr); + vec_xst(reg.val[3], 48, ptr); + } +}; + +template struct VecType { using vec_type = void; }; + +template using vec_t = typename VecType::vec_type; + +template <> struct VecType { using vec_type = FP32Vec8; }; + +template <> struct VecType { using vec_type = BF16Vec8; }; + +template void storeFP32(float v, T *ptr) { *ptr = v; } + +inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { + acc = acc + a * b; +} + +template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { + c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = + reinterpret_cast(&v); + *ptr = *(v_ptr + 1); +} + +#ifndef __VEC_CLASS_FP_NAN +#define __VEC_CLASS_FP_NAN (1 << 6) +#endif + +const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +#ifndef _ARCH_PWR10 +const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff }; +const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 }; +const static __vector unsigned int sh16 = { 16, 16, 16, 16 }; +const static __vector unsigned int one = { 1, 1, 1, 1 }; +#endif + +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { +#ifdef _ARCH_PWR10 + __vector signed short ret[2]; + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + reg = vec_perm(ret[0], ret[1], omask); +#elif defined(_ARCH_PWR9) + __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); + __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]); + __vector unsigned int lsb0 = vec_sr(inp0, sh16); + __vector unsigned int lsb1 = vec_sr(inp1, sh16); + lsb0 = vec_and(lsb0, one); + lsb1 = vec_and(lsb1, one); + __vector unsigned int rnd0 = vec_add(lsb0, bias); + __vector unsigned int rnd1 = vec_add(lsb1, bias); + inp0 = vec_add(inp0, rnd0); + inp1 = vec_add(inp1, rnd1); + __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + inp0 = vec_sel(inp0, nan, sel0); + inp1 = vec_sel(inp1, nan, sel1); + inp0 = vec_sr(inp0, sh16); + inp1 = vec_sr(inp1, sh16); + reg = (__vector signed short)vec_perm(inp0, inp1, omask); +#endif +} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +#ifdef _ARCH_PWR10 + __vector signed short ret[4]; + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]); + ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]); + reg.val[0] = vec_perm(ret[0], ret[1], omask); + reg.val[1] = vec_perm(ret[2], ret[3], omask); +#elif defined(_ARCH_PWR9) + __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); + __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]); + __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]); + __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]); + __vector unsigned int lsb0 = vec_sr(inp0, sh16); + __vector unsigned int lsb1 = vec_sr(inp1, sh16); + __vector unsigned int lsb2 = vec_sr(inp2, sh16); + __vector unsigned int lsb3 = vec_sr(inp3, sh16); + lsb0 = vec_and(lsb0, one); + lsb1 = vec_and(lsb1, one); + lsb2 = vec_and(lsb2, one); + lsb3 = vec_and(lsb3, one); + __vector unsigned int rnd0 = vec_add(lsb0, bias); + __vector unsigned int rnd1 = vec_add(lsb1, bias); + __vector unsigned int rnd2 = vec_add(lsb2, bias); + __vector unsigned int rnd3 = vec_add(lsb3, bias); + inp0 = vec_add(inp0, rnd0); + inp1 = vec_add(inp1, rnd1); + inp2 = vec_add(inp2, rnd2); + inp3 = vec_add(inp3, rnd3); + __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); + __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); + inp0 = vec_sel(inp0, nan, sel0); + inp1 = vec_sel(inp1, nan, sel1); + inp2 = vec_sel(inp2, nan, sel2); + inp3 = vec_sel(inp3, nan, sel3); + inp0 = vec_sr(inp0, sh16); + inp1 = vec_sr(inp1, sh16); + inp2 = vec_sr(inp2, sh16); + inp3 = vec_sr(inp3, sh16); + reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask); + reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask); +#endif +} + +inline void prefetch(const void *addr) { + __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); +} + +}; // namespace vec_op + +#endif diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp new file mode 100644 index 0000000000000..f50620a5287d4 --- /dev/null +++ b/csrc/cpu/cpu_types_x86.hpp @@ -0,0 +1,515 @@ + +#ifndef CPU_TYPES_X86_HPP +#define CPU_TYPES_X86_HPP + +#include +#include + +#ifndef __AVX2__ +static_assert(false, "AVX2 must be supported for the current implementation."); +#endif + +namespace vec_op { + +// FIXME: FP16 is not fully supported in Torch-CPU +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD +#define CPU_KERNEL_GUARD_IN(NAME) +#define CPU_KERNEL_GUARD_OUT(NAME) +#else +#define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; +#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +namespace { +template +constexpr void unroll_loop_item(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); +} +}; // namespace + +template >> +constexpr void unroll_loop(F &&f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } +}; + +struct FP32Vec8; +struct FP32Vec16; + +#ifdef __AVX512FP16__ +struct FP16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __m128h reg; + + explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {} + + explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {} + + explicit FP16Vec8(__m128h data) : reg(data) {} + + FP16Vec8 operator*(const FP16Vec8 &b) const { + return FP16Vec8(_mm_mul_ph(reg, b.reg)); + } + + FP16Vec8 operator+(const FP16Vec8 &b) const { + return FP16Vec8(_mm_add_ph(reg, b.reg)); + } + + FP16Vec8 operator-(const FP16Vec8 &b) const { + return FP16Vec8(_mm_sub_ph(reg, b.reg)); + } + + FP16Vec8 operator/(const FP16Vec8 &b) const { + return FP16Vec8(_mm_div_ph(reg, b.reg)); + } + + void save(void *ptr) const { _mm_storeu_ph(ptr, reg); } +}; +#endif + +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __m128i reg; + + explicit BF16Vec8(const void *ptr) + : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + + explicit BF16Vec8(const FP32Vec8 &); + + void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + __m256i reg; + + explicit BF16Vec16(const void *ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + + explicit BF16Vec16(const FP32Vec16 &); + + void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } +}; + +#ifdef __AVX512F__ +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + __m512i reg; + + explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} + + explicit BF16Vec32(__m512i data) : reg(data) {} + + explicit BF16Vec32(BF16Vec8 &vec8_data) + : reg((__m512i)_mm512_inserti32x4( + _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( + (__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1), + (__m128i)vec8_data.reg, 2), + (__m128i)vec8_data.reg, 3)) {} + + void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } +}; +#else +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + __m256i reg_low; + __m256i reg_high; + + explicit BF16Vec32(const void *ptr) + : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), + reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} + + explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), + reg_high(high) {} + + explicit BF16Vec32(BF16Vec8 &vec8_data) + : reg_low((__m256i)_mm256_inserti32x4( + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)), + reg_high((__m256i)_mm256_inserti32x4( + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)) {} + + void save(void *ptr) const { + *reinterpret_cast<__m256i *>(ptr) = reg_low; + *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; + } +}; +#endif + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + union AliasReg { + __m128 reg; + float values[VEC_ELEM_NUM]; + }; + + __m128 reg; + + explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {} + + explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} + + explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} + + explicit FP32Vec4(__m128 data) : reg(data) {} + + explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + union AliasReg { + __m256 reg; + float values[VEC_ELEM_NUM]; + }; + + __m256 reg; + + explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {} + + explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} + + explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} + + explicit FP32Vec8(__m256 data) : reg(data) {} + + explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} + +#ifdef __AVX512FP16__ + explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {} +#endif + + explicit FP32Vec8(const BF16Vec8 &v) + : reg(_mm256_castsi256_ps( + _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + FP32Vec8 exp() const { + AliasReg ar; + ar.reg = reg; + return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]), + expf(ar.values[5]), expf(ar.values[4]), + expf(ar.values[3]), expf(ar.values[2]), + expf(ar.values[1]), expf(ar.values[0]))); + } + + FP32Vec8 tanh() const { + AliasReg ar; + ar.reg = reg; + return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]), + tanhf(ar.values[5]), tanhf(ar.values[4]), + tanhf(ar.values[3]), tanhf(ar.values[2]), + tanhf(ar.values[1]), tanhf(ar.values[0]))); + } + + FP32Vec8 er() const { + AliasReg ar; + ar.reg = reg; + return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]), + erf(ar.values[5]), erf(ar.values[4]), + erf(ar.values[3]), erf(ar.values[2]), + erf(ar.values[1]), erf(ar.values[0]))); + } + + FP32Vec8 operator*(const FP32Vec8 &b) const { + return FP32Vec8(_mm256_mul_ps(reg, b.reg)); + } + + FP32Vec8 operator+(const FP32Vec8 &b) const { + return FP32Vec8(_mm256_add_ps(reg, b.reg)); + } + + FP32Vec8 operator-(const FP32Vec8 &b) const { + return FP32Vec8(_mm256_sub_ps(reg, b.reg)); + } + + FP32Vec8 operator/(const FP32Vec8 &b) const { + return FP32Vec8(_mm256_div_ps(reg, b.reg)); + } + + void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } +}; + +#ifdef __AVX512F__ +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + __m512 reg; + float values[VEC_ELEM_NUM]; + }; + + __m512 reg; + + explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {} + + explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} + + explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} + + explicit FP32Vec16(__m512 data) : reg(data) {} + + explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + + explicit FP32Vec16(const FP32Vec4 &data) + : reg((__m512)_mm512_inserti32x4( + _mm512_inserti32x4( + _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), + (__m128i)data.reg, 1), + (__m128i)data.reg, 2), + (__m128i)data.reg, 3)) {} + + explicit FP32Vec16(const FP32Vec8 &data) + : reg((__m512)_mm512_inserti32x8( + _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} + + explicit FP32Vec16(const BF16Vec16 &v) + : reg(_mm512_castsi512_ps( + _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(_mm512_mul_ps(reg, b.reg)); + } + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(_mm512_add_ps(reg, b.reg)); + } + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(_mm512_sub_ps(reg, b.reg)); + } + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(_mm512_div_ps(reg, b.reg)); + } + + float reduce_sum() const { return _mm512_reduce_add_ps(reg); } + + template float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); + __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); + return _mm512_mask_reduce_add_ps(mask, reg); + } + + void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } +}; +#else +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + union AliasReg { + __m256 reg; + float values[8]; + }; + + __m256 reg_low; + __m256 reg_high; + + explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), + reg_high(_mm256_set1_ps(v)) {} + + explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), + reg_high(_mm256_set1_ps(0.0)) {} + + explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), + reg_high(_mm256_loadu_ps(ptr + 8)) {} + + explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} + + explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), + reg_high(data.reg_high) {} + + explicit FP32Vec16(const FP32Vec4 &data) + : reg_low((__m256)_mm256_inserti128_si256( + _mm256_castsi128_si256((__m128i)data.reg), + (__m128i)data.reg, 1)), + reg_high((__m256)_mm256_inserti128_si256( + _mm256_castsi128_si256((__m128i)data.reg), + (__m128i)data.reg, 1)) {} + + explicit FP32Vec16(const FP32Vec8 &data) + : reg_low(data.reg), reg_high(data.reg) {} + + explicit FP32Vec16(const BF16Vec16 &v) { + __m128i low = _mm256_extractf128_si256(v.reg, 0); + __m128i high = _mm256_extractf128_si256(v.reg, 1); + + __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low); + __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high); + + __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2); + __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2); + + reg_low = _mm256_castsi256_ps(v_low_shifted); + reg_high = _mm256_castsi256_ps(v_high_shifted); + } + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), + _mm256_mul_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), + _mm256_add_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), + _mm256_sub_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), + _mm256_div_ps(reg_high, b.reg_high)); + } + + float reduce_sum() const { + FP32Vec8 low = FP32Vec8(reg_low); + FP32Vec8 high = FP32Vec8(reg_high); + return low.reduce_sum() + high.reduce_sum(); + } + + template float reduce_sub_sum(int idx) { + float sum = 0.0; + static_assert(VEC_ELEM_NUM % group_size == 0); + constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); + uint32_t mask = base_mask << (idx * group_size); + + AliasReg ar; + + auto func = [&sum, &mask, &ar](int i) { + int flag = mask & 0x1; + mask = mask >> 1; + if (flag != 0) sum += ar.values[i]; + }; + + ar.reg = reg_low; + unroll_loop(func); + + ar.reg = reg_high; + unroll_loop(func); + + return sum; + } + + void save(float *ptr) const { + _mm256_storeu_ps(ptr, reg_low); + _mm256_storeu_ps(ptr + 8, reg_high); + } +}; +#endif + +template struct VecType { using vec_type = void; }; + +template using vec_t = typename VecType::vec_type; + +template <> struct VecType { using vec_type = FP32Vec8; }; + +#ifdef __AVX512FP16__ +template <> struct VecType { using vec_type = FP16Vec16; }; +#endif + +template <> struct VecType { using vec_type = BF16Vec8; }; + +template void storeFP32(float v, T *ptr) { *ptr = v; } + +#ifdef __AVX512FP16__ +template <> inline void storeFP32(float v, c10::Half *ptr) { + *reinterpret_cast<_Float16 *>(ptr) = v; +} +#endif + +inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { + acc = acc + a * b; +} + +#ifdef __AVX512BF16__ +template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { + *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); +} + +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) + : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} + +inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { + acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); +} +#else +template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { + c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = + reinterpret_cast(&v); + *ptr = *(v_ptr + 1); +} + +#ifdef __AVX512F__ +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + : reg(_mm256_cvtepi32_epi16( + _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) + : reg(_mm512_cvtepi32_epi16( + _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} +#else +namespace{ +__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { + __m256i ai = _mm256_castps_si256(a); + ai = _mm256_srli_epi32(ai, 16); + ai = _mm256_packus_epi32(ai, ai); + ai = _mm256_permute4x64_epi64(ai, 0b00111001); + return _mm256_extracti128_si256(ai, 0); +} +} + +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { + BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); + BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); + reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); +} +#endif // __AVX512F__ +#endif // __AVX512BF16__ + +inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } + +}; // namespace vec_op + +#endif diff --git a/csrc/quantization/fp8/fp8_marlin.cu b/csrc/quantization/fp8/fp8_marlin.cu new file mode 100644 index 0000000000000..51ff071987f80 --- /dev/null +++ b/csrc/quantization/fp8/fp8_marlin.cu @@ -0,0 +1,1308 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Adapted from https://github.com/IST-DASLab/marlin + */ + +#include "../gptq_marlin/gptq_marlin.cuh" +#include "../gptq_marlin/gptq_marlin_dtypes.cuh" + +using namespace gptq_marlin; + +#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ + static_assert(std::is_same::value || \ + std::is_same::value, \ + "only float16 and bfloat16 is supported"); + +template +inline std::string str(T x) { + return std::to_string(x); +} + +namespace fp8_marlin { + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale + > +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization +) {} + +} // namespace fp8_marlin + +torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& workspace, + int64_t num_bits, int64_t size_m, int64_t size_n, + int64_t size_k) { + TORCH_CHECK_NOT_IMPLEMENTED(false, + "marlin_gemm(..) requires CUDA_ARCH >= 8.0"); + return torch::empty({1, 1}); +} + +#else + +// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 +// output/accumulation. +template +__device__ inline void mma(const typename ScalarType::FragA& a_frag, + const typename ScalarType::FragB& frag_b, + typename ScalarType::FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + float* c = reinterpret_cast(&frag_c); + if constexpr (std::is_same::value) { + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); + } +} + +// Instruction for loading a full 16x16 matrix fragment of operand A from shared +// memory, directly in tensor core layout. +template +__device__ inline void ldsm4(typename ScalarType::FragA& frag_a, + const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" + : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) + : "r"(smem)); +} + +// Fast FP8ToFp16/FP8ToBf16: Efficiently dequantize 8bit fp8_e4m3 values to fp16 +// bf16 Reference: +// - FP16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85 +// - BF16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175 +template +__device__ inline typename ScalarType::FragB dequant_8bit(int q) { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); +} + +template <> +__device__ inline typename ScalarType::FragB dequant_8bit(int q) { + // Constants for FP8 (E4M3) and FP16 formats + constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, FP16_EXPONENT = 5; + constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT; + + // Calculate MASK for extracting mantissa and exponent + constexpr int MASK1 = 0x80000000; + constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA); + constexpr int MASK3 = MASK2 & 0x7fffffff; + constexpr int MASK = MASK3 | (MASK3 >> 16); + // Final MASK value: 0x7F007F00 + + // Extract and shift FP8 values to FP16 format + int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT); + + // Construct and apply exponent bias + constexpr int BIAS_OFFSET = + (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1)); + const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET)); + + // Convert to half2 and apply bias + typename ScalarType::FragB frag_b; + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = __hmul2(*reinterpret_cast(&Out1), bias_reg); + frag_b[0] = __hmul2(*reinterpret_cast(&Out2), bias_reg); + return frag_b; +} + +template <> +__device__ inline typename ScalarType::FragB +dequant_8bit(int q) { + // Constants for FP8 (E4M3) and BF16 formats + constexpr int FP8_EXPONENT = 4, FP8_MANTISSA = 3, BF16_EXPONENT = 8; + constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT; + + // Calculate MASK for extracting mantissa and exponent + constexpr int MASK1 = 0x80000000; + constexpr int MASK2 = MASK1 >> (FP8_EXPONENT + FP8_MANTISSA); + constexpr int MASK3 = MASK2 & 0x7fffffff; + constexpr int MASK = MASK3 | (MASK3 >> 16); + // Final MASK value: 0x7F007F00 + + // Extract and shift FP8 values to BF16 format + int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + int Out2 = ((q << 8) & 0x80008000) | (((q << 8) & MASK) >> RIGHT_SHIFT); + + // Construct and apply exponent bias + constexpr int BIAS_OFFSET = + (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1)); + // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent + // position + constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23; + const nv_bfloat162 bias_reg = + __float2bfloat162_rn(*reinterpret_cast(&BIAS)); + + // Convert to bfloat162 and apply bias + typename ScalarType::FragB frag_b; + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = __hmul2(*reinterpret_cast(&Out1), bias_reg); + frag_b[0] = __hmul2(*reinterpret_cast(&Out2), bias_reg); + return frag_b; +} + +// Multiply dequantized values by the corresponding quantization scale; used +// only for grouped quantization. +template +__device__ inline void scale(typename ScalarType::FragB& frag_b, + typename ScalarType::FragS& frag_s, + int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s = + ScalarType::num2num2(reinterpret_cast(&frag_s)[i]); + frag_b[0] = __hmul2(frag_b[0], s); + frag_b[1] = __hmul2(frag_b[1], s); +} + +// Given 2 floats multiply by 2 scales (halves) +template +__device__ inline void scale_float(float* c, + typename ScalarType::FragS& s) { + scalar_t* s_ptr = reinterpret_cast(&s); + c[0] = __fmul_rn(c[0], ScalarType::num2float(s_ptr[0])); + c[1] = __fmul_rn(c[1], ScalarType::num2float(s_ptr[1])); +} + +// Wait until barrier reaches `count`, then lock for current threadblock. +__device__ inline void barrier_acquire(int* lock, int count) { + if (threadIdx.x == 0) { + int state = -1; + do + // Guarantee that subsequent writes by this threadblock will be visible + // globally. + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" + : "=r"(state) + : "l"(lock)); + while (state != count); + } + __syncthreads(); +} + +// Release barrier and increment visitation count. +__device__ inline void barrier_release(int* lock, bool reset = false) { + __syncthreads(); + if (threadIdx.x == 0) { + if (reset) { + lock[0] = 0; + return; + } + int val = 1; + // Make sure that all writes since acquiring this barrier are visible + // globally, while releasing the barrier. + asm volatile("fence.acq_rel.gpu;\n"); + asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" + : + : "l"(lock), "r"(val)); + } +} + +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks + // with a separate quantization scale + > +__global__ void Marlin( + const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks // extra global storage for barrier synchronization +) { + // Each threadblock processes one "stripe" of the B matrix with (roughly) the + // same size, which might involve multiple column "slices" (of width 16 * + // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM + // example: + // 0 1 3 + // 0 2 3 + // 1 2 4 + // While this kind of partitioning makes things somewhat more complicated, it + // ensures good utilization of all SMs for many kinds of shape and GPU + // configurations, while requiring as few slow global cross-threadblock + // reductions as possible. + using Dtype = ScalarType; + using scalar_t2 = typename ScalarType::scalar_t2; + using FragA = typename ScalarType::FragA; + using FragB = typename ScalarType::FragB; + using FragC = typename ScalarType::FragC; + using FragS = typename ScalarType::FragS; + + constexpr int pack_factor = 32 / num_bits; + + // For larger GEMMs we run multiple batchsize 64 versions in parallel for a + // better partitioning with less reductions + int parallel = 1; + if (prob_m > 16 * thread_m_blocks) { + parallel = prob_m / (16 * thread_m_blocks); + prob_m = 16 * thread_m_blocks; + } + + int k_tiles = prob_k / 16 / thread_k_blocks; + int n_tiles = prob_n / 16 / thread_n_blocks; + int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x); + + int slice_row = (iters * blockIdx.x) % k_tiles; + int slice_col_par = (iters * blockIdx.x) / k_tiles; + int slice_col = slice_col_par; + int slice_iters; // number of threadblock tiles in the current slice + int slice_count = + 0; // total number of active threadblocks in the current slice + int slice_idx; // index of threadblock in current slice; numbered bottom to + // top + + // We can easily implement parallel problem execution by just remapping + // indices and advancing global pointers + if (slice_col_par >= n_tiles) { + A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8; + C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; + locks += (slice_col_par / n_tiles) * n_tiles; + slice_col = slice_col_par % n_tiles; + } + + // Compute all information about the current slice which is required for + // synchronization. + auto init_slice = [&]() { + slice_iters = + iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; + if (slice_iters == 0) return; + if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; + slice_count = 1; + slice_idx = 0; + int col_first = iters * div_ceil(k_tiles * slice_col_par, iters); + if (col_first <= k_tiles * (slice_col_par + 1)) { + int col_off = col_first - k_tiles * slice_col_par; + slice_count = div_ceil(k_tiles - col_off, iters); + if (col_off > 0) slice_count++; + int delta_first = iters * blockIdx.x - col_first; + if (delta_first < 0 || (col_off == 0 && delta_first == 0)) + slice_idx = slice_count - 1; + else { + slice_idx = slice_count - 1 - delta_first / iters; + if (col_off > 0) slice_idx--; + } + } + if (slice_col == n_tiles) { + A += 16 * thread_m_blocks * prob_k / 8; + C += 16 * thread_m_blocks * prob_n / 8; + locks += n_tiles; + slice_col = 0; + } + }; + init_slice(); + + // A sizes/strides + + // stride of the A matrix in global memory + int a_gl_stride = prob_k / 8; + // stride of an A matrix tile in shared memory + constexpr int a_sh_stride = 16 * thread_k_blocks / 8; + // delta between subsequent A tiles in global memory + constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8; + // between subsequent accesses within a tile + int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o); + // between shared memory writes + constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o); + // between shared memory tile reads + constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4)); + // within a shared memory tile + constexpr int a_sh_rd_delta_i = a_sh_stride * 16; + // overall size of a tile + constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks); + // number of shared write iterations for a tile + constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta); + + // B sizes/strides + int b_gl_stride = 16 * prob_n / (pack_factor * 4); + constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4; + constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2; + constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs; + + int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; + int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads); + constexpr int b_sh_wr_delta = threads * b_thread_vecs; + constexpr int b_sh_rd_delta = threads * b_thread_vecs; + constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; + constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; + + // Scale sizes/strides without act_order + int s_gl_stride = prob_n / 8; + constexpr int s_sh_stride = 16 * thread_n_blocks / 8; + + // Scale size/strides with act_order + constexpr int tb_k = 16 * thread_k_blocks; + constexpr int g_idx_stage = 0; + // constexpr int act_s_row_stride = 1; + // int act_s_col_stride = act_s_row_stride * num_groups; + int act_s_col_stride = 1; + int act_s_col_warp_stride = act_s_col_stride * 8; + int tb_n_warps = thread_n_blocks / 4; + int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps; + + // Global A read index of current thread. + int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + a_gl_rd += a_gl_rd_delta_o * slice_row; + // Shared write index of current thread. + int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + // Shared read index. + int a_sh_rd = + a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16; + a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); + + int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) + + (threadIdx.x % b_sh_stride_threads) * b_thread_vecs; + b_gl_rd += b_sh_stride * slice_col; + b_gl_rd += b_gl_rd_delta_o * slice_row; + int b_sh_wr = threadIdx.x * b_thread_vecs; + int b_sh_rd = threadIdx.x * b_thread_vecs; + + // For act_order + int slice_k_start = tb_k * slice_row; + int slice_k_start_shared_fetch = slice_k_start; + int slice_n_offset = act_s_col_tb_stride * slice_col; + + // No act_order + int s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + int s_sh_wr = threadIdx.x; + bool s_sh_wr_pred = threadIdx.x < s_sh_stride; + + // We scale a `half2` tile in row-major layout for column-wise quantization. + int s_sh_rd = + 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4; + + // Precompute which thread should not read memory in which iterations; this is + // needed if there are more threads than required for a certain tilesize or + // when the batchsize is not a multiple of 16. + bool a_sh_wr_pred[a_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; + + // To ensure that writing and reading A tiles to/from shared memory, the + // latter in fragment format, is fully bank conflict free, we need to use a + // rather fancy XOR-based layout. The key here is that neither reads nor + // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the + // same shared memory banks. Further, it seems (based on NSight-Compute) that + // each warp must also write a consecutive memory segment? + auto transform_a = [&](int i) { + int row = i / a_gl_rd_delta_o; + return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; + }; + // Since the computation of this remapping is non-trivial and, due to our main + // loop unrolls, all shared memory accesses are static, we simply precompute + // both transformed reads and writes. + int a_sh_wr_trans[a_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); + int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + #pragma unroll + for (int j = 0; j < thread_m_blocks; j++) + a_sh_rd_trans[i][j] = + transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); + } + + // Since B-accesses have non-constant stride they have to be computed at + // runtime; we break dependencies between subsequent accesses with a tile by + // maintining multiple pointers (we have enough registers), a tiny + // optimization. + const int4* B_ptr[b_sh_wr_iters]; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; + + extern __shared__ int4 sh[]; + // Shared memory storage for global fetch pipelines. + int4* sh_a = sh; + int4* sh_b = sh_a + (stages * a_sh_stage); + int4* sh_g_idx = sh_b + (stages * b_sh_stage); + int4* sh_s = sh_g_idx + (stages * g_idx_stage); + + // Register storage for double buffer of shared memory reads. + FragA frag_a[2][thread_m_blocks]; + I4 frag_b_quant[2][b_thread_vecs]; + FragC frag_c[thread_m_blocks][4][2]; + FragS frag_s[2][4]; + + // Zero accumulators. + auto zero_accums = [&]() { + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) + reinterpret_cast(frag_c)[i] = 0; + }; + + int sh_first_group_id = -1; + int sh_num_groups = -1; + constexpr int sh_max_num_groups = 32; + + auto fetch_scales_to_shared = [&](bool is_async, int first_group_id, + int last_group_id) { + sh_first_group_id = first_group_id; + sh_num_groups = last_group_id - first_group_id + 1; + + if (sh_num_groups < sh_max_num_groups) { + sh_num_groups = sh_max_num_groups; + } + + if (sh_first_group_id + sh_num_groups > num_groups) { + sh_num_groups = num_groups - sh_first_group_id; + } + + int row_offset = first_group_id * s_gl_stride; + + if (is_async) { + for (int i = 0; i < sh_num_groups; i++) { + if (threadIdx.x < s_sh_stride) { + cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x], + &scales_ptr[row_offset + (i * s_gl_stride) + + slice_n_offset + threadIdx.x]); + } + } + } else { + for (int i = 0; i < sh_num_groups; i++) { + if (threadIdx.x < s_sh_stride) { + sh_s[(i * s_sh_stride) + threadIdx.x] = + scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + + threadIdx.x]; + } + } + } + }; + // Asynchronously fetch the next A, B and s tile from global to the next + // shared memory pipeline location. + auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { + if (pred) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) { + cp_async4_pred( + &sh_a_stage[a_sh_wr_trans[i]], + &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], + a_sh_wr_pred[i]); + } + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + #pragma unroll + for (int j = 0; j < b_thread_vecs; j++) { + cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j); + } + + B_ptr[i] += b_gl_rd_delta_o; + } + } + // Insert a fence even when we are winding down the pipeline to ensure that + // waiting is also correct at this point. + cp_async_fence(); + }; + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + // Load the next sub-tile from the current location in the shared memory pipe + // into the current register buffer. + auto fetch_to_registers = [&](int k, int pipe) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) + ldsm4(frag_a[k % 2][i], + &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + + #pragma unroll + for (int i = 0; i < b_thread_vecs; i++) { + frag_b_quant[k % 2][i] = *reinterpret_cast( + &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]); + } + }; + + bool is_same_group[stages]; + int same_group_id[stages]; + + auto init_same_group = [&](int pipe) { + is_same_group[pipe] = false; + same_group_id[pipe] = 0; + return; + }; + + // Execute the actual tensor core matmul of a sub-tile. + auto matmul = [&](int k) { + // We have the m dimension as the inner loop in order to encourage overlapping + // dequantization and matmul operations. + #pragma unroll + for (int j = 0; j < 4; j++) { + FragB frag_b0; + FragB frag_b1; + + int* frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k % 2]); + int b_quant_0 = frag_b_quant_ptr[j * 2 + 0]; + int b_quant_1 = frag_b_quant_ptr[j * 2 + 1]; + + frag_b0 = dequant_8bit(b_quant_0); + frag_b1 = dequant_8bit(b_quant_1); + + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); + mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); + } + } + }; + + // Since we slice across the k dimension of a tile in order to increase the + // number of warps while keeping the n dimension of a tile reasonable, we have + // multiple warps that accumulate their partial sums of the same output + // location; which we have to reduce over in the end. We do in shared memory. + auto thread_block_reduce = [&]() { + constexpr int red_off = threads / b_sh_stride_threads / 2; + if (red_off >= 1) { + int red_idx = threadIdx.x / b_sh_stride_threads; + constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2; + constexpr int red_sh_delta = b_sh_stride_threads; + int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) + + (threadIdx.x % b_sh_stride_threads); + + // Parallel logarithmic shared memory reduction. We make sure to avoid any + // unnecessary read or write iterations, e.g., for two warps we write only + // once by warp 1 and read only once by warp 0. + + #pragma unroll + for (int m_block = 0; m_block < thread_m_blocks; m_block++) { + #pragma unroll + for (int i = red_off; i > 0; i /= 2) { + if (i <= red_idx && red_idx < 2 * i) { + #pragma unroll + for (int j = 0; j < 4 * 2; j++) { + int red_sh_wr = + red_sh_delta * j + (red_sh_rd - red_sh_stride * i); + if (i < red_off) { + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh[red_sh_wr]); + #pragma unroll + for (int k = 0; k < 4; k++) + reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += + c_rd[k] + c_wr[k]; + } + sh[red_sh_wr] = + reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + } + } + __syncthreads(); + } + if (red_idx == 0) { + #pragma unroll + for (int i = 0; i < 4 * 2; i++) { + float* c_rd = + reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); + #pragma unroll + for (int j = 0; j < 4; j++) + reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += + c_rd[j]; + } + } + __syncthreads(); + } + } + }; + + // Since multiple threadblocks may process parts of the same column slice, we + // finally have to globally reduce over the results. As the striped + // partitioning minimizes the number of such reductions and our outputs are + // usually rather small, we perform this reduction serially in L2 cache. + auto global_reduce = [&](bool first = false, bool last = false) { + // We are very careful here to reduce directly in the output buffer to + // maximize L2 cache utilization in this step. To do this, we write out + // results in FP16 (but still reduce with FP32 compute). + constexpr int active_threads = 32 * thread_n_blocks / 4; + if (threadIdx.x < active_threads) { + int c_gl_stride = prob_n / 8; + int c_gl_wr_delta_o = 8 * c_gl_stride; + int c_gl_wr_delta_i = 4 * (active_threads / 32); + int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + + 4 * (threadIdx.x / 32) + threadIdx.x % 4; + c_gl_wr += (2 * thread_n_blocks) * slice_col; + constexpr int c_sh_wr_delta = active_threads; + int c_sh_wr = threadIdx.x; + + int row = (threadIdx.x % 32) / 4; + + if (!first) { + // Interestingly, doing direct global accesses here really seems to mess up + // the compiler and lead to slowdowns, hence we also use async-copies even + // though these fetches are not actually asynchronous. + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4; i++) { + cp_async4_pred( + &sh[c_sh_wr + c_sh_wr_delta * i], + &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + + c_gl_wr_delta_i * (i % 2)], + i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); + } + cp_async_fence(); + cp_async_wait<0>(); + } + + #pragma unroll + for (int i = 0; i < thread_m_blocks * 4; i++) { + if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { + if (!first) { + int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; + #pragma unroll + for (int j = 0; j < 2 * 4; j++) { + reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += + Dtype::num2float(reinterpret_cast(&c_red)[j]); + } + } + if (!last) { + int4 c; + #pragma unroll + for (int j = 0; j < 2 * 4; j++) { + reinterpret_cast(&c)[j] = + Dtype::float2num(reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); + } + C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = + c; + } + } + } + } + }; + + // Write out the reduce final result in the correct layout. We only actually + // reshuffle matrix fragments in this step, the reduction above is performed + // in fragment layout. + auto write_result = [&]() { + int c_gl_stride = prob_n / 8; + constexpr int c_sh_stride = 2 * thread_n_blocks + 1; + int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); + constexpr int c_sh_rd_delta = + c_sh_stride * (threads / (2 * thread_n_blocks)); + + int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + c_gl_wr += (2 * thread_n_blocks) * slice_col; + int c_sh_wr = + (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; + c_sh_wr += 32 * (threadIdx.x / 32); + int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + + int c_gl_wr_end = c_gl_stride * prob_m; + + // We first reorder in shared memory to guarantee the most efficient final + // global write patterns + auto write = [&](int idx, float c0, float c1, FragS& s) { + scalar_t2 res = + Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); + + ((scalar_t2*)sh)[idx] = res; + }; + + if (threadIdx.x / 32 < thread_n_blocks / 4) { + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + int wr = c_sh_wr + 8 * j; + write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], + frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], + frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], + frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], + frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + } + c_sh_wr += 16 * (4 * c_sh_stride); + } + } + __syncthreads(); + + #pragma unroll + for (int i = 0; + i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); + i++) { + if (c_gl_wr < c_gl_wr_end) { + C[c_gl_wr] = sh[c_sh_rd]; + c_gl_wr += c_gl_wr_delta; + c_sh_rd += c_sh_rd_delta; + } + } + }; + + // Start global fetch and register load pipelines. + auto start_pipes = [&]() { + + #pragma unroll + for (int i = 0; i < stages - 1; i++) { + fetch_to_shared(i, i, i < slice_iters); + } + + zero_accums(); + wait_for_stage(); + init_same_group(0); + fetch_to_registers(0, 0); + a_gl_rd += a_gl_rd_delta_o * (stages - 1); + slice_k_start_shared_fetch += tb_k * (stages - 1); + }; + if (slice_iters) { + start_pipes(); + } + + // Main loop. + while (slice_iters) { + // We unroll over both the global fetch and the register load pipeline to + // ensure all shared memory accesses are static. Note that both pipelines + // have even length meaning that the next iteration will always start at + // index 0. + + #pragma unroll + for (int pipe = 0; pipe < stages;) { + #pragma unroll + for (int k = 0; k < b_sh_wr_iters; k++) { + fetch_to_registers(k + 1, pipe % stages); + if (k == b_sh_wr_iters - 2) { + fetch_to_shared((pipe + stages - 1) % stages, pipe, + slice_iters >= stages); + pipe++; + wait_for_stage(); + init_same_group(pipe % stages); + } + matmul(k); + } + slice_iters--; + if (slice_iters == 0) { + break; + } + } + + a_gl_rd += a_gl_rd_delta_o * stages; + slice_k_start += tb_k * stages; + slice_k_start_shared_fetch += tb_k * stages; + + // Process results and, if necessary, proceed to the next column slice. + // While this pattern may not be the most readable, other ways of writing + // the loop seemed to noticeably worse performance after compilation. + if (slice_iters == 0) { + cp_async_wait<0>(); + bool last = slice_idx == slice_count - 1; + // For per-column scales, we only fetch them here in the final step before + // write-out + if (s_sh_wr_pred) { + cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]); + } + cp_async_fence(); + + thread_block_reduce(); + + cp_async_wait<0>(); + __syncthreads(); + if (threadIdx.x / 32 < thread_n_blocks / 4) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + } + + // For 8-bit channelwise, we apply the scale before the global reduction + // that converts the fp32 results to fp16 (so that we avoid possible + // overflow in fp16) + if (threadIdx.x / 32 < thread_n_blocks / 4) { + #pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + #pragma unroll + for (int j = 0; j < 4; j++) { + scale_float(reinterpret_cast(&frag_c[i][j][0][0]), + frag_s[j / 2][2 * (j % 2) + 0]); + scale_float(reinterpret_cast(&frag_c[i][j][0][2]), + frag_s[j / 2][2 * (j % 2) + 0]); + + scale_float(reinterpret_cast(&frag_c[i][j][1][0]), + frag_s[j / 2][2 * (j % 2) + 1]); + scale_float(reinterpret_cast(&frag_c[i][j][1][2]), + frag_s[j / 2][2 * (j % 2) + 1]); + } + } + } + + if (slice_count > 1) { // only globally reduce if there is more than one + // block in a slice + barrier_acquire(&locks[slice_col], slice_idx); + global_reduce(slice_idx == 0, last); + barrier_release(&locks[slice_col], last); + } + if (last) // only the last block in a slice actually writes the result + write_result(); + slice_row = 0; + slice_col_par++; + slice_col++; + init_slice(); + if (slice_iters) { + a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; + if (slice_col == 0) { + #pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; + } + + // Update slice k/n for scales loading + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + + start_pipes(); + } + } + } +} + + #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \ + THREAD_K_BLOCKS, GROUP_BLOCKS, NUM_THREADS) \ + else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ + cudaFuncSetAttribute( \ + Marlin, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ + Marlin \ + <<>>( \ + A_ptr, B_ptr, C_ptr, s_ptr, num_groups, prob_m, prob_n, prob_k, \ + locks); \ + } + +typedef struct { + int thread_k; + int thread_n; + int num_threads; +} thread_config_t; + +typedef struct { + int max_m_blocks; + thread_config_t tb_cfg; +} exec_config_t; + +thread_config_t small_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {128, 128, 256}, + {64, 128, 128}, + {128, 64, 128}, +}; + +thread_config_t large_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {64, 256, 256}, + {64, 128, 128}, + {128, 64, 128}, + +}; + +int get_scales_cache_size(thread_config_t const& th_config, int prob_m, + int prob_n, int prob_k, int num_bits, + int group_size) { + int tb_n = th_config.thread_n; + + // Get max scale groups per thread-block + // Fixed for channelwise + int tb_groups = 1; + int tb_scales = tb_groups * tb_n * 2; + + return tb_scales * pipe_stages; +} + +bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks, + int prob_m, int prob_n, int prob_k, int num_bits, + int scales_cache_size, int max_shared_mem) { + int pack_factor = 32 / num_bits; + + // Get B size + int tb_k = th_config.thread_k; + int tb_n = th_config.thread_n; + + int b_size = (tb_k * tb_n / pack_factor) * 4; + + // Get A size + int m_blocks = div_ceil(prob_m, 16); + int tb_max_m = 16; + + while (true) { + if (m_blocks >= max_m_blocks) { + tb_max_m *= max_m_blocks; + break; + } + + max_m_blocks--; + if (max_m_blocks == 0) { + TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks); + } + } + + int a_size = (tb_max_m * tb_k) * 2; + + float pipe_size = (a_size + b_size) * pipe_stages; + + TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity + + return pipe_size < 0.95f * (max_shared_mem - scales_cache_size); +} + +bool is_valid_config(thread_config_t const& th_config, int max_m_blocks, + int prob_m, int prob_n, int prob_k, int num_bits, + int group_size, int max_shared_mem) { + // Sanity + if (th_config.thread_k == -1 || th_config.thread_n == -1 || + th_config.num_threads == -1) { + return false; + } + + // Verify K/N are divisible by thread K/N + if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { + return false; + } + + // Verify min for thread K/N + if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { + return false; + } + + // num_threads must be at least 128 (= 4 warps) + if (th_config.num_threads < 128) { + return false; + } + + // Determine cache for scales + int scales_cache_size = get_scales_cache_size(th_config, prob_m, prob_n, + prob_k, num_bits, group_size); + + // Check that pipeline fits into cache + if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k, + num_bits, scales_cache_size, max_shared_mem)) { + return false; + } + + return true; +} + +exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, + int num_bits, int group_size, + int max_shared_mem) { + int max_m_blocks = 4; + while (max_m_blocks > 0) { + if (prob_m <= 16) { + for (auto th_config : small_batch_thread_configs) { + if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k, + num_bits, group_size, max_shared_mem)) { + return exec_config_t{max_m_blocks, th_config}; + } + } + } else { + for (auto th_config : large_batch_thread_configs) { + if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k, + num_bits, group_size, max_shared_mem)) { + return exec_config_t{max_m_blocks, th_config}; + } + } + } + + max_m_blocks--; // Process less M blocks per invocation to reduce cache + // usage + } + + return exec_config_t{0, {-1, -1, -1}}; +} + + #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) + +template +void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, int prob_m, + int prob_n, int prob_k, void* workspace, int num_bits, + int num_groups, int group_size, int dev, + cudaStream_t stream, int thread_k, int thread_n, int sms, + int max_par) { + TORCH_CHECK(num_bits == 8, "num_bits must be 8. Got = ", num_bits); + TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, + ", ", prob_n, ", ", prob_k, "]"); + + int tot_m = prob_m; + int tot_m_blocks = div_ceil(tot_m, 16); + int pad = 16 * tot_m_blocks - tot_m; + + if (sms == -1) { + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); + } + + int max_shared_mem = 0; + cudaDeviceGetAttribute(&max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + TORCH_CHECK(max_shared_mem > 0); + + // Set thread config + exec_config_t exec_cfg; + if (thread_k != -1 && thread_n != -1) { + // User-defined config + exec_cfg = + exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}}; + } else { + // Auto config + exec_cfg = determine_thread_config(prob_m, prob_n, prob_k, num_bits, + group_size, max_shared_mem); + } + + TORCH_CHECK( + exec_cfg.max_m_blocks > 0 && + is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks, prob_m, + prob_n, prob_k, num_bits, group_size, max_shared_mem), + "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks, + ", thread_k = ", exec_cfg.tb_cfg.thread_k, + ", thread_n = ", exec_cfg.tb_cfg.thread_n, + ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [", prob_m, + ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits, + ", group_size = ", group_size, ", max_shared_mem = ", max_shared_mem); + + int num_threads = exec_cfg.tb_cfg.num_threads; + thread_k = exec_cfg.tb_cfg.thread_k; + thread_n = exec_cfg.tb_cfg.thread_n; + + int thread_k_blocks = thread_k / 16; + int thread_n_blocks = thread_n / 16; + + int blocks = sms; + + TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, + " is not divisible by thread_n = ", thread_n); + TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, + " is not divisible by thread_k = ", thread_k); + + int group_blocks = -1; + + const int4* A_ptr = (const int4*)A; + const int4* B_ptr = (const int4*)B; + int4* C_ptr = (int4*)C; + const int4* s_ptr = (const int4*)s; + + int* locks = (int*)workspace; + + // Main loop + for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) { + int thread_m_blocks = tot_m_blocks - i; + prob_m = tot_m - 16 * i; + int par = 1; + if (thread_m_blocks > exec_cfg.max_m_blocks) { + // Note that parallel > 1 currently only works for inputs without any + // padding + par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks); + if (par > max_par) par = max_par; + prob_m = (16 * exec_cfg.max_m_blocks) * par; + i += exec_cfg.max_m_blocks * (par - 1); + thread_m_blocks = exec_cfg.max_m_blocks; + } + + // Define kernel configurations + if (false) { + } + CALL_IF(8, 32, 2, 256) + CALL_IF(8, 16, 4, 256) + CALL_IF(8, 8, 8, 256) + CALL_IF(8, 8, 4, 128) + CALL_IF(8, 4, 8, 128) + else { + TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + + str(prob_n) + ", " + str(prob_k) + "]" + + ", num_groups = " + str(num_groups) + + ", group_size = " + str(group_size) + + ", thread_m_blocks = " + str(thread_m_blocks) + + ", thread_n_blocks = " + str(thread_n_blocks) + + ", thread_k_blocks = " + str(thread_k_blocks)); + } + + A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par; + C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; + } +} + +} // namespace fp8_marlin + +torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& workspace, + int64_t num_bits, int64_t size_m, int64_t size_n, + int64_t size_k) { + // Verify num_bits + TORCH_CHECK(num_bits == 8, "num_bits must be 8. Got = ", num_bits); + int pack_factor = 32 / num_bits; + + // Verify A + TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0), + ", size_m = ", size_m); + TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1), + ", size_k = ", size_k); + + // Verify B + TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k, + " is not divisible by tile_size = ", gptq_marlin::tile_size); + TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0), + "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0), + ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size); + TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0, + "b_q_weight.size(1) = ", b_q_weight.size(1), + " is not divisible by tile_size = ", gptq_marlin::tile_size); + int actual_size_n = + (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor; + TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n, + ", actual_size_n = ", actual_size_n); + + // Verify device and strides + TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); + TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); + + TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); + TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); + + TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); + TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); + + // Alloc buffers + const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); + auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); + torch::Tensor c = torch::empty({size_m, size_n}, options); + + // thread_k: `k` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_k = -1; + // thread_n: `n` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_n = -1; + // sms: number of SMs to use for the kernel (can usually be left as auto -1) + int sms = -1; + + // Detect groupsize and act_order + int num_groups = -1; + int group_size = -1; + + int b_rank = b_scales.sizes().size(); + TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2"); + TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1), + " is not size_n = ", size_n); + // Channelwise only for FP8 + TORCH_CHECK(b_scales.size(0) == 1) + num_groups = b_scales.size(0); + + // Verify workspace size + TORCH_CHECK( + size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n, + ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n); + int min_workspace_size = + (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par; + TORCH_CHECK(workspace.numel() >= min_workspace_size, + "workspace.numel = ", workspace.numel(), + " is below min_workspace_size = ", min_workspace_size); + + int dev = a.get_device(); + if (a.scalar_type() == at::ScalarType::Half) { + fp8_marlin::marlin_mm_f16i4( + a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), + b_scales.data_ptr(), size_m, size_n, size_k, + workspace.data_ptr(), num_bits, num_groups, group_size, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, + gptq_marlin::max_par); + } else if (a.scalar_type() == at::ScalarType::BFloat16) { + fp8_marlin::marlin_mm_f16i4( + a.data_ptr(), b_q_weight.data_ptr(), + c.data_ptr(), b_scales.data_ptr(), size_m, + size_n, size_k, workspace.data_ptr(), num_bits, num_groups, group_size, + dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, + gptq_marlin::max_par); + } else { + TORCH_CHECK(false, "fp8_marlin_gemm only supports bfloat16 and float16"); + } + + return c; +} + +#endif diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html new file mode 100644 index 0000000000000..cd5c4053e225f --- /dev/null +++ b/docs/source/_templates/sections/header.html @@ -0,0 +1,38 @@ + + +
+

You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.

+
diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst new file mode 100644 index 0000000000000..e0c773781115f --- /dev/null +++ b/docs/source/dev/input_processing/input_processing_pipeline.rst @@ -0,0 +1,20 @@ +.. _input_processing_pipeline: + +Input Processing Pipeline +========================= + +1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`). + +2. Tokenize the data if necessary. + +3. Process the inputs using :meth:`INPUT_REGISTRY.process_input `. + + - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. + +4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`. + +5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`. + +6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input `. + + - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model. diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst new file mode 100644 index 0000000000000..5d895837590ba --- /dev/null +++ b/docs/source/dev/input_processing/model_inputs_index.rst @@ -0,0 +1,39 @@ +.. _input_processing: + +Input Processing +================ + +.. currentmodule:: vllm.inputs + +Each model can override parts of vLLM's :ref:`input processing pipeline ` via +:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + +Currently, this mechanism is only utilized in :ref:`multi-modal ` models for preprocessing multi-modal input +data in addition to input prompt, but it can be extended to text-only language models when needed. + +Guides +++++++ + +.. toctree:: + :maxdepth: 1 + + input_processing_pipeline + +Module Contents ++++++++++++++++ + +LLM Engine Inputs +----------------- + +.. autoclass:: vllm.inputs.LLMInputs + :members: + :show-inheritance: + +Registry +-------- + +.. autodata:: vllm.inputs.INPUT_REGISTRY + +.. automodule:: vllm.inputs.registry + :members: + :show-inheritance: diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst new file mode 100644 index 0000000000000..0d8e0b680ff0d --- /dev/null +++ b/docs/source/getting_started/openvino-installation.rst @@ -0,0 +1,95 @@ +.. _installation_openvino: + +Installation with OpenVINO +========================== + +vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (``--enable-prefix-caching``) +- Chunked prefill (``--enable-chunked-prefill``) + +**Table of contents**: + +- :ref:`Requirements ` +- :ref:`Quick start using Dockerfile ` +- :ref:`Build from source ` +- :ref:`Performance tips ` +- :ref:`Limitations ` + +.. _openvino_backend_requirements: + +Requirements +------------ + +* OS: Linux +* Instruction set architecture (ISA) requirement: at least AVX2. + +.. _openvino_backend_quick_start_dockerfile: + +Quick start using Dockerfile +---------------------------- + +.. code-block:: console + + $ docker build -f Dockerfile.openvino -t vllm-openvino-env . + $ docker run -it --rm vllm-openvino-env + +.. _install_openvino_backend_from_source: + +Install from source +------------------- + +- First, install Python. For example, on Ubuntu 22.04, you can run: + + .. code-block:: console + + $ sudo apt-get update -y + $ sudo apt-get install python3 + +- Second, install prerequisites vLLM OpenVINO backend installation: + + .. code-block:: console + + $ pip install --upgrade pip + $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + +- Finally, install vLLM with OpenVINO backend: + + .. code-block:: console + + $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + +.. _openvino_backend_performance_tips: + +Performance tips +---------------- + +vLLM OpenVINO backend uses the following environment variables to control behavior: + +- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. + +- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. + +- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. + +To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``) + +OpenVINO best known configuration is: + +.. code-block:: console + + $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 + +.. _openvino_backend_limitations: + +Limitations +----------- + +- LoRA serving is not supported. + +- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. + +- Tensor and pipeline parallelism are not currently enabled in vLLM integration. + +- Speculative sampling is not tested within vLLM integration. diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst new file mode 100644 index 0000000000000..20be920b5f699 --- /dev/null +++ b/docs/source/models/enabling_multimodal_inputs.rst @@ -0,0 +1,147 @@ +.. _enabling_multimodal_inputs: + +Enabling Multimodal Inputs +========================== + +This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal ` inputs. + +.. seealso:: + :ref:`adding_a_new_model` + + +1. Update the base vLLM model +----------------------------- + +It is assumed that you have already implemented the model in vLLM according to :ref:`these steps `. +Further update the model as follows: + +- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface. + + .. code-block:: diff + + + from vllm.model_executor.models.interfaces import SupportsVision + + - class YourModelForImage2Seq(nn.Module): + + class YourModelForImage2Seq(nn.Module, SupportsVision): + + .. note:: + The model class does not have to be named :code:`*ForCausalLM`. + Check out `the HuggingFace Transformers documentation `__ for some examples. + +- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward` + for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + .. code-block:: diff + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + + +2. Register input mappers +------------------------- + +For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper `. +This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`. + +.. code-block:: diff + + from vllm.model_executor.models.interfaces import SupportsVision + + from vllm.multimodal import MULTIMODAL_REGISTRY + + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + class YourModelForImage2Seq(nn.Module, SupportsVision): + +A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. + +.. seealso:: + :ref:`input_processing_pipeline` + + +3. Register maximum number of multi-modal tokens +------------------------------------------------ + +For each modality type that the model accepts as input, calculate the maximum possible number of tokens +and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. + +.. code-block:: diff + + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsVision + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + + @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsVision): + +Here are some examples: + +- Image inputs (static feature size): `LLaVA-1.5 Model `__ +- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ + +.. seealso:: + :ref:`input_processing_pipeline` + + +4. (Optional) Register dummy data +--------------------------------- + +During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. +In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data `. + +.. code-block:: diff + + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsVision + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() + + @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsVision): + +.. note:: + The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. + +Here are some examples: + +- Image inputs (static feature size): `LLaVA-1.5 Model `__ +- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ + +.. seealso:: + :ref:`input_processing_pipeline` + + +5. (Optional) Register input processor +-------------------------------------- + +Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. +This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call. +You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor `. + +.. code-block:: diff + + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsVision + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() + + @INPUT_REGISTRY.register_input_processor() + class YourModelForImage2Seq(nn.Module, SupportsVision): + +A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. +Here are some examples: + +- Insert static number of image tokens: `LLaVA-1.5 Model `__ +- Insert dynamic number of image tokens: `LLaVA-NeXT Model `__ + +.. seealso:: + :ref:`input_processing_pipeline` diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst new file mode 100644 index 0000000000000..ecc330d866dbd --- /dev/null +++ b/docs/source/quantization/supported_hardware.rst @@ -0,0 +1,30 @@ +.. _supported_hardware_for_quantization: + +Supported Hardware for Quantization Kernels +=========================================== + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +Implementation Volta Turing Ampere Ada Hopper AMD GPU Intel GPU x86 CPU AWS Inferentia Google TPU +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +AQLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +AWQ ❌ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +DeepSpeedFP ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +FP8 ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +Marlin ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +GPTQ ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +SqueezeLLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +bitsandbytes ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== + +Notes: +^^^^^^ + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- "✅" indicates that the quantization method is supported on the specified hardware. +- "❌" indicates that the quantization method is not supported on the specified hardware. + +Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + +For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. \ No newline at end of file diff --git a/docs/source/serving/faq.rst b/docs/source/serving/faq.rst new file mode 100644 index 0000000000000..7b0374be8adff --- /dev/null +++ b/docs/source/serving/faq.rst @@ -0,0 +1,12 @@ +Frequently Asked Questions +=========================== + + Q: How can I serve multiple models on a single port using the OpenAI API? + +A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly. + +---------------------------------------- + + Q: Which model to use for offline inference embedding? + +A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py new file mode 100644 index 0000000000000..fd53a6def1a13 --- /dev/null +++ b/examples/llava_next_example.py @@ -0,0 +1,36 @@ +from io import BytesIO + +import requests +from PIL import Image + +from vllm import LLM, SamplingParams + + +def run_llava_next(): + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096) + + prompt = "[INST] \nWhat is shown in this image? [/INST]" + url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg" + image = Image.open(BytesIO(requests.get(url).content)) + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=100) + + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": { + "image": image + } + }, + sampling_params=sampling_params) + + generated_text = "" + for o in outputs: + generated_text += o.outputs[0].text + + print(f"LLM output:{generated_text}") + + +if __name__ == "__main__": + run_llava_next() diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py new file mode 100644 index 0000000000000..5dec4a76afb2f --- /dev/null +++ b/examples/offline_inference_mlpspeculator.py @@ -0,0 +1,58 @@ +import gc +import time +from typing import List + +from vllm import LLM, SamplingParams + + +def time_generation(llm: LLM, prompts: List[str], + sampling_params: SamplingParams): + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + # Warmup first + llm.generate(prompts, sampling_params) + llm.generate(prompts, sampling_params) + start = time.time() + outputs = llm.generate(prompts, sampling_params) + end = time.time() + print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs])) + # Print the outputs. + for output in outputs: + generated_text = output.outputs[0].text + print(f"text: {generated_text!r}") + + +if __name__ == "__main__": + + template = ( + "Below is an instruction that describes a task. Write a response " + "that appropriately completes the request.\n\n### Instruction:\n{}" + "\n\n### Response:\n") + + # Sample prompts. + prompts = [ + "Write about the president of the United States.", + ] + prompts = [template.format(prompt) for prompt in prompts] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=200) + + # Create an LLM without spec decoding + llm = LLM(model="meta-llama/Llama-2-13b-chat-hf") + + print("Without speculation") + time_generation(llm, prompts, sampling_params) + + del llm + gc.collect() + + # Create an LLM with spec decoding + llm = LLM( + model="meta-llama/Llama-2-13b-chat-hf", + speculative_model="ibm-fms/llama-13b-accelerator", + # These are currently required for MLPSpeculator decoding + use_v2_block_manager=True, + ) + + print("With speculation") + time_generation(llm, prompts, sampling_params) diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py new file mode 100644 index 0000000000000..d4d9738a1f7bc --- /dev/null +++ b/examples/openai_vision_api_client.py @@ -0,0 +1,86 @@ +"""An example showing how to use vLLM to serve VLMs. + +Launch the vLLM server with the following command: +python -m vllm.entrypoints.openai.api_server \ + --model llava-hf/llava-1.5-7b-hf \ + --chat-template template_llava.jinja +""" +import base64 + +import requests +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +# Use image url in the payload +chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + ], + }], + model=model, +) + +result = chat_completion_from_url.choices[0].message.content +print(f"Chat completion output:{result}") + + +# Use base64 encoded image in the payload +def encode_image_base64_from_url(image_url: str) -> str: + """Encode an image retrieved from a remote url to base64 format.""" + + with requests.get(image_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + + +image_base64 = encode_image_base64_from_url(image_url=image_url) +chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + }, + ], + }], + model=model, +) + +result = chat_completion_from_base64.choices[0].message.content +print(f"Chat completion output:{result}") diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py new file mode 100644 index 0000000000000..b315eafe5dda4 --- /dev/null +++ b/examples/paligemma_example.py @@ -0,0 +1,52 @@ +import os +import subprocess + +from PIL import Image + +from vllm import LLM + +# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. +# You can use `.buildkite/download-images.sh` to download them + + +def run_paligemma(): + llm = LLM(model="google/paligemma-3b-mix-224") + + prompt = "caption es" + + image = Image.open("images/stop_sign.jpg") + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": image + }, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +def main(): + run_paligemma() + + +if __name__ == "__main__": + # Download from s3 + s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" + local_directory = "images" + + # Make sure the local directory exists or create it + os.makedirs(local_directory, exist_ok=True) + + # Use AWS CLI to sync the directory, assume anonymous access + subprocess.check_call([ + "aws", + "s3", + "sync", + s3_bucket_path, + local_directory, + "--no-sign-request", + ]) + main() diff --git a/requirements-mamba.txt b/requirements-mamba.txt new file mode 100644 index 0000000000000..1838e87d063da --- /dev/null +++ b/requirements-mamba.txt @@ -0,0 +1,3 @@ +# Mamba dependencies +mamba-ssm>=1.2.2 +causal-conv1d>=1.2.0 diff --git a/requirements-openvino.txt b/requirements-openvino.txt new file mode 100644 index 0000000000000..e555d52572541 --- /dev/null +++ b/requirements-openvino.txt @@ -0,0 +1,9 @@ +# Common dependencies +-r requirements-common.txt + +# OpenVINO dependencies +torch >= 2.1.2 +openvino ~= 2024.3.0.dev +optimum-intel[openvino] >= 1.17.2 + +triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py new file mode 100644 index 0000000000000..8e0e8ecd675eb --- /dev/null +++ b/tests/distributed/test_multimodal_broadcast.py @@ -0,0 +1,54 @@ +"""Compare the outputs of HF and distributed vLLM when using greedy sampling. +The second test will hang if more than one test is run per command, so we need +to run the tests one by one. The solution is to pass arguments (model name) by +environment variables. + +Run: +```sh +TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \ + test_multimodal_broadcast.py +TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \ + test_multimodal_broadcast.py +``` +""" +import os + +import pytest + +from vllm.utils import cuda_device_count_stateless + +model = os.environ["TEST_DIST_MODEL"] + +if model.startswith("llava-hf/llava"): + from ..models.test_llava import models, run_test +elif model.startswith("microsoft/Phi-3-vision"): + from ..models.test_phi3v import models, run_test +else: + raise NotImplementedError(f"Unsupported model: {model}") + + +@pytest.mark.parametrize("tensor_parallel_size", [2]) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, + tensor_parallel_size: int, dtype: str, max_tokens: int, + num_logprobs: int) -> None: + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip( + f"Need at least {tensor_parallel_size} GPUs to run the test.") + + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") + + run_test( + hf_runner, + vllm_runner, + image_assets, + model=models[0], + size_factors=[1.0], + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + ) diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py new file mode 100644 index 0000000000000..3adcf6b61046d --- /dev/null +++ b/tests/distributed/test_parallel_state.py @@ -0,0 +1,57 @@ +from typing import Any, Dict + +import pytest +import torch + +from vllm.distributed.parallel_state import (_split_tensor_dict, + _update_nested_dict) + + +def test_split_tensor_dict(): + test_dict = { + "key_a": "a", + "key_b": torch.arange(8, dtype=torch.float32), + "key_c": { + "key_1": torch.arange(5, dtype=torch.float32), + "key_2": torch.tensor([], dtype=torch.float32), + "key_3": 123, + }, + "key_d": {}, + } + metadata_list, tensor_list = _split_tensor_dict(test_dict) + assert len(metadata_list) == 6 + assert torch.allclose(tensor_list[0], test_dict["key_b"]) + assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"]) + assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"]) + + +def test_split_tensor_dict_invalid_key(): + test_dict = { + "a%b": "a", + } + with pytest.raises(AssertionError): + _split_tensor_dict(test_dict) + + +def test_update_nested_dict(): + flattened_keys_values = [("key1%key2%key3", "value1"), + ("key1%key2%key4", "value2"), + ("key1%key5", "value3"), ("key6%key7", "value4"), + ("key8", "value5")] + res: Dict[str, Any] = {} + + for flat_key, value in flattened_keys_values: + _update_nested_dict(res, flat_key, value) + assert res == { + "key1": { + "key2": { + "key3": "value1", + "key4": "value2" + }, + "key5": "value3" + }, + "key6": { + "key7": "value4" + }, + "key8": "value5" + } diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py new file mode 100644 index 0000000000000..6072a2dd71800 --- /dev/null +++ b/tests/distributed/test_pipeline_parallel.py @@ -0,0 +1,149 @@ +import os + +import openai # use the official client for correctness check +import pytest +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray + +from ..utils import VLLM_PATH, RemoteOpenAIServer + +# downloading lora to test lora requests + +# any model with a chat template should work here +MODEL_NAME = "meta-llama/Meta-Llama-3-8B" +EAGER_MODE = bool(int(os.getenv("EAGER_MODE", 0))) +CHUNKED_PREFILL = bool(int(os.getenv("CHUNKED_PREFILL", 0))) +TP_SIZE = int(os.getenv("TP_SIZE", 1)) +PP_SIZE = int(os.getenv("PP_SIZE", 1)) + +pytestmark = pytest.mark.asyncio + + +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(ray_ctx): + args = [ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--pipeline-parallel-size", + str(PP_SIZE), + "--tensor-parallel-size", + str(TP_SIZE), + "--distributed-executor-backend", + "ray", + ] + if CHUNKED_PREFILL: + args += [ + "--enable-chunked-prefill", + ] + if EAGER_MODE: + args += [ + "--enforce-eager", + ] + return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE) + + +@pytest.fixture(scope="module") +def client(server): + return server.get_async_client() + + +async def test_check_models(server, client: openai.AsyncOpenAI): + models = await client.models.list() + models = models.data + served_model = models[0] + assert served_model.id == MODEL_NAME + assert all(model.root == MODEL_NAME for model in models) + + +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_single_completion(server, client: openai.AsyncOpenAI, + model_name: str): + completion = await client.completions.create(model=model_name, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + assert completion.choices[0].text is not None and len( + completion.choices[0].text) >= 5 + assert completion.choices[0].finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11) + + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + assert completion.choices[0].text is not None and len( + completion.choices[0].text) >= 5 + + +@pytest.mark.parametrize( + # just test 1 lora hereafter + "model_name", + [MODEL_NAME], +) +async def test_batch_completions(server, client: openai.AsyncOpenAI, + model_name: str): + # test simple list + batch = await client.completions.create( + model=model_name, + prompt=["Hello, my name is", "Hello, my name is"], + max_tokens=5, + temperature=0.0, + ) + assert len(batch.choices) == 2 + assert batch.choices[0].text == batch.choices[1].text + + # test n = 2 + batch = await client.completions.create( + model=model_name, + prompt=["Hello, my name is", "Hello, my name is"], + n=2, + max_tokens=5, + temperature=0.0, + extra_body=dict( + # NOTE: this has to be true for n > 1 in vLLM, but not necessary + # for official client. + use_beam_search=True), + ) + assert len(batch.choices) == 4 + assert batch.choices[0].text != batch.choices[ + 1].text, "beam search should be different" + assert batch.choices[0].text == batch.choices[ + 2].text, "two copies of the same prompt should be the same" + assert batch.choices[1].text == batch.choices[ + 3].text, "two copies of the same prompt should be the same" + + # test streaming + batch = await client.completions.create( + model=model_name, + prompt=["Hello, my name is", "Hello, my name is"], + max_tokens=5, + temperature=0.0, + stream=True, + ) + texts = [""] * 2 + async for chunk in batch: + assert len(chunk.choices) == 1 + choice = chunk.choices[0] + texts[choice.index] += choice.text + assert texts[0] == texts[1] diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py new file mode 100644 index 0000000000000..2c2466f81bb8a --- /dev/null +++ b/tests/distributed/test_shm_broadcast.py @@ -0,0 +1,99 @@ +import multiprocessing +import random +import time +from typing import List + +import numpy as np +import torch.distributed as dist + +from vllm.distributed.device_communicators.shm_broadcast import ( + ShmRingBuffer, ShmRingBufferIO) +from vllm.utils import update_environment_variables + + +def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]: + np.random.seed(seed) + sizes = np.random.randint(1, 10_000, n) + # on average, each array will have 5k elements + # with int64, each array will have 40kb + return [np.random.randint(1, 100, i) for i in sizes] + + +def distributed_run(fn, world_size): + number_of_processes = world_size + processes = [] + for i in range(number_of_processes): + env = {} + env['RANK'] = str(i) + env['LOCAL_RANK'] = str(i) + env['WORLD_SIZE'] = str(number_of_processes) + env['LOCAL_WORLD_SIZE'] = str(number_of_processes) + env['MASTER_ADDR'] = 'localhost' + env['MASTER_PORT'] = '12345' + p = multiprocessing.Process(target=fn, args=(env, )) + processes.append(p) + p.start() + + for p in processes: + p.join() + + for p in processes: + assert p.exitcode == 0 + + +def worker_fn_wrapper(fn): + # `multiprocessing.Process` cannot accept environment variables directly + # so we need to pass the environment variables as arguments + # and update the environment variables in the function + def wrapped_fn(env): + update_environment_variables(env) + dist.init_process_group(backend="gloo") + fn() + + return wrapped_fn + + +@worker_fn_wrapper +def worker_fn(): + writer_rank = 2 + broadcaster = ShmRingBufferIO.create_from_process_group( + dist.group.WORLD, 1024 * 1024, 2, writer_rank) + if dist.get_rank() == writer_rank: + seed = random.randint(0, 1000) + dist.broadcast_object_list([seed], writer_rank) + else: + recv = [None] + dist.broadcast_object_list(recv, writer_rank) + seed = recv[0] # type: ignore + dist.barrier() + # in case we find a race condition + # print the seed so that we can reproduce the error + print(f"Rank {dist.get_rank()} got seed {seed}") + # test broadcasting with about 400MB of data + N = 10_000 + if dist.get_rank() == writer_rank: + arrs = get_arrays(N, seed) + for x in arrs: + broadcaster.broadcast_object(x) + time.sleep(random.random() / 1000) + else: + arrs = get_arrays(N, seed) + for x in arrs: + y = broadcaster.broadcast_object(None) + assert np.array_equal(x, y) + time.sleep(random.random() / 1000) + dist.barrier() + + +def test_shm_broadcast(): + distributed_run(worker_fn, 4) + + +def test_singe_process(): + buffer = ShmRingBuffer(1, 1024, 4) + reader = ShmRingBufferIO(buffer, reader_rank=0) + writer = ShmRingBufferIO(buffer, reader_rank=-1) + writer.enqueue([0]) + writer.enqueue([1]) + assert reader.dequeue() == [0] + assert reader.dequeue() == [1] diff --git a/tests/entrypoints/llm/__init__.py b/tests/entrypoints/llm/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py new file mode 100644 index 0000000000000..d1056a0490509 --- /dev/null +++ b/tests/entrypoints/llm/test_encode.py @@ -0,0 +1,142 @@ +import weakref +from typing import List + +import pytest + +from vllm import LLM, EmbeddingRequestOutput, PoolingParams + +from ...conftest import cleanup + +MODEL_NAME = "intfloat/e5-mistral-7b-instruct" + +PROMPTS = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +TOKEN_IDS = [ + # Using ID={0, 1, 2, 3} results in NaN values, + # so we add this offset of 1000 + [1000], + [1000, 1001], + [1000, 1002, 1001], + [1000, 1003, 1001, 1002], +] + + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, + max_num_batched_tokens=32768, + tensor_parallel_size=1, + gpu_memory_utilization=0.75, + enforce_eager=True) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup() + + +def assert_outputs_equal(o1: List[EmbeddingRequestOutput], + o2: List[EmbeddingRequestOutput]): + assert [o.outputs for o in o1] == [o.outputs for o in o2] + + +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize('prompt', PROMPTS) +def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt): + pooling_params = PoolingParams() + + with pytest.warns(DeprecationWarning, match="'prompts'"): + v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params) + + v2_output = llm.encode(prompt, pooling_params=pooling_params) + assert_outputs_equal(v1_output, v2_output) + + v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) +def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, + prompt_token_ids): + pooling_params = PoolingParams() + + with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): + v1_output = llm.encode(prompt_token_ids=prompt_token_ids, + pooling_params=pooling_params) + + v2_output = llm.encode({"prompt_token_ids": prompt_token_ids}, + pooling_params=pooling_params) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM): + pooling_params = PoolingParams() + + with pytest.warns(DeprecationWarning, match="'prompts'"): + v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params) + + v2_output = llm.encode(PROMPTS, pooling_params=pooling_params) + assert_outputs_equal(v1_output, v2_output) + + v2_output = llm.encode( + [{ + "prompt": p + } for p in PROMPTS], + pooling_params=pooling_params, + ) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): + pooling_params = PoolingParams() + + with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): + v1_output = llm.encode(prompt_token_ids=TOKEN_IDS, + pooling_params=pooling_params) + + v2_output = llm.encode( + [{ + "prompt_token_ids": p + } for p in TOKEN_IDS], + pooling_params=pooling_params, + ) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_multiple_pooling_params(llm: LLM): + pooling_params = [ + PoolingParams(), + PoolingParams(), + PoolingParams(), + PoolingParams(), + ] + + # Multiple PoolingParams should be matched with each prompt + outputs = llm.encode(PROMPTS, pooling_params=pooling_params) + assert len(PROMPTS) == len(outputs) + + # Exception raised, if the size of params does not match the size of prompts + with pytest.raises(ValueError): + outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3]) + + # Single PoolingParams should be applied to every prompt + single_pooling_params = PoolingParams() + outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params) + assert len(PROMPTS) == len(outputs) + + # pooling_params is None, default params should be applied + outputs = llm.encode(PROMPTS, pooling_params=None) + assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py new file mode 100644 index 0000000000000..57ac37f7ea8f7 --- /dev/null +++ b/tests/entrypoints/llm/test_generate.py @@ -0,0 +1,142 @@ +import weakref +from typing import List + +import pytest + +from vllm import LLM, RequestOutput, SamplingParams + +from ...conftest import cleanup + +MODEL_NAME = "facebook/opt-125m" + +PROMPTS = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +TOKEN_IDS = [ + [0], + [0, 1], + [0, 2, 1], + [0, 3, 1, 2], +] + + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, + max_num_batched_tokens=4096, + tensor_parallel_size=1, + gpu_memory_utilization=0.10, + enforce_eager=True) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup() + + +def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): + assert [o.outputs for o in o1] == [o.outputs for o in o2] + + +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize('prompt', PROMPTS) +def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt): + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + + with pytest.warns(DeprecationWarning, match="'prompts'"): + v1_output = llm.generate(prompts=prompt, + sampling_params=sampling_params) + + v2_output = llm.generate(prompt, sampling_params=sampling_params) + assert_outputs_equal(v1_output, v2_output) + + v2_output = llm.generate({"prompt": prompt}, + sampling_params=sampling_params) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) +def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, + prompt_token_ids): + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + + with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): + v1_output = llm.generate(prompt_token_ids=prompt_token_ids, + sampling_params=sampling_params) + + v2_output = llm.generate({"prompt_token_ids": prompt_token_ids}, + sampling_params=sampling_params) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM): + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + + with pytest.warns(DeprecationWarning, match="'prompts'"): + v1_output = llm.generate(prompts=PROMPTS, + sampling_params=sampling_params) + + v2_output = llm.generate(PROMPTS, sampling_params=sampling_params) + assert_outputs_equal(v1_output, v2_output) + + v2_output = llm.generate( + [{ + "prompt": p + } for p in PROMPTS], + sampling_params=sampling_params, + ) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + + with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"): + v1_output = llm.generate(prompt_token_ids=TOKEN_IDS, + sampling_params=sampling_params) + + v2_output = llm.generate( + [{ + "prompt_token_ids": p + } for p in TOKEN_IDS], + sampling_params=sampling_params, + ) + assert_outputs_equal(v1_output, v2_output) + + +@pytest.mark.skip_global_cleanup +def test_multiple_sampling_params(llm: LLM): + sampling_params = [ + SamplingParams(temperature=0.01, top_p=0.95), + SamplingParams(temperature=0.3, top_p=0.95), + SamplingParams(temperature=0.7, top_p=0.95), + SamplingParams(temperature=0.99, top_p=0.95), + ] + + # Multiple SamplingParams should be matched with each prompt + outputs = llm.generate(PROMPTS, sampling_params=sampling_params) + assert len(PROMPTS) == len(outputs) + + # Exception raised, if the size of params does not match the size of prompts + with pytest.raises(ValueError): + outputs = llm.generate(PROMPTS, sampling_params=sampling_params[:3]) + + # Single SamplingParams should be applied to every prompt + single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95) + outputs = llm.generate(PROMPTS, sampling_params=single_sampling_params) + assert len(PROMPTS) == len(outputs) + + # sampling_params is None, default params should be applied + outputs = llm.generate(PROMPTS, sampling_params=None) + assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py new file mode 100644 index 0000000000000..35eabf079964a --- /dev/null +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -0,0 +1,67 @@ +import weakref + +import pytest +# downloading lora to test lora requests +from huggingface_hub import snapshot_download + +from vllm import LLM +from vllm.lora.request import LoRARequest + +from ...conftest import cleanup + +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" + +PROMPTS = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +LORA_NAME = "typeof/zephyr-7b-beta-lora" + + +@pytest.fixture(scope="module") +def llm(): + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM(model=MODEL_NAME, + tensor_parallel_size=1, + max_model_len=8192, + enable_lora=True, + max_loras=4, + max_lora_rank=64, + max_num_seqs=128, + enforce_eager=True) + + with llm.deprecate_legacy_api(): + yield weakref.proxy(llm) + + del llm + + cleanup() + + +@pytest.fixture(scope="module") +def zephyr_lora_files(): + return snapshot_download(repo_id=LORA_NAME) + + +@pytest.mark.skip_global_cleanup +def test_multiple_lora_requests(llm: LLM, zephyr_lora_files): + lora_request = [ + LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files) + for idx in range(len(PROMPTS)) + ] + # Multiple SamplingParams should be matched with each prompt + outputs = llm.generate(PROMPTS, lora_request=lora_request) + assert len(PROMPTS) == len(outputs) + + # Exception raised, if the size of params does not match the size of prompts + with pytest.raises(ValueError): + outputs = llm.generate(PROMPTS, lora_request=lora_request[:1]) + + # Single LoRARequest should be applied to every prompt + single_lora_request = lora_request[0] + outputs = llm.generate(PROMPTS, lora_request=single_lora_request) + assert len(PROMPTS) == len(outputs) diff --git a/tests/entrypoints/openai/__init__.py b/tests/entrypoints/openai/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py new file mode 100644 index 0000000000000..3e80214f24dc5 --- /dev/null +++ b/tests/entrypoints/openai/test_chat.py @@ -0,0 +1,873 @@ +# imports for guided decoding tests +import json +import re +from typing import List + +import jsonschema +import openai # use the official client for correctness check +import pytest +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray +import torch +# downloading lora to test lora requests +from huggingface_hub import snapshot_download +from openai import BadRequestError + +from ...utils import VLLM_PATH, RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" + +TEST_SCHEMA = { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "skills": { + "type": "array", + "items": { + "type": "string", + "maxLength": 10 + }, + "minItems": 3 + }, + "work history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": { + "type": "string" + }, + "duration": { + "type": "string" + }, + "position": { + "type": "string" + } + }, + "required": ["company", "position"] + } + } + }, + "required": ["name", "age", "skills", "work history"] +} + +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") + +TEST_CHOICE = [ + "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", + "Swift", "Kotlin" +] + + +@pytest.fixture(scope="module") +def zephyr_lora_files(): + return snapshot_download(repo_id=LORA_NAME) + + +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(zephyr_lora_files, ray_ctx): + return RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) + + +@pytest.fixture(scope="module") +def client(server): + return server.get_async_client() + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # first test base model, then test loras + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +) +async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=False) + + choice = chat_completion.choices[0] + assert choice.logprobs is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # just test 1 lora hereafter + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=0) + + choice = chat_completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.content is not None + assert len(choice.logprobs.content[0].top_logprobs) == 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=5) + + choice = chat_completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.content is not None + assert len(choice.logprobs.content[0].top_logprobs) == 5 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + # Default max_logprobs is 20, so this should raise an error + with pytest.raises((openai.BadRequestError, openai.APIError)): + stream = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=21, + stream=True) + async for chunk in stream: + ... + + with pytest.raises(openai.BadRequestError): + await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=30, + stream=False) + + # the server should still work afterwards + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + stream=False) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_single_chat_session(client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + # test single completion + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=5) + assert chat_completion.id is not None + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=37, total_tokens=47) + + message = choice.message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # just test 1 lora hereafter + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + ) + chunks: List[str] = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert delta.content + assert "".join(chunks) == output + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], +) +async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "What is the capital of France?" + }] + + # Test stream=True, stream_options={"include_usage": False} + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={"include_usage": False}) + async for chunk in stream: + assert chunk.usage is None + + # Test stream=True, stream_options={"include_usage": True} + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={"include_usage": True}) + + async for chunk in stream: + if chunk.choices[0].finish_reason is None: + assert chunk.usage is None + else: + assert chunk.usage is None + final_chunk = await stream.__anext__() + assert final_chunk.usage is not None + assert final_chunk.usage.prompt_tokens > 0 + assert final_chunk.usage.completion_tokens > 0 + assert final_chunk.usage.total_tokens == ( + final_chunk.usage.prompt_tokens + + final_chunk.usage.completion_tokens) + assert final_chunk.choices == [] + + # Test stream=False, stream_options={"include_usage": None} + with pytest.raises(BadRequestError): + await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + stream_options={"include_usage": None}) + + # Test stream=False, stream_options={"include_usage": True} + with pytest.raises(BadRequestError): + await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}) + + +# NOTE: Not sure why, but when I place this after `test_guided_regex_chat` +# (i.e. using the same ordering as in the Completions API tests), the test +# will fail on the second `guided_decoding_backend` even when I swap their order +# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256) +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_guided_choice_chat(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + "The best language for type-safe systems programming is " + }] + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=10, + extra_body=dict(guided_choice=TEST_CHOICE, + guided_decoding_backend=guided_decoding_backend)) + choice1 = chat_completion.choices[0].message.content + assert choice1 in TEST_CHOICE + + messages.append({"role": "assistant", "content": choice1}) + messages.append({ + "role": "user", + "content": "I disagree, pick another one" + }) + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=10, + extra_body=dict(guided_choice=TEST_CHOICE, + guided_decoding_backend=guided_decoding_backend)) + choice2 = chat_completion.choices[0].message.content + assert choice2 in TEST_CHOICE + assert choice1 != choice2 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_guided_json_chat(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + f"Give an example JSON for an employee profile that " + f"fits this schema: {TEST_SCHEMA}" + }] + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=1000, + extra_body=dict(guided_json=TEST_SCHEMA, + guided_decoding_backend=guided_decoding_backend)) + message = chat_completion.choices[0].message + assert message.content is not None + json1 = json.loads(message.content) + jsonschema.validate(instance=json1, schema=TEST_SCHEMA) + + messages.append({"role": "assistant", "content": message.content}) + messages.append({ + "role": + "user", + "content": + "Give me another one with a different name and age" + }) + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=1000, + extra_body=dict(guided_json=TEST_SCHEMA, + guided_decoding_backend=guided_decoding_backend)) + message = chat_completion.choices[0].message + assert message.content is not None + json2 = json.loads(message.content) + jsonschema.validate(instance=json2, schema=TEST_SCHEMA) + assert json1["name"] != json2["name"] + assert json1["age"] != json2["age"] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_guided_regex_chat(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + f"Give an example IP address with this regex: {TEST_REGEX}" + }] + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=20, + extra_body=dict(guided_regex=TEST_REGEX, + guided_decoding_backend=guided_decoding_backend)) + ip1 = chat_completion.choices[0].message.content + assert ip1 is not None + assert re.fullmatch(TEST_REGEX, ip1) is not None + + messages.append({"role": "assistant", "content": ip1}) + messages.append({"role": "user", "content": "Give me a different one"}) + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=20, + extra_body=dict(guided_regex=TEST_REGEX, + guided_decoding_backend=guided_decoding_backend)) + ip2 = chat_completion.choices[0].message.content + assert ip2 is not None + assert re.fullmatch(TEST_REGEX, ip2) is not None + assert ip1 != ip2 + + +@pytest.mark.asyncio +async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + "The best language for type-safe systems programming is " + }] + + with pytest.raises(openai.BadRequestError): + _ = await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + extra_body=dict(guided_regex={ + 1: "Python", + 2: "C++" + })) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + "The best language for type-safe systems programming is " + }] + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=5, + extra_body=dict(guided_choice=TEST_CHOICE, + guided_decoding_backend=guided_decoding_backend)) + + assert chat_completion.choices[0].logprobs is not None + assert chat_completion.choices[0].logprobs.content is not None + top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs + + # -9999.0 is the minimum logprob returned by OpenAI + for item in top_logprobs: + assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_named_tool_use(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + f"Give an example JSON for an employee profile that " + f"fits this schema: {TEST_SCHEMA}" + }] + + # non-streaming + + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=1000, + tools=[{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": TEST_SCHEMA + } + }], + tool_choice={ + "type": "function", + "function": { + "name": "dummy_function_name" + } + }) + message = chat_completion.choices[0].message + assert len(message.content) == 0 + json_string = message.tool_calls[0].function.arguments + json1 = json.loads(json_string) + jsonschema.validate(instance=json1, schema=TEST_SCHEMA) + + messages.append({"role": "assistant", "content": json_string}) + messages.append({ + "role": + "user", + "content": + "Give me another one with a different name and age" + }) + + # streaming + + stream = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=1000, + tools=[{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": TEST_SCHEMA + } + }], + tool_choice={ + "type": "function", + "function": { + "name": "dummy_function_name" + } + }, + stream=True) + + output = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + assert delta.content is None or len(delta.content) == 0 + if delta.tool_calls: + output.append(delta.tool_calls[0].function.arguments) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + json2 = json.loads("".join(output)) + jsonschema.validate(instance=json2, schema=TEST_SCHEMA) + assert json1["name"] != json2["name"] + assert json1["age"] != json2["age"] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) +async def test_required_tool_use_not_yet_supported( + client: openai.AsyncOpenAI, guided_decoding_backend: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + f"Give an example JSON for an employee profile that " + f"fits this schema: {TEST_SCHEMA}" + }] + + with pytest.raises(openai.BadRequestError): + await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=1000, + tools=[{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": TEST_SCHEMA + } + }], + tool_choice="required") + + with pytest.raises(openai.BadRequestError): + await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=1000, + tools=[{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": TEST_SCHEMA + } + }], + tool_choice="auto") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) +async def test_inconsistent_tool_choice_and_tools( + client: openai.AsyncOpenAI, guided_decoding_backend: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + f"Give an example JSON for an employee profile that " + f"fits this schema: {TEST_SCHEMA}" + }] + + with pytest.raises(openai.BadRequestError): + await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + max_tokens=1000, + tool_choice={ + "type": "function", + "function": { + "name": + "dummy_function_name" + } + }) + + with pytest.raises(openai.BadRequestError): + await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=1000, + tools=[{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": TEST_SCHEMA + } + }], + tool_choice={ + "type": "function", + "function": { + "name": "nondefined_function_name" + } + }) + + +@pytest.mark.asyncio +async def test_response_format_json_object(client: openai.AsyncOpenAI): + for _ in range(2): + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": + "user", + "content": ('what is 1+1? please respond with a JSON object, ' + 'the format is {"result": 2}') + }], + response_format={"type": "json_object"}) + + content = resp.choices[0].message.content + assert content is not None + + loaded = json.loads(content) + assert loaded == {"result": 2}, loaded + + +@pytest.mark.asyncio +async def test_extra_fields(client: openai.AsyncOpenAI): + with pytest.raises(BadRequestError) as exc_info: + await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "system", + "content": "You are a helpful assistant.", + "extra_field": "0", + }], # type: ignore + temperature=0, + seed=0) + + assert "extra_forbidden" in exc_info.value.message + + +@pytest.mark.asyncio +async def test_complex_message_content(client: openai.AsyncOpenAI): + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": + "user", + "content": [{ + "type": + "text", + "text": + "what is 1+1? please provide the result without any other text." + }] + }], + temperature=0, + seed=0) + content = resp.choices[0].message.content + assert content == "2" + + +@pytest.mark.asyncio +async def test_custom_role(client: openai.AsyncOpenAI): + # Not sure how the model handles custom roles so we just check that + # both string and complex message content are handled in the same way + + resp1 = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "my-custom-role", + "content": "what is 1+1?", + }], # type: ignore + temperature=0, + seed=0) + + resp2 = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "my-custom-role", + "content": [{ + "type": "text", + "text": "what is 1+1?" + }] + }], # type: ignore + temperature=0, + seed=0) + + content1 = resp1.choices[0].message.content + content2 = resp2.choices[0].message.content + assert content1 == content2 + + +@pytest.mark.asyncio +async def test_long_seed(client: openai.AsyncOpenAI): + for seed in [ + torch.iinfo(torch.long).min - 1, + torch.iinfo(torch.long).max + 1 + ]: + with pytest.raises(BadRequestError) as exc_info: + await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "system", + "content": "You are a helpful assistant.", + }], + temperature=0, + seed=seed) + + assert ("greater_than_equal" in exc_info.value.message + or "less_than_equal" in exc_info.value.message) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py new file mode 100644 index 0000000000000..52a848b7831d5 --- /dev/null +++ b/tests/entrypoints/openai/test_completion.py @@ -0,0 +1,724 @@ +# imports for guided decoding tests +import json +import re +from typing import List + +import jsonschema +import openai # use the official client for correctness check +import pytest +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray +import requests +# downloading lora to test lora requests +from huggingface_hub import snapshot_download +from openai import BadRequestError + +from vllm.transformers_utils.tokenizer import get_tokenizer + +from ...utils import VLLM_PATH, RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" + +TEST_SCHEMA = { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "skills": { + "type": "array", + "items": { + "type": "string", + "maxLength": 10 + }, + "minItems": 3 + }, + "work history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": { + "type": "string" + }, + "duration": { + "type": "string" + }, + "position": { + "type": "string" + } + }, + "required": ["company", "position"] + } + } + }, + "required": ["name", "age", "skills", "work history"] +} + +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") + +TEST_CHOICE = [ + "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", + "Swift", "Kotlin" +] + + +@pytest.fixture(scope="module") +def zephyr_lora_files(): + return snapshot_download(repo_id=LORA_NAME) + + +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(zephyr_lora_files, ray_ctx): + return RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) + + +@pytest.fixture(scope="module") +def client(server): + return server.get_async_client() + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # first test base model, then test loras + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +) +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): + completion = await client.completions.create(model=model_name, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + + choice = completion.choices[0] + assert len(choice.text) >= 5 + assert choice.finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11) + + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + assert len(completion.choices[0].text) >= 5 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # first test base model, then test loras + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +) +async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=None, + ) + choice = completion.choices[0] + assert choice.logprobs is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # just test 1 lora hereafter + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=0, + ) + choice = completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.token_logprobs is not None + assert choice.logprobs.top_logprobs is not None + assert len(choice.logprobs.top_logprobs[0]) == 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=5, + ) + choice = completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.token_logprobs is not None + assert choice.logprobs.top_logprobs is not None + assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, + model_name: str): + + with pytest.raises( + (openai.BadRequestError, openai.APIError)): # test using token IDs + await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + # vLLM has higher default max_logprobs (20 instead of 5) to support + # both Completion API and Chat Completion API + logprobs=21, + ) + ... + with pytest.raises( + (openai.BadRequestError, openai.APIError)): # test using token IDs + stream = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + # vLLM has higher default max_logprobs (20 instead of 5) to support + # both Completion API and Chat Completion API + logprobs=30, + stream=True, + ) + async for chunk in stream: + ... + + # the server should still work afterwards + completion = await client.completions.create( + model=model_name, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + assert len(completion.choices[0].text) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_completion_streaming(client: openai.AsyncOpenAI, + model_name: str): + prompt = "What is an LLM?" + + single_completion = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + ) + single_output = single_completion.choices[0].text + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True) + chunks: List[str] = [] + finish_reason_count = 0 + async for chunk in stream: + chunks.append(chunk.choices[0].text) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == "length" + assert chunk.choices[0].text + assert "".join(chunks) == single_output + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], +) +async def test_completion_stream_options(client: openai.AsyncOpenAI, + model_name: str): + prompt = "What is the capital of France?" + + # Test stream=True, stream_options= + # {"include_usage": False, "continuous_usage_stats": False} + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": False, + "continuous_usage_stats": + False, + }) + + async for chunk in stream: + assert chunk.usage is None + + # Test stream=True, stream_options= + # {"include_usage": False, "continuous_usage_stats": True} + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": False, + "continuous_usage_stats": + True, + }) + async for chunk in stream: + assert chunk.usage is None + + # Test stream=True, stream_options= + # {"include_usage": True, "continuous_usage_stats": False} + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": True, + "continuous_usage_stats": + False, + }) + async for chunk in stream: + if chunk.choices[0].finish_reason is None: + assert chunk.usage is None + else: + assert chunk.usage is None + final_chunk = await stream.__anext__() + assert final_chunk.usage is not None + assert final_chunk.usage.prompt_tokens > 0 + assert final_chunk.usage.completion_tokens > 0 + assert final_chunk.usage.total_tokens == ( + final_chunk.usage.prompt_tokens + + final_chunk.usage.completion_tokens) + assert final_chunk.choices == [] + + # Test stream=True, stream_options= + # {"include_usage": True, "continuous_usage_stats": True} + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": True, + "continuous_usage_stats": + True, + }) + async for chunk in stream: + assert chunk.usage is not None + assert chunk.usage.prompt_tokens > 0 + assert chunk.usage.completion_tokens > 0 + assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + + chunk.usage.completion_tokens) + if chunk.choices[0].finish_reason is not None: + final_chunk = await stream.__anext__() + assert final_chunk.usage is not None + assert final_chunk.usage.prompt_tokens > 0 + assert final_chunk.usage.completion_tokens > 0 + assert final_chunk.usage.total_tokens == ( + final_chunk.usage.prompt_tokens + + final_chunk.usage.completion_tokens) + assert final_chunk.choices == [] + + # Test stream=False, stream_options= + # {"include_usage": None} + with pytest.raises(BadRequestError): + await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": None}) + + # Test stream=False, stream_options= + # {"include_usage": True} + with pytest.raises(BadRequestError): + await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"include_usage": True}) + + # Test stream=False, stream_options= + # {"continuous_usage_stats": None} + with pytest.raises(BadRequestError): + await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"continuous_usage_stats": None}) + + # Test stream=False, stream_options= + # {"continuous_usage_stats": True} + with pytest.raises(BadRequestError): + await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=False, + stream_options={"continuous_usage_stats": True}) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # just test 1 lora hereafter + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): + # test both text and token IDs + for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): + # test simple list + batch = await client.completions.create( + model=model_name, + prompt=prompts, + max_tokens=5, + temperature=0.0, + ) + assert len(batch.choices) == 2 + assert batch.choices[0].text == batch.choices[1].text + + # test n = 2 + batch = await client.completions.create( + model=model_name, + prompt=prompts, + n=2, + max_tokens=5, + temperature=0.0, + extra_body=dict( + # NOTE: this has to be true for n > 1 in vLLM, but not necessary + # for official client. + use_beam_search=True), + ) + assert len(batch.choices) == 4 + assert batch.choices[0].text != batch.choices[ + 1].text, "beam search should be different" + assert batch.choices[0].text == batch.choices[ + 2].text, "two copies of the same prompt should be the same" + assert batch.choices[1].text == batch.choices[ + 3].text, "two copies of the same prompt should be the same" + + # test streaming + batch = await client.completions.create( + model=model_name, + prompt=prompts, + max_tokens=5, + temperature=0.0, + stream=True, + ) + texts = [""] * 2 + async for chunk in batch: + assert len(chunk.choices) == 1 + choice = chunk.choices[0] + texts[choice.index] += choice.text + assert texts[0] == texts[1] + + +@pytest.mark.asyncio +async def test_logits_bias(client: openai.AsyncOpenAI): + prompt = "Hello, my name is" + max_tokens = 5 + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + + # Test exclusive selection + token_id = 1000 + completion = await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + max_tokens=max_tokens, + temperature=0.0, + logit_bias={str(token_id): 100}, + seed=42, + ) + assert len(completion.choices[0].text) >= 5 + response_tokens = tokenizer(completion.choices[0].text, + add_special_tokens=False)["input_ids"] + expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), + add_special_tokens=False)["input_ids"] + assert all([ + response == expected + for response, expected in zip(response_tokens, expected_tokens) + ]) + + # Test ban + completion = await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + max_tokens=max_tokens, + temperature=0.0, + ) + response_tokens = tokenizer(completion.choices[0].text, + add_special_tokens=False)["input_ids"] + first_response = completion.choices[0].text + completion = await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + max_tokens=max_tokens, + temperature=0.0, + logit_bias={str(token): -100 + for token in response_tokens}, + ) + assert first_response != completion.choices[0].text + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_guided_json_completion(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + completion = await client.completions.create( + model=MODEL_NAME, + prompt=f"Give an example JSON for an employee profile " + f"that fits this schema: {TEST_SCHEMA}", + n=3, + temperature=1.0, + max_tokens=500, + extra_body=dict(guided_json=TEST_SCHEMA, + guided_decoding_backend=guided_decoding_backend)) + + assert completion.id is not None + assert len(completion.choices) == 3 + for i in range(3): + output_json = json.loads(completion.choices[i].text) + jsonschema.validate(instance=output_json, schema=TEST_SCHEMA) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_guided_regex_completion(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + completion = await client.completions.create( + model=MODEL_NAME, + prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}", + n=3, + temperature=1.0, + max_tokens=20, + extra_body=dict(guided_regex=TEST_REGEX, + guided_decoding_backend=guided_decoding_backend)) + + assert completion.id is not None + assert len(completion.choices) == 3 + for i in range(3): + assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_guided_choice_completion(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + completion = await client.completions.create( + model=MODEL_NAME, + prompt="The best language for type-safe systems programming is ", + n=2, + temperature=1.0, + max_tokens=10, + extra_body=dict(guided_choice=TEST_CHOICE, + guided_decoding_backend=guided_decoding_backend)) + + assert completion.id is not None + assert len(completion.choices) == 2 + for i in range(2): + assert completion.choices[i].text in TEST_CHOICE + + +@pytest.mark.asyncio +async def test_guided_grammar(client: openai.AsyncOpenAI): + simple_sql_grammar = """ +start: select_statement + +select_statement: "SELECT" column "from" table "where" condition + +column: "col_1" | "col_2" +table: "table_1" | "table_2" +condition: column "=" number + +number: "1" | "2" +""" + + completion = await client.completions.create( + model=MODEL_NAME, + prompt=("Generate a sql state that select col_1 from " + "table_1 where it is equals to 1"), + temperature=1.0, + max_tokens=500, + extra_body=dict(guided_grammar=simple_sql_grammar)) + + content = completion.choices[0].text + + # use Lark to parse the output, and make sure it's a valid parse tree + from lark import Lark + parser = Lark(simple_sql_grammar) + parser.parse(content) + + # remove spaces for comparison b/c we removed them in the grammar + ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "") + + assert content.strip() == ground_truth + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + # first test base model, then test loras + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +) +@pytest.mark.parametrize("logprobs_arg", [1, 0]) +async def test_echo_logprob_completion(client: openai.AsyncOpenAI, + model_name: str, logprobs_arg: int): + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + # test using text and token IDs + for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]): + completion = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + echo=True, + logprobs=logprobs_arg) + + prompt_text = tokenizer.decode(prompt) if isinstance(prompt, + list) else prompt + assert re.search(r"^" + prompt_text, completion.choices[0].text) + logprobs = completion.choices[0].logprobs + assert logprobs is not None + assert len(logprobs.text_offset) > 5 + assert (len(logprobs.token_logprobs) > 5 + and logprobs.token_logprobs[0] is None) + assert (len(logprobs.top_logprobs) > 5 + and logprobs.top_logprobs[0] is None) + for top_logprobs in logprobs.top_logprobs[1:]: + assert max(logprobs_arg, + 1) <= len(top_logprobs) <= logprobs_arg + 1 + assert len(logprobs.tokens) > 5 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("guided_decoding_backend", + ["outlines", "lm-format-enforcer"]) +async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, + guided_decoding_backend: str): + with pytest.raises(openai.BadRequestError): + _ = await client.completions.create( + model=MODEL_NAME, + prompt="Give an example JSON that fits this schema: 42", + extra_body=dict(guided_json=42, + guided_decoding_backend=guided_decoding_backend)) + + with pytest.raises(openai.BadRequestError): + _ = await client.completions.create( + model=MODEL_NAME, + prompt="Give an example string that fits this regex", + extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA)) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_tokenize(client: openai.AsyncOpenAI, model_name: str): + base_url = str(client.base_url)[:-3].strip("/") + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") + + for add_special in [False, True]: + prompt = "This is a test prompt." + tokens = tokenizer.encode(prompt, add_special_tokens=add_special) + + response = requests.post(base_url + "/tokenize", + json={ + "add_special_tokens": add_special, + "model": model_name, + "prompt": prompt + }) + response.raise_for_status() + assert response.json() == { + "tokens": tokens, + "count": len(tokens), + "max_model_len": 8192 + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_detokenize(client: openai.AsyncOpenAI, model_name: str): + base_url = str(client.base_url)[:-3] + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") + + prompt = "This is a test prompt." + tokens = tokenizer.encode(prompt, add_special_tokens=False) + + response = requests.post(base_url + "detokenize", + json={ + "model": model_name, + "tokens": tokens + }) + response.raise_for_status() + assert response.json() == {"prompt": prompt} diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py new file mode 100644 index 0000000000000..f8aa1c9143a3b --- /dev/null +++ b/tests/entrypoints/openai/test_embedding.py @@ -0,0 +1,144 @@ +import base64 + +import numpy as np +import openai +import pytest +import ray + +from ...utils import VLLM_PATH, RemoteOpenAIServer + +EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" + + +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def embedding_server(ray_ctx): + return RemoteOpenAIServer([ + "--model", + EMBEDDING_MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--enforce-eager", + "--max-model-len", + "8192", + "--enforce-eager", + ]) + + +@pytest.mark.asyncio +@pytest.fixture(scope="module") +def embedding_client(embedding_server): + return embedding_server.get_async_client() + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [EMBEDDING_MODEL_NAME], +) +async def test_single_embedding(embedding_client: openai.AsyncOpenAI, + model_name: str): + input_texts = [ + "The chef prepared a delicious meal.", + ] + + # test single embedding + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_texts, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 9 + assert embeddings.usage.total_tokens == 9 + + # test using token IDs + input_tokens = [1, 1, 1, 1, 1] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_tokens, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 1 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 5 + assert embeddings.usage.total_tokens == 5 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [EMBEDDING_MODEL_NAME], +) +async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, + model_name: str): + # test List[str] + input_texts = [ + "The cat sat on the mat.", "A feline was resting on a rug.", + "Stars twinkle brightly in the night sky." + ] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_texts, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 3 + assert len(embeddings.data[0].embedding) == 4096 + + # test List[List[int]] + input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], + [25, 32, 64, 77]] + embeddings = await embedding_client.embeddings.create( + model=model_name, + input=input_tokens, + encoding_format="float", + ) + assert embeddings.id is not None + assert len(embeddings.data) == 4 + assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 17 + assert embeddings.usage.total_tokens == 17 + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [EMBEDDING_MODEL_NAME], +) +async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, + model_name: str): + input_texts = [ + "Hello my name is", + "The best thing about vLLM is that it supports many different models" + ] + + responses_float = await embedding_client.embeddings.create( + input=input_texts, model=model_name, encoding_format="float") + + responses_base64 = await embedding_client.embeddings.create( + input=input_texts, model=model_name, encoding_format="base64") + + decoded_responses_base64_data = [] + for data in responses_base64.data: + decoded_responses_base64_data.append( + np.frombuffer(base64.b64decode(data.embedding), + dtype="float").tolist()) + + assert responses_float.data[0].embedding == decoded_responses_base64_data[ + 0] + assert responses_float.data[1].embedding == decoded_responses_base64_data[ + 1] diff --git a/tests/entrypoints/openai/test_guided_processors.py b/tests/entrypoints/openai/test_guided_processors.py new file mode 100644 index 0000000000000..27568d3e7c26c --- /dev/null +++ b/tests/entrypoints/openai/test_guided_processors.py @@ -0,0 +1,111 @@ +# This unit test should be moved to a new +# tests/test_guided_decoding directory. +import pytest +import torch +from transformers import AutoTokenizer + +from vllm.entrypoints.openai.protocol import CompletionRequest +from vllm.model_executor.guided_decoding import ( + get_guided_decoding_logits_processor) +from vllm.model_executor.guided_decoding.outlines_logits_processors import ( + JSONLogitsProcessor, RegexLogitsProcessor) + +TEST_SCHEMA = { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "skills": { + "type": "array", + "items": { + "type": "string", + "maxLength": 10 + }, + "minItems": 3 + }, + "work history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": { + "type": "string" + }, + "duration": { + "type": "string" + }, + "position": { + "type": "string" + } + }, + "required": ["company", "position"] + } + } + }, + "required": ["name", "age", "skills", "work history"] +} + +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") + + +def test_guided_logits_processors(): + """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" + tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') + regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer) + json_LP = JSONLogitsProcessor(TEST_SCHEMA, + tokenizer, + whitespace_pattern=None) + + token_ids = tokenizer.encode( + f"Give an example IPv4 address with this regex: {TEST_REGEX}") + tensor = torch.rand(32000) + original_tensor = torch.clone(tensor) + regex_LP(token_ids, tensor) + assert tensor.shape == original_tensor.shape + assert not torch.allclose(tensor, original_tensor) + + token_ids = tokenizer.encode( + f"Give an employee profile that fits this schema: {TEST_SCHEMA}") + tensor = torch.rand(32000) + original_tensor = torch.clone(tensor) + json_LP(token_ids, tensor) + assert tensor.shape == original_tensor.shape + assert not torch.allclose(tensor, original_tensor) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"]) +async def test_guided_logits_processor_black_box(backend: str): + tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') + token_ids = tokenizer.encode( + f"Give an example IPv4 address with this regex: {TEST_REGEX}") + regex_request = CompletionRequest(model='test', + prompt=token_ids, + guided_regex=TEST_REGEX) + regex_lp = await get_guided_decoding_logits_processor( + backend, regex_request, tokenizer) + assert regex_lp is not None + tensor = torch.rand(32000) + original_tensor = torch.clone(tensor) + tensor = regex_lp(token_ids, tensor) + assert tensor.shape == original_tensor.shape + assert not torch.allclose(tensor, original_tensor) + + token_ids = tokenizer.encode( + f"Give an employee profile that fits this schema: {TEST_SCHEMA}") + json_request = CompletionRequest(model='test', + prompt=token_ids, + guided_json=TEST_SCHEMA) + json_lp = await get_guided_decoding_logits_processor( + backend, json_request, tokenizer) + assert json_lp is not None + tensor = torch.rand(32000) + original_tensor = torch.clone(tensor) + tensor = json_lp(token_ids, tensor) + assert tensor.shape == original_tensor.shape + assert not torch.allclose(tensor, original_tensor) diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py new file mode 100644 index 0000000000000..914ef6e19e109 --- /dev/null +++ b/tests/entrypoints/openai/test_models.py @@ -0,0 +1,69 @@ +import openai # use the official client for correctness check +import pytest +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray +# downloading lora to test lora requests +from huggingface_hub import snapshot_download + +from ...utils import VLLM_PATH, RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" + + +@pytest.fixture(scope="module") +def zephyr_lora_files(): + return snapshot_download(repo_id=LORA_NAME) + + +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(zephyr_lora_files, ray_ctx): + return RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) + + +@pytest.fixture(scope="module") +def client(server): + return server.get_async_client() + + +@pytest.mark.asyncio +async def test_check_models(client: openai.AsyncOpenAI): + models = await client.models.list() + models = models.data + served_model = models[0] + lora_models = models[1:] + assert served_model.id == MODEL_NAME + assert all(model.root == MODEL_NAME for model in models) + assert lora_models[0].id == "zephyr-lora" + assert lora_models[1].id == "zephyr-lora2" diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py new file mode 100644 index 0000000000000..dbbda6de1fa09 --- /dev/null +++ b/tests/entrypoints/openai/test_oot_registration.py @@ -0,0 +1,66 @@ +import sys +import time + +import torch +from openai import OpenAI, OpenAIError + +from vllm import ModelRegistry +from vllm.model_executor.models.opt import OPTForCausalLM +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.utils import get_open_port + + +class MyOPTForCausalLM(OPTForCausalLM): + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + # this dummy model always predicts the first token + logits = super().compute_logits(hidden_states, sampling_metadata) + logits.zero_() + logits[:, 0] += 1.0 + return logits + + +def server_function(port): + # register our dummy model + ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) + sys.argv = ["placeholder.py"] + \ + ("--model facebook/opt-125m --gpu-memory-utilization 0.10 " + f"--dtype float32 --api-key token-abc123 --port {port}").split() + import runpy + runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') + + +def test_oot_registration_for_api_server(): + port = get_open_port() + ctx = torch.multiprocessing.get_context() + server = ctx.Process(target=server_function, args=(port, )) + server.start() + client = OpenAI( + base_url=f"http://localhost:{port}/v1", + api_key="token-abc123", + ) + while True: + try: + completion = client.chat.completions.create( + model="facebook/opt-125m", + messages=[{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "Hello!" + }], + temperature=0, + ) + break + except OpenAIError as e: + if "Connection error" in str(e): + time.sleep(3) + else: + raise e + server.kill() + generated_text = completion.choices[0].message.content + # make sure only the first token is generated + rest = generated_text.replace("", "") + assert rest == "" diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py new file mode 100644 index 0000000000000..5de28513ca391 --- /dev/null +++ b/tests/entrypoints/openai/test_run_batch.py @@ -0,0 +1,53 @@ +import subprocess +import sys +import tempfile + +from vllm.entrypoints.openai.protocol import BatchRequestOutput + +# ruff: noqa: E501 +INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" + +INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" + + +def test_e2e(): + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(INPUT_BATCH) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", + input_file.name, "-o", output_file.name, "--model", + "NousResearch/Meta-Llama-3-8B-Instruct" + ], ) + proc.communicate() + proc.wait() + assert proc.returncode == 0, f"{proc=}" + + contents = output_file.read() + for line in contents.strip().split("\n"): + # Ensure that the output format conforms to the openai api. + # Validation should throw if the schema is wrong. + BatchRequestOutput.model_validate_json(line) + + +def test_e2e_invalid_input(): + """ + Ensure that we fail when the input doesn't conform to the openai api. + """ + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(INVALID_INPUT_BATCH) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", + input_file.name, "-o", output_file.name, "--model", + "NousResearch/Meta-Llama-3-8B-Instruct" + ], ) + proc.communicate() + proc.wait() + assert proc.returncode != 0, f"{proc=}" diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py new file mode 100644 index 0000000000000..b869717608d0f --- /dev/null +++ b/tests/entrypoints/openai/test_vision.py @@ -0,0 +1,270 @@ +from typing import Dict, List + +import openai +import pytest +import pytest_asyncio +import ray + +from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 + +from ...utils import VLLM_PATH, RemoteOpenAIServer + +MODEL_NAME = "llava-hf/llava-1.5-7b-hf" +LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja" +assert LLAVA_CHAT_TEMPLATE.exists() + +# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) +TEST_IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +] + + +@pytest.fixture(scope="module") +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(ray_ctx): + return RemoteOpenAIServer([ + "--model", + MODEL_NAME, + "--dtype", + "bfloat16", + "--max-model-len", + "4096", + "--enforce-eager", + "--chat-template", + str(LLAVA_CHAT_TEMPLATE), + ]) + + +@pytest.fixture(scope="module") +def client(server): + return server.get_async_client() + + +@pytest_asyncio.fixture(scope="session") +async def base64_encoded_image() -> Dict[str, str]: + return { + image_url: + encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url)) + for image_url in TEST_IMAGE_URLS + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image(client: openai.AsyncOpenAI, + model_name: str, image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=5) + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=596, total_tokens=606) + + message = choice.message + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_base64encoded( + client: openai.AsyncOpenAI, model_name: str, image_url: str, + base64_encoded_image: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": + f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=5) + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=596, total_tokens=606) + + message = choice.message + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_chat_streaming_image(client: openai.AsyncOpenAI, + model_name: str, image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + ) + chunks: List[str] = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert delta.content + assert "".join(chunks) == output + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, + image_url: str): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + with pytest.raises(openai.BadRequestError): # test multi-image input + await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + ) + + # the server should still work afterwards + completion = await client.completions.create( + model=model_name, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + completion = completion.choices[0].text + assert completion is not None and len(completion) >= 0 diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py new file mode 100644 index 0000000000000..5211be6aef009 --- /dev/null +++ b/tests/kernels/test_flashinfer.py @@ -0,0 +1,248 @@ +from typing import List, Optional, Tuple + +import flashinfer +import pytest +import torch + +NUM_HEADS = [(16, 16), (32, 8), (64, 8)] +HEAD_SIZES = [128, 256] +BLOCK_SIZES = [16, 32] +DTYPES = [torch.float16, torch.bfloat16] +NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. + + +def ref_paged_attn( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + query_lens: List[int], + kv_lens: List[int], + block_tables: torch.Tensor, + scale: float, + sliding_window: Optional[int] = None, + soft_cap: Optional[float] = None, +) -> torch.Tensor: + num_seqs = len(query_lens) + block_tables = block_tables.cpu().numpy() + _, block_size, num_kv_heads, head_size = key_cache.shape + + outputs: List[torch.Tensor] = [] + start_idx = 0 + for i in range(num_seqs): + query_len = query_lens[i] + kv_len = kv_lens[i] + q = query[start_idx:start_idx + query_len] + q *= scale + + num_kv_blocks = (kv_len + block_size - 1) // block_size + block_indices = block_tables[i, :num_kv_blocks] + + k = key_cache[block_indices].view(-1, num_kv_heads, head_size) + k = k[:kv_len] + v = value_cache[block_indices].view(-1, num_kv_heads, head_size) + v = v[:kv_len] + + if q.shape[1] != k.shape[1]: + k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1) + v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1) + attn = torch.einsum("qhd,khd->hqk", q, k).float() + empty_mask = torch.ones(query_len, kv_len) + mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool() + if sliding_window is not None: + sliding_window_mask = torch.triu(empty_mask, + diagonal=kv_len - + (query_len + sliding_window) + + 1).bool().logical_not() + mask |= sliding_window_mask + if soft_cap is not None: + attn = soft_cap * torch.tanh(attn / soft_cap) + attn.masked_fill_(mask, float("-inf")) + attn = torch.softmax(attn, dim=-1).to(v.dtype) + out = torch.einsum("hqk,khd->qhd", attn, v) + + outputs.append(out) + start_idx += query_len + + return torch.cat(outputs, dim=0) + + +@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) +@torch.inference_mode +def test_flashinfer_decode_with_paged_kv(kv_lens: List[int], + num_heads: Tuple[int, + int], head_size: int, + dtype: torch.dtype, block_size: int, + soft_cap: Optional[float]) -> None: + torch.set_default_device("cuda") + torch.cuda.manual_seed_all(0) + num_seqs = len(kv_lens) + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + max_kv_len = max(kv_lens) + scale = head_size**-0.5 + + query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype) + key_value_cache = torch.randn(NUM_BLOCKS, + 2, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + key_cache = key_value_cache[:, 0, :, :, :].squeeze(1) + value_cache = key_value_cache[:, 1, :, :, :].squeeze(1) + + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint(0, + NUM_BLOCKS, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(num_seqs): + seq_len = kv_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + block_size - 1) // block_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % block_size + if kv_last_page_len == 0: + kv_last_page_len = block_size + kv_last_page_lens.append(kv_last_page_len) + + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + + workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) + wrapper = flashinfer.\ + BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD") + wrapper.begin_forward(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + data_type=dtype) + + output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap) + + ref_output = ref_paged_attn(query=query, + key_cache=key_cache, + value_cache=value_cache, + query_lens=[1] * num_seqs, + kv_lens=kv_lens, + block_tables=block_tables, + scale=scale, + soft_cap=soft_cap) + assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \ + f"{torch.max(torch.abs(output - ref_output))}" + + +@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) +@torch.inference_mode +def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], + num_heads: Tuple[int, int], + head_size: int, dtype: torch.dtype, + block_size: int, + soft_cap: Optional[float]) -> None: + torch.set_default_device("cuda") + torch.cuda.manual_seed_all(0) + num_seqs = len(seq_lens) + query_lens = [x[0] for x in seq_lens] + kv_lens = [x[1] for x in seq_lens] + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + max_kv_len = max(kv_lens) + scale = head_size**-0.5 + + query = torch.randn(sum(query_lens), + num_query_heads, + head_size, + dtype=dtype) + key_value_cache = torch.randn(NUM_BLOCKS, + 2, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + key_cache = key_value_cache[:, 0, :, :, :].squeeze(1) + value_cache = key_value_cache[:, 1, :, :, :].squeeze(1) + + # Normalize the scale of the key and value caches to mitigate + # numerical instability. + key_cache /= head_size**0.5 + value_cache /= head_size**0.5 + + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint(0, + NUM_BLOCKS, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + + qo_indptr = [0] + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(num_seqs): + seq_len = kv_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + block_size - 1) // block_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % block_size + if kv_last_page_len == 0: + kv_last_page_len = block_size + kv_last_page_lens.append(kv_last_page_len) + qo_indptr.append(qo_indptr[-1] + query_lens[i]) + + qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32) + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + + workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) + wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( + workspace_buffer, "NHD") + wrapper.begin_forward( + qo_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + ) + + output = wrapper.forward( + query, + key_value_cache, + logits_soft_cap=soft_cap, + ) + + ref_output = ref_paged_attn(query=query, + key_cache=key_cache, + value_cache=value_cache, + query_lens=query_lens, + kv_lens=kv_lens, + block_tables=block_tables, + scale=scale, + soft_cap=soft_cap) + assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \ + f"{torch.max(torch.abs(output - ref_output))}" diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py new file mode 100644 index 0000000000000..9a0054c5aff53 --- /dev/null +++ b/tests/models/test_compressed_tensors.py @@ -0,0 +1,49 @@ +"""Compares vllm vs sparseml for compressed-tensors + +Note: vllm and sparseml do not have bitwise correctness, +so in this test, we just confirm that the top selected +tokens of the are in the top 5 selections of each other. +""" + +import pytest + +from tests.quantization.utils import is_quant_method_supported + +from .utils import check_logprobs_close + +MODELS = [ + "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test", +] + +MAX_TOKENS = 32 +NUM_LOGPROBS = 5 + + +@pytest.mark.skipif( + not is_quant_method_supported("compressed-tensors"), + reason="compressed-tensors is not supported on this machine type.") +@pytest.mark.parametrize("model_name", MODELS) +def test_models( + vllm_runner, + hf_runner, + example_prompts, + model_name, +) -> None: + # Run sparseml. + with hf_runner(model_name=model_name, + is_sparseml_model=True) as sparseml_model: + + sparseml_outputs = sparseml_model.generate_greedy_logprobs_limit( + example_prompts, MAX_TOKENS, NUM_LOGPROBS) + + # Run vllm. + with vllm_runner(model_name=model_name) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, MAX_TOKENS, NUM_LOGPROBS) + + check_logprobs_close( + outputs_0_lst=sparseml_outputs, + outputs_1_lst=vllm_outputs, + name_0="sparseml", + name_1="vllm", + ) diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py new file mode 100644 index 0000000000000..d7e3a2fc4a71b --- /dev/null +++ b/tests/models/test_jamba.py @@ -0,0 +1,65 @@ +import pytest + +MODELS = ["ai21labs/Jamba-tiny-random"] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [20]) +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + # To pass the small model tests, we need full precision. + assert dtype == "float" + + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + for i in range(len(example_prompts)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_outputs[i] + assert hf_output_str == vllm_output_str, ( + f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") + assert hf_output_ids == vllm_output_ids, ( + f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_state_cleanup( + vllm_runner, + model: str, + dtype: str, + example_prompts, +) -> None: + # This test is for verifying that the Jamba state is cleaned up between + # steps, If its not cleaned, an error would be expected. + try: + with vllm_runner(model, dtype=dtype) as vllm_model: + for _ in range(10): + vllm_model.generate_greedy([example_prompts[0]] * 100, 1) + except ValueError: + pytest.fail("Jamba inner state wasn't cleaned up between states, " + "could be related to finished_requests_ids") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_model_print( + vllm_runner, + model: str, + dtype: str, +) -> None: + with vllm_runner(model, dtype=dtype) as vllm_model: + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py new file mode 100644 index 0000000000000..2b1d3c5b43b44 --- /dev/null +++ b/tests/models/test_paligemma.py @@ -0,0 +1,147 @@ +from typing import List, Optional, Tuple, Type + +import pytest +from transformers import AutoTokenizer + +from vllm.multimodal.utils import rescale_image_size +from vllm.sequence import SampleLogprobs + +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from .utils import check_logprobs_close + +pytestmark = pytest.mark.vlm + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": "caption es", + "cherry_blossom": "What is in the picture?", + "boardwalk": "What is in the picture?", +}) + +IMAGE_TOKEN_ID = 257152 + +models = ["google/paligemma-3b-mix-224"] + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, + Optional[SampleLogprobs]], + model: str): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + tokenizer = AutoTokenizer.from_pretrained(model) + eos_token_id = tokenizer.eos_token_id + + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID + ] + + hf_output_str = output_str + + if hf_output_ids[-1] == eos_token_id: + hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) + + return hf_output_ids, hf_output_str, out_logprobs + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model: str, + *, + size_factors: List[float], + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: + run_test( + hf_runner, + vllm_runner, + image_assets, + model, + size_factors=size_factors, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py new file mode 100644 index 0000000000000..321566ad53a50 --- /dev/null +++ b/tests/multimodal/test_mapper.py @@ -0,0 +1,85 @@ +import numpy as np +import pytest +from transformers import CLIPImageProcessor, LlavaNextImageProcessor + +from vllm.config import ModelConfig +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import rescale_image_size + + +@pytest.mark.parametrize("dtype", ["half", "float"]) +@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) +def test_clip_image_processor(image_assets, dtype, size_factor): + MODEL_NAME = "llava-hf/llava-1.5-7b-hf" + + hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME) + assert isinstance(hf_processor, CLIPImageProcessor) + + model_config = ModelConfig( + model=MODEL_NAME, + tokenizer=MODEL_NAME, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype=dtype, + revision=None, + ) + + for asset in image_assets: + image = rescale_image_size(asset.pil_image, size_factor) + + hf_result = hf_processor.preprocess( + image, + return_tensors="pt", + ) + vllm_result = MULTIMODAL_REGISTRY.map_input( + model_config, + {"image": image}, + ) + + assert hf_result.keys() == vllm_result.keys() + for key, hf_tensor in hf_result.items(): + hf_arr: np.ndarray = hf_tensor.numpy() + vllm_arr: np.ndarray = vllm_result[key].numpy() + + assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}" + assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}" + + +@pytest.mark.parametrize("dtype", ["half", "float"]) +@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) +def test_llava_next_image_processor(image_assets, dtype, size_factor): + MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf" + + hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME) + assert isinstance(hf_processor, LlavaNextImageProcessor) + + model_config = ModelConfig( + model=MODEL_NAME, + tokenizer=MODEL_NAME, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype=dtype, + revision=None, + ) + + for asset in image_assets: + image = rescale_image_size(asset.pil_image, size_factor) + + hf_result = hf_processor.preprocess( + image, + return_tensors="pt", + ) + vllm_result = MULTIMODAL_REGISTRY.map_input( + model_config, + {"image": image}, + ) + + assert hf_result.keys() == vllm_result.keys() + for key, hf_tensor in hf_result.items(): + hf_arr: np.ndarray = hf_tensor.numpy() + vllm_arr: np.ndarray = vllm_result[key].numpy() + + assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}" + assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}" diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py new file mode 100644 index 0000000000000..dd9a016807df9 --- /dev/null +++ b/tests/quantization/test_lm_head.py @@ -0,0 +1,45 @@ +"""Tests whether gptq models with quantized lm_head can be loaded. + +Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`. +""" +from typing import Tuple + +import pytest +import torch + +from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinLinearMethod) +from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod + +PROMPT = "On the surface of Mars, we found" + +MODELS_QUANT = [( + "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse", + True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), + ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)] + + +@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT) +def test_lm_head( + vllm_runner, + model_lm_head_quant: Tuple[str, bool], +) -> None: + model, lm_head_quantized = model_lm_head_quant + vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048) + + lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model.lm_head) + + if lm_head_quantized: + assert isinstance( + lm_head_layer.linear_method, + (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod)) + else: + assert isinstance(lm_head_layer.linear_method, UnquantizedLinearMethod) + + print( + vllm_model.generate_greedy(prompts=["Hello my name is"], + max_tokens=10)[0][1]) + del vllm_model diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py new file mode 100644 index 0000000000000..859d4234c458f --- /dev/null +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -0,0 +1,123 @@ +"""Tests which cover integration of the speculative decoding framework with +tensor parallelism. +""" + +import pytest +import torch + +from vllm.utils import is_hip + +from .conftest import run_greedy_equality_correctness_test + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 3, + }, + { + "speculative_model": "[ngram]", + "num_speculative_tokens": 5, + "ngram_prompt_lookup_max": 3, + }, +]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality when tensor parallelism is used. + """ + if is_hip(): + pytest.skip("hip is not well-supported yet") + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 2, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs, test_llm_kwargs", + [ + ( + { + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a + # tokenizer. + "model": "JackFram/llama-68m", + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "speculative_draft_tensor_parallel_size": 1, + }), + ({ + "model": "ibm-granite/granite-3b-code-instruct", + }, { + "speculative_model": + "ibm-granite/granite-3b-code-instruct-accelerator", + "num_speculative_tokens": 5, + "speculative_draft_tensor_parallel_size": 1, + }) + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_draft_model_tp_lt_target_model_tp2(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=32, + force_output_len=True) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py new file mode 100644 index 0000000000000..56cb0147d9e4f --- /dev/null +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -0,0 +1,60 @@ +"""Tests which cover integration of the speculative decoding framework with +tensor parallelism. +""" + +import pytest +import torch + +from .conftest import run_greedy_equality_correctness_test + + +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Use a small model for a fast test. + # Note this is repeated in the test body; to initialize a tokenizer. + "model": "JackFram/llama-68m", + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + "tensor_parallel_size": 4, + + # Use AsyncLLM engine, so that the engine runs in its own process. + # Otherwise, since vLLM does not follow true SPMD, the test runner + # process will have both the engine and the rank0 worker. NCCL is not + # cleaned up properly, and its server host thread leaks, causing the + # second run of the test to fail with internal NCCL error. + "use_async": True, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + }, +]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + #TODO(wooyeon): add spec_draft_dp=2 case + { + "speculative_draft_tensor_parallel_size": 1, + }, + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_draft_model_tp_lt_target_model_tp4(test_llm_generator, + baseline_llm_generator, + batch_size: int): + """Verify spec decode works well with smaller tp for draft models. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=32, + force_output_len=True) diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py new file mode 100644 index 0000000000000..dd67a7735a647 --- /dev/null +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -0,0 +1,216 @@ +"""This docstring details important information on the testing methodology. + +Most of the tests rely on "greedy equality", where we expect the output of +speculative decoding on a sequence to exactly match the output of normal non- +speculative decoding. + +Since speculative decoding with rejection sampling guarantees that the output +distribution matches the target model's output distribution (up to hardware +numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy +equality. + +However, we still need to verify below scenario could be passed: + * Batch size 1 greedy equality + * Batch size >1 greedy equality + * Test greedy equality under preemption + * Test greedy equality under various number of speculative tokens. + +With those tests, we can say at least, MLPSpeculator would not break the +correctess for the target model outputs. +""" + +import pytest + +from .conftest import run_greedy_equality_correctness_test + +# main model +MAIN_MODEL = "ibm-granite/granite-3b-code-instruct" + +# speculative model +SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator" + +# max. number of speculative tokens: this corresponds to +# n_predict in the config.json of the speculator model. +MAX_SPEC_TOKENS = 5 + +# precision +PRECISION = "float32" + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": SPEC_MODEL, + }, +]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify greedy equality with different batch size.""" + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 8, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_model": SPEC_MODEL, + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 128, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator, + test_llm_generator, + batch_size: int, + output_len: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_model": SPEC_MODEL, + "num_speculative_tokens": k, + } + # Try a range of num. speculative tokens + for k in range(1, 1 + MAX_SPEC_TOKENS) + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_different_k(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify that mlp speculative decoding produces exact equality + to without spec decode with different values of num_speculative_tokens. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Required for spec decode. + "use_v2_block_manager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", + [{ + "speculative_model": SPEC_MODEL, + "speculative_disable_by_batch_size": 4 + }]) +@pytest.mark.parametrize("batch_size", [1, 5]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator, + batch_size: int, output_len: int): + """Verify that mlp speculative decoding produces exact equality + to without spec decode when speculation is disabled for large + batch sizes. + """ + run_greedy_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + force_output_len=True) diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py new file mode 100644 index 0000000000000..875ca19d3b4b7 --- /dev/null +++ b/tests/tokenization/test_get_eos.py @@ -0,0 +1,31 @@ +""" +This test file includes some cases where it is inappropriate to +only get the `eos_token_id` from the tokenizer as defined by +:meth:`vllm.LLMEngine._get_eos_token_id`. +""" +from vllm.transformers_utils.config import try_get_generation_config +from vllm.transformers_utils.tokenizer import get_tokenizer + + +def test_get_llama3_eos_token(): + model_name = "meta-llama/Meta-Llama-3-8B-Instruct" + + tokenizer = get_tokenizer(model_name) + assert tokenizer.eos_token_id == 128009 + + generation_config = try_get_generation_config(model_name, + trust_remote_code=False) + assert generation_config is not None + assert generation_config.eos_token_id == [128001, 128009] + + +def test_get_blip2_eos_token(): + model_name = "Salesforce/blip2-opt-2.7b" + + tokenizer = get_tokenizer(model_name) + assert tokenizer.eos_token_id == 2 + + generation_config = try_get_generation_config(model_name, + trust_remote_code=False) + assert generation_config is not None + assert generation_config.eos_token_id == 50118 diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py new file mode 100644 index 0000000000000..ae818ee360f19 --- /dev/null +++ b/tests/worker/test_model_input.py @@ -0,0 +1,152 @@ +import dataclasses +from typing import List, Tuple, Type + +import torch + +from vllm.attention import AttentionMetadata +from vllm.attention.backends.abstract import AttentionBackend +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.worker.embedding_model_runner import ( + ModelInputForGPUWithPoolingMetadata) +from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata + + +class MockAttentionBackend(AttentionBackend): + + @staticmethod + def get_name() -> str: + raise NotImplementedError + + @staticmethod + def get_impl_cls(): + raise NotImplementedError + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: + return AttentionMetadata + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + raise NotImplementedError + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + pass + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + pass + + +def test_model_runner_input(): + sampling_metadata = SamplingMetadata( + ["seq_group"], + "selected_token_indices", + "categorized_sample_indices", + "num_prompts", + ) + attn_metadata = AttentionMetadata( + num_prefills=1, + num_prefill_tokens=2, + num_decode_tokens=3, + slot_mapping=torch.zeros(1), + ) + model_input = ModelInputForGPUWithSamplingMetadata( + input_tokens=torch.ones(10), + input_positions=torch.ones(10), + sampling_metadata=sampling_metadata, + attn_metadata=attn_metadata) + + assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata) + + # Test round trip serialization. + tensor_dict = model_input.as_broadcastable_tensor_dict() + attn_backend = MockAttentionBackend() + received_model_input = ( + ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, attn_backend=attn_backend)) + # Check that received copy has correct values. + assert isinstance(received_model_input, + ModelInputForGPUWithSamplingMetadata) + assert received_model_input.input_tokens is not None + assert ( + received_model_input.input_tokens == model_input.input_tokens).all() + assert received_model_input.input_positions is not None + assert (received_model_input.input_positions == model_input.input_positions + ).all() + assert received_model_input.multi_modal_kwargs is None + assert (received_model_input.multi_modal_kwargs == + model_input.multi_modal_kwargs) + assert received_model_input.lora_requests is None + assert received_model_input.lora_requests == model_input.lora_requests + assert received_model_input.lora_mapping is None + assert received_model_input.lora_mapping == model_input.lora_mapping + for field in dataclasses.fields(AttentionMetadata): + assert getattr(received_model_input.attn_metadata, field.name, + None) == getattr(attn_metadata, field.name, None) + # For sampling metadata, only selected_token_indices is copied. + assert (received_model_input.sampling_metadata.selected_token_indices == + sampling_metadata.selected_token_indices) + assert received_model_input.sampling_metadata.seq_groups is None + + +def test_embedding_model_runner_input(): + pooling_metadata = PoolingMetadata( + seq_groups=[[0]], + seq_data={}, + prompt_lens=[1], + ) + attn_metadata = AttentionMetadata( + num_prefills=1, + num_prefill_tokens=2, + num_decode_tokens=3, + slot_mapping=torch.zeros(1), + ) + model_input = ModelInputForGPUWithPoolingMetadata( + input_tokens=torch.ones(10), + input_positions=torch.ones(10), + pooling_metadata=pooling_metadata, + attn_metadata=attn_metadata) + + assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata) + + # Test round trip serialization. + tensor_dict = model_input.as_broadcastable_tensor_dict() + attn_backend = MockAttentionBackend() + received_model_input = ( + ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict( + tensor_dict, attn_backend=attn_backend)) + # Check that received copy has correct values. + assert isinstance(received_model_input, + ModelInputForGPUWithPoolingMetadata) + assert received_model_input.input_tokens is not None + assert ( + received_model_input.input_tokens == model_input.input_tokens).all() + assert received_model_input.input_positions is not None + assert (received_model_input.input_positions == model_input.input_positions + ).all() + assert received_model_input.multi_modal_kwargs is None + assert (received_model_input.multi_modal_kwargs == + model_input.multi_modal_kwargs) + assert received_model_input.lora_requests is None + assert received_model_input.lora_requests == model_input.lora_requests + assert received_model_input.lora_mapping is None + assert received_model_input.lora_mapping == model_input.lora_mapping + for field in dataclasses.fields(AttentionMetadata): + assert getattr(received_model_input.attn_metadata, field.name, + None) == getattr(attn_metadata, field.name, None) + # Pooling metadata is not broadcast. + assert received_model_input.pooling_metadata is None diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py new file mode 100644 index 0000000000000..0f21b50ad4dc7 --- /dev/null +++ b/vllm/attention/backends/openvino.py @@ -0,0 +1,101 @@ +from dataclasses import dataclass +from typing import List, Tuple + +import openvino as ov +import torch + +from vllm.attention.backends.abstract import (AttentionBackend, + AttentionMetadata) + + +class OpenVINOAttentionBackend(AttentionBackend): + + @staticmethod + def get_name() -> str: + return "openvino" + + @staticmethod + def get_impl_cls(): + # OpenVINO implements PagedAttention as part of the Optimum + # exported model + raise NotImplementedError + + @staticmethod + def make_metadata(*args, **kwargs) -> "AttentionMetadata": + raise NotImplementedError + + @staticmethod + def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata": + return OpenVINOAttentionMetadata(*args, **kwargs) + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (2, num_blocks, num_kv_heads, block_size, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: ov.Tensor, + dst_kv_cache: ov.Tensor, + src_to_dst: torch.Tensor, + ) -> None: + # OpenVINO currently supports only CPU, which does not require + # swap of KV cache blocks + raise NotImplementedError + + @staticmethod + def copy_blocks( + kv_caches: List[Tuple[ov.Tensor, ov.Tensor]], + src_to_dists: List[Tuple[int, int]], + ) -> None: + for src, dst in src_to_dists: + for key_cache, value_cache in kv_caches: + key_cache.data[dst, :] = key_cache.data[src, :] + value_cache.data[dst, :] = value_cache.data[src, :] + + +@dataclass +class OpenVINOAttentionMetadata: + """Metadata for OpenVINOAttentionBackend. + + Basic terms used below: + - batch_size_in_sequences - total number of sequences to execute​ + - prompt_lens – per sequence size number of scheduled tokens​ + - batch_size_in_tokens = sum(prompt_lens)​ + - max_context_len = max(context_lens)​ + - max_num_blocks = div_up(max_context_len / BLOCK_SIZE)​ + - num_blocks – total number of blocks in block_indices​ + """ + + # Describes past KV cache size for each sequence within a batch + # Shape: [batch_size_in_sequences] + # Type: i32​ + past_lens: torch.Tensor + + # Describes start indices of input / speculative tokens from + # current sequences within a batch sequence​ + # Shape: [batch_size_in_sequences + 1]​ + # Type: i32 + subsequence_begins: torch.Tensor + + # Describes block tables for each sequence within a batch​ - + # indices along 0th dimension in key_cache and value_cache inputs​ + # Shape: [num_blocks] + # Type: i32​ + block_indices: torch.Tensor + + # Describes block tables for each sequence within a batch​ - + # for i-th element, it is an index in block_indices with the + # first block belonging to i-th sequence​ + # Shape: [batch_size_in_sequences + 1] + # Type: i32​ + block_indices_begins: torch.Tensor + + # Describes max context length + # Shape: scalar + # Type: i32 + max_context_len: torch.Tensor diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py new file mode 100644 index 0000000000000..bea205882d9d8 --- /dev/null +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -0,0 +1,295 @@ +import pickle +import time +from contextlib import contextmanager +from multiprocessing import shared_memory +from typing import Optional +from unittest.mock import patch + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +import vllm.envs as envs +from vllm.logger import init_logger + +VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL + +# time to wait if the queue is full or empty +# if we sleep for too short, it will consume too much CPU +# if we sleep for too long, it will slow down the writer/reader +# 0.1 us is a good balance +RINGBUFFER_SLEEP_INTERVAL = 1e-7 + +logger = init_logger(__name__) + + +class ShmRingBuffer: + + def __init__(self, + n_reader: int, + max_chunk_bytes: int, + max_chunks: int, + name: Optional[str] = None): + """ + A shared memory ring buffer implementation for broadcast communication. + Essentially, it is a queue where only one will `enqueue` and multiple + will `dequeue`. The max size of each item, together with the max number + of items that can be stored in the buffer are known in advance. + In this case, we don't need to synchronize the access to + the buffer. + + Buffer memory layout: + data metadata + | | + | (current_idx) | (current_idx) + v v + +-------------------------------+----------------------------------------+ + | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata | + +-------------------------------+----------------------------------------+ + | max_chunks x max_chunk_bytes | max_chunks x (1 + n_reader) bytes | + + metadata memory layout: each byte is a flag, the first byte is the written + flag, and the rest are reader flags. The flags are set to 0 by default. + +--------------+--------------+--------------+-----+--------------+ + | written_flag | reader0_flag | reader1_flag | ... | readerN_flag | + +--------------+--------------+--------------+-----+--------------+ + + The state of metadata is as follows: + + (case 1) 0???...???: the block is not written yet, cannot read, can write + (case 2) 1000...000: the block is just written, can read, cannot write + (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write + (case 4) 1111...111: the block is written and read by all readers, cannot read, can write + + State transition for readers: + + When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read. + Only after the caller finishes reading the block, the reader can mark the block as read. + Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0). + + State transition for writer: + + When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case + to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer + can reset the reader flags to 0, and mark the block as written (from 0 to 1). + NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct. + + During creation, `name` is None and the buffer is created. We can pass the + created object to other processes by pickling it. The other processes will + get the name of the shared memory and open it, so that they can access the + same shared memory buffer. + """# noqa + self.n_reader = n_reader + self.metadata_size = 1 + n_reader + self.max_chunk_bytes = max_chunk_bytes + self.max_chunks = max_chunks + self.total_bytes_of_buffer = (self.max_chunk_bytes + + self.metadata_size) * self.max_chunks + self.data_offset = 0 + self.metadata_offset = self.max_chunk_bytes * self.max_chunks + + if name is None: + # we are creating a buffer + self.is_creator = True + self.shared_memory = shared_memory.SharedMemory( + create=True, size=self.total_bytes_of_buffer) + # initialize the metadata section to 0 + with memoryview(self.shared_memory.buf[self.metadata_offset:] + ) as metadata_buffer: + torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0) + else: + # we are opening an existing buffer + self.is_creator = False + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch("multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None): + self.shared_memory = shared_memory.SharedMemory(name=name) + assert self.shared_memory.size == self.total_bytes_of_buffer + + def __reduce__(self): + return ( + self.__class__, + (self.n_reader, self.max_chunk_bytes, self.max_chunks, + self.shared_memory.name), + ) + + def __del__(self): + self.shared_memory.close() + if self.is_creator: + self.shared_memory.unlink() + + @contextmanager + def get_data(self, current_idx: int): + start = self.data_offset + current_idx * self.max_chunk_bytes + end = start + self.max_chunk_bytes + with memoryview(self.shared_memory.buf[start:end]) as buf: + yield buf + + @contextmanager + def get_metadata(self, current_idx: int): + start = self.metadata_offset + current_idx * self.metadata_size + end = start + self.metadata_size + with memoryview(self.shared_memory.buf[start:end]) as buf: + yield buf + + +class ShmRingBufferIO: + + def __init__(self, buffer: ShmRingBuffer, reader_rank: int): + self.buffer = buffer + self.reader_rank = reader_rank + self._is_writer = self.reader_rank == -1 + self._is_reader = not self._is_writer + if self._is_reader: + assert 0 <= self.reader_rank < buffer.n_reader, \ + (f"Invalid reader rank {self.reader_rank} for buffer" + f" created with {buffer.n_reader} readers") + self.current_idx = 0 + + @contextmanager + def acquire_write(self): + assert self._is_writer, "Only writers can acquire write" + start_time = time.monotonic() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_count = sum(metadata_buffer[1:]) + written_flag = metadata_buffer[0] + if written_flag and read_count != self.buffer.n_reader: + # this block is written and not read by all readers + # for writers, `self.current_idx` is the next block to write + # if this block is not ready to write, + # we need to wait until it is read by all readers + + # wait for a while + time.sleep(RINGBUFFER_SLEEP_INTERVAL) + + # if we wait for a long time, we should warn the user + if time.monotonic( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + + continue + # found a block that is either + # (1) not written + # (2) read by all readers + + # mark the block as not written + metadata_buffer[0] = 0 + # let caller write to the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has written to the buffer + # NOTE: order is important here + # first set the read flags to 0 + # then set the written flag to 1 + # otherwise, the readers may think they already read the block + for i in range(1, self.buffer.n_reader + 1): + # set read flag to 0, meaning it is not read yet + metadata_buffer[i] = 0 + # mark the block as written + metadata_buffer[0] = 1 + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks + break + + @contextmanager + def acquire_read(self): + assert self._is_reader, "Only readers can acquire read" + start_time = time.monotonic() + n_warning = 1 + while True: + with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + read_flag = metadata_buffer[self.reader_rank + 1] + written_flag = metadata_buffer[0] + if not written_flag or read_flag: + # this block is either + # (1) not written + # (2) already read by this reader + + # for readers, `self.current_idx` is the next block to read + # if this block is not ready, + # we need to wait until it is written + + # wait for a while + time.sleep(RINGBUFFER_SLEEP_INTERVAL) + + # if we wait for a long time, we should warn the user + if time.monotonic( + ) - start_time > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning: # noqa + logger.warning( + "No available block found in %s second. ", + VLLM_RINGBUFFER_WARNING_INTERVAL) + n_warning += 1 + + continue + # found a block that is not read by this reader + # let caller read from the buffer + with self.buffer.get_data(self.current_idx) as buf: + yield buf + + # caller has read from the buffer + # set the read flag + metadata_buffer[self.reader_rank + 1] = 1 + self.current_idx = (self.current_idx + + 1) % self.buffer.max_chunks + break + + def enqueue(self, obj): + assert self._is_writer, "Only writers can enqueue" + serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL) + if len(serialized_obj) > self.buffer.max_chunk_bytes: + raise RuntimeError( + f"{len(serialized_obj)=} larger than the allowed value " + f"{self.buffer.max_chunk_bytes}," + "Please increase the max_chunk_bytes parameter.") + with self.acquire_write() as buf: + buf[:len(serialized_obj)] = serialized_obj + + def dequeue(self): + assert self._is_reader, "Only readers can dequeue" + with self.acquire_read() as buf: + # no need to know the size of serialized object + # pickle format itself contains the size information internally + # see https://docs.python.org/3/library/pickle.html + obj = pickle.loads(buf) + return obj + + def broadcast_object(self, obj=None): + if self._is_writer: + self.enqueue(obj) + return obj + else: + return self.dequeue() + + @staticmethod + def create_from_process_group(pg: ProcessGroup, + max_chunk_bytes, + max_chunks, + writer_rank=0) -> "ShmRingBufferIO": + group_rank = dist.get_rank(pg) + group_world_size = dist.get_world_size(pg) + ranks_inside_group = list(range(group_world_size)) + global_ranks = dist.get_process_group_ranks(pg) + n_reader = group_world_size - 1 + buffer: ShmRingBuffer + if group_rank == writer_rank: + buffer = ShmRingBuffer(n_reader, max_chunk_bytes, max_chunks) + dist.broadcast_object_list([buffer], + src=global_ranks[writer_rank], + group=pg) + return ShmRingBufferIO(buffer, -1) + else: + recv = [None] + dist.broadcast_object_list(recv, + src=global_ranks[writer_rank], + group=pg) + buffer = recv[0] # type: ignore + rest_ranks = [r for r in ranks_inside_group if r != writer_rank] + return ShmRingBufferIO(buffer, rest_ranks.index(group_rank)) diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py new file mode 100644 index 0000000000000..4b18426252127 --- /dev/null +++ b/vllm/engine/async_timeout.py @@ -0,0 +1,189 @@ +# Workaround for https://github.com/python/cpython/issues/86296 +# +# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py +# Licensed under the Apache License (Apache-2.0) + +import asyncio +import enum +import sys +import warnings +from types import TracebackType +from typing import Any, Optional, Type + +if sys.version_info[:2] >= (3, 11): + from asyncio import timeout as asyncio_timeout +else: + + def asyncio_timeout(delay: Optional[float]) -> "Timeout": + """timeout context manager. + Useful in cases when you want to apply timeout logic around block + of code or in cases when asyncio.wait_for is not suitable. For example: + >>> async with timeout(0.001): + ... async with aiohttp.get('https://github.com') as r: + ... await r.text() + delay - value in seconds or None to disable timeout logic + """ + loop = asyncio.get_running_loop() + deadline = loop.time() + delay if delay is not None else None + return Timeout(deadline, loop) + + class _State(enum.Enum): + INIT = "INIT" + ENTER = "ENTER" + TIMEOUT = "TIMEOUT" + EXIT = "EXIT" + + class Timeout: + # Internal class, please don't instantiate it directly + # Use timeout() and timeout_at() public factories instead. + # + # Implementation note: `async with timeout()` is preferred + # over `with timeout()`. + # While technically the Timeout class implementation + # doesn't need to be async at all, + # the `async with` statement explicitly points that + # the context manager should be used from async function context. + # + # This design allows to avoid many silly misusages. + # + # TimeoutError is raised immediately when scheduled + # if the deadline is passed. + # The purpose is to time out as soon as possible + # without waiting for the next await expression. + + __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler") + + def __init__(self, deadline: Optional[float], + loop: asyncio.AbstractEventLoop) -> None: + self._loop = loop + self._state = _State.INIT + + self._timeout_handler = None # type: Optional[asyncio.Handle] + if deadline is None: + self._deadline = None # type: Optional[float] + else: + self.update(deadline) + + def __enter__(self) -> "Timeout": + warnings.warn( + "with timeout() is deprecated, use async with timeout()", + DeprecationWarning, + stacklevel=2, + ) + self._do_enter() + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> Optional[bool]: + self._do_exit(exc_type) + return None + + async def __aenter__(self) -> "Timeout": + self._do_enter() + return self + + async def __aexit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> Optional[bool]: + self._do_exit(exc_type) + return None + + @property + def expired(self) -> bool: + """Is timeout expired during execution?""" + return self._state == _State.TIMEOUT + + @property + def deadline(self) -> Optional[float]: + return self._deadline + + def reject(self) -> None: + """Reject scheduled timeout if any.""" + # cancel is maybe better name but + # task.cancel() raises CancelledError in asyncio world. + if self._state not in (_State.INIT, _State.ENTER): + raise RuntimeError(f"invalid state {self._state.value}") + self._reject() + + def _reject(self) -> None: + if self._timeout_handler is not None: + self._timeout_handler.cancel() + self._timeout_handler = None + + def shift(self, delay: float) -> None: + """Advance timeout on delay seconds. + The delay can be negative. + Raise RuntimeError if shift is called when deadline is not scheduled + """ + deadline = self._deadline + if deadline is None: + raise RuntimeError( + "cannot shift timeout if deadline is not scheduled") + self.update(deadline + delay) + + def update(self, deadline: float) -> None: + """Set deadline to absolute value. + deadline argument points on the time in the same clock system + as loop.time(). + If new deadline is in the past the timeout is raised immediately. + Please note: it is not POSIX time but a time with + undefined starting base, e.g. the time of the system power on. + """ + if self._state == _State.EXIT: + raise RuntimeError( + "cannot reschedule after exit from context manager") + if self._state == _State.TIMEOUT: + raise RuntimeError("cannot reschedule expired timeout") + if self._timeout_handler is not None: + self._timeout_handler.cancel() + self._deadline = deadline + if self._state != _State.INIT: + self._reschedule() + + def _reschedule(self) -> None: + assert self._state == _State.ENTER + deadline = self._deadline + if deadline is None: + return + + now = self._loop.time() + if self._timeout_handler is not None: + self._timeout_handler.cancel() + + task = asyncio.current_task() + if deadline <= now: + self._timeout_handler = self._loop.call_soon( + self._on_timeout, task) + else: + self._timeout_handler = self._loop.call_at( + deadline, self._on_timeout, task) + + def _do_enter(self) -> None: + if self._state != _State.INIT: + raise RuntimeError(f"invalid state {self._state.value}") + self._state = _State.ENTER + self._reschedule() + + def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None: + if exc_type is asyncio.CancelledError and \ + self._state == _State.TIMEOUT: + self._timeout_handler = None + raise asyncio.TimeoutError + # timeout has not expired + self._state = _State.EXIT + self._reject() + return None + + def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None: + if task: + task.cancel() + self._state = _State.TIMEOUT + # drop the reference early + self._timeout_handler = None diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py new file mode 100644 index 0000000000000..697d698b4edf7 --- /dev/null +++ b/vllm/executor/openvino_executor.py @@ -0,0 +1,163 @@ +from typing import List, Set, Tuple + +import openvino as ov +import openvino.properties.hint as hints +import torch + +import vllm.envs as envs +from vllm.config import CacheConfig, ModelConfig +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) + +logger = init_logger(__name__) + + +class OpenVINOExecutor(ExecutorBase): + + def _init_executor(self) -> None: + assert self.device_config.device_type == "openvino" + assert self.lora_config is None, "OpenVINO backend doesn't support LoRA" + self.model_config = _verify_and_get_model_config(self.model_config) + self.cache_config = _verify_and_get_cache_config(self.cache_config) + + # Instantiate the worker and load the model to CPU. + self._init_worker() + + def _init_worker(self): + from vllm.worker.openvino_worker import OpenVINOWorker + + assert ( + self.parallel_config.world_size == 1 + ), "OpenVINOExecutor only supports single CPU socket currently." + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = OpenVINOWorker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + load_config=self.load_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + multimodal_config=self.multimodal_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=True, + ) + self.driver_worker.init_device() + self.driver_worker.load_model() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ + return self.driver_worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Initialize the KV cache by invoking the underlying worker.""" + # NOTE: We log here to avoid multiple logs when number of workers is + # greater than one. We could log in the engine, but not all executors + # have GPUs. + # NOTE: `cpu block` for OpenVINO backend is located on CPU memory but is + # referred as `gpu block`. Because we want to reuse the existing block + # management procedure. + logger.info("# CPU blocks: %d", num_gpu_blocks) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + output = self.driver_worker.execute_model(execute_model_req) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + return self.driver_worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self.driver_worker.remove_lora(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + return self.driver_worker.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.driver_worker.list_loras() + + def check_health(self) -> None: + # OpenVINOExecutor will always be healthy as long as + # it's running. + return + + +class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + output = await make_async(self.driver_worker.execute_model + )(execute_model_req=execute_model_req, ) + return output + + async def check_health_async(self) -> None: + # OpenVINOExecutor will always be healthy as long as + # it's running. + return + + +def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: + if config.dtype != torch.float32: + logger.warning( + f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}." # noqa: G004, E501 + ) + config.dtype = torch.float32 + if not config.enforce_eager: + logger.warning( + "CUDA graph is not supported on OpenVINO backend, fallback to the " + "eager mode.") + config.enforce_eager = True + return config + + +def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: + if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": + logger.info("KV cache type is overried to u8 via " + "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.") + config.cache_dtype = ov.Type.u8 + else: + core = ov.Core() + inference_precision = core.get_property("CPU", + hints.inference_precision) + if inference_precision == ov.Type.bf16: + config.cache_dtype = ov.Type.bf16 + else: + config.cache_dtype = ov.Type.f16 + + if config.block_size != 32: + logger.info( + f"OpenVINO optimal block size is 32, overriding currently set {config.block_size}" # noqa: G004, E501 + ) + config.block_size = 32 + + kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE + if kv_cache_space >= 0: + _GB = 1 << 30 + if kv_cache_space == 0: + config.openvino_kvcache_space_bytes = 4 * _GB # type: ignore + logger.warning( + "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) " + "for OpenVINO backend is not set, using 4 by default.") + else: + config.openvino_kvcache_space_bytes = kv_cache_space * _GB # type: ignore + else: + raise RuntimeError( + "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE" + f" {kv_cache_space}, expect a positive integer value.") + + return config diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py new file mode 100644 index 0000000000000..d094156962955 --- /dev/null +++ b/vllm/inputs/__init__.py @@ -0,0 +1,19 @@ +from .data import (LLMInputs, ParsedText, ParsedTokens, PromptInputs, + PromptStrictInputs, TextPrompt, TextTokensPrompt, + TokensPrompt, parse_and_batch_prompt) +from .registry import InputContext, InputRegistry + +INPUT_REGISTRY = InputRegistry() +""" +The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine` +to dispatch data processing according to the target model. + +See also: + :ref:`input_processing_pipeline` +""" + +__all__ = [ + "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt", + "TokensPrompt", "TextTokensPrompt", "PromptStrictInputs", "PromptInputs", + "LLMInputs", "INPUT_REGISTRY", "InputContext", "InputRegistry" +] diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py new file mode 100644 index 0000000000000..c6381fcc01e5f --- /dev/null +++ b/vllm/inputs/data.py @@ -0,0 +1,143 @@ +from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence, + TypedDict, Union, cast, overload) + +from typing_extensions import NotRequired + +if TYPE_CHECKING: + from vllm.multimodal import MultiModalDataDict + + +class ParsedText(TypedDict): + content: str + is_tokens: Literal[False] + + +class ParsedTokens(TypedDict): + content: List[int] + is_tokens: Literal[True] + + +# https://github.com/vllm-project/vllm/pull/4028 +@overload +def parse_and_batch_prompt( + prompt: Union[str, List[str]]) -> Sequence[ParsedText]: + ... + + +@overload +def parse_and_batch_prompt( + prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: + ... + + +def parse_and_batch_prompt( + prompt: Union[str, List[str], List[int], List[List[int]]], +) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]: + if isinstance(prompt, str): + # case 1: a string + return [ParsedText(content=prompt, is_tokens=False)] + + if isinstance(prompt, list): + if len(prompt) == 0: + raise ValueError("please provide at least one prompt") + + if isinstance(prompt[0], str): + # case 2: array of strings + return [ + ParsedText(content=elem, is_tokens=False) + for elem in cast(List[str], prompt) + ] + if isinstance(prompt[0], int): + # case 3: array of tokens + elem = cast(List[int], prompt) + return [ParsedTokens(content=elem, is_tokens=True)] + if isinstance(prompt[0], list): + if len(prompt[0]) == 0: + raise ValueError("please provide at least one prompt") + + if isinstance(prompt[0][0], int): + # case 4: array of token arrays + return [ + ParsedTokens(content=elem, is_tokens=True) + for elem in cast(List[List[int]], prompt) + ] + + raise ValueError("prompt must be a string, array of strings, " + "array of tokens, or array of token arrays") + + +class TextPrompt(TypedDict): + """Schema for a text prompt.""" + + prompt: str + """The input text to be tokenized before passing to the model.""" + + multi_modal_data: NotRequired["MultiModalDataDict"] + """ + Optional multi-modal data to pass to the model, + if the model supports it. + """ + + +class TokensPrompt(TypedDict): + """Schema for a tokenized prompt.""" + + prompt_token_ids: List[int] + """A list of token IDs to pass to the model.""" + + multi_modal_data: NotRequired["MultiModalDataDict"] + """ + Optional multi-modal data to pass to the model, + if the model supports it. + """ + + +class TextTokensPrompt(TypedDict): + """It is assumed that :attr:`prompt` is consistent with + :attr:`prompt_token_ids`. This is currently used in + :class:`AsyncLLMEngine` for logging both the text and token IDs.""" + + prompt: str + """The prompt text.""" + + prompt_token_ids: List[int] + """The token IDs of the prompt.""" + + multi_modal_data: NotRequired["MultiModalDataDict"] + """ + Optional multi-modal data to pass to the model, + if the model supports it. + """ + + +PromptStrictInputs = Union[str, TextPrompt, TokensPrompt] +""" +The inputs to the LLM, which can take one of the following forms: + +- A text prompt (:class:`str` or :class:`TextPrompt`) +- A tokenized prompt (:class:`TokensPrompt`) +""" + +PromptInputs = Union[str, TextPrompt, TokensPrompt, TextTokensPrompt] +"""Same as :const:`PromptStrictInputs` but additionally accepts +:class:`TextTokensPrompt`.""" + + +class LLMInputs(TypedDict): + """ + The inputs in :class:`~vllm.LLMEngine` before they are + passed to the model executor. + """ + prompt_token_ids: List[int] + """The token IDs of the prompt.""" + + prompt: NotRequired[Optional[str]] + """ + The original prompt text corresponding to the token IDs, if available. + """ + + multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] + """ + Optional multi-modal data to pass to the model, + if the model supports it. + """ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py new file mode 100644 index 0000000000000..4a7e5c5832917 --- /dev/null +++ b/vllm/inputs/registry.py @@ -0,0 +1,209 @@ +import functools +from dataclasses import dataclass +from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type, + TypeVar) + +from torch import nn +from transformers import PretrainedConfig + +from vllm.logger import init_logger + +from .data import LLMInputs + +if TYPE_CHECKING: + from vllm.config import ModelConfig, MultiModalConfig + from vllm.multimodal import MultiModalDataDict + from vllm.sequence import SequenceData + +logger = init_logger(__name__) + +C = TypeVar("C", bound=PretrainedConfig) + + +@dataclass(frozen=True) +class InputContext: + """ + Contains information about the model which may be used to + modify the inputs. + """ + + model_config: "ModelConfig" + """The configuration of the model.""" + + def get_multimodal_config(self) -> "MultiModalConfig": + """ + Get the multimodal configuration of the model. + + Raises: + ValueError: If the model is not multimodal. + """ + + multimodal_config = self.model_config.multimodal_config + if multimodal_config is None: + raise ValueError("No multimodal config found") + + return multimodal_config + + def get_hf_config(self, hf_config_type: Type[C]) -> C: + """ + Get the HuggingFace configuration + (:class:`transformers.PretrainedConfig`) of the model, + additionally checking its type. + + Raises: + TypeError: If the model is not of the specified type. + """ + + hf_config = self.model_config.hf_config + if not isinstance(hf_config, hf_config_type): + raise TypeError("Invalid type of HuggingFace config. " + f"Expected type: {hf_config_type}, but " + f"found type: {type(hf_config)}") + + return hf_config + + +N = TypeVar("N", bound=Type[nn.Module]) + +DummyDataFactory = Callable[[InputContext, int], + Tuple["SequenceData", + Optional["MultiModalDataDict"]]] +""" +Create dummy data to be inputted into the model. + +Note: + :data:`InputProcessor` is not applied to the dummy data. +""" + +InputProcessor = Callable[[InputContext, LLMInputs], LLMInputs] +"""Preprocess the inputs to the model.""" + + +class InputRegistry: + """ + A registry to dispatch data processing + according to the target model. + """ + + def __init__(self) -> None: + self._dummy_factories_by_model_type: Dict[Type[nn.Module], + DummyDataFactory] = {} + self._input_processors_by_model_type: Dict[Type[nn.Module], + InputProcessor] = {} + + def _default_dummy_data_factory( + self, + ctx: InputContext, + seq_len: int, + ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]: + """ + The default dummy data factory represents the longest possible text + that can be inputted to the model. + + Note: + :data:`InputProcessor` is not applied to the dummy data. + """ + # Avoid circular import + from vllm.sequence import SequenceData + + dummy_seq_data = SequenceData([0] * seq_len) + dummy_multi_modal_data = None + + return dummy_seq_data, dummy_multi_modal_data + + def register_dummy_data(self, factory: DummyDataFactory): + """ + Register a dummy data factory to a model class. + + During memory profiling, the provided function is invoked to create + dummy data to be inputted into the model. The resulting memory usage + should be an upper bound of what the model would use at inference time. + """ + + def wrapper(model_cls: N) -> N: + if model_cls in self._dummy_factories_by_model_type: + logger.warning( + "Model class %s already has dummy data " + "registered to %s. It is overwritten by the new one.", + model_cls, self) + + self._dummy_factories_by_model_type[model_cls] = factory + + return model_cls + + return wrapper + + def dummy_data_for_profiling(self, model_config: "ModelConfig", + seq_len: int): + """ + Create dummy data for profiling the memory usage of a model. + + The model is identified by ``model_config``. + + See also: + :ref:`enabling_multimodal_inputs` + """ + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + dummy_factory = self._dummy_factories_by_model_type \ + .get(model_cls, self._default_dummy_data_factory) + + return dummy_factory(InputContext(model_config), seq_len) + + def _default_input_processor(self, ctx: InputContext, + inputs: LLMInputs) -> LLMInputs: + """The default input processor is a no-op.""" + return inputs + + def register_input_processor(self, processor: InputProcessor): + """ + Register an input processor to a model class. + + The provided function is invoked on each input to the model. This + happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`. + + See also: + :ref:`input_processing_pipeline` + """ + + def wrapper(model_cls: N) -> N: + if model_cls in self._input_processors_by_model_type: + logger.warning( + "Model class %s already has input processor " + "registered to %s. It is overwritten by the new one.", + model_cls, self) + + self._input_processors_by_model_type[model_cls] = processor + + return model_cls + + return wrapper + + def process_input(self, model_config: "ModelConfig", + inputs: LLMInputs) -> LLMInputs: + """ + Apply an input processor to an instance of model inputs. + + The model is identified by ``model_config``. + + See also: + :ref:`input_processing_pipeline` + """ + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + + processor = self._input_processors_by_model_type \ + .get(model_cls, self._default_input_processor) + + return processor(InputContext(model_config), inputs) + + def create_input_processor(self, model_config: "ModelConfig"): + """ + Create an input processor (see :meth:`process_input`) for a + specific model. + """ + return functools.partial(self.process_input, model_config) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py new file mode 100644 index 0000000000000..73cfcd7fc85f2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -0,0 +1,197 @@ +from abc import abstractmethod +from typing import Optional + +import torch + +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.utils import set_weight_attrs + +logger = init_logger(__name__) + + +class FusedMoEMethodBase(QuantizeMethodBase): + + @abstractmethod + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size: int, + params_dtype: torch.dtype, **extra_weight_attrs): + raise NotImplementedError + + @abstractmethod + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool = True) -> torch.Tensor: + raise NotImplementedError + + +class UnquantizedFusedMoEMethod(FusedMoEMethodBase): + """MoE method without quantization.""" + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + # Fused gate_up_proj (column parallel) + w13_weight = torch.nn.Parameter(torch.empty(num_experts, + 2 * intermediate_size, + hidden_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + # down_proj (row parallel) + w2_weight = torch.nn.Parameter(torch.empty(num_experts, + hidden_size, + intermediate_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool = True) -> torch.Tensor: + + return fused_moe(x, + layer.w13_weight, + layer.w2_weight, + router_logits, + top_k, + renormalize=renormalize, + inplace=True) + + +class FusedMoE(torch.nn.Module): + """FusedMoE layer for MoE models. + + This layer contains both MergedColumnParallel weights (gate_up_proj / + w13) and RowParallelLinear weights (down_proj/ w2). + + Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We + copy that naming convention here and handle any remapping in the + load_weights function in each model implementation. + + Args: + num_experts: Number of experts in the model + top_k: Number of experts selected for each token + hidden_size: Input hidden state size of the transformer + intermediate_size: Intermediate size of the experts + params_dtype: Data type for the parameters. + reduce_results: Whether to all all_reduce on the output of the layer + renomalize: Whether to renormalize the logits in the fused_moe kernel + quant_config: Quantization configure. + """ + + def __init__( + self, + num_experts: int, + top_k: int, + hidden_size: int, + intermediate_size: int, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = False, + renormalize: bool = True, + quant_config: Optional[QuantizationConfig] = None, + tp_size: Optional[int] = None, + ): + super().__init__() + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + + self.tp_size = (tp_size if tp_size is not None else + get_tensor_model_parallel_world_size()) + self.top_k = top_k + self.num_experts = num_experts + self.intermediate_size_per_partition = intermediate_size // self.tp_size + self.reduce_results = reduce_results + self.renormalize = renormalize + + if quant_config is None: + self.quant_method: Optional[QuantizeMethodBase] = ( + UnquantizedFusedMoEMethod()) + else: + self.quant_method = quant_config.get_quant_method(self) + assert self.quant_method is not None + + self.quant_method.create_weights( + layer=self, + num_experts=num_experts, + hidden_size=hidden_size, + intermediate_size=self.intermediate_size_per_partition, + params_dtype=params_dtype, + weight_loader=self.weight_loader) + + def weight_loader(self, param: torch.nn.Parameter, + loaded_weight: torch.Tensor, weight_name: str, + shard_id: int, expert_id: int): + param_data = param.data + + # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral. + # Follow up PR to enable fp8 for other MoE models. + if "input_scale" in weight_name or "w2.weight_scale" in weight_name: + if param_data[expert_id] != 1 and (param_data[expert_id] - + loaded_weight).abs() > 1e-5: + raise ValueError( + "input_scales of w1 and w3 of a layer " + f"must be equal. But got {param_data[expert_id]} " + f"vs. {loaded_weight}") + param_data[expert_id] = loaded_weight + # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral. + # Follow up PR to enable fp8 for other MoE models. + elif "weight_scale" in weight_name: + # We have to keep the weight scales of w1 and w3 because + # we need to re-quantize w1/w3 weights after weight loading. + assert "w1" in weight_name or "w3" in weight_name + shard_id = 0 if "w1" in weight_name else 1 + param_data[expert_id][shard_id] = loaded_weight + else: + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.intermediate_size_per_partition + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + + # w1, gate_proj case: Load into first shard of w13. + if shard_id == 0: + param_data[expert_id, + 0:shard_size, :] = loaded_weight[shard, :] + # w3, up_proj case: Load into second shard of w13. + elif shard_id == 2: + param_data[expert_id, shard_size:2 * + shard_size, :] = loaded_weight[shard, :] + # w2, down_proj case: Load into only shard of w2. + elif shard_id == 1: + param_data[expert_id, :, :] = loaded_weight[:, shard] + else: + raise ValueError( + f"Shard id must be in [0,1,2] but got {shard_id}") + + def forward(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + assert self.quant_method is not None + + # Matrix multiply. + final_hidden_states = self.quant_method.apply( + self, + x=hidden_states, + router_logits=router_logits, + top_k=self.top_k, + renormalize=self.renormalize) + + if self.reduce_results and self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py new file mode 100644 index 0000000000000..b93425fb2d629 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -0,0 +1,87 @@ +from typing import Callable, List, Optional + +import torch + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + apply_fp8_linear, create_per_tensor_scale_param, cutlass_fp8_supported, + requantize_with_max_scale) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsW8A8Fp8"] + + +class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): + + def __init__(self, input_dynamic: bool): + self.input_dynamic = input_dynamic + self.cutlass_fp8_supported = cutlass_fp8_supported() + + # W8A8-Fp8 kernels support only per-tensor and per-channel cases. + # So if we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), we requantize with a single scale. + def process_weights_after_loading(self, layer) -> None: + # Dequant -> Quant with max scale. + max_w_scale, weight = requantize_with_max_scale( + weight=layer.weight, + weight_scale=layer.weight_scale, + logical_widths=layer.logical_widths, + ) + + # Update layer with new values. + layer.weight = torch.nn.Parameter(weight.t(), requires_grad=False) + layer.weight_scale = torch.nn.Parameter(max_w_scale, + requires_grad=False) + if self.input_dynamic: + layer.input_scale = None + else: + layer.input_scale = torch.nn.Parameter(layer.input_scale.max(), + requires_grad=False) + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + del params_dtype + + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + + # WEIGHT + weight = torch.nn.Parameter(torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + requires_grad=False) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, { + "input_dim": 1, + "output_dim": 0, + "weight_loader": weight_loader, + }) + + # WEIGHT SCALE + weight_scale = create_per_tensor_scale_param( + output_partition_sizes, weight_loader=weight_loader) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if not self.input_dynamic: + input_scale = create_per_tensor_scale_param( + output_partition_sizes, weight_loader=weight_loader) + layer.register_parameter("input_scale", input_scale) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + return apply_fp8_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + cutlass_fp8_supported=self.cutlass_fp8_supported) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py new file mode 100644 index 0000000000000..e70504ec51cb3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -0,0 +1,85 @@ +from typing import Callable, List + +import torch +from torch.nn import Parameter + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + apply_int8_linear, convert_to_channelwise, create_per_channel_scale_param, + create_per_tensor_scale_param) +from vllm.model_executor.utils import set_weight_attrs + + +class CompressedTensorsW8A8Int8(CompressedTensorsScheme): + + def __init__(self, strategy: str, is_static_input_scheme: bool): + self.strategy = strategy + self.is_static_input_scheme = is_static_input_scheme + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # WEIGHT + # Cutlass kernels need transposed weight. + weight = layer.weight + layer.weight = Parameter(weight.t(), requires_grad=False) + + # WEIGHT SCALE + # Cutlass kernels support only per-tensor and per-channel. + # If we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), convert to the per-channel case. + is_fused_module = len(self.logical_widths) > 1 + if is_fused_module and self.strategy == QuantizationStrategy.TENSOR: + ws_channelwise = convert_to_channelwise(layer.weight_scale, + self.logical_widths) + layer.weight_scale = Parameter(ws_channelwise, requires_grad=False) + + # INPUT SCALE + if self.is_static_input_scheme: + layer.input_scale = Parameter(layer.input_scale.max(), + requires_grad=False) + else: + layer.input_scale = None + + def create_weights(self, layer: torch.nn.Module, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + self.logical_widths = output_partition_sizes + + # WEIGHT + weight = Parameter(torch.empty(sum(output_partition_sizes), + input_size_per_partition, + dtype=torch.int8), + requires_grad=False) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, { + "input_dim": 1, + "output_dim": 0, + "weight_loader": weight_loader, + }) + + # WEIGHT SCALE + layer_kwargs = {"weight_loader": weight_loader} + if self.strategy == QuantizationStrategy.CHANNEL: + scale = create_per_channel_scale_param(output_partition_sizes, + **layer_kwargs) + else: + assert self.strategy == QuantizationStrategy.TENSOR + scale = create_per_tensor_scale_param(output_partition_sizes, + **layer_kwargs) + layer.register_parameter("weight_scale", scale) + + # INPUT SCALE + if self.is_static_input_scheme: + scale = create_per_tensor_scale_param(output_partition_sizes, + **layer_kwargs) + layer.register_parameter("input_scale", scale) + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): + return apply_int8_linear(input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py new file mode 100644 index 0000000000000..2243260053ef5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -0,0 +1,175 @@ +from typing import Callable, List, Optional + +import torch +from torch.nn import Parameter + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState, + marlin_permute_scales) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsWNA16"] +WNA16_SUPPORTED_BITS = [4, 8] + + +class CompressedTensorsWNA16(CompressedTensorsScheme): + + def __init__(self, + strategy: str, + num_bits: int, + group_size: Optional[int] = None): + self.num_bits = num_bits + self.strategy = strategy + self.group_size = group_size + + if self.strategy == "group" and self.group_size is None: + raise ValueError( + "group_size must be given when using strategy group") + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + pass + + def create_weights(self, layer: torch.nn.Module, input_size: int, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + pack_factor = 32 // self.num_bits + output_size_per_partition = sum(output_partition_sizes) + + if self.group_size is not None: + group_size = self.group_size + else: + group_size = input_size + + weight_scale_dim = None + scales_and_zp_size = input_size // group_size + + if (input_size != input_size_per_partition + and self.group_size is not None): + weight_scale_dim = 1 + scales_and_zp_size = input_size_per_partition // group_size + + weight = Parameter( + torch.empty( + output_size_per_partition, + input_size_per_partition // pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + + set_weight_attrs( + weight, { + "input_dim": 1, + "output_dim": 0, + "packed_dim": 1, + "pack_factor": pack_factor, + "weight_loader": weight_loader + }) + layer.register_parameter("weight_packed", weight) + + weight_scale = Parameter( + torch.empty( + output_size_per_partition, + scales_and_zp_size, + dtype=params_dtype, + ), + requires_grad=False, + ) + + set_weight_attrs( + weight_scale, { + "weight_loader": weight_loader, + "input_dim": weight_scale_dim, + "output_dim": 0 + }) + layer.register_parameter("weight_scale", weight_scale) + + # A 2D array defining the original shape of the weights + # before packing + weight_shape = Parameter(torch.empty(2, dtype=torch.int64), + requires_grad=False) + + layer.register_parameter("weight_shape", weight_shape) + set_weight_attrs(weight_shape, { + "weight_loader": weight_loader, + "ignore_warning": True, + }) + + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + layer.input_size = input_size + layer.marlin_state = GPTQMarlinState.REPACK + layer.is_k_full = True + layer.group_size = group_size + + max_workspace_size = ( + output_size_per_partition // + GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL + + workspace = torch.zeros(max_workspace_size, + dtype=torch.int, + requires_grad=False) + layer.workspace = workspace + + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor): + reshaped_x = x.reshape(-1, x.shape[-1]) + + size_m = reshaped_x.shape[0] + part_size_n = layer.output_size_per_partition + part_size_k = layer.input_size_per_partition + + out_shape = x.shape[:-1] + (part_size_n, ) + + if layer.marlin_state == GPTQMarlinState.REPACK: + layer.marlin_state = GPTQMarlinState.READY + + # Newly generated tensors need to replace existing tensors that are + # already registered as parameters by vLLM (and won't be freed) + def replace_tensor(name, new_t): + # It is important to use resize_() here since it ensures + # the same buffer is reused + getattr(layer, name).resize_(new_t.shape) + getattr(layer, name).copy_(new_t) + del new_t + + cur_device = layer.weight_packed.device + + # Reset g_idx related tensors + layer.g_idx = Parameter(torch.empty(0, + dtype=torch.int, + device=cur_device), + requires_grad=False) + layer.g_idx_sort_indices = Parameter(torch.empty( + 0, dtype=torch.int, device=cur_device), + requires_grad=False) + + # Repack weights + marlin_qweight = ops.gptq_marlin_repack( + layer.weight_packed.t().contiguous(), layer.g_idx_sort_indices, + part_size_k, part_size_n, self.num_bits) + + replace_tensor("weight_packed", marlin_qweight) + + # Permute scales + scales_size_k = part_size_k + scales_size_n = part_size_n + + marlin_scales = marlin_permute_scales( + layer.weight_scale.squeeze().t().contiguous(), scales_size_k, + scales_size_n, layer.group_size, self.num_bits) + replace_tensor("weight_scale", marlin_scales) + + output = ops.gptq_marlin_gemm(reshaped_x, layer.weight_packed, + layer.weight_scale, layer.g_idx, + layer.g_idx_sort_indices, + layer.workspace, self.num_bits, size_m, + part_size_n, part_size_k, + layer.is_k_full) + return output.reshape(out_shape) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py new file mode 100644 index 0000000000000..81b7fdb7833d7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -0,0 +1,163 @@ +from typing import List, Optional, Tuple, Union + +import torch +from torch.nn import Parameter + +from vllm import _custom_ops as ops +from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform + + +def cutlass_fp8_supported() -> bool: + capability = current_platform.get_device_capability() + capability = capability[0] * 10 + capability[1] + + return ops.cutlass_scaled_mm_supports_fp8(capability) + + +def per_tensor_dequantize( + tensor: torch.Tensor, inv_scale: Union[float, + torch.Tensor]) -> torch.Tensor: + fake_qweight = tensor.to(torch.float16) + dq_weight = fake_qweight * inv_scale + return dq_weight + + +def all_close_1d(x: torch.Tensor) -> bool: + assert len(x.shape) == 1 + return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0])) + + +def create_per_tensor_scale_param( + output_partition_sizes: List[int], + **extra_weight_attrs, +) -> Parameter: + scale = Parameter(torch.empty(len(output_partition_sizes), + dtype=torch.float32), + requires_grad=False) + scale[:] = torch.finfo(torch.float32).min + set_weight_attrs(scale, { + "needs_scalar_to_array": True, + **extra_weight_attrs + }) + return scale + + +def create_per_channel_scale_param(output_partition_sizes: List[int], + **extra_weight_attrs) -> Parameter: + scale = Parameter(torch.empty((sum(output_partition_sizes), 1), + dtype=torch.float32), + requires_grad=False) + scale[:] = torch.finfo(torch.float32).min + set_weight_attrs(scale, {"output_dim": 0, **extra_weight_attrs}) + return scale + + +def convert_to_channelwise( + weight_scale: torch.Tensor, + logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: + # Create channelwise buffer + weight_scale_channel = torch.empty((sum(logical_widths), 1), + dtype=torch.float32, + device=weight_scale.device) + + # Expand each scale to match the size of each logical matrix. + start = 0 + for idx, logical_width in enumerate(logical_widths): + end = start + logical_width + weight_scale_channel[start:end, :] = weight_scale[idx] + start = end + + return weight_scale_channel + + +def requantize_with_max_scale( + weight: torch.Tensor, weight_scale: torch.Tensor, + logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: + # Max scale to be used for requanitzation. + max_w_scale = weight_scale.max() + + # QKV / MLP is fused in the on disk checkpoint if any of the + # weight scales are still set to the default since we initialize + # N weight scales for N shards but we only load 1 weight scale + # from disk in this case. Skip requantization in this case (since) + # we already are quantized with the single scale. + # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 + unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo( + torch.float8_e4m3fn).min) + + # If unfused checkpoint, need requanize with the single scale. + if unfused_module_in_checkpoint: + start = 0 + for idx, logical_width in enumerate(logical_widths): + end = start + logical_width + weight_dq = per_tensor_dequantize(weight[start:end, :], + weight_scale[idx]) + weight[start:end, :], _ = ops.scaled_fp8_quant( + weight_dq, max_w_scale) + start = end + + return max_w_scale, weight + + +def apply_fp8_linear( + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: torch.Tensor, + bias: Optional[torch.Tensor] = None, + cutlass_fp8_supported: bool = True, +) -> torch.Tensor: + # ops.scaled_fp8_quant supports both dynamic and static quant. + # If dynamic, layer.input_scale is None and x_scale computed from x. + # If static, layer.input_scale is scalar and x_scale is input_scale. + + if bias is None and cutlass_fp8_supported: + qinput, x_scale = ops.scaled_fp8_quant(input, input_scale) + + # Fused GEMM_DQ + output = ops.cutlass_scaled_mm(qinput, + weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale) + + else: + qinput, x_scale = ops.scaled_fp8_quant(input, + input_scale, + batch_dim_padding=17) + + # Fused GEMM_DQ -- note we padded the input above because + # torch._scaled_mm is more performant for matrices with + # batch dimension > 16. Note that this could change + # in the future. + output, _ = torch._scaled_mm(qinput, + weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias) + + return torch.narrow(output, 0, 0, input.shape[0]) + + +def apply_int8_linear( + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: torch.Tensor, + bias: Optional[torch.Tensor] = None, +): + if bias is not None: + raise NotImplementedError("W8A8 with int8 does not yet support bias.") + + # ops.scaled_int8_quant supports both dynamic and static quant. + # * dynamic, layer.input_scale is None and x_scale computed from x. + # * static, layer.input_scale is scalar and x_scale is input_scale. + x_q, x_scale = ops.scaled_int8_quant(input, input_scale) + + return ops.cutlass_scaled_mm(x_q, + weight, + scale_a=x_scale, + scale_b=weight_scale, + out_dtype=input.dtype) diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py new file mode 100644 index 0000000000000..5c522a61732a4 --- /dev/null +++ b/vllm/model_executor/model_loader/openvino.py @@ -0,0 +1,210 @@ +# ruff: noqa: SIM117 +from pathlib import Path +from typing import List, Optional, Tuple + +import openvino as ov +import torch +from huggingface_hub import HfApi +from openvino._offline_transformations import paged_attention_transformation +from optimum.intel import OVModelForCausalLM +from torch import nn + +import vllm.envs as envs +from vllm.attention.backends.openvino import OpenVINOAttentionMetadata +from vllm.config import DeviceConfig, ModelConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.logits_processor import (LogitsProcessor, + _prune_hidden_states) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput + +logger = init_logger(__name__) + + +def _flattenize_inputs(inputs): + """ + Helper function for making nested inputs flattens + """ + flatten_inputs = [] + for input_data in inputs: + if input_data is None: + continue + if isinstance(input_data, (list, tuple)): + flatten_inputs.extend(_flattenize_inputs(input_data)) + elif isinstance(input_data, dict): + flatten_inputs.extend(_flattenize_inputs(list( + input_data.values()))) + else: + flatten_inputs.append(input_data) + return flatten_inputs + + +def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type, + is_cpu: bool): + # Apply hardware dependent modifications to KV tensors + for parameter in model.get_parameters(): + input = parameter.get_output_tensor(0) + input_names = input.get_names() + if len(input_names) != 1: + continue + input_name = next(iter(input_names)) + shape = parameter.get_partial_shape() + # use real block size if available, just a placeholder + # to provide the expected rank + x_size = 1 + num_blocks = ov.Dimension() + block_size = ov.Dimension() + head_size = ov.Dimension() + # TODO: Negotiate required layout with plugins (CPU is ~OK, GPU is TBD), + # pass more parameters to this function to set more static dimensions + if input_name.startswith("key_cache."): + cpu_shape = [num_blocks, shape[1], block_size, head_size] + gpu_shape = [ + num_blocks, + shape[1], + shape[2].get_length() // + x_size if shape[2].is_static else ov.Dimension(), + block_size, + x_size, + ] + elif input_name.startswith("value_cache."): + cpu_shape = [num_blocks, shape[1], block_size, head_size] + gpu_shape = [num_blocks, shape[1], shape[2], block_size] + else: + continue + parameter.set_partial_shape( + ov.PartialShape(cpu_shape if is_cpu else gpu_shape)) + parameter.set_element_type(kv_cache_dtype) + model.validate_nodes_and_infer_types() + + +def _require_model_export(model_id, revision=None, subfolder=None): + model_dir = Path(model_id) + if subfolder is not None: + model_dir = model_dir / subfolder + if model_dir.is_dir(): + return (not (model_dir / "openvino_model.xml").exists() + or not (model_dir / "openvino_model.bin").exists()) + + hf_api = HfApi() + try: + model_info = hf_api.model_info(model_id, revision=revision or "main") + normalized_subfolder = (None if subfolder is None else + Path(subfolder).as_posix()) + model_files = [ + file.rfilename for file in model_info.siblings + if normalized_subfolder is None + or file.rfilename.startswith(normalized_subfolder) + ] + ov_model_path = ("openvino_model.xml" if normalized_subfolder is None + else f"{normalized_subfolder}/openvino_model.xml") + return (ov_model_path not in model_files + or ov_model_path.replace(".xml", ".bin") not in model_files) + except Exception: + return True + + +class OpenVINOCasualLM(nn.Module): + + def __init__( + self, + model_config: ModelConfig, + device_config: DeviceConfig, + kv_cache_dtype: ov.Type, + ) -> None: + super().__init__() + self.logits_processor = LogitsProcessor( + model_config.hf_config.vocab_size, logits_as_input=True) + self.sampler = Sampler() + + export = _require_model_export(model_config.model) + if export: + logger.warning( + f"Provided model id {model_config.model} does not " # noqa: G004 + "contain OpenVINO IR, the model will be converted to IR with " + "default options. If you need to use specific options for " + "model conversion, use optimum-cli export openvino with " + "desired options.") + else: + logger.warning( + "OpenVINO IR is available for provided model id " # noqa: G004 + f"{model_config.model}. This IR will be used for inference " + "as-is, all possible options that may affect model conversion " + "are ignored.") + + load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS + pt_model = OVModelForCausalLM.from_pretrained( + model_config.model, + export=export, + compile=False, + load_in_8bit=load_in_8bit, + trust_remote_code=model_config.trust_remote_code, + ) + + paged_attention_transformation(pt_model.model) + _modify_cache_parameters(pt_model.model, kv_cache_dtype, + device_config.device.type == "cpu") + + core = ov.Core() + ov_compiled = core.compile_model(pt_model.model, "CPU") + self.ov_request = ov_compiled.create_infer_request() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[Tuple[ov.Tensor, ov.Tensor]], + attn_metadata: OpenVINOAttentionMetadata, + ) -> torch.Tensor: + flatten_kv_cache = _flattenize_inputs(kv_caches) + + inputs = [ + input_ids, + positions, + *flatten_kv_cache, + attn_metadata.past_lens, + attn_metadata.subsequence_begins, + attn_metadata.block_indices, + attn_metadata.block_indices_begins, + attn_metadata.max_context_len, + ] + + self.ov_request.start_async(inputs, share_inputs=True) + self.ov_request.wait() + + logits = torch.from_numpy(self.ov_request.get_tensor("logits").data) + + # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension + return logits.view(-1, logits.shape[-1]) + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + hidden_states = _prune_hidden_states(hidden_states, sampling_metadata) + logits = self.logits_processor(None, hidden_states, sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + +def get_model( + model_config: ModelConfig, + device_config: DeviceConfig, + kv_cache_dtype: ov.Type, + **kwargs, +) -> torch.nn.Module: + lora_config = kwargs.get("lora_config", None) + if lora_config: + raise ValueError( + "OpenVINO modeling does not support LoRA, " + "but LoRA is enabled. Support for this model may " + "be added in the future. If this is important to you, " + "please open an issue on github.") + + return OpenVINOCasualLM(model_config, device_config, kv_cache_dtype) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py new file mode 100644 index 0000000000000..d8fbf796b5d3a --- /dev/null +++ b/vllm/model_executor/models/clip.py @@ -0,0 +1,289 @@ +"""Minimal implementation of CLIPVisionModel intended to be only used +within a vision language model.""" +from typing import Optional + +import torch +import torch.nn as nn +from PIL import Image +from transformers import CLIPVisionConfig +from transformers.models.clip.modeling_clip import CLIPAttention + +from vllm.config import ModelConfig +from vllm.inputs import LLMInputs +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.multimodal.image import (cached_get_tokenizer, + repeat_and_pad_image_tokens) +from vllm.sequence import SequenceData + + +def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: + assert image_size % patch_size == 0 + return image_size // patch_size + + +def get_clip_num_patches(*, image_size: int, patch_size: int) -> int: + grid_length = get_clip_patch_grid_length(image_size=image_size, + patch_size=patch_size) + return grid_length * grid_length + + +def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int: + return get_clip_num_patches(image_size=hf_config.image_size, + patch_size=hf_config.patch_size) + + +def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int: + return get_clip_image_feature_size(hf_config) + + +def dummy_seq_data_for_clip( + hf_config: CLIPVisionConfig, + seq_len: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + if image_feature_size_override is None: + image_feature_size = get_clip_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + token_ids = [image_token_id] * image_feature_size + token_ids += [0] * (seq_len - image_feature_size) + return SequenceData(token_ids) + + +def dummy_image_for_clip( + hf_config: CLIPVisionConfig, + *, + image_width_override: Optional[int] = None, + image_height_override: Optional[int] = None, +): + width = height = hf_config.image_size + if image_width_override is not None: + width = image_width_override + if image_height_override is not None: + height = image_height_override + + image = Image.new("RGB", (width, height), color=0) + return {"image": image} + + +def input_processor_for_clip( + model_config: ModelConfig, + hf_config: CLIPVisionConfig, + llm_inputs: LLMInputs, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + tokenizer = cached_get_tokenizer(model_config.tokenizer) + + if image_feature_size_override is None: + image_feature_size = get_clip_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + new_prompt, new_token_ids = repeat_and_pad_image_tokens( + tokenizer, + llm_inputs.get("prompt"), + llm_inputs["prompt_token_ids"], + image_token_id=image_token_id, + repeat_count=image_feature_size, + ) + + # NOTE: Create a defensive copy of the original inputs + return LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa +class CLIPVisionEmbeddings(nn.Module): + + def __init__(self, config: CLIPVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = get_clip_num_patches(image_size=self.image_size, + patch_size=self.patch_size) + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, + self.embed_dim) + self.register_buffer("position_ids", + torch.arange(self.num_positions).expand((1, -1)), + persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to( + dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + embeddings = embeddings + self.position_embedding(self.position_ids) + + return embeddings + + +class CLIPMLP(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + + return hidden_states + + +class CLIPEncoderLayer(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.self_attn = CLIPAttention(config) + self.layer_norm1 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.mlp = CLIPMLP(config, quant_config=quant_config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class CLIPEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self + attention layers. Each layer is a [`CLIPEncoderLayer`]. + + Args: + config: CLIPConfig + """ + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ + CLIPEncoderLayer(config=config, quant_config=quant_config) + for _ in range(config.num_hidden_layers) + ]) + + def forward(self, + inputs_embeds: torch.Tensor, + vision_feature_layer: int = -1): + + # Encoder forward pass only up to the required layer + num_layer = len(self.layers) + vision_feature_layer + 1 + hidden_states = inputs_embeds + for encoder_layer in self.layers[:num_layer]: + hidden_states = encoder_layer(hidden_states) + + return hidden_states + + +class CLIPVisionTransformer(nn.Module): + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = CLIPVisionEmbeddings(config) + + # NOTE: This typo of "layrnorm" is not fixed on purpose to match + # the original transformers code and name of the model weights. + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = CLIPEncoder(config=config, quant_config=quant_config) + + def forward( + self, + pixel_values: torch.Tensor, + vision_feature_layer: int = -1, + ) -> torch.Tensor: + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layrnorm(hidden_states) + hidden_states = self.encoder(inputs_embeds=hidden_states, + vision_feature_layer=vision_feature_layer) + + return hidden_states + + +class CLIPVisionModel(nn.Module): + + config_class = CLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.vision_model = CLIPVisionTransformer(config=config, + quant_config=quant_config) + + def forward(self, + pixel_values: Optional[torch.Tensor] = None, + vision_feature_layer: int = -1): + + return self.vision_model(pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer) + + @property + def device(self): + return next(self.parameters()).device diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py new file mode 100644 index 0000000000000..fb4097fd1e9b3 --- /dev/null +++ b/vllm/model_executor/models/deepseek_v2.py @@ -0,0 +1,537 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only DeepseekV2 model.""" +from typing import Any, Dict, Iterable, List, Optional, Tuple + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import CacheConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import fused_experts, grouped_topk +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors, SamplerOutput + + +class DeepseekV2MLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class DeepseekV2MoE(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + self.n_routed_experts = config.n_routed_experts + self.top_k = config.num_experts_per_tok + self.routed_scaling_factor = config.routed_scaling_factor + if self.tp_size > self.n_routed_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {self.n_routed_experts}.") + + self.experts = nn.ModuleList([ + DeepseekV2MLP(hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False) + for idx in range(self.n_routed_experts) + ]) + self.pack_params() + + self.gate = ReplicatedLinear(config.hidden_size, + self.n_routed_experts, + bias=False, + quant_config=None) + + if config.n_shared_experts is not None: + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) + self.shared_experts = DeepseekV2MLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + ) + + def pack_params(self): + w1 = [] + w2 = [] + for expert in self.experts: + w1.append(expert.gate_up_proj.weight) + w2.append(expert.down_proj.weight) + self.w1 = torch._utils._flatten_dense_tensors(w1) + w1s = torch._utils._unflatten_dense_tensors(self.w1, w1) + for data, param in zip(w1s, w1): + param.data = data + self.w1 = self.w1.view(len(w1), *w1s[0].shape) + + self.w2 = torch._utils._flatten_dense_tensors(w2) + w2s = torch._utils._unflatten_dense_tensors(self.w2, w2) + for data, param in zip(w2s, w2): + param.data = data + + self.w2 = self.w2.view(len(w2), *w2s[0].shape) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + if self.config.n_shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + topk_weights, topk_ids = grouped_topk( + hidden_states, + router_logits, + self.top_k, + renormalize=self.config.norm_topk_prob, + num_expert_group=self.config.n_group, + topk_group=self.config.topk_group) + final_hidden_states = fused_experts( + hidden_states, + self.w1, + self.w2, + topk_weights, + topk_ids, + inplace=True) * self.routed_scaling_factor + if self.config.n_shared_experts is not None: + final_hidden_states = final_hidden_states + shared_output + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + import math + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +class DeepseekV2Attention(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int, + kv_lora_rank: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + layer_idx=None, + ) -> None: + super().__init__() + self.layer_idx = layer_idx + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.num_heads = num_heads + tp_size = get_tensor_model_parallel_world_size() + assert num_heads % tp_size == 0 + self.num_local_heads = num_heads // tp_size + self.scaling = self.qk_head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + if self.q_lora_rank is not None: + self.q_a_proj = ReplicatedLinear(self.hidden_size, + self.q_lora_rank, + bias=False, + quant_config=quant_config) + self.q_a_layernorm = RMSNorm(self.q_lora_rank, + eps=config.rms_norm_eps) + self.q_b_proj = ColumnParallelLinear(q_lora_rank, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config) + else: + self.q_proj = ColumnParallelLinear(self.hidden_size, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config) + + self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size, + self.kv_lora_rank + + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config) + self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, + eps=config.rms_norm_eps) + self.kv_b_proj = ColumnParallelLinear( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + quant_config=quant_config) + # O projection. + self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config) + rope_scaling['type'] = 'deepseek_yarn' + self.rotary_emb = get_rope(qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=False) + + if rope_scaling: + mscale_all_dim = rope_scaling.get("mscale_all_dim", False) + scaling_factor = rope_scaling["factor"] + mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) + self.scaling = self.scaling * mscale * mscale + + # self.attn = Attention(self.num_heads, + # self.qk_head_dim, + # self.scaling, + # num_kv_heads=self.num_heads) + + # TODO, support head_size 192 + self.attn = Attention(self.num_local_heads, + 256, + self.scaling, + num_kv_heads=self.num_local_heads, + cache_config=cache_config, + quant_config=quant_config) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + if self.q_lora_rank is not None: + q = self.q_a_proj(hidden_states)[0] + q = self.q_a_layernorm(q) + q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, + self.qk_head_dim) + else: + q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads, + self.qk_head_dim) + q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], + dim=-1) + latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0] + kv_a, _ = latent_cache.split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + latent_cache = latent_cache.unsqueeze(1) + kv_a = self.kv_a_layernorm(kv_a.contiguous()) + kv = self.kv_b_proj(kv_a)[0] + kv = kv.view(-1, self.num_local_heads, + self.qk_nope_head_dim + self.v_head_dim) + k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_pe = latent_cache[:, :, self.kv_lora_rank:] + q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) + q[..., self.qk_nope_head_dim:] = q_pe + k = torch.empty_like(q) + k[..., :self.qk_nope_head_dim] = k_nope + k[..., self.qk_nope_head_dim:] = k_pe + q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], + value=0).view(-1, + self.num_local_heads * 256) + k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], + value=0).view(-1, + self.num_local_heads * 256) + v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], + value=0).view(-1, + self.num_local_heads * 256) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + attn_output = attn_output.view( + -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads * self.v_head_dim) + output, _ = self.o_proj(attn_output) + return output + + +class DeepseekV2DecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = DeepseekV2Attention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + qk_nope_head_dim=config.qk_nope_head_dim, + qk_rope_head_dim=config.qk_rope_head_dim, + v_head_dim=config.v_head_dim, + q_lora_rank=config.q_lora_rank + if hasattr(config, "q_lora_rank") else None, + kv_lora_rank=config.kv_lora_rank, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + cache_config=cache_config, + quant_config=quant_config, + layer_idx=layer_idx, + ) + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0): + self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config) + else: + self.mlp = DeepseekV2MLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class DeepseekV2Model(nn.Module): + + fall_back_to_pt_during_load = False + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + DeepseekV2DecoderLayer(config, + layer_idx, + cache_config=cache_config, + quant_config=quant_config) + for layer_idx in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, + kv_caches[i], attn_metadata, + residual) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class DeepseekV2ForCausalLM(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.quant_config = quant_config + self.model = DeepseekV2Model(config, cache_config, quant_config) + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip experts that are not assigned to this worker. + if (("mlp.experts." in name or "mlp.shared_experts." in name) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip experts that are not assigned to this worker. + if (("mlp.experts." in name or "mlp.shared_experts." in name) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py new file mode 100644 index 0000000000000..8386084c2b3f8 --- /dev/null +++ b/vllm/model_executor/models/gemma2.py @@ -0,0 +1,395 @@ +# coding=utf-8 +# Copyright 2024 The vLLM team. +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Iterable, List, Optional, Set, Tuple + +import torch +from torch import nn +from transformers import Gemma2Config + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import CacheConfig, LoRAConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.layernorm import GemmaRMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors, SamplerOutput + +from .interfaces import SupportsLoRA + + +class Gemma2MLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + hidden_activation: str, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config) + if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"): + raise ValueError( + "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation " + "function. Please set `hidden_act` and `hidden_activation` to " + "`gelu_pytorch_tanh`.") + self.act_fn = GeluAndMul(approximate="tanh") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class Gemma2Attention(nn.Module): + + def __init__(self, + layer_idx: int, + config: Gemma2Config, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + max_position_embeddings: int, + rope_theta: float, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__() + self.layer_idx = layer_idx + self.config = config + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = config.query_pre_attn_scalar**-0.5 + self.rope_theta = rope_theta + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=config.attention_bias, + quant_config=quant_config, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=config.attention_bias, + quant_config=quant_config, + ) + # TODO(woosuk): Use the `get_rope` interface. + self.rotary_emb = GemmaRotaryEmbedding( + self.head_dim, + self.head_dim, + max_position_embeddings, + base=self.rope_theta, + is_neox_style=True, + dtype=torch.get_default_dtype(), + ) + + # FIXME(woosuk): While Gemma 2 uses sliding window attention for every + # odd layer, vLLM currently ignores it and uses global attention for + # all layers. + use_sliding_window = (layer_idx % 2 == 1 + and config.sliding_window is not None) + del use_sliding_window # Unused. + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class Gemma2DecoderLayer(nn.Module): + + def __init__( + self, + layer_idx: int, + config: Gemma2Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Gemma2Attention( + layer_idx=layer_idx, + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + head_dim=config.head_dim, + max_position_embeddings=config.max_position_embeddings, + rope_theta=config.rope_theta, + cache_config=cache_config, + quant_config=quant_config, + ) + self.hidden_size = config.hidden_size + self.mlp = Gemma2MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + hidden_activation=config.hidden_activation, + quant_config=quant_config, + ) + self.input_layernorm = GemmaRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + + hidden_states, residual = self.pre_feedforward_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + return hidden_states, residual + + +class Gemma2Model(nn.Module): + + def __init__( + self, + config: Gemma2Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + Gemma2DecoderLayer(layer_idx, config, cache_config, quant_config) + for layer_idx in range(config.num_hidden_layers) + ]) + self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + # Normalize the embedding by sqrt(hidden_size) + # The normalizer's data type should be downcasted to the model's + # data type such as bfloat16, not float32. + # See https://github.com/huggingface/transformers/pull/29402 + normalizer = self.config.hidden_size**0.5 + self.register_buffer("normalizer", torch.tensor(normalizer)) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + hidden_states *= self.normalizer + + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + attn_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class Gemma2ForCausalLM(nn.Module, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + # Gemma does not apply LoRA to the embedding layer. + embedding_modules = {} + embedding_padding_modules = [] + + def __init__( + self, + config: Gemma2Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + del lora_config # Unused. + super().__init__() + self.config = config + self.quant_config = quant_config + self.model = Gemma2Model(config, cache_config, quant_config) + self.logits_processor = LogitsProcessor( + config.vocab_size, soft_cap=config.final_logit_softcapping) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.model.embed_tokens, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + for (param_name, shard_name, shard_id) in stacked_params_mapping: + if shard_name not in name: + continue + name = name.replace(shard_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # lm_head is not used in vllm as it is tied with embed_token. + # To prevent errors, skip loading lm_head.weight. + if "lm_head.weight" in name: + continue + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + unloaded_params = params_dict.keys() - loaded_params + if unloaded_params: + raise RuntimeError( + "Some weights are not initialized from checkpoints: " + f"{unloaded_params}") diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py new file mode 100644 index 0000000000000..2697a6996f4ca --- /dev/null +++ b/vllm/model_executor/models/interfaces.py @@ -0,0 +1,144 @@ +from typing import (ClassVar, Dict, List, Literal, Optional, Protocol, Type, + Union, overload, runtime_checkable) + +from typing_extensions import TypeGuard + +from vllm.config import LoRAConfig, MultiModalConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@runtime_checkable +class SupportsVision(Protocol): + """The interface required for all vision language models (VLMs).""" + + supports_vision: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports vision inputs. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + def __init__(self, *, multimodal_config: MultiModalConfig) -> None: + ... + + +# We can't use runtime_checkable with ClassVar for issubclass checks +# so we need to treat the class as an instance and use isinstance instead +@runtime_checkable +class _SupportsVisionType(Protocol): + supports_vision: Literal[True] + + def __call__(self, *, multimodal_config: MultiModalConfig) -> None: + ... + + +@overload +def supports_vision(model: Type[object]) -> TypeGuard[Type[SupportsVision]]: + ... + + +@overload +def supports_vision(model: object) -> TypeGuard[SupportsVision]: + ... + + +def supports_vision( + model: Union[Type[object], object], +) -> Union[TypeGuard[Type[SupportsVision]], TypeGuard[SupportsVision]]: + if isinstance(model, type): + return isinstance(model, _SupportsVisionType) + + return isinstance(model, SupportsVision) + + +@runtime_checkable +class SupportsLoRA(Protocol): + """The interface required for all models that support LoRA.""" + + supports_lora: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports LoRA. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + packed_modules_mapping: ClassVar[Dict[str, List[str]]] + supported_lora_modules: ClassVar[List[str]] + embedding_modules: ClassVar[Dict[str, str]] + embedding_padding_modules: ClassVar[List[str]] + + # lora_config is None when LoRA is not enabled + def __init__(self, *, lora_config: Optional[LoRAConfig] = None) -> None: + ... + + +# We can't use runtime_checkable with ClassVar for issubclass checks +# so we need to treat the class as an instance and use isinstance instead +@runtime_checkable +class _SupportsLoRAType(Protocol): + supports_lora: Literal[True] + + packed_modules_mapping: Dict[str, List[str]] + supported_lora_modules: List[str] + embedding_modules: Dict[str, str] + embedding_padding_modules: List[str] + + def __call__(self, *, lora_config: Optional[LoRAConfig] = None) -> None: + ... + + +@overload +def supports_lora(model: Type[object]) -> TypeGuard[Type[SupportsLoRA]]: + ... + + +@overload +def supports_lora(model: object) -> TypeGuard[SupportsLoRA]: + ... + + +def supports_lora( + model: Union[Type[object], object], +) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]: + result = _supports_lora(model) + + if not result: + lora_attrs = ( + "packed_modules_mapping", + "supported_lora_modules", + "embedding_modules", + "embedding_padding_modules", + ) + missing_attrs = tuple(attr for attr in lora_attrs + if not hasattr(model, attr)) + + if getattr(model, "supports_lora", False): + if missing_attrs: + logger.warning( + "The model (%s) sets `supports_lora=True`, " + "but is missing LoRA-specific attributes: %s", + model, + missing_attrs, + ) + else: + if not missing_attrs: + logger.warning( + "The model (%s) contains all LoRA-specific attributes, " + "but does not set `supports_lora=True`.", model) + + return result + + +def _supports_lora( + model: Union[Type[object], object], +) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]: + if isinstance(model, type): + return isinstance(model, _SupportsLoRAType) + + return isinstance(model, SupportsLoRA) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py new file mode 100644 index 0000000000000..bf330c7770d12 --- /dev/null +++ b/vllm/model_executor/models/jamba.py @@ -0,0 +1,955 @@ +# coding=utf-8 +"""Inference-only Jurassic model.""" +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +import torch +from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +from mamba_ssm.ops.selective_scan_interface import selective_scan_fn +from mamba_ssm.ops.triton.selective_state_update import selective_state_update +from torch import nn +from torch.nn.parameter import Parameter +from transformers import JambaConfig + +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.layer import Attention +from vllm.config import CacheConfig, LoRAConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.utils import set_weight_attrs +from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.worker.model_runner import _BATCH_SIZES_TO_CAPTURE + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +@dataclass +class MambaCacheParams: + is_prompt: bool = False + conv_state: torch.Tensor = torch.Tensor() + ssm_state: torch.Tensor = torch.Tensor() + + +# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer +class JambaMambaMixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute + the `contextualized_states`. A, D are input independent + (see Mamba paper [1] Section 3.5.2 "Interpretation of A" + for why A isn't selective) ∆, B, C are input-dependent + (this is a key difference between Mamba and the linear time + invariant S4, and is why Mamba is called + **selective** state spaces) + """ + + def __init__(self, config: JambaConfig, layer_idx): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.intermediate_size = config.mamba_expand * config.hidden_size + self.time_step_rank = config.mamba_dt_rank + self.use_conv_bias = config.mamba_conv_bias + self.use_bias = config.mamba_proj_bias + self.conv1d = ColumnParallelLinear( + input_size=self.conv_kernel_size, + output_size=self.intermediate_size, + bias=self.use_conv_bias, + ) + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `set_weight_attrs` + # doesn't allow to override it + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + + self.in_proj = MergedColumnParallelLinear(self.hidden_size, + [self.intermediate_size] * 2, + bias=self.use_bias) + # selective projection used to make dt, B and C input dependent + self.x_proj = RowParallelLinear( + self.intermediate_size, + self.time_step_rank + self.ssm_state_size * 2, + bias=False, + ) + # time step projection (discretization) - + # In the forward we need to apply dt_proj without the bias, + # as the bias is added in the selective scan kernel. + self.dt_proj = ColumnParallelLinear(self.time_step_rank, + self.intermediate_size, + bias=True, + skip_bias_add=True) + + def weight_loader(param: Parameter, loaded_weight: torch.Tensor): + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + param.data.copy_( + loaded_weight.data.split(loaded_weight.shape[0] // tp_size, + dim=0)[tp_rank]) + + def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor): + weight_loader(param, -torch.exp(loaded_weight.float())) + + tp_size = get_tensor_model_parallel_world_size() + self.A = nn.Parameter( + torch.empty( + self.intermediate_size // tp_size, + self.ssm_state_size, + dtype=torch.float32, + )) + self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size)) + + set_weight_attrs(self.D, {"weight_loader": weight_loader}) + set_weight_attrs(self.A, {"weight_loader": A_weight_loader}) + + self.out_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + bias=self.use_bias, + input_is_parallel=True, + ) + self.activation = config.hidden_act + + self.dt_layernorm = RMSNorm(self.time_step_rank, + eps=config.rms_norm_eps) + self.b_layernorm = RMSNorm(self.ssm_state_size, + eps=config.rms_norm_eps) + self.c_layernorm = RMSNorm(self.ssm_state_size, + eps=config.rms_norm_eps) + + def mamba_forward(self, + hidden_states: torch.Tensor, + cache_params: MambaCacheParams = None): + # 1. Gated MLP's linear projection + projected_states = self.in_proj(hidden_states)[0].transpose(1, 2) + hidden_states, gate = projected_states.chunk(2, dim=1) + + # 2. Convolution sequence transformation + conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), + self.conv1d.weight.size(2)) + if cache_params is not None and not cache_params.is_prompt: + hidden_states = causal_conv1d_update( + hidden_states.squeeze(-1), + cache_params.conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + ) + hidden_states = hidden_states.unsqueeze(-1) + else: + if cache_params is not None: + conv_states = nn.functional.pad( + hidden_states, + (self.conv_kernel_size - hidden_states.shape[-1], 0)) + cache_params.conv_state.copy_(conv_states) + + hidden_states = causal_conv1d_fn( + hidden_states, + conv_weights, + self.conv1d.bias, + activation=self.activation, + ) + + # 3. State Space Model sequence transformation + # 3.a. input varying initialization of time_step, B and C + ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))[0] + + time_step, B, C = torch.split( + ssm_parameters, + [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], + dim=-1, + ) + time_step = self.dt_layernorm(time_step.contiguous()) + B = self.b_layernorm(B.contiguous()) + C = self.c_layernorm(C.contiguous()) + + discrete_time_step = self.dt_proj(time_step)[0].transpose(1, 2) + # 3.c perform the recurrence y ← SSM(A, B, C)(x) + time_proj_bias = (self.dt_proj.bias.float() if hasattr( + self.dt_proj, "bias") else None) + if cache_params is not None and not cache_params.is_prompt: + scan_outputs = selective_state_update( + cache_params.ssm_state, + hidden_states[..., 0], + discrete_time_step[..., 0], + self.A, + B[:, 0], + C[:, 0], + self.D, + gate[..., 0], + time_proj_bias, + dt_softplus=True, + ).unsqueeze(-1) + else: + scan_outputs, ssm_state = selective_scan_fn( + hidden_states, + discrete_time_step, + self.A, + B.transpose(1, 2), + C.transpose(1, 2), + self.D.float(), + gate, + time_proj_bias, + delta_softplus=True, + return_last_state=True, + ) + if ssm_state is not None and cache_params is not None: + cache_params.ssm_state.copy_(ssm_state) + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))[0] + return contextualized_states + + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + conv_state: torch.Tensor, + ssm_state: torch.Tensor, + ): + if attn_metadata.prefill_metadata is not None: + offset = 0 + for i, prompt_len in enumerate( + attn_metadata.prefill_metadata.seq_lens): + cache = MambaCacheParams(True, + conv_state=conv_state[i].unsqueeze(0), + ssm_state=ssm_state[i].unsqueeze(0)) + hidden_states[offset:offset + prompt_len].copy_( + self.mamba_forward(hidden_states[offset:offset + + prompt_len].unsqueeze(0), + cache_params=cache)[0]) + offset += prompt_len + else: + cache = MambaCacheParams(False, + conv_state=conv_state, + ssm_state=ssm_state) + hidden_states = self.mamba_forward(hidden_states.unsqueeze(1), + cache_params=cache) + hidden_states = hidden_states.squeeze(1) + + return hidden_states + + +class JambaMLP(nn.Module): + + def __init__( + self, + config: JambaConfig, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + hidden_size = config.hidden_size + intermediate_size = config.intermediate_size + hidden_act = config.hidden_act + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class JambaMoE(nn.Module): + """A tensor-parallel MoE implementation for Mixtral that shards each expert + across all ranks. + + Each expert's weights are sharded across all ranks and a fused MoE + kernel is used for the forward pass, and finally we reduce the outputs + across ranks. + """ + + def __init__( + self, + config: JambaConfig, + params_dtype: Optional[torch.dtype] = None, + tp_size: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.tp_size = tp_size or get_tensor_model_parallel_world_size() + self.num_total_experts = config.num_experts + self.top_k = config.num_experts_per_tok + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size // self.tp_size + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + + self.router = ReplicatedLinear(self.hidden_size, + self.num_total_experts, + bias=False, + params_dtype=self.params_dtype) + + self.ws = nn.Parameter( + torch.empty( + self.num_total_experts, + 2 * self.intermediate_size, + self.hidden_size, + device="cuda", + dtype=self.params_dtype, + )) + self.w2s = nn.Parameter( + torch.empty( + self.num_total_experts, + self.hidden_size, + self.intermediate_size, + device="cuda", + dtype=self.params_dtype, + )) + + set_weight_attrs( + self.ws, + { + "weight_loader": self.weight_loader, + }, + ) + set_weight_attrs( + self.w2s, + { + "weight_loader": self.weight_loader, + }, + ) + + def weight_loader( + self, + param: nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + expert_id: int, + ): + tp_rank = get_tensor_model_parallel_rank() + param_data = param.data + shard_size = self.intermediate_size + shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) + if weight_name.endswith("gate_proj.weight"): + param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :] + if weight_name.endswith("up_proj.weight"): + param_data[expert_id, + shard_size:2 * shard_size, :] = loaded_weight[shard, :] + if weight_name.endswith("down_proj.weight"): + param_data[expert_id, :, :] = loaded_weight[:, shard] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_size = hidden_states.shape + hidden_states = hidden_states.view(-1, self.hidden_size) + # router_logits: (batch * sequence_length, n_experts) + router_logits, _ = self.router(hidden_states) + + final_hidden_states = fused_moe( + hidden_states, + self.ws, + self.w2s, + router_logits, + self.top_k, + renormalize= + False, # Mixtral normalize the expert probs to 1. We don't! + inplace=True, + ) + + if self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states.view(num_tokens, hidden_size) + + +class JambaMambaDecoderLayer(nn.Module): + + def __init__(self, + config: JambaConfig, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__() + self.layer_idx = layer_idx + self.config = config + self.mamba = JambaMambaMixer(config, layer_idx) + + num_experts = config.layers_num_experts[layer_idx] + ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP + self.feed_forward = ffn_layer_class(config, quant_config=quant_config) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.pre_ff_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + conv_state: torch.Tensor, + ssm_state: torch.Tensor, + **kwargs, + ): + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + hidden_states = self.mamba(hidden_states, attn_metadata, conv_state, + ssm_state) + # Fully Connected + hidden_states, residual = self.pre_ff_layernorm( + hidden_states, residual) + hidden_states = self.feed_forward(hidden_states) + return hidden_states, residual + + +class JambaAttentionDecoderLayer(nn.Module): + + def __init__( + self, + config: JambaConfig, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = config.hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + + self.qkv_proj = QKVParallelLinear( + config.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + ) + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + config.hidden_size, + bias=False, + quant_config=quant_config) + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + ) + + num_experts = config.layers_num_experts[layer_idx] + ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP + self.feed_forward = ffn_layer_class(config, quant_config=quant_config) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.pre_ff_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def self_attention( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + **kwargs, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + output, _ = self.o_proj(attn_output) + return output + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + **kwargs, + ): + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + hidden_states = self.self_attention( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + # Fully Connected + hidden_states, residual = self.pre_ff_layernorm( + hidden_states, residual) + hidden_states = self.feed_forward(hidden_states) + return hidden_states, residual + + +ALL_DECODER_LAYER_TYPES = { + "attention": JambaAttentionDecoderLayer, + "mamba": JambaMambaDecoderLayer +} + + +class JambaModel(nn.Module): + + def __init__( + self, + config: JambaConfig, + quant_config: Optional[QuantizationConfig] = None, + cache_config: Optional[CacheConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + + decoder_layers = [] + for i in range(config.num_hidden_layers): + layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]] + decoder_layers.append( + layer_class(config, + layer_idx=i, + cache_config=cache_config, + quant_config=quant_config)) + self.layers = nn.ModuleList(decoder_layers) + self.final_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + conv_state: torch.Tensor, + ssm_state: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + + for i in range(len(self.layers)): + layer = self.layers[i] + kv_cache = None + current_ssm_state = None + current_conv_state = None + if isinstance(layer, JambaAttentionDecoderLayer): + kv_cache = kv_caches[(i - self.config.attn_layer_offset) // + self.config.attn_layer_period] + if isinstance(layer, JambaMambaDecoderLayer): + current_state_layer = i - (1 + + (i - self.config.attn_layer_offset) + // self.config.attn_layer_period) + current_ssm_state = ssm_state[current_state_layer] + current_conv_state = conv_state[current_state_layer] + + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + residual=residual, + conv_state=current_conv_state, + ssm_state=current_ssm_state, + ) + hidden_states, _ = self.final_layernorm(hidden_states, residual) + return hidden_states + + +class JambaForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "embed_tokens", + "lm_head", + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__( + self, + config: JambaConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.model = JambaModel(config, + cache_config=cache_config, + quant_config=quant_config, + lora_config=lora_config) + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + ) + # Current step used indices + self.current_indices: List[int] = [] + # Used to track and store by the Mamba cache between steps. + self.mamba_cache: Tuple[torch.Tensor, torch.Tensor] = tuple() + # Used as an input_buffer for the CUDA graph runs. + self.mamba_gc_cache_buffer: Tuple[torch.Tensor, torch.Tensor] = tuple() + # Maps between the request id and a dict that maps between the seq_id + # and its index inside the self.mamba_cache + self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {} + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size) + self.sampler = Sampler() + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs): + if not self.mamba_cache: + self._prepare_mamba_cache() + + if "seqlen_agnostic_capture_inputs" not in kwargs: + # We get here only on Prefill/Eager mode runs + assert all( + key in kwargs + for key in ["request_ids_to_seq_ids", "finished_requests_ids"]) + + request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"] + batch_size = input_ids.shape[0] + if attn_metadata.prefill_metadata: + batch_size = len(request_ids_to_seq_ids) + ( + current_seqlen_agnostic_cache, + indices, + ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids, + batch_size) + finished_requests_ids = kwargs["finished_requests_ids"] + self._release_mamba_cache(finished_requests_ids) + else: + # CUDA graph capturing runs + current_seqlen_agnostic_cache, indices = ( + kwargs["seqlen_agnostic_capture_inputs"], + [], + ) + self.current_indices = indices + + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, + current_seqlen_agnostic_cache[0], + current_seqlen_agnostic_cache[1]) + + if "seqlen_agnostic_capture_inputs" not in kwargs: + self._copy_mamba_cache_by_indices(self.current_indices, + current_seqlen_agnostic_cache) + + return hidden_states + + def _copy_mamba_cache_by_indices( + self, indices: List[int], + current_seqlen_agnostic_cache: Tuple[torch.Tensor, torch.Tensor]): + for i, offset in enumerate(indices): + self._copy_mamba_cache(offset, i, current_seqlen_agnostic_cache) + + def _copy_mamba_cache(self, index_to: int, index_from: int, + from_buffer: Tuple[torch.Tensor, torch.Tensor]): + assert len(self.mamba_cache) > 0 + for (cache_t, from_buffer_t) in zip(self.mamba_cache, from_buffer): + cache_t[:, index_to].copy_(from_buffer_t[:, index_from], + non_blocking=True) + + def _assign_seq_id_to_mamba_cache(self, cur_rid: str, + seqs_id: List[int]) -> List[int]: + indices_for_current_run = [] + for seq_id in seqs_id: + if cur_rid not in self.mamba_cache_indices_mapping: + self.mamba_cache_indices_mapping[cur_rid] = {} + first_free_index = self._first_free_index_in_mamba_cache() + self.mamba_cache_indices_mapping[cur_rid][ + seq_id] = first_free_index + index_for_current_run = first_free_index + ## case of decoding n>1, copy prefill cache to decoding indices + elif seq_id not in (seq_ids2indices := + self.mamba_cache_indices_mapping[cur_rid]): + first_free_index = self._first_free_index_in_mamba_cache() + index_exist = list(seq_ids2indices.values())[0] + self._copy_mamba_cache(index_from=index_exist, + index_to=first_free_index, + from_buffer=self.mamba_cache) + self.mamba_cache_indices_mapping[cur_rid][ + seq_id] = first_free_index + index_for_current_run = first_free_index + else: + index_for_current_run = self.mamba_cache_indices_mapping[ + cur_rid][seq_id] + + indices_for_current_run.append(index_for_current_run) + return indices_for_current_run + + def _prepare_current_run_mamba_cache( + self, request_ids_to_seq_ids: Dict[str, list[int]], batch_size: int + ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], List[int]]: + indices_for_current_run = [] + for request_id, seqs_id in request_ids_to_seq_ids.items(): + indices_for_current_run += self._assign_seq_id_to_mamba_cache( + request_id, seqs_id) + ## Pad the batch in case of running batch that was not captured via CG + padded_indices = indices_for_current_run.copy() + pad_index = self._first_free_index_in_mamba_cache() + + for _ in range(batch_size - len(indices_for_current_run)): + padded_indices.append(pad_index) + + conv_state = self.mamba_cache[0][:, padded_indices] + temporal_state = self.mamba_cache[1][:, padded_indices] + + return (conv_state, temporal_state), indices_for_current_run + + def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): + """ + Copy the relevant Mamba cache into the CUDA graph input buffer + that was provided during the capture runs + (JambaForCausalLM.mamba_gc_cache_buffer). + """ + assert all( + key in kwargs + for key in ["request_ids_to_seq_ids", "finished_requests_ids"]) + request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"] + batch_size = len(request_ids_to_seq_ids) + ( + current_mamba_cache, + indices, + ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids, + batch_size) + self.current_indices = indices + finished_requests_ids = kwargs["finished_requests_ids"] + self._release_mamba_cache(finished_requests_ids) + + for input_buffer, current_cache_buffer in zip( + input_buffers["seqlen_agnostic_capture_inputs"], + current_mamba_cache): + input_buffer.copy_(current_cache_buffer, non_blocking=True) + + def copy_outputs_after_cuda_graphs(self, input_buffers, **kwargs): + """ + Copy the relevant Mamba cache from the CUDA graph input_buffers + back to the JambaForCausalLM.mamba_cache after CUDA + graph replay run is done. + """ + self._copy_mamba_cache_by_indices( + self.current_indices, + input_buffers["seqlen_agnostic_capture_inputs"]) + + def get_seqlen_agnostic_capture_inputs(self, batch_size: int): + """ + Provide the CUDA graph capture runs with a buffer in adjusted size. + The buffer is used to maintain the Mamba Cache during the CUDA graph + replay runs. + """ + return tuple(buffer[:, :batch_size] + for buffer in self.mamba_gc_cache_buffer) + + def _release_mamba_cache(self, finished_seq_groups_req_ids: List[str]): + for req_id in finished_seq_groups_req_ids: + if req_id in self.mamba_cache_indices_mapping: + self.mamba_cache_indices_mapping.pop(req_id) + + def _first_free_index_in_mamba_cache(self) -> int: + if self.mamba_cache: + max_possible_batch_size = self.mamba_cache[0].shape[1] + occupied = [ + id for seq_ids in self.mamba_cache_indices_mapping.values() + for id in seq_ids.values() + ] + first_free_index = [ + i not in occupied for i in range(max_possible_batch_size) + ].index(True) + return first_free_index + return 0 + + def _get_mamba_cache_shape( + self + ) -> Tuple[Optional[Tuple[int, int]], Optional[Tuple[int, int]]]: + world_size = get_tensor_model_parallel_world_size() + hidden_size = self.config.hidden_size + conv_state_shape = ( + self.config.mamba_expand * hidden_size // world_size, + self.config.mamba_d_conv, + ) + temporal_state_shape = ( + self.config.mamba_expand * self.config.hidden_size // world_size, + self.config.mamba_d_state, + ) + return conv_state_shape, temporal_state_shape + + def _prepare_mamba_cache(self): + dtype = self.lm_head.weight.dtype + layers_type = self.config.layers_block_type + mamba_layers = sum( + [layer_type == "mamba" for layer_type in layers_type]) + max_batch_size = _BATCH_SIZES_TO_CAPTURE[-1] + 10 + conv_state_shape, temporal_state_shape = self._get_mamba_cache_shape() + assert conv_state_shape is not None and temporal_state_shape is not None + for buffername in ["mamba_cache", "mamba_gc_cache_buffer"]: + buffer = (torch.empty(size=(mamba_layers, max_batch_size) + + conv_state_shape, + dtype=dtype, + device="cuda"), + torch.empty(size=(mamba_layers, max_batch_size) + + temporal_state_shape, + dtype=dtype, + device="cuda")) + setattr(self, buffername, buffer) + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + expert_params_mapping = [ + # (param_name, weight_name, expert_id) + ( + "ws" if weight_name in ["gate_proj", "up_proj"] else "w2s", + f"experts.{expert_id}.{weight_name}.weight", + expert_id, + ) for expert_id in range(self.config.num_experts) + for weight_name in ["down_proj", "up_proj", "gate_proj"] + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + if "A_log" in name: + name = name.replace("A_log", "A") + + if ".self_attn." in name: + name = name.replace(".self_attn", "") + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + if 'experts' in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for param_name, weight_name, expert_id in expert_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + weight_name, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py new file mode 100644 index 0000000000000..97f7ec74292bb --- /dev/null +++ b/vllm/model_executor/models/mlp_speculator.py @@ -0,0 +1,188 @@ +import math +from typing import Iterable, List, Tuple + +import torch +import torch.nn as nn + +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.sequence import SamplerOutput +from vllm.transformers_utils.configs import MLPSpeculatorConfig + +SQRT2 = 2**0.5 + + +class MLPSpeculatorLayerNorm(nn.Module): + """ + A L2 normalization implementation + ... + Args + ---- + normalized_shape : int + Dimensionality of input data (size of final tensor axis) + eps : float + Safety term to prevent division by zero. Make sure the chosen value + fits in the range of your encoding scheme + (i.e. fp16 requires eps >= 6e-8). + elementwise_scale_and_shift : bool + Include a learned scaling and shift term after normalization. + """ + + def __init__( + self, + normalized_shape, + eps=1e-06, + elementwise_scale_and_shift=True, + ): + super(MLPSpeculatorLayerNorm, self).__init__() + self.elementwise_scale_and_shift = elementwise_scale_and_shift + if self.elementwise_scale_and_shift: + self.weight = nn.Parameter(torch.empty(normalized_shape)) + self.bias = nn.Parameter(torch.empty(normalized_shape)) + self.eps = eps + + def forward(self, x): + xf = x + xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps) + x = xf.type_as(x) + if self.elementwise_scale_and_shift: + x = self.weight * x + x = x + self.bias + return x + + +class MLPSpeculator(nn.Module): + + def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None: + super().__init__() + self.n_predict = config.n_predict + self.vocab_size = config.vocab_size + self.emb_dim = config.emb_dim + self.inner_dim = config.inner_dim if config.inner_dim != 0 \ + else config.emb_dim + + self.max_speculative_tokens = config.num_lookahead_tokens + + self.tie_weights = config.tie_weights + self.scale_input = config.scale_input + + if self.tie_weights: + assert ( + self.n_predict > + 1), "You cannot tie weights between stages when only 1 exists" + embedding = VocabParallelEmbedding( + config.vocab_size, + self.inner_dim, + org_num_embeddings=config.vocab_size) + self.emb = nn.ModuleList([embedding] * self.max_speculative_tokens) + + # the initial projection from the base model may + # have a different size, so that stays separate. + proj_first = nn.Linear(self.emb_dim, self.inner_dim, bias=False) + proj_tied = nn.Linear(self.inner_dim, self.inner_dim, bias=False) + self.proj = nn.ModuleList([proj_first] + [proj_tied] * + (self.max_speculative_tokens - 1)) + + head = ParallelLMHead(self.vocab_size, self.inner_dim, bias=False) + self.head = nn.ModuleList([head] * self.max_speculative_tokens) + + ln = MLPSpeculatorLayerNorm(self.inner_dim, + elementwise_scale_and_shift=True) + self.ln = nn.ModuleList([ln] * self.max_speculative_tokens) + + else: + self.emb = nn.ModuleList([ + VocabParallelEmbedding(config.vocab_size, + self.inner_dim, + org_num_embeddings=config.vocab_size) + for _ in range(self.max_speculative_tokens) + ]) + + self.proj = nn.ModuleList([ + nn.Linear((self.emb_dim if i == 0 else self.inner_dim), + self.inner_dim, + bias=False) + for i in range(self.max_speculative_tokens) + ]) + + self.head = nn.ModuleList([ + nn.Linear(self.inner_dim, self.vocab_size, bias=False) + for _ in range(self.max_speculative_tokens) + ]) + self.ln = nn.ModuleList([ + MLPSpeculatorLayerNorm(self.inner_dim, + elementwise_scale_and_shift=True) + for _ in range(self.max_speculative_tokens) + ]) + if self.scale_input: + self.ln0 = MLPSpeculatorLayerNorm( + self.emb_dim, elementwise_scale_and_shift=False) + + self.state_weight = 0.5**(0.5 / config.n_predict) + self.emb_weight = math.sqrt( + (1 - self.state_weight**2) * (self.inner_dim / 2)) + self.activation = nn.GELU() + self.config = config + self.logits_processor = LogitsProcessor(config.vocab_size, + config.vocab_size, 1.0) + self.sampler = Sampler() + + def generate_proposals( + self, + input_ids: torch.Tensor, + previous_hidden_states: torch.Tensor, + num_predict_tokens: int, + sampling_metadata: SamplingMetadata, + ) -> List[SamplerOutput]: + if num_predict_tokens > self.max_speculative_tokens: + raise ValueError(f"Max speculative tokens for model is " + f"{self.max_speculative_tokens}, but " + f"{num_predict_tokens} were requested") + + # b x 1 x d + previous_hidden_states = previous_hidden_states.unsqueeze(1) + + if self.scale_input: + previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2 + + # b x 1 + last_tokens = input_ids.unsqueeze(1) + + next_tokens = [] + + for head_index in range(num_predict_tokens): + + # Project and predict + z = self.emb[head_index](last_tokens) # b k d + states = self.proj[head_index](previous_hidden_states) + + # Weighted add of state_weight*state and emb_weight*z + # Let subsequent LN take care of denominator + # state_weight is close to 1, so shouldn't be any precision issues + states.add_(z, alpha=self.emb_weight / self.state_weight) + + states = self.activation(self.ln[head_index](states)) # b k d + # TODO: not yet supporting top_k_tokens_per_head + previous_hidden_states = states + + logits = self.logits_processor(self.head[head_index], states, + sampling_metadata) + + output = self.sampler(logits.flatten(0, 1), sampling_metadata) + last_tokens = output.sampled_token_ids + next_tokens.append(output) + + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + param = params_dict.get(name.replace("speculator.", "")) + if param is not None: + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py new file mode 100644 index 0000000000000..2af2bedd8e48e --- /dev/null +++ b/vllm/model_executor/models/paligemma.py @@ -0,0 +1,344 @@ +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + +import torch +from PIL import Image +from torch import nn +from transformers import PaliGemmaConfig, SiglipVisionConfig, SiglipVisionModel + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ColumnParallelLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.gemma import GemmaModel +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import cached_get_tokenizer +from vllm.sequence import SamplerOutput, SequenceData + +from .interfaces import SupportsVision +from .utils import merge_vision_embeddings + +logger = init_logger(__name__) + +_KEYS_TO_MODIFY_MAPPING = { + "language_model.model": "language_model", +} + + +def get_max_paligemma_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config(PaliGemmaConfig) + text_config = hf_config.text_config + + return text_config.num_image_tokens + + +def dummy_seq_data_for_paligemma( + hf_config: PaliGemmaConfig, + seq_len: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + if image_feature_size_override is None: + image_feature_size = hf_config.text_config.num_image_tokens + else: + image_feature_size = image_feature_size_override + + token_ids = [image_token_id] * image_feature_size + token_ids += [0] * (seq_len - image_feature_size) + return SequenceData(token_ids) + + +def dummy_image_for_paligemma( + hf_config: SiglipVisionConfig, + *, + image_width_override: Optional[int] = None, + image_height_override: Optional[int] = None, +): + width = height = hf_config.image_size + if image_width_override is not None: + width = image_width_override + if image_height_override is not None: + height = image_height_override + + image = Image.new("RGB", (width, height), color=0) + return {"image": image} + + +def dummy_data_for_paligemma(ctx: InputContext, seq_len: int): + hf_config = ctx.get_hf_config(PaliGemmaConfig) + vision_config = hf_config.vision_config + + seq_data = dummy_seq_data_for_paligemma( + hf_config, + seq_len, + image_token_id=hf_config.image_token_index, + ) + + mm_data = dummy_image_for_paligemma(vision_config) + return seq_data, mm_data + + +def input_processor_for_paligemma(ctx: InputContext, llm_inputs: LLMInputs): + + """ + The correct prompt format needs to be: + '' * image_feature_size + '' + prompt + '\n' + + See https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/models/paligemma/processing_paligemma.py#L55 + """ # noqa + + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + model_config = ctx.model_config + hf_config = ctx.get_hf_config(PaliGemmaConfig) + + tokenizer = cached_get_tokenizer(model_config.tokenizer) + image_feature_size = hf_config.text_config.num_image_tokens + image_token_str = tokenizer.decode(hf_config.image_token_index) + bos_token = tokenizer.decode(hf_config.bos_token_id) + image_token_str_pad = image_token_str * image_feature_size + image_token_ids_pad = [hf_config.image_token_index] * image_feature_size + + orig_prompt = llm_inputs.get("prompt") + orig_prompt_ids = llm_inputs.get("prompt_token_ids") + + if image_token_str in orig_prompt: + logger.warning( + "The image token '%s' was detected in the prompt and " + "will be removed. Please follow the proper prompt format" + " documented on HuggingFace.", image_token_str) + orig_prompt = orig_prompt.replace(image_token_str, "") + orig_prompt_ids.remove(hf_config.image_token_index) + + new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n" + new_token_ids = image_token_ids_pad + orig_prompt_ids + [108] #newline + + # NOTE: Create a defensive copy of the original inputs + return LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +class PaliGemmaMultiModalProjector(nn.Module): + + def __init__(self, vision_hidden_size: int, projection_dim: int): + super().__init__() + + self.linear = ColumnParallelLinear(vision_hidden_size, + projection_dim, + bias=True) + + def forward(self, image_features: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.linear(image_features) + return hidden_states + + +class PaliGemmaImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, num_channels, height, width)""" + + +PaliGemmaImageInputs = PaliGemmaImagePixelInputs + + +@MULTIMODAL_REGISTRY.register_image_input_mapper() +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_paligemma_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_paligemma) +@INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma) +class PaliGemmaForConditionalGeneration(nn.Module, SupportsVision): + + def __init__(self, + config: PaliGemmaConfig, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__() + + self.config = config + self.multimodal_config = multimodal_config + + # TODO(ywang96): Port over SiglipVisionModel & TP + self.vision_tower = SiglipVisionModel(config.vision_config) + self.multi_modal_projector = PaliGemmaMultiModalProjector( + vision_hidden_size=config.vision_config.hidden_size, + projection_dim=config.vision_config.projection_dim) + + self.quant_config = quant_config + self.language_model = GemmaModel(config.text_config, cache_config, + quant_config) + self.unpadded_vocab_size = config.text_config.vocab_size + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, logit_scale) + self.sampler = Sampler() + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[PaliGemmaImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + + if pixel_values is None: + return None + + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return PaliGemmaImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + def _image_pixels_to_features(self, vision_tower: SiglipVisionModel, + pixel_values: torch.Tensor) -> torch.Tensor: + + image_outputs = vision_tower(pixel_values, output_hidden_states=True) + + selected_image_features = image_outputs.last_hidden_state + + return selected_image_features + + def _process_image_pixels( + self, inputs: PaliGemmaImagePixelInputs) -> torch.Tensor: + assert self.vision_tower is not None + + pixel_values = inputs["data"] + + return self._image_pixels_to_features(self.vision_tower, pixel_values) + + def _process_image_input( + self, image_input: PaliGemmaImageInputs) -> torch.Tensor: + + assert self.vision_tower is not None + image_features = self._process_image_pixels(image_input) + + return self.multi_modal_projector(image_features) + + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs: object) -> SamplerOutput: + + parsed_image_input = self._parse_and_validate_image_input(**kwargs) + + if parsed_image_input is not None: + vision_embeddings = self._process_image_input(parsed_image_input) + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa + vision_embeddings = vision_embeddings * (self.config.hidden_size** + -0.5) + + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + inputs_embeds = merge_vision_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.config.image_token_index) + + input_ids = None + else: + inputs_embeds = None + + hidden_states = self.language_model(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) + + return hidden_states + + # Copied from vllm/model_executor/models/gemma.py + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.language_model.embed_tokens, + hidden_states, sampling_metadata) + return logits + + # Copied from vllm/model_executor/models/gemma.py + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + # Adapted from vllm/model_executor/models/gemma.py + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params = set() + for name, loaded_weight in weights: + for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in name: + name = name.replace(key_to_modify, new_key) + use_default_weight_loading = False + if "vision" in name: + if self.vision_tower is not None: + # We only do sharding for language model and + # not vision model for now. + use_default_weight_loading = True + else: + for (param_name, shard_name, + shard_id) in stacked_params_mapping: + if shard_name not in name: + continue + name = name.replace(shard_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # lm_head is not used in vllm as it is tied with + # embed_token. To prevent errors, skip loading + # lm_head.weight. + if "lm_head.weight" in name: + continue + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + use_default_weight_loading = True + + if use_default_weight_loading: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + loaded_params.add(name) + + unloaded_params = params_dict.keys() - loaded_params + if unloaded_params: + raise RuntimeError( + "Some weights are not initialized from checkpoints: " + f"{unloaded_params}") diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py new file mode 100644 index 0000000000000..ef2562b073e6f --- /dev/null +++ b/vllm/model_executor/models/utils.py @@ -0,0 +1,41 @@ +import torch + +from vllm.multimodal import BatchedTensors + + +def merge_vision_embeddings(input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + vision_embeddings: BatchedTensors, + image_token_id: int) -> torch.Tensor: + """ + Merge `vision_embeddings` into `inputs_embeds` by overwriting the positions + in `inputs_embeds` corresponding to placeholder image tokens in `input_ids`. + + Note: + This updates `inputs_embeds` in place. + """ + mask = (input_ids == image_token_id) + num_expected_tokens = mask.sum() + + if isinstance(vision_embeddings, torch.Tensor): + batch_size, batch_tokens, *_, embed_dim = vision_embeddings.shape + total_tokens = batch_size * batch_tokens + if num_expected_tokens != total_tokens: + expr = f"{batch_size} x {batch_tokens}" + raise ValueError( + f"Attempted to assign {expr} = {total_tokens} " + f"image tokens to {num_expected_tokens} placeholders") + + inputs_embeds[mask] = vision_embeddings.view(total_tokens, embed_dim) + else: + size_per_batch = [t.shape[0] for t in vision_embeddings] + total_tokens = sum(size_per_batch) + if num_expected_tokens != total_tokens: + expr = ' + '.join(map(str, size_per_batch)) + raise ValueError( + f"Attempted to assign {expr} = {total_tokens} " + f"image tokens to {num_expected_tokens} placeholders") + + inputs_embeds[mask] = torch.cat(vision_embeddings) + + return inputs_embeds diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py new file mode 100644 index 0000000000000..7309f7bf795d6 --- /dev/null +++ b/vllm/platforms/__init__.py @@ -0,0 +1,18 @@ +from typing import Optional + +import torch + +from .interface import Platform, PlatformEnum + +current_platform: Optional[Platform] + +if torch.version.cuda is not None: + from .cuda import CudaPlatform + current_platform = CudaPlatform() +elif torch.version.hip is not None: + from .rocm import RocmPlatform + current_platform = RocmPlatform() +else: + current_platform = None + +__all__ = ['Platform', 'PlatformEnum', 'current_platform'] diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py new file mode 100644 index 0000000000000..b2ca758131e92 --- /dev/null +++ b/vllm/platforms/cuda.py @@ -0,0 +1,34 @@ +"""Code inside this file can safely assume cuda platform, e.g. importing +pynvml. However, it should not initialize cuda context. +""" + +from functools import lru_cache, wraps +from typing import Tuple + +import pynvml + +from .interface import Platform, PlatformEnum + + +def with_nvml_context(fn): + + @wraps(fn) + def wrapper(*args, **kwargs): + pynvml.nvmlInit() + try: + return fn(*args, **kwargs) + finally: + pynvml.nvmlShutdown() + + return wrapper + + +class CudaPlatform(Platform): + _enum = PlatformEnum.CUDA + + @staticmethod + @lru_cache(maxsize=8) + @with_nvml_context + def get_device_capability(device_id: int = 0) -> Tuple[int, int]: + handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) + return pynvml.nvmlDeviceGetCudaComputeCapability(handle) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py new file mode 100644 index 0000000000000..2ac092c258d15 --- /dev/null +++ b/vllm/platforms/interface.py @@ -0,0 +1,21 @@ +import enum +from typing import Tuple + + +class PlatformEnum(enum.Enum): + CUDA = enum.auto() + ROCM = enum.auto() + + +class Platform: + _enum: PlatformEnum + + def is_cuda(self) -> bool: + return self._enum == PlatformEnum.CUDA + + def is_rocm(self) -> bool: + return self._enum == PlatformEnum.ROCM + + @staticmethod + def get_device_capability(device_id: int = 0) -> Tuple[int, int]: + raise NotImplementedError diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py new file mode 100644 index 0000000000000..36b3ba8f7d1bb --- /dev/null +++ b/vllm/platforms/rocm.py @@ -0,0 +1,15 @@ +from functools import lru_cache +from typing import Tuple + +import torch + +from .interface import Platform, PlatformEnum + + +class RocmPlatform(Platform): + _enum = PlatformEnum.ROCM + + @staticmethod + @lru_cache(maxsize=8) + def get_device_capability(device_id: int = 0) -> Tuple[int, int]: + return torch.cuda.get_device_capability(device_id) diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py new file mode 100644 index 0000000000000..6a2cfc819d8d2 --- /dev/null +++ b/vllm/spec_decode/draft_model_runner.py @@ -0,0 +1,179 @@ +from typing import List, Optional + +import torch + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, MultiModalConfig, ParallelConfig, + SchedulerConfig) +from vllm.logger import init_logger +from vllm.sequence import (IntermediateTensors, SamplerOutput, + SequenceGroupMetadata) +from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, + ModelRunner) + +logger = init_logger(__name__) + + +class TP1DraftModelRunner(ModelRunner): + """Specialized model runner for speculative decoding draft model. + Since the draft model always execute k forward passes consecutively to + generate k speculative tokens in a single speculative decoding step, + we could get rid of most CPU-GPU synchronization and data transfer + overheads by keeping model input and output tensors on GPU all the time. + + This runner is still under development so there's no performance gain + at this moment. Currently we adopt a temporary solution that caches the + seq_group_metadata_list for multi-step execution, so that we can + leverage existing prepare_model_input to be compatible with the current + execution flow, but we plan to remove this cache and avoid calling + prepare_model_input in execute_model at all. + + The detail development plan includes: + 1. Use "update_model_input" to update existing model_input without + creating a new one. + 2. Improve the performance of "update_model_input" with a GPU kernel. + 3. Support TP > 1 (this requires some designs because we do not expect + any broadcasting inside execute_model). + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + multimodal_config: Optional[MultiModalConfig] = None, + return_hidden_states: bool = False, + ): + if return_hidden_states: + raise ValueError( + "return_hidden_states is not supported for TP1DraftModelRunner." + ) + + super().__init__( + model_config=model_config, + parallel_config=parallel_config, + scheduler_config=scheduler_config, + device_config=device_config, + cache_config=cache_config, + load_config=load_config, + lora_config=lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=is_driver_worker, + multimodal_config=multimodal_config, + return_hidden_states=return_hidden_states, + ) + + # TODO: Remove this cache when we are able to update model_input + # directly in advance_step. + self.cached_seq_group_metadata_list: Optional[ + List[SequenceGroupMetadata]] = None + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForGPUWithSamplingMetadata: + """A temporary solution that caches the seq_group_metadata_list + for multi-step execution. + TODO: In-place update model_input and remove this function. + """ + self.cached_seq_group_metadata_list = seq_group_metadata_list + return super().prepare_model_input( + seq_group_metadata_list, + finished_requests_ids=finished_requests_ids) + + def update_model_input( + self, model_input: ModelInputForGPUWithSamplingMetadata, + last_output: SamplerOutput + ) -> ModelInputForGPUWithSamplingMetadata: + """Prepare the model inputs for the next step. + TODO: In-place update model_input instead of calling + prepare_model_input. + """ + + # Append the output token to the sequence data. + assert self.cached_seq_group_metadata_list is not None + for seq_group_metadata, sequence_group_outputs in zip( + self.cached_seq_group_metadata_list, last_output.outputs): + seq_group_metadata.is_prompt = False + + for seq_output in sequence_group_outputs.samples: + seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] + + token_id = seq_output.output_token + token_logprob = seq_output.logprobs[token_id] + + seq.append_token_id(token_id, token_logprob.logprob) + seq.update_num_computed_tokens(1) + + return self.prepare_model_input(self.cached_seq_group_metadata_list) + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + ) -> Optional[List[SamplerOutput]]: + # Since we do not broadcast data inside execute_model anymore, + # we need to figure out the best way to support TP > 1 in this + # case, because we will at least need to broadcast the sampled + # tokens to all workers. + if not self.is_driver_worker: + raise ValueError("TP1DraftModelRunner only supports TP=1.") + + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + + virtual_engine = model_input.virtual_engine + outputs: List[SamplerOutput] = [] + for step in range(num_steps): + # Currently cuda graph is only supported by the decode phase. + assert model_input.attn_metadata is not None + prefill_meta = model_input.attn_metadata.prefill_metadata + decode_meta = model_input.attn_metadata.decode_metadata + if prefill_meta is None and decode_meta.use_cuda_graph: + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] + model_executable = ( + self.graph_runners[virtual_engine][graph_batch_size]) + else: + model_executable = self.model + + multi_modal_kwargs = model_input.multi_modal_kwargs or {} + hidden_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + intermediate_tensors=intermediate_tensors, + **multi_modal_kwargs, + ) + + # Compute the logits. + logits = self.model.compute_logits(hidden_states, + model_input.sampling_metadata) + + # Sample the next token. + outputs.append( + self.model.sample( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + )) + + # Prepare the inputs for the next step. + if step != num_steps - 1: + model_input = self.update_model_input(model_input, outputs[-1]) + + return outputs diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py new file mode 100644 index 0000000000000..6c1c8da57d188 --- /dev/null +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -0,0 +1,86 @@ +from typing import List, Optional, Tuple + +import torch + +from vllm.model_executor import SamplingMetadata +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, + SequenceGroupMetadata) +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase + + +class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker): + """Worker for MLPSpeculator models. + + Not currently compatible with LoRA or chunked prefill. + """ + + @torch.inference_mode() + def sampler_output( + self, + execute_model_req: ExecuteModelRequest, + sample_len: int, + ) -> Tuple[List[SamplerOutput], bool]: + """Run the model forward pass to generate sample_len future tokens. + Returns the list of sampler output, one per layer, along with indicator + of whether torch tensor in sampler output need to be transposed in + latter sampler_output_to_torch logic. + + For mlp spec worker, this indicator shall be True. + """ + self._raise_if_unsupported(execute_model_req) + + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + + (input_tokens, seq_lens, + query_lens) = self._prepare_input_tensors(seq_group_metadata_list) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, seq_lens, query_lens, self.device, + self.model_runner.pin_memory) + + model_outputs = self.model_runner.model.generate_proposals( + input_ids=input_tokens, + previous_hidden_states=execute_model_req.previous_hidden_states. + hidden_states, + num_predict_tokens=sample_len, + sampling_metadata=sampling_metadata) + + assert len(model_outputs) == sample_len + + return model_outputs, True + + def _prepare_input_tensors( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ) -> Tuple[torch.Tensor, List[int], List[int]]: + if not seq_group_metadata_list: + return torch.empty(0, device=self.device), [], [] + + input_tokens: List[int] = [] + seq_lens: List[int] = [] + query_lens: List[int] = [] + + for seq_group_metadata in seq_group_metadata_list: + is_prompt = seq_group_metadata.is_prompt + + for seq_data in seq_group_metadata.seq_data.values(): + seq_data_len = seq_data.get_len() + if is_prompt: + context_len = seq_data.get_num_computed_tokens() + seq_len = min( + seq_data_len, + context_len + seq_group_metadata.token_chunk_size) + tokens = seq_data.get_token_ids()[context_len:seq_len] + seq_lens.append(seq_len) + input_tokens.extend(tokens) + query_lens.append(seq_len - context_len) + else: + seq_lens.append(seq_data_len) + input_tokens.append(seq_data.get_last_token_id()) + query_lens.append(1) + + input_tokens_tensor = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + return input_tokens_tensor, seq_lens, query_lens diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py new file mode 100644 index 0000000000000..b78e4489513f7 --- /dev/null +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -0,0 +1,149 @@ +from typing import List, Optional, Tuple + +import torch + +from vllm.distributed.parallel_state import (get_tp_group, + init_model_parallel_group, + patch_tensor_parallel_group) +from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase + +logger = init_logger(__name__) + + +class SmallerTpProposerWorker(ProposerWorkerBase): + """Class which allows a speculative draft model to run with smaller tensor + parallel degree than target model. + This reduces the communication overhead of small draft models. + + To implement this feature, this class differs behavior based on is_dummy + flag, where dummy means worker that does not participate draft generation. + Participating workers use a smaller tp group by patching vLLM's tensor + parallel group temporarily during forward passes of draft models. + """ + + @classmethod + def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, + target_tensor_parallel_size: int): + """Wrap the worker in a SmallerTpProposerWorker if necessary. + """ + if draft_tensor_parallel_size == target_tensor_parallel_size: + return worker + + # gpu ranks that will generate draft tokens together + draft_ranks = list(range(draft_tensor_parallel_size)) + + logger.info("Wrapping {%s} in {%s}", type(worker), cls) + return cls(worker, draft_ranks) + + def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): + """Create a SmallerTpProposerWorker. + + Args: + worker (MultiStepWorker): an actual worker wrapped with this class + draft_ranks (List[int]): if this value is given, only the GPU ranks + written in this value participate in draft generation + """ + self._worker = worker + self._draft_ranks = draft_ranks + + # init during init_device + self._is_dummy = False + self._tp_group = None + + def _patch_tensor_parallel_group(self): + """Temporarily patch the global tp group state with its own tp group + state. + """ + return patch_tensor_parallel_group(self._tp_group) + + def init_device(self) -> None: + self._is_dummy = get_tp_group().rank not in self._draft_ranks + + # dummy workers do nothing + if self._is_dummy: + return + + # creates tp process group containing only a subset of gpu ranks + local_rank = get_tp_group().local_rank + tp_backend = torch.distributed.get_backend(get_tp_group().device_group) + self._tp_group = init_model_parallel_group([self._draft_ranks], + local_rank, tp_backend) + + with self._patch_tensor_parallel_group(): + self._worker.init_device() + + def set_include_gpu_probs_tensor(self) -> None: + if self._is_dummy: + return + + # Need include_gpu_probs_tensor for multi_step_worker + self._worker.set_include_gpu_probs_tensor() + + def load_model(self) -> None: + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + self._worker.load_model() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + if self._is_dummy: + # this case is not used now + return -1, -1 + + with self._patch_tensor_parallel_group(): + return self._worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + if self._is_dummy: + return + + with self._patch_tensor_parallel_group(): + self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + def sampler_output( + self, + execute_model_req: ExecuteModelRequest, + sample_len: int, + ) -> Tuple[List[SamplerOutput], bool]: + # Do not check _is_dummy, as it's always called by get_spec_proposals + return self._worker.sampler_output(execute_model_req, sample_len) + + def get_spec_proposals( + self, + execute_model_req: ExecuteModelRequest, + ) -> SpeculativeProposals: + """Produce speculations given an input batch of sequences. The number of + speculative tokens per sequence is determined by max_proposal_len. + """ + if self._is_dummy: + return SpeculativeProposals(None, None, None) + + with self._patch_tensor_parallel_group(): + return self._worker.get_spec_proposals(execute_model_req) + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if self._is_dummy: + return [] + + with self._patch_tensor_parallel_group(): + return self._worker.execute_model(execute_model_req) + + def get_cache_block_size_bytes(self) -> int: + if self._is_dummy: + # by returning zero, target worker can use the entire kv cache space + return 0 + + return self._worker.get_cache_block_size_bytes() + + @property + def vocab_size(self) -> int: + return self._worker.vocab_size diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py new file mode 100644 index 0000000000000..946af4e919f7c --- /dev/null +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -0,0 +1,65 @@ +from typing import List, Optional + +from transformers import PretrainedConfig + + +class MLPSpeculatorConfig(PretrainedConfig): + model_type = "mlp_speculator" + + attribute_map = { + "hidden_size": "emb_dim", + } + + def __init__(self, + vocab_size: int = 32000, + emb_dim: int = 4096, + inner_dim: int = 0, + n_predict: int = 3, + top_k_tokens_per_head: Optional[List[int]] = None, + n_candidates: int = 5, + tie_weights: bool = False, + scale_input: bool = False, + **kwargs): + """ + Initialize an MLPSpeculatorConfig + + Args: + vocab_size: int + the model vocab size + emb_dim: int + the model embedding dimension + inner_dim: int + the inner dimension of the model. If 0, will be the emb_dim. + n_predict: int + the number of lookaheads for the speculator + top_k_tokens_per_head: List[int] + Number of tokens to consider from each head when forming the + candidate tree. + For each candidate branch in the tree, head n produces topk[n] + additional sub-branches. + NOTE: This parameter is currently unused. + n_candidates: int + number of child candidates to create per sequence + tie_weights: bool + If true, use a single set of weights for every model + head/stage after the first. The initial projection + from the base model may have a different size, so that + stays separate. + scale_input: bool + if True, will scale the initial hidden states from + the base model. + """ + if top_k_tokens_per_head is None: + top_k_tokens_per_head = [5, 4, 3] + assert len(top_k_tokens_per_head) == n_predict + self.vocab_size = vocab_size + self.emb_dim = emb_dim + self.inner_dim = inner_dim + self.n_predict = n_predict + self.top_k_tokens_per_head = top_k_tokens_per_head + self.n_candidates = n_candidates + self.num_lookahead_tokens = n_predict + self.tie_weights = tie_weights + self.scale_input = scale_input + + super().__init__(**kwargs) diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py new file mode 100644 index 0000000000000..bc0960fa16221 --- /dev/null +++ b/vllm/worker/model_runner_base.py @@ -0,0 +1,162 @@ +import dataclasses +from abc import ABC, abstractmethod +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, + TypeVar) + +import torch + +from vllm.sequence import (IntermediateTensors, SamplerOutput, + SequenceGroupMetadata) + +if TYPE_CHECKING: + from vllm.attention import AttentionMetadata + from vllm.attention.backends.abstract import AttentionBackend + from vllm.model_executor import SamplingMetadata + +T = TypeVar('T', bound="ModelRunnerInputBase") + + +def _add_attn_metadata_broadcastable_dict( + tensor_dict: Dict[str, Any], + attn_metadata: Optional["AttentionMetadata"]) -> None: + """ + Helper method to update tensor_dict with broadcastable + AttentionMetadata fields. + """ + if attn_metadata is not None: + tensor_dict.update(attn_metadata.asdict_zerocopy()) + + +def _init_attn_metadata_from_tensor_dict( + attn_backend: "AttentionBackend", + tensor_dict: Dict[str, Any], +) -> Dict[str, Any]: + """ + Helper method to initialize AttentionMetadata based on an + AttentionBackend and broadcastable AttentionMetadata fields. + """ + # Extract the fields used to create AttentionMetadata. + valid_attn_kwargs = {} + for field in dataclasses.fields(attn_backend.get_metadata_cls()): + val = tensor_dict.pop(field.name, None) + if val is not None: + valid_attn_kwargs[field.name] = val + + attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs) + tensor_dict["attn_metadata"] = attn_metadata + return tensor_dict + + +def _init_sampling_metadata_from_tensor_dict( # type: ignore + tensor_dict: Dict[str, Any]) -> Dict[str, Any]: + """ + Helper method to initialize SamplingMetadata based on broadcastable + SamplingMetadata fields. + """ + from vllm.model_executor import SamplingMetadata + + selected_token_indices = tensor_dict.pop("selected_token_indices", None) + # An empty SamplingMetadata to signal that the worker should skip + # sampling. + if selected_token_indices is not None: + tensor_dict["sampling_metadata"] = SamplingMetadata( + seq_groups=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + num_prompts=0, + ) + return tensor_dict + + +def _add_sampling_metadata_broadcastable_dict( + tensor_dict: Dict[str, Any], + sampling_metadata: Optional["SamplingMetadata"]) -> None: + """ + Helper method to update tensor_dict with broadcastable + SamplingMetadata fields. + """ + if sampling_metadata is not None: + tensor_dict["selected_token_indices"] = ( + sampling_metadata.selected_token_indices) + + +@dataclasses.dataclass(frozen=True) +class ModelRunnerInputBase(ABC): + """Local inputs to each worker's model runner. May contain + device-specific data. Different worker backends may have different methods + of converting from the global ExecuteModelRequest produced by the LLM + engine to the worker-local ModelRunnerInputBase objects. + + Model runners that support multi-GPU execution should define a + ModelRunnerInputBase subclass, add their required fields, and specify how to + serialize/deserialize a ModelInput for broadcast between workers. + """ + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + """ + Extract broadcastable fields. Override for fields that require some + custom deserialization. + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def from_broadcasted_tensor_dict( + cls: Type[T], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> T: + """ + Pop fields from the given tensor_dict and populate a new instance of + ModelRunnerInputBase. + """ + raise NotImplementedError + + +class ModelRunnerBase(ABC, Generic[T]): + """ + Model runner interface that abstracts a particular hardware and/or type of + model. Model execution may communicate data with model runners in other + processes, but it should not include control plane metadata communication. + + Each ModelRunnerBase subclass should define a corresponding + ModelRunnerInputBase subclass. + """ + + @abstractmethod + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> T: + """ + Make an instance of a ModelRunnerInputBase from the broadcasted tensor + dict. + """ + raise NotImplementedError + + @abstractmethod + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None, + ) -> T: + """ + Prepare the inputs to ModelRunnerBase.execute_model from an execution + request. This method may move data to the worker's local device. It is + not allowed to communicate with other workers or devices. + """ + raise NotImplementedError + + @torch.inference_mode() + def execute_model( + self, + model_input: T, + kv_caches: Optional[List[torch.Tensor]], + intermediate_tensors: Optional[IntermediateTensors], + num_steps: int = 1, + ) -> Optional[List[SamplerOutput]]: + """ + Execute the model on the given input. + """ + raise NotImplementedError diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py new file mode 100644 index 0000000000000..6281cec09825f --- /dev/null +++ b/vllm/worker/openvino_model_runner.py @@ -0,0 +1,344 @@ +from typing import List, Mapping, NamedTuple, Optional, Tuple + +import openvino as ov +import torch +from torch import nn + +from vllm.attention import get_attn_backend +from vllm.attention.backends.openvino import OpenVINOAttentionMetadata +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, MultiModalConfig, ParallelConfig, + SchedulerConfig) +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.model_loader.openvino import get_model +from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors, + MultiModalInputs) +from vllm.sequence import SamplerOutput, SequenceGroupMetadata + +logger = init_logger(__name__) + + +class ModelInput(NamedTuple): + input_tokens: torch.Tensor + input_positions: torch.Tensor + attn_metadata: Optional[OpenVINOAttentionMetadata] + seq_lens: List[int] + query_lens: List[int] + multi_modal_kwargs: Mapping[str, BatchedTensors] + + @classmethod + def empty(cls, device): + return ModelInput(input_tokens=torch.empty(0, device=device), + input_positions=torch.empty(0, device=device), + attn_metadata=None, + seq_lens=[], + query_lens=[], + multi_modal_kwargs={}) + + +class OpenVINOModelRunner: + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + multimodal_config: Optional[MultiModalConfig], + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + *args, + **kwargs, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.lora_config = lora_config + self.multimodal_config = multimodal_config + self.load_config = load_config + self.is_driver_worker = is_driver_worker + + self.device = self.device_config.device + + self.kv_cache_dtype = kv_cache_dtype + self.sliding_window = model_config.get_sliding_window() + self.block_size = cache_config.block_size + + self.attn_backend = get_attn_backend( + self.model_config.get_num_attention_heads(self.parallel_config), + self.model_config.get_head_size(), + self.model_config.get_num_kv_heads(self.parallel_config), + self.model_config.get_sliding_window(), + self.model_config.dtype, + self.kv_cache_dtype, + self.block_size, + ) + + # Multi-modal data support + self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + .create_input_mapper(self.model_config) + + # Lazy initialization. + self.model: nn.Module # Set after init_Model + + def load_model(self) -> None: + self.model = get_model( + model_config=self.model_config, + device_config=self.device_config, + kv_cache_dtype=self.kv_cache_dtype, + ) + + def _prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> ModelInput: + """Prepare the model input based on a given sequence group. + + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + + The result tensors and data structure also batches input in prefill + -> decode order. For example, + + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + """ + input_tokens: List[int] = [] + input_positions: List[int] = [] + + seq_lens: List[int] = [] + past_lens: List[int] = [] + query_lens: List[int] = [] + multi_modal_inputs_list: List[MultiModalInputs] = [] + + subsequence_begins: List[int] = [] + block_indices: List[int] = [] + block_indices_begins: List[int] = [] + + # initialize beginning of prefix sums + subsequence_begins.append(0) + block_indices_begins.append(0) + + if len(seq_group_metadata_list) == 0: + return ModelInput.empty(self.device) + + for seq_group_metadata in seq_group_metadata_list: + seq_ids = list(seq_group_metadata.seq_data.keys()) + is_prompt = seq_group_metadata.is_prompt + + for seq_id in seq_ids: + computed_block_nums = seq_group_metadata.computed_block_nums + if (self.scheduler_config is not None + and self.scheduler_config.chunked_prefill_enabled + and not (computed_block_nums is None + or computed_block_nums == [])): + raise RuntimeError( + "chunked prefill cannot be used with prefix caching " + "now.") + + seq_data = seq_group_metadata.seq_data[seq_id] + if is_prompt: + computed_len = seq_data.get_num_computed_tokens() + else: + # get_num_computed_tokens is incorrect for spec decoding. + # So, we should have a special logic here. + # TODO(sang): Fix it. + computed_len = seq_data.get_len() - 1 + + seq_len = min( + seq_data.get_len(), + computed_len + seq_group_metadata.token_chunk_size, + ) + if is_prompt: + tokens = seq_data.get_token_ids()[computed_len:seq_len] + else: + # Optimization. get_token_ids requires the entire copy of + # tokens. + tokens = [seq_data.get_last_token_id()] + + # Prefix cache was hit. + # Prefix is not supported with sliding_window + prefix_cache_hit = (computed_block_nums is not None + and len(computed_block_nums) > 0 + and self.sliding_window is None + and is_prompt) + + mm_data = seq_group_metadata.multi_modal_data + if mm_data: + mm_kwargs = self.multi_modal_input_mapper(mm_data) + multi_modal_inputs_list.append(mm_kwargs) + + block_table = seq_group_metadata.block_tables[seq_id] + # TODO(sang): Combine chunked prefill and prefix caching by + # only allowing multiple of block_size chunk size. + # NOTE: This only works for oooooooxxx style attention. + if prefix_cache_hit: + assert computed_block_nums is not None + computed_len = len(computed_block_nums) * self.block_size + tokens = tokens[computed_len:] + elif (self.scheduler_config.chunked_prefill_enabled + or not is_prompt): + if seq_group_metadata.block_tables is not None: + # chunked prefill or decode + block_table = seq_group_metadata.block_tables[seq_id] + if self.sliding_window is not None: + # chunked prefill doesn't support sliding window. + assert not self.scheduler_config.chunked_prefill_enabled # noqa: E501 + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + else: + # Only happens when memory profiling runs. + block_table = [] + else: + # prompt phase w/o prefix_caching, chunked_prefill + pass + + block_indices.extend(block_table) + block_indices_begins.append(block_indices_begins[-1] + + len(block_table)) + + # TODO(sang): This is a hack to make sliding window work with + # paged attn. We can remove it if we make paged attn kernel + # to properly handle slinding window attn. + if self.sliding_window is not None and not is_prompt: + seq_len = min(seq_len, self.sliding_window) + computed_len = seq_len - 1 + + seq_lens.append(seq_len) + + query_len = seq_len - computed_len + query_lens.append(query_len) + + input_tokens.extend(tokens) + input_positions.extend(list(range(computed_len, seq_len))) + + past_lens.append(computed_len) + subsequence_begins.append(subsequence_begins[-1] + query_len) + + if is_prompt: + assert len(seq_ids) == 1 + else: + assert ( + query_len == 1 + ), "seq_len: {}, computed_len: {}, query_len: {}".format( + seq_len, computed_len, query_len) + + max_query_len = max(query_lens) + assert max_query_len > 0, "query_lens: {}".format(query_lens) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) # type: ignore + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) # type: ignore + + past_lens_tensor = torch.tensor(past_lens, + dtype=torch.int32, + device=self.device) # type: ignore + subsequence_begins_tensor = torch.tensor( + subsequence_begins, dtype=torch.int32, + device=self.device) # type: ignore + block_indices_tensor = torch.tensor(block_indices, + dtype=torch.int32, + device=self.device) # type: ignore + block_indices_begins_tensor = torch.tensor( + block_indices_begins, dtype=torch.int32, + device=self.device) # type: ignore + + max_context_len = max(seq_lens) + max_context_len_tensor = torch.tensor( + max_context_len, dtype=torch.int32, + device=self.device) # type: ignore + + attn_metadata = self.attn_backend.make_openvino_metadata( + past_lens=past_lens_tensor, + subsequence_begins=subsequence_begins_tensor, + block_indices=block_indices_tensor, + block_indices_begins=block_indices_begins_tensor, + max_context_len=max_context_len_tensor, + ) + + multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list, + device=self.device) + + return ModelInput( + input_tokens, + input_positions, + attn_metadata, + seq_lens, + query_lens, + multi_modal_kwargs=multi_modal_kwargs, + ) + + def prepare_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata, + SamplingMetadata, Mapping[str, BatchedTensors]]: + # Prepare input tensors. + ( + input_tokens, + input_positions, + attn_metadata, + seq_lens, + query_lens, + multi_modal_kwargs, + ) = self._prepare_model_input(seq_group_metadata_list) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + query_lens, + self.device, + pin_memory=False, + ) + + return ( + input_tokens, + input_positions, + attn_metadata, + sampling_metadata, + multi_modal_kwargs, + ) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]], + ) -> Optional[SamplerOutput]: + ( + input_tokens, + input_positions, + attn_metadata, + sampling_metadata, + multi_modal_kwargs, + ) = self.prepare_input_tensors(seq_group_metadata_list) + + model_executable = self.model + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": attn_metadata, + **(multi_modal_kwargs or {}), + } + + hidden_states = model_executable(**execute_model_kwargs) + + # Compute the logits. + logits = self.model.compute_logits(hidden_states, sampling_metadata) + + # Sample the next token. + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + return output diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py new file mode 100644 index 0000000000000..c47f9acc4423d --- /dev/null +++ b/vllm/worker/openvino_worker.py @@ -0,0 +1,354 @@ +"""An OpenVINO worker class.""" +from typing import Any, Dict, List, Optional, Tuple + +import openvino as ov +import torch +import torch.distributed + +from vllm.attention import get_attn_backend +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, MultiModalConfig, ParallelConfig, + SchedulerConfig) +from vllm.distributed import (broadcast_tensor_dict, + ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.logger import init_logger +from vllm.model_executor import set_random_seed +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.worker.openvino_model_runner import OpenVINOModelRunner +from vllm.worker.worker_base import LoraNotSupportedWorkerBase + +logger = init_logger(__name__) + + +class OpenVINOCacheEngine: + """Manages the KV cache for OpenVINO backend. + + This class is responsible for initializing and managing CPU KV + caches. It also provides methods for performing KV cache operations, such + as copying. + """ + + def __init__( + self, + cache_config: CacheConfig, + model_config: ModelConfig, + parallel_config: ParallelConfig, + device_config: DeviceConfig, + ) -> None: + assert device_config.device_type == "openvino" + self.cache_config = cache_config + self.model_config = model_config + self.parallel_config = parallel_config + + self.head_size = model_config.get_head_size() + if device_config.device.type == "cpu" and \ + cache_config.cache_dtype == ov.Type.u8: + # Scale, zero point and quantized data will be stored together. + # The layout for per token per head: + # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501 + # so, we have to extend head_size by 8, which is sizeof(float) + # for scale and sizeof(float) for zeropoint + self.head_size += 8 + self.num_layers = model_config.get_num_layers(parallel_config) + self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) + + self.block_size = cache_config.block_size + # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks + # for OpenVINO backend, because we want to reuse KV cache management + # in the scheduler. + self.num_cpu_blocks = cache_config.num_gpu_blocks + + # Get attention backend. + self.attn_backend = get_attn_backend( + self.model_config.get_num_attention_heads(self.parallel_config), + self.head_size, + self.model_config.get_num_kv_heads(self.parallel_config), + self.model_config.get_sliding_window(), + self.model_config.dtype, + self.cache_config.cache_dtype, + self.block_size, + ) + + # Initialize the cache. + self.kv_cache: List[Tuple[ov.Tensor, + ov.Tensor]] = self._allocate_kv_cache( + self.num_cpu_blocks) + + def _allocate_kv_cache( + self, + num_blocks: int, + ) -> List[Tuple[ov.Tensor, ov.Tensor]]: + """Allocates KV cache.""" + k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape( + num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:] + kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = [] + for _ in range(self.num_layers): + key_blocks = ov.Tensor(self.cache_config.cache_dtype, + k_block_shape) + value_blocks = ov.Tensor(self.cache_config.cache_dtype, + v_block_shape) + kv_cache.append((key_blocks, value_blocks)) + return kv_cache + + def swap_in(self, src_to_dst: Dict[int, int]) -> None: + raise NotImplementedError( + "Swap is not supported in OpenVINOCacheEngine.") + + def swap_out(self, src_to_dst: Dict[int, int]) -> None: + raise NotImplementedError( + "Swap is not supported in OpenVINOCacheEngine.") + + def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: + self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts) + + @staticmethod + def get_cache_block_size( + block_size: int, + cache_dtype: ov.Type, + model_config: ModelConfig, + parallel_config: ParallelConfig, + ) -> int: + head_size = model_config.get_head_size() + num_kv_heads = model_config.get_num_kv_heads(parallel_config) + num_layers = model_config.get_num_layers(parallel_config) + + if cache_dtype == ov.Type.u8: + # Scale, zero point and quantized data will be stored together. + # The layout for per token per head: + # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501 + # so, we have to extend head_size by 8, which is sizeof(float) + # for scale and sizeof(float) for zeropoint + head_size += 8 + + key_cache_block = block_size * num_kv_heads * head_size + value_cache_block = key_cache_block + total = num_layers * (key_cache_block + value_cache_block) + dtype_size = cache_dtype.size + return dtype_size * total + + +class OpenVINOWorker(LoraNotSupportedWorkerBase): + """A worker class that executes the model on OpenVINO backend. + + Each worker is associated with a single OpenVINO device. The worker is + responsible for maintaining the KV cache and executing the model on the + OpenVINO backend. + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + lora_config: Optional[LoRAConfig] = None, + multimodal_config: Optional[MultiModalConfig] = None, + kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined, + is_driver_worker: bool = False, + ) -> None: + self.model_config = model_config + self.parallel_config = parallel_config + self.parallel_config.rank = rank + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.load_config = load_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.multimodal_config = multimodal_config + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + + init_cached_hf_modules() + self.model_runner = OpenVINOModelRunner( + model_config, + parallel_config, + scheduler_config, + device_config, + cache_config, + load_config=self.load_config, + lora_config=self.lora_config, + multimodal_config=self.multimodal_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=is_driver_worker, + ) + # Uninitialized cache engine. Will be initialized by + # initialize_cache. + self.cache_engine: OpenVINOCacheEngine + self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] + + def init_device(self) -> None: + self.init_distributed_environment() + # Set random seed. + set_random_seed(self.model_config.seed) + + def load_model(self): + self.model_runner.load_model() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of blocks available for the KV cache. + + This determines how many KV blocks can fit into the configured + KV cache space. + + Note that since vLLM assumes a block resides on GPU if it can be + modified, we return num_gpu_blocks=num_cpu_blocks and num_cpu_blocks=0. + This allows us to reuse the scheduler of vLLM without generalizing it + to different devices. + """ + # For OpenVINO backend, the block number will be calculated based on the + # openvino_kvcache_space_bytes. + cache_block_size = self.get_cache_block_size_bytes() + num_cpu_blocks = int(self.cache_config.openvino_kvcache_space_bytes // + cache_block_size) + num_cpu_blocks = max(num_cpu_blocks, 0) + + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + num_gpu_blocks = num_cpu_blocks + num_cpu_blocks = 0 + return num_gpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Initialize the KV cache. Currently, swappable CPU memory is not + supported. + + Since this worker does not support GPUs, we use the num_gpu_blocks to + determine how many non-swappable CPU blocks to allocate. + """ + assert (num_cpu_blocks == 0 + ), f"{type(self)} does not support swappable cache" + + # Note: To reuse the cache management procedure, + # use cpu cache as 'gpu cache'. + num_cpu_blocks = num_gpu_blocks + + self._validate_num_cpu_blocks(num_cpu_blocks) + self.cache_config.num_gpu_blocks = num_cpu_blocks + self.cache_config.num_cpu_blocks = 0 + + # Initialize the cache. + self._init_cache_engine() + + def _validate_num_cpu_blocks(self, num_cpu_blocks: int) -> None: + """Raise errors if the num_cpu_blocks is invalid.""" + if num_cpu_blocks <= 0: + raise ValueError( + "No available memory for the cache blocks. " + "Try increasing `VLLM_OPENVINO_KVCACHE_SPACE` when " + "initializing the engine.") + + max_seq_len = self.cache_config.block_size * num_cpu_blocks + if self.model_config.max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({self.model_config.max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`VLLM_OPENVINO_KVCACHE_SPACE` or decreasing `max_model_len` " + "when initializing the engine.") + + def _init_cache_engine(self) -> None: + self.cache_engine = OpenVINOCacheEngine( + self.cache_config, + self.model_config, + self.parallel_config, + self.device_config, + ) + self.kv_cache = self.cache_engine.kv_cache + self.model_runner.block_size = self.cache_engine.block_size + + assert self.kv_cache is not None + + # Populate the cache to warmup the memory + for key_cache, value_cache in self.kv_cache: + key_cache.data[:] = 0 + value_cache.data[:] = 0 + + def cache_copy( + self, + blocks_to_copy: List[Tuple[int, int]], + ) -> None: + self.cache_engine.copy(blocks_to_copy) # type: ignore + + @torch.inference_mode() + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + ) -> List[SamplerOutput]: + if execute_model_req is None: + seq_group_metadata_list = None + else: + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + + if self.is_driver_worker: + assert seq_group_metadata_list is not None + num_seq_groups: int = len(seq_group_metadata_list) + assert execute_model_req is not None + blocks_to_copy = execute_model_req.blocks_to_copy + assert len(execute_model_req.blocks_to_swap_in) == 0 + assert len(execute_model_req.blocks_to_swap_out) == 0 + data: Dict[str, Any] = { + "num_seq_groups": num_seq_groups, + "blocks_to_copy": execute_model_req.blocks_to_copy, + } + broadcast_tensor_dict(data, src=0) + else: + data = broadcast_tensor_dict(src=0) + num_seq_groups = data["num_seq_groups"] + blocks_to_copy = data["blocks_to_copy"] + + self.cache_copy(blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return [] + + output = self.model_runner.execute_model(seq_group_metadata_list, + self.kv_cache) + + # OpenVINO worker only supports single-step execution. + return [output] + + def init_distributed_environment(self) -> None: + """Initialize the distributed environment.""" + + parallel_config = self.parallel_config + rank = self.rank + distributed_init_method = self.distributed_init_method + init_distributed_environment( + world_size=parallel_config.world_size, + rank=rank, + distributed_init_method=distributed_init_method, + backend="gloo", + ) + + # A small all_reduce for warmup. + torch.distributed.all_reduce(torch.zeros(1).cpu()) + + ensure_model_parallel_initialized( + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size, + ) + + def get_cache_block_size_bytes(self) -> int: + """Return the size in bytes of a single KV cache block.""" + return OpenVINOCacheEngine.get_cache_block_size( + self.cache_config.block_size, + self.cache_config.cache_dtype, + self.model_config, + self.parallel_config, + ) From da4c965ec856f3d39b83d118b3f64541f4748211 Mon Sep 17 00:00:00 2001 From: hezhihui Date: Wed, 10 Jul 2024 17:08:45 +0800 Subject: [PATCH 23/52] update processing minicpmv --- .buildkite/download-images.sh | 4 - .../benchmark-pipeline.yaml | 1 + .buildkite/release-pipeline.yaml | 2 +- .buildkite/run-cpu-test.sh | 8 +- .buildkite/test-pipeline.yaml | 76 +- .buildkite/test-template-aws.j2 | 93 -- CMakeLists.txt | 24 +- Dockerfile | 26 +- Dockerfile.cpu | 10 +- Dockerfile.rocm | 215 ++- README.md | 2 +- benchmarks/backend_request_func.py | 20 +- benchmarks/benchmark_latency.py | 27 +- benchmarks/benchmark_prefix_caching.py | 4 +- benchmarks/benchmark_serving.py | 72 +- benchmarks/benchmark_throughput.py | 10 +- .../cutlass_benchmarks/w8a8_benchmarks.py | 3 +- benchmarks/kernels/benchmark_aqlm.py | 4 +- benchmarks/kernels/benchmark_marlin.py | 4 +- benchmarks/kernels/benchmark_moe.py | 3 +- .../kernels/benchmark_paged_attention.py | 6 +- benchmarks/kernels/benchmark_rope.py | 4 +- benchmarks/overheads/benchmark_hashing.py | 4 +- cmake/cpu_extension.cmake | 11 +- cmake/utils.cmake | 20 +- csrc/activation_kernels.cu | 12 + csrc/cpu/activation.cpp | 19 + csrc/cpu/cpu_types.hpp | 514 +------ csrc/cpu/torch_bindings.cpp | 4 + csrc/ops.h | 13 +- csrc/punica/bgmv/bgmv_config.h | 40 +- csrc/quantization/cutlass_w8a8/common.hpp | 15 + .../cutlass_w8a8/scaled_mm_c2x.cu | 385 ++++- .../cutlass_w8a8/scaled_mm_c3x.cu | 299 +++- .../cutlass_w8a8/scaled_mm_entry.cu | 48 +- csrc/torch_bindings.cpp | 16 +- docs/requirements-docs.txt | 8 +- docs/source/conf.py | 13 + docs/source/dev/dockerfile/dockerfile.rst | 22 +- .../dev/multimodal/multimodal_index.rst | 29 +- .../getting_started/amd-installation.rst | 6 +- docs/source/getting_started/debugging.rst | 13 +- docs/source/getting_started/installation.rst | 4 +- .../getting_started/tpu-installation.rst | 18 + docs/source/index.rst | 6 + docs/source/models/adding_model.rst | 42 +- docs/source/models/lora.rst | 3 + docs/source/models/supported_models.rst | 65 +- docs/source/models/vlm.rst | 83 +- docs/source/quantization/fp8.rst | 5 +- docs/source/serving/deploying_with_docker.rst | 7 +- docs/source/serving/distributed_serving.rst | 7 +- docs/source/serving/env_vars.rst | 5 + examples/api_client.py | 7 +- examples/aqlm_example.py | 5 +- examples/llava_example.py | 66 +- examples/llm_engine_example.py | 3 +- examples/minicpmv_example.py | 48 +- examples/offline_inference_neuron.py | 0 examples/phi3v_example.py | 33 +- examples/production_monitoring/grafana.json | 27 +- examples/save_sharded_state.py | 4 +- examples/tensorize_vllm_model.py | 3 +- pyproject.toml | 2 - requirements-common.txt | 3 +- requirements-cpu.txt | 5 +- requirements-cuda.txt | 2 + requirements-test.txt | 5 +- setup.py | 11 +- tests/async_engine/api_server_async_engine.py | 4 +- tests/async_engine/test_async_llm_engine.py | 52 +- .../test_basic_correctness.py | 21 +- .../basic_correctness/test_chunked_prefill.py | 15 +- tests/basic_correctness/test_preemption.py | 40 +- tests/conftest.py | 200 ++- tests/core/block/test_block_table.py | 5 +- .../block/test_cpu_gpu_block_allocator.py | 24 +- tests/core/block/test_naive_block.py | 6 +- tests/core/block/test_prefix_caching_block.py | 106 +- .../test_basic_distributed_correctness.py | 35 +- .../test_chunked_prefill_distributed.py | 29 +- tests/distributed/test_comm_ops.py | 98 +- tests/distributed/test_custom_all_reduce.py | 10 +- tests/distributed/test_pynccl.py | 16 +- tests/distributed/test_utils.py | 17 +- .../output_processor/test_multi_step.py | 8 +- tests/entrypoints/openai/test_serving_chat.py | 4 - tests/entrypoints/test_guided_processors.py | 113 -- tests/entrypoints/test_llm_encode.py | 144 -- tests/entrypoints/test_llm_generate.py | 144 -- .../test_llm_generate_multiple_loras.py | 69 - tests/entrypoints/test_openai_embedding.py | 113 -- tests/entrypoints/test_openai_run_batch.py | 53 - tests/entrypoints/test_openai_server.py | 1370 ----------------- tests/entrypoints/test_openai_vision.py | 285 ---- .../test_server_oot_registration.py | 69 - tests/kernels/test_attention_selector.py | 9 +- tests/kernels/test_cutlass.py | 136 +- tests/kernels/test_marlin_gemm.py | 88 +- tests/kernels/test_moe.py | 6 +- tests/lora/conftest.py | 4 +- tests/lora/test_baichuan.py | 14 +- tests/lora/test_layers.py | 17 +- tests/lora/test_lora_manager.py | 64 + tests/lora/test_mixtral.py | 4 +- tests/lora/test_punica.py | 18 + tests/metrics/test_metrics.py | 12 +- tests/models/test_big_models.py | 15 +- tests/models/test_llava.py | 213 +-- tests/models/test_llava_next.py | 191 ++- tests/models/test_minicpmv.py | 235 +-- tests/models/test_models.py | 16 +- tests/models/test_phi3v.py | 213 +-- tests/models/utils.py | 101 +- tests/multimodal/test_processor.py | 149 -- tests/multimodal/test_utils.py | 29 +- tests/quantization/test_compressed_tensors.py | 84 +- tests/quantization/test_fp8.py | 36 +- tests/quantization/utils.py | 3 +- tests/samplers/test_sampler.py | 2 +- .../test_typical_acceptance_sampler.py | 96 +- tests/spec_decode/e2e/conftest.py | 47 +- .../spec_decode/e2e/test_integration_dist.py | 65 - .../e2e/test_multistep_correctness.py | 54 +- tests/spec_decode/test_batch_expansion.py | 8 +- tests/spec_decode/test_dynamic_spec_decode.py | 12 +- tests/spec_decode/test_metrics.py | 94 +- tests/spec_decode/test_multi_step_worker.py | 3 + tests/spec_decode/test_spec_decode_worker.py | 162 +- tests/spec_decode/test_utils.py | 26 +- tests/spec_decode/utils.py | 11 +- tests/tensorizer_loader/test_tensorizer.py | 4 +- tests/test_logits_processor.py | 2 +- tests/test_utils.py | 61 +- tests/tokenization/test_image_processor.py | 20 - tests/tokenization/test_tokenizer_group.py | 99 ++ tests/utils.py | 96 +- tests/worker/test_model_runner.py | 57 +- tests/worker/test_swap.py | 4 +- vllm/_custom_ops.py | 27 +- vllm/_ipex_ops.py | 3 + vllm/attention/backends/abstract.py | 6 +- vllm/attention/backends/blocksparse_attn.py | 4 +- vllm/attention/backends/flash_attn.py | 4 +- vllm/attention/backends/flashinfer.py | 95 +- vllm/attention/backends/ipex_attn.py | 4 +- vllm/attention/backends/pallas.py | 21 +- vllm/attention/backends/rocm_flash_attn.py | 57 +- vllm/attention/backends/torch_sdpa.py | 4 +- vllm/attention/backends/xformers.py | 4 +- vllm/attention/layer.py | 23 +- .../ops/blocksparse_attention/interface.py | 5 +- .../ops/blocksparse_attention/utils.py | 19 +- vllm/attention/ops/prefix_prefill.py | 4 +- vllm/attention/selector.py | 17 +- vllm/block.py | 82 +- vllm/config.py | 370 +++-- vllm/core/block/block_table.py | 85 +- vllm/core/block/common.py | 198 ++- vllm/core/block/cpu_gpu_block_allocator.py | 84 +- vllm/core/block/interfaces.py | 56 +- vllm/core/block/naive_block.py | 216 ++- vllm/core/block/prefix_caching_block.py | 693 ++++++--- vllm/core/block_manager_v1.py | 22 +- vllm/core/block_manager_v2.py | 153 +- vllm/core/scheduler.py | 29 +- .../device_communicators/custom_all_reduce.py | 58 +- .../custom_all_reduce_utils.py | 39 +- .../device_communicators/pynccl.py | 14 +- vllm/distributed/parallel_state.py | 387 ++++- vllm/distributed/utils.py | 18 +- vllm/engine/arg_utils.py | 188 +-- vllm/engine/async_llm_engine.py | 107 +- vllm/engine/llm_engine.py | 181 ++- vllm/engine/metrics.py | 293 ++-- vllm/engine/output_processor/interfaces.py | 2 +- vllm/engine/output_processor/multi_step.py | 5 +- vllm/engine/output_processor/single_step.py | 20 +- vllm/entrypoints/api_server.py | 16 +- vllm/entrypoints/llm.py | 5 + vllm/entrypoints/openai/api_server.py | 48 +- vllm/entrypoints/openai/cli_args.py | 3 +- vllm/entrypoints/openai/protocol.py | 81 +- vllm/entrypoints/openai/run_batch.py | 9 +- vllm/entrypoints/openai/serving_chat.py | 152 +- vllm/entrypoints/openai/serving_completion.py | 64 +- vllm/entrypoints/openai/serving_embedding.py | 26 +- vllm/entrypoints/openai/serving_engine.py | 17 +- vllm/envs.py | 34 +- vllm/executor/cpu_executor.py | 5 +- vllm/executor/distributed_gpu_executor.py | 35 +- vllm/executor/executor_base.py | 35 +- vllm/executor/gpu_executor.py | 11 +- vllm/executor/multiproc_gpu_executor.py | 36 +- vllm/executor/neuron_executor.py | 12 +- vllm/executor/ray_gpu_executor.py | 90 +- vllm/executor/ray_utils.py | 23 +- vllm/executor/ray_xpu_executor.py | 10 +- vllm/executor/tpu_executor.py | 61 +- vllm/executor/xpu_executor.py | 8 +- vllm/inputs.py | 130 -- vllm/lora/fully_sharded_layers.py | 58 +- vllm/lora/layers.py | 44 +- vllm/lora/lora.py | 3 +- vllm/lora/models.py | 95 +- vllm/lora/punica.py | 3 +- vllm/lora/utils.py | 4 +- vllm/lora/worker_manager.py | 3 + vllm/model_executor/layers/activation.py | 19 + .../layers/fused_moe/__init__.py | 7 +- ...14336,device_name=AMD_Instinct_MI300X.json | 164 +- ...=1792,device_name=AMD_Instinct_MI300X.json | 182 ++- ...=3584,device_name=AMD_Instinct_MI300X.json | 172 ++- ...=7168,device_name=AMD_Instinct_MI300X.json | 176 ++- .../layers/fused_moe/fused_moe.py | 143 +- vllm/model_executor/layers/layernorm.py | 46 + vllm/model_executor/layers/linear.py | 136 +- .../model_executor/layers/logits_processor.py | 26 +- .../model_executor/layers/quantization/awq.py | 3 +- .../layers/quantization/base_config.py | 12 +- .../layers/quantization/bitsandbytes.py | 2 +- .../compressed_tensors/compressed_tensors.py | 106 +- .../compressed_tensors/schemes/__init__.py | 29 +- .../schemes/compressed_tensors_scheme.py | 8 + .../schemes/compressed_tensors_unquantized.py | 3 + .../schemes/compressed_tensors_w4a16.py | 171 -- .../schemes/compressed_tensors_w4a16_24.py | 4 + .../compressed_tensors_w8a8_dynamictoken.py | 102 -- .../compressed_tensors_w8a8_statictensor.py | 87 -- .../quantization/compressed_tensors/utils.py | 1 + .../model_executor/layers/quantization/fp8.py | 385 +++-- .../layers/quantization/gptq.py | 13 +- .../layers/quantization/gptq_marlin.py | 32 +- .../layers/quantization/marlin.py | 13 +- .../layers/quantization/squeezellm.py | 3 +- .../layers/quantization/utils/marlin_utils.py | 130 +- .../layers/rejection_sampler.py | 18 +- .../model_executor/layers/rotary_embedding.py | 136 ++ vllm/model_executor/layers/sampler.py | 4 +- .../layers/spec_decode_base_sampler.py | 15 +- .../layers/typical_acceptance_sampler.py | 22 +- .../layers/vocab_parallel_embedding.py | 74 +- vllm/model_executor/model_loader/__init__.py | 8 +- vllm/model_executor/model_loader/loader.py | 70 +- .../model_executor/model_loader/tensorizer.py | 4 +- vllm/model_executor/models/__init__.py | 6 + vllm/model_executor/models/arctic.py | 6 +- vllm/model_executor/models/baichuan.py | 18 +- vllm/model_executor/models/bloom.py | 7 +- vllm/model_executor/models/chatglm.py | 19 +- vllm/model_executor/models/commandr.py | 11 +- vllm/model_executor/models/dbrx.py | 6 +- vllm/model_executor/models/decilm.py | 4 +- vllm/model_executor/models/deepseek.py | 9 +- vllm/model_executor/models/falcon.py | 9 +- vllm/model_executor/models/gemma.py | 50 +- vllm/model_executor/models/gpt2.py | 92 +- vllm/model_executor/models/gpt_bigcode.py | 14 +- vllm/model_executor/models/gpt_j.py | 6 +- vllm/model_executor/models/gpt_neox.py | 6 +- vllm/model_executor/models/internlm2.py | 9 +- vllm/model_executor/models/jais.py | 7 +- vllm/model_executor/models/llama.py | 111 +- vllm/model_executor/models/llava.py | 244 ++- vllm/model_executor/models/llava_next.py | 436 ++++-- vllm/model_executor/models/minicpm.py | 20 +- vllm/model_executor/models/minicpmv.py | 114 +- vllm/model_executor/models/mixtral.py | 289 +--- vllm/model_executor/models/mixtral_quant.py | 9 +- vllm/model_executor/models/mpt.py | 7 +- vllm/model_executor/models/olmo.py | 9 +- vllm/model_executor/models/opt.py | 7 +- vllm/model_executor/models/orion.py | 9 +- vllm/model_executor/models/phi.py | 28 +- vllm/model_executor/models/phi3_small.py | 6 +- vllm/model_executor/models/phi3v.py | 346 ++++- vllm/model_executor/models/qwen.py | 19 +- vllm/model_executor/models/qwen2.py | 33 +- vllm/model_executor/models/qwen2_moe.py | 145 +- vllm/model_executor/models/stablelm.py | 9 +- vllm/model_executor/models/starcoder2.py | 9 +- vllm/model_executor/models/vlm_base.py | 12 - vllm/model_executor/models/xverse.py | 18 +- vllm/model_executor/sampling_metadata.py | 4 +- vllm/multimodal/__init__.py | 22 +- vllm/multimodal/base.py | 300 +++- vllm/multimodal/image.py | 252 ++- vllm/multimodal/registry.py | 196 ++- vllm/multimodal/utils.py | 103 +- vllm/outputs.py | 4 +- vllm/sampling_params.py | 29 +- vllm/sequence.py | 230 ++- vllm/spec_decode/batch_expansion.py | 6 +- vllm/spec_decode/interfaces.py | 4 + vllm/spec_decode/metrics.py | 24 +- vllm/spec_decode/multi_step_worker.py | 38 +- vllm/spec_decode/proposer_worker_base.py | 4 +- vllm/spec_decode/spec_decode_worker.py | 125 +- vllm/spec_decode/top1_proposer.py | 4 + vllm/spec_decode/util.py | 8 - vllm/transformers_utils/config.py | 42 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/image_processor.py | 26 +- .../tokenizer_group/base_tokenizer_group.py | 4 + .../tokenizer_group/ray_tokenizer_group.py | 116 +- vllm/utils.py | 188 ++- vllm/version.py | 2 +- vllm/worker/cache_engine.py | 19 +- vllm/worker/cpu_model_runner.py | 248 +-- vllm/worker/cpu_worker.py | 135 +- vllm/worker/embedding_model_runner.py | 161 +- vllm/worker/model_runner.py | 1018 ++++++++---- vllm/worker/neuron_model_runner.py | 108 +- vllm/worker/neuron_worker.py | 39 +- vllm/worker/tpu_model_runner.py | 192 ++- vllm/worker/tpu_worker.py | 174 ++- vllm/worker/worker.py | 191 +-- vllm/worker/worker_base.py | 214 ++- vllm/worker/xpu_model_runner.py | 187 ++- vllm/worker/xpu_worker.py | 18 +- 320 files changed, 12447 insertions(+), 9601 deletions(-) delete mode 100644 .buildkite/test-template-aws.j2 mode change 100755 => 100644 csrc/punica/bgmv/bgmv_config.h mode change 100755 => 100644 examples/offline_inference_neuron.py delete mode 100644 tests/entrypoints/test_guided_processors.py delete mode 100644 tests/entrypoints/test_llm_encode.py delete mode 100644 tests/entrypoints/test_llm_generate.py delete mode 100644 tests/entrypoints/test_llm_generate_multiple_loras.py delete mode 100644 tests/entrypoints/test_openai_embedding.py delete mode 100644 tests/entrypoints/test_openai_run_batch.py delete mode 100644 tests/entrypoints/test_openai_server.py delete mode 100644 tests/entrypoints/test_openai_vision.py delete mode 100644 tests/entrypoints/test_server_oot_registration.py delete mode 100644 tests/multimodal/test_processor.py delete mode 100644 tests/spec_decode/e2e/test_integration_dist.py delete mode 100644 tests/tokenization/test_image_processor.py delete mode 100644 vllm/inputs.py delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16.py delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py mode change 100755 => 100644 vllm/model_executor/models/__init__.py delete mode 100644 vllm/model_executor/models/vlm_base.py diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh index 389a12956c3c3..360a7584bccf1 100644 --- a/.buildkite/download-images.sh +++ b/.buildkite/download-images.sh @@ -8,10 +8,6 @@ set -o pipefail # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/ mkdir -p images cd images -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 8f12748b68f39..2b25c954b5c5c 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -17,6 +17,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT command: diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 1959f9752069f..c394f3fd7a0c5 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,7 +1,7 @@ steps: - block: "Build wheels" - - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" + - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" agents: queue: cpu_queue commands: diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index f4fa24be1f20f..a7678aae54644 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -12,8 +12,10 @@ trap remove_docker_container EXIT remove_docker_container # Run the image -docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test -docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2 +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ + --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ + --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2 # offline inference docker exec cpu-test bash -c "python3 examples/offline_inference.py" @@ -23,4 +25,4 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" docker exec cpu-test bash -c "cd tests; pip install pytest Pillow protobuf cd ../ - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5afe3730210e8..8013fbb642bb8 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,7 +1,10 @@ # In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. -# This script will be feed into Jinja template in `test-template-aws.j2` to generate -# the final pipeline yaml file. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + steps: - label: Regression Test @@ -24,29 +27,38 @@ steps: - label: Core Test mirror_hardwares: [amd] - command: pytest -v -s core + commands: + - pytest -v -s core + - pytest -v -s distributed/test_parallel_state.py - label: Distributed Comm Ops Test #mirror_hardwares: [amd] - command: pytest -v -s distributed/test_comm_ops.py working_dir: "/vllm-workspace/tests" num_gpus: 2 + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py - label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: + - bash ../.buildkite/download-images.sh - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - pytest -v -s spec_decode/e2e/test_integration_dist.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py @@ -60,6 +72,18 @@ steps: # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + +- label: Pipeline Parallelism Test + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + commands: + - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py + - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py + - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - label: Engine Test mirror_hardwares: [amd] @@ -69,8 +93,8 @@ steps: mirror_hardwares: [amd] commands: - - pytest -v -s entrypoints -m llm - - pytest -v -s entrypoints -m openai + - pytest -v -s entrypoints/llm + - pytest -v -s entrypoints/openai - label: Examples Test working_dir: "/vllm-workspace/examples" @@ -94,12 +118,15 @@ steps: - label: Kernels Test %N #mirror_hardwares: [amd] - command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + commands: + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 4 - label: Models Test #mirror_hardwares: [amd] commands: + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl - pytest -v -s models -m \"not vlm\" - label: Vision Language Models Test @@ -145,6 +172,9 @@ steps: num_gpus: 4 # This test runs llama 13B, so it is required to run on 4 GPUs. commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_long_context.py - label: Tensorizer Test @@ -175,9 +205,39 @@ steps: - pip install aiohttp - bash run-benchmarks.sh +- label: LM Eval Small Models + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-small.txt -t 1 + +- label: LM Eval Large Models + gpu: a100 + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-large.txt -t 4 + - label: Documentation Build working_dir: "/vllm-workspace/test_docs/docs" no_gpu: True commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html + +- label: Distributed Tests (A100) + gpu: a100 + num_gpus: 4 + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s -x lora/test_mixtral.py diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 deleted file mode 100644 index 01f7ff1e0e2b5..0000000000000 --- a/.buildkite/test-template-aws.j2 +++ /dev/null @@ -1,93 +0,0 @@ -{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} -{% set default_working_dir = "/vllm-workspace/tests" %} - -steps: - - label: ":docker: build image" - agents: - queue: cpu_queue - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - - - group: "AMD Tests" - depends_on: ~ - steps: - {% for step in steps %} - {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} - - label: "AMD: {{ step.label }}" - agents: - queue: amd - command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" - env: - DOCKER_BUILDKIT: "1" - priority: 100 - soft_fail: true - {% endif %} - {% endfor %} - - - label: "Neuron Test" - depends_on: ~ - agents: - queue: neuron - command: bash .buildkite/run-neuron-test.sh - soft_fail: false - - - label: "Intel Test" - depends_on: ~ - agents: - queue: intel - command: bash .buildkite/run-cpu-test.sh - - {% for step in steps %} - - label: "{{ step.label }}" - agents: - {% if step.label == "Documentation Build" %} - queue: small_cpu_queue - {% elif step.no_gpu %} - queue: cpu_queue - {% elif step.num_gpus == 2 or step.num_gpus == 4 %} - queue: gpu_4_queue - {% else %} - queue: gpu_1_queue - {% endif %} - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - docker#v5.2.0: - image: {{ docker_image }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_TOKEN - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS - {% endif %} - volumes: - - /dev/shm:/dev/shm - {% endfor %} diff --git a/CMakeLists.txt b/CMakeLists.txt index aa15b632cdd3b..31f7a97386d91 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda") +# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) +set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") @@ -32,8 +33,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # versions are derived from Dockerfile.rocm # set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0") -set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") -set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0") # # Try to find python package with an executable that exactly matches @@ -98,18 +98,11 @@ elseif(HIP_FOUND) # .hip extension automatically, HIP must be enabled explicitly. enable_language(HIP) - # ROCm 5.x - if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X}) - message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} " - "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.") - endif() - - # ROCm 6.x - if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X}) - message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} " - "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.") + # ROCm 5.X and 6.X + if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND + NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM}) + message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} " + "expected for ROCm build, saw ${Torch_VERSION} instead.") endif() else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") @@ -178,6 +171,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" + "csrc/quantization/fp8/fp8_marlin.cu" "csrc/custom_all_reduce.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" diff --git a/Dockerfile b/Dockerfile index 5b3e682a80169..feb004513959b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,6 +43,10 @@ COPY requirements-cuda.txt requirements-cuda.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-cuda.txt +COPY requirements-mamba.txt requirements-mamba.txt +RUN python3 -m pip install packaging +RUN python3 -m pip install -r requirements-mamba.txt + # cuda arch list used by torch # can be useful for both `dev` and `test` # explicitly set the list to avoid issues with torch 2.2 @@ -95,6 +99,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ && export SCCACHE_BUCKET=vllm-build-sccache \ && export SCCACHE_REGION=us-west-2 \ + && export CMAKE_BUILD_TYPE=Release \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist \ && sccache --show-stats; \ @@ -123,6 +128,21 @@ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt #################### DEV IMAGE #################### +#################### MAMBA Build IMAGE #################### +FROM dev as mamba-builder +# max jobs used for build +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} + +WORKDIR /usr/src/mamba + +COPY requirements-mamba.txt requirements-mamba.txt + +# Download the wheel or build it if a pre-compiled release doesn't exist +RUN pip --verbose wheel -r requirements-mamba.txt \ + --no-build-isolation --no-deps --no-cache-dir + +#################### MAMBA Build IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed @@ -143,6 +163,10 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install dist/*.whl --verbose + +RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \ + --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir #################### vLLM installation IMAGE #################### @@ -172,7 +196,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer modelscope + pip install accelerate hf_transfer 'modelscope!=1.15.0' ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 6e55203decc56..f95d748f1e4be 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -6,7 +6,13 @@ RUN apt-get update -y \ && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 -RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc +# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html +# intel-openmp provides additional performance improvement vs. openmp +# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects. +RUN pip install intel-openmp + +ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD" + RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl @@ -31,4 +37,4 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks -CMD ["/bin/bash"] +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 6bda696859c8b..1b89b892bbf1c 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,34 +1,35 @@ -# default base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -FROM $BASE_IMAGE - -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -RUN echo "Base image is $BASE_IMAGE" - -ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \ - ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - +# Default ROCm 6.1 base image +ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" + +# Tested and supported base rocm/pytorch images +ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \ + ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ + ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" + +# Default ROCm ARCHes to build vLLM for. +ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" + +# Whether to build CK-based flash-attention +# If 0, will not build flash attention +# This is useful for gfx target where flash-attention is not supported +# (i.e. those that do not appear in `FA_GFX_ARCHS`) +# Triton FA is used by default on ROCm now so this is unnecessary. +ARG BUILD_FA="1" ARG FA_GFX_ARCHS="gfx90a;gfx942" -RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" - ARG FA_BRANCH="ae7928c" -RUN echo "FA_BRANCH is $FA_BRANCH" -# whether to build flash-attention -# if 0, will not build flash attention -# this is useful for gfx target where flash-attention is not supported -# In that case, we need to use the python reference attention implementation in vllm -ARG BUILD_FA="1" - -# whether to build triton on rocm +# Whether to build triton on rocm ARG BUILD_TRITON="1" +ARG TRITON_BRANCH="0ef1848" -# Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y +### Base image build stage +FROM $BASE_IMAGE AS base + +# Import arg(s) defined before this build stage +ARG PYTORCH_ROCM_ARCH # Install some basic utilities +RUN apt-get update && apt-get install python3 python3-pip -y RUN apt-get update && apt-get install -y \ curl \ ca-certificates \ @@ -39,79 +40,165 @@ RUN apt-get update && apt-get install -y \ build-essential \ wget \ unzip \ - nvidia-cuda-toolkit \ tmux \ ccache \ && rm -rf /var/lib/apt/lists/* -### Mount Point ### -# When launching the container, mount the code directory to /app +# When launching the container, mount the code directory to /vllm-workspace ARG APP_MOUNT=/vllm-workspace -VOLUME [ ${APP_MOUNT} ] WORKDIR ${APP_MOUNT} -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas +RUN pip install --upgrade pip +# Remove sccache so it doesn't interfere with ccache +# TODO: implement sccache support across components +RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)" +# Install torch == 2.4.0 on ROCm +RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-5.7"*) \ + pip uninstall -y torch torchaudio torchvision \ + && pip install --no-cache-dir --pre \ + torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \ + torchvision==0.19.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \ + *"rocm-6.0"*) \ + pip uninstall -y torch torchaudio torchvision \ + && pip install --no-cache-dir --pre \ + torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \ + torchvision==0.19.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \ + *"rocm-6.1"*) \ + pip uninstall -y torch torchaudio torchvision \ + && pip install --no-cache-dir --pre \ + torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \ + torchvision==0.19.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ + *) ;; esac ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: -# Install ROCm flash-attention -RUN if [ "$BUILD_FA" = "1" ]; then \ - mkdir libs \ +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} +ENV CCACHE_DIR=/root/.cache/ccache + + +### AMD-SMI build stage +FROM base AS build_amdsmi +# Build amdsmi wheel always +RUN cd /opt/rocm/share/amd_smi \ + && pip wheel . --wheel-dir=/install + + +### Flash-Attention wheel build stage +FROM base AS build_fa +ARG BUILD_FA +ARG FA_GFX_ARCHS +ARG FA_BRANCH +# Build ROCm flash-attention wheel if `BUILD_FA = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_FA" = "1" ]; then \ + mkdir -p libs \ && cd libs \ && git clone https://github.com/ROCm/flash-attention.git \ && cd flash-attention \ - && git checkout ${FA_BRANCH} \ + && git checkout "${FA_BRANCH}" \ && git submodule update --init \ - && export GPU_ARCHS=${FA_GFX_ARCHS} \ - && if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \ - patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ - && python3 setup.py install \ - && cd ..; \ + && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-5.7"*) \ + export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \ + && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \ + *) ;; esac \ + && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ fi -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually removed it so that later steps of numpy upgrade can continue -RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi -# build triton -RUN if [ "$BUILD_TRITON" = "1" ]; then \ +### Triton wheel build stage +FROM base AS build_triton +ARG BUILD_TRITON +ARG TRITON_BRANCH +# Build triton wheel if `BUILD_TRITON = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_TRITON" = "1" ]; then \ mkdir -p libs \ && cd libs \ - && pip uninstall -y triton \ - && git clone https://github.com/ROCm/triton.git \ - && cd triton/python \ - && pip3 install . \ - && cd ../..; \ + && git clone https://github.com/OpenAI/triton.git \ + && cd triton \ + && git checkout "${TRITON_BRANCH}" \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ fi -WORKDIR /vllm-workspace + +### Final vLLM build stage +FROM base AS final +# Import the vLLM development directory from the build context COPY . . -#RUN python3 -m pip install pynvml # to be removed eventually -RUN python3 -m pip install --upgrade pip numba +# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. +# Manually remove it so that later steps of numpy upgrade can continue +RUN case "$(which python3)" in \ + *"/opt/conda/envs/py_3.9"*) \ + rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ + *) ;; esac + +# Package upgrades for useful functionality or to avoid dependency issues +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --upgrade numba scipy huggingface-hub[cli] -# make sure punica kernels are built (for LoRA) +# Make sure punica kernels are built (for LoRA) ENV VLLM_INSTALL_PUNICA_KERNELS=1 # Workaround for ray >= 2.10.0 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +# Silences the HF Tokenizers warning +ENV TOKENIZERS_PARALLELISM=false -ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so - -ENV CCACHE_DIR=/root/.cache/ccache -RUN --mount=type=cache,target=/root/.cache/ccache \ +RUN --mount=type=cache,target=${CCACHE_DIR} \ --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ - && if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \ - patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \ - && python3 setup.py install \ - && export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \ - && cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \ - && cd .. + && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-6.0"*) \ + patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \ + *"rocm-6.1"*) \ + # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM + wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \ + && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \ + # Prevent interference if torch bundles its own HIP runtime + && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \ + *) ;; esac \ + && python3 setup.py clean --all \ + && python3 setup.py develop + +# Copy amdsmi wheel into final image +RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ + mkdir -p libs \ + && cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y amdsmi; +# Copy triton wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y triton; fi + +# Copy flash-attn wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y flash-attn; fi + +# Install wheels that were built to the final image +RUN --mount=type=cache,target=/root/.cache/pip \ + if ls libs/*.whl; then \ + pip install libs/*.whl; fi CMD ["/bin/bash"] diff --git a/README.md b/README.md index c24768bf78173..3e0da945d9be8 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ vLLM is flexible and easy to use with: - Tensor parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs +- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs - (Experimental) Prefix caching support - (Experimental) Multi-lora support diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 4350b96b04a6a..fe29c67086158 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -225,8 +225,8 @@ async def async_request_openai_completions( ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( - "v1/completions" - ), "OpenAI Completions API URL must end with 'v1/completions'." + "completions" + ), "OpenAI Completions API URL must end with 'completions'." async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: assert not request_func_input.use_beam_search @@ -265,6 +265,9 @@ async def async_request_openai_completions( else: data = json.loads(chunk) + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated if data["choices"][0]["text"]: timestamp = time.perf_counter() # First token @@ -273,12 +276,8 @@ async def async_request_openai_completions( output.ttft = ttft # Decoding phase - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # do not want to include as inter-token-latency - elif data.get("usage", None) is None: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - + most_recent_timestamp) most_recent_timestamp = timestamp generated_text += data["choices"][0]["text"] @@ -305,8 +304,8 @@ async def async_request_openai_chat_completions( ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( - "v1/chat/completions" - ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'." + "chat/completions" + ), "OpenAI Chat Completions API URL must end with 'chat/completions'." async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: assert not request_func_input.use_beam_search @@ -423,4 +422,5 @@ def get_tokenizer( "openai": async_request_openai_completions, "openai-chat": async_request_openai_chat_completions, "tensorrt-llm": async_request_trt_llm, + "scalellm": async_request_openai_completions, } diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 98e0be2779922..8d0554b0f4f05 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -13,6 +13,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptStrictInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): @@ -24,11 +25,14 @@ def main(args: argparse.Namespace): model=args.model, speculative_model=args.speculative_model, num_speculative_tokens=args.num_speculative_tokens, + speculative_draft_tensor_parallel_size=\ + args.speculative_draft_tensor_parallel_size, tokenizer=args.tokenizer, quantization=args.quantization, tensor_parallel_size=args.tensor_parallel_size, trust_remote_code=args.trust_remote_code, dtype=args.dtype, + max_model_len=args.max_model_len, enforce_eager=args.enforce_eager, kv_cache_dtype=args.kv_cache_dtype, quantization_param_path=args.quantization_param_path, @@ -42,6 +46,7 @@ def main(args: argparse.Namespace): load_format=args.load_format, distributed_executor_backend=args.distributed_executor_backend, otlp_traces_endpoint=args.otlp_traces_endpoint, + enable_prefix_caching=args.enable_prefix_caching, ) sampling_params = SamplingParams( @@ -119,12 +124,16 @@ def run_to_completion(profile_dir: Optional[str] = None): if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--speculative-model', type=str, default=None) parser.add_argument('--num-speculative-tokens', type=int, default=None) + parser.add_argument('--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', + type=int, + default=None) parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--quantization', '-q', @@ -150,6 +159,12 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') + parser.add_argument( + '--max-model-len', + type=int, + default=None, + help='Maximum length of a sequence (including prompt and output). ' + 'If None, will be derived from the model.') parser.add_argument( '--dtype', type=str, @@ -193,9 +208,10 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument( "--device", type=str, - default="cuda", - choices=["cuda", "cpu", "tpu", "xpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') + default="auto", + choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], + help='device type for vLLM execution, supporting CUDA, OpenVINO and ' + 'CPU.') parser.add_argument('--block-size', type=int, default=16, @@ -205,6 +221,9 @@ def run_to_completion(profile_dir: Optional[str] = None): action='store_true', help='If True, the prefill requests can be chunked based on the ' 'max_num_batched_tokens') + parser.add_argument("--enable-prefix-caching", + action='store_true', + help="Enable automatic prefix caching") parser.add_argument('--use-v2-block-manager', action='store_true') parser.add_argument( "--ray-workers-use-nsight", diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 089966986984f..395107a5ec747 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,7 +1,7 @@ -import argparse import time from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 @@ -44,7 +44,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance with or without automatic ' 'prefix caching.') parser.add_argument('--model', diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index eef03e7d81c39..dbcb9743b6773 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -17,7 +17,7 @@ --dataset-path \ --request-rate \ # By default is inf --num-prompts # By default is 1000 - + when using tgi backend, add --endpoint /generate_stream to the end of the command above. @@ -44,6 +44,11 @@ except ImportError: from backend_request_func import get_tokenizer +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + @dataclass class BenchmarkMetrics: @@ -72,7 +77,6 @@ def sample_sharegpt_requests( ) -> List[Tuple[str, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") - # Load the dataset. with open(dataset_path) as f: dataset = json.load(f) @@ -180,6 +184,31 @@ def sample_sonnet_requests( return sampled_requests +def sample_random_requests( + input_len: int, output_len: int, num_prompts: int, range_ratio: float, + tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]: + + input_lens = np.random.randint( + int(input_len * range_ratio), + input_len + 1, + size=num_prompts, + ) + output_lens = np.random.randint( + int(output_len * range_ratio), + output_len + 1, + size=num_prompts, + ) + offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) + input_requests = [] + for i in range(args.num_prompts): + prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size + for j in range(input_lens[i])]) + input_requests.append( + (prompt, int(input_lens[i]), int(output_lens[i]))) + + return input_requests + + async def get_request( input_requests: List[Tuple[str, int, int]], request_rate: float, @@ -191,6 +220,7 @@ async def get_request( if request_rate == float("inf"): # If the request rate is infinity, then we don't need to wait. continue + # Sample the request interval from the exponential distribution. interval = np.random.exponential(1.0 / request_rate) # The next request will be sent after the interval. @@ -214,7 +244,7 @@ def calculate_metrics( # We use the tokenizer to count the number of output tokens for all # serving backends instead of looking at len(outputs[i].itl) since # multiple output tokens may be bundled together - # Note: this may inflate the output token count slightly + # Note : this may inflate the output token count slightly output_len = len( tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids) @@ -451,6 +481,15 @@ def main(args: argparse.Namespace): for prompt, prompt_formatted, prompt_len, output_len in input_requests] + elif args.dataset_name == "random": + input_requests = sample_random_requests( + input_len=args.input_len, + output_len=args.output_len, + num_prompts=args.num_prompts, + range_ratio=args.range_ratio, + tokenizer=tokenizer, + ) + else: raise ValueError(f"Unknown dataset: {args.dataset_name}") @@ -511,7 +550,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the online serving throughput.") parser.add_argument( "--backend", @@ -544,7 +583,7 @@ def main(args: argparse.Namespace): "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "sonnet"], + choices=["sharegpt", "sonnet", "random"], help="Name of the dataset to benchmark on.", ) parser.add_argument("--dataset-path", @@ -561,7 +600,7 @@ def main(args: argparse.Namespace): "--tokenizer", type=str, help= - "Name or path of the tokenizer, if not using the default tokenizer.", + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) parser.add_argument( "--best-of", @@ -604,6 +643,27 @@ def main(args: argparse.Namespace): help= "Number of prefix tokens per request, used only for sonnet dataset.", ) + parser.add_argument( + "--random-input-len", + type=int, + default=1024, + help= + "Number of input tokens per request, used only for random sampling.", + ) + parser.add_argument( + "--random-output-len", + type=int, + default=128, + help= + "Number of output tokens per request, used only for random sampling.", + ) + parser.add_argument( + "--random-range-ratio", + type=float, + default=1.0, + help="Range of sampled ratio of input/output length, " + "used only for random sampling.", + ) parser.add_argument( "--request-rate", type=float, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ed65002bc7d3c..a52e67bbbe7e3 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -12,6 +12,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -261,7 +262,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], @@ -348,9 +349,10 @@ def main(args: argparse.Namespace): parser.add_argument( "--device", type=str, - default="cuda", - choices=["cuda", "cpu", "tpu", "xpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') + default="auto", + choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], + help='device type for vLLM execution, supporting CUDA, OpenVINO and ' + 'CPU.') parser.add_argument( "--enable-prefix-caching", action='store_true', diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 5cc0fbbd49b8e..377f8683c021f 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -11,6 +11,7 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -293,7 +294,7 @@ def to_torch_dtype(dt): return torch.float8_e4m3fn raise ValueError("unsupported dtype") - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description=""" Benchmark Cutlass GEMM. diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index ac6a9f297f95a..601c4ea439aea 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,3 @@ -import argparse import os import sys from typing import Optional @@ -10,6 +9,7 @@ from vllm.model_executor.layers.quantization.aqlm import ( dequantize_weight, generic_dequantize_gemm, get_int_dtype, optimized_dequantize_gemm) +from vllm.utils import FlexibleArgumentParser os.environ['CUDA_VISIBLE_DEVICES'] = '0' @@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: def main(): - parser = argparse.ArgumentParser(description="Benchmark aqlm performance.") + parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") # Add arguments parser.add_argument("--nbooks", diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 96f01967b351e..261f5829631ee 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,3 @@ -import argparse from typing import List import torch @@ -16,6 +15,7 @@ MarlinWorkspace, marlin_24_quantize, marlin_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -211,7 +211,7 @@ def main(args): # python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 # if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark Marlin across specified models/shapes/batches") parser.add_argument( "--models", diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 62347aaf8ed6d..e00696d6d43cb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -10,6 +10,7 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.utils import FlexibleArgumentParser class BenchmarkConfig(TypedDict): @@ -315,7 +316,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 687e2369b758c..16de60477c305 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,4 +1,3 @@ -import argparse import random import time from typing import List, Optional @@ -6,7 +5,8 @@ import torch from vllm import _custom_ops as ops -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, + create_kv_caches_with_random) NUM_BLOCKS = 1024 PARTITION_SIZE = 512 @@ -161,7 +161,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the paged attention kernel.") parser.add_argument("--version", type=str, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index a53c6c77a5828..78736c7a7ba6f 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,4 +1,3 @@ -import argparse from itertools import accumulate from typing import List, Optional @@ -7,6 +6,7 @@ from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, get_rope) +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( @@ -86,7 +86,7 @@ def benchmark_rope_kernels_multi_lora( if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the rotary embedding kernels.") parser.add_argument("--is-neox-style", type=bool, default=True) parser.add_argument("--batch-size", type=int, default=16) diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index c846e47de1fcf..203699e9a8d06 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,8 +1,8 @@ -import argparse import cProfile import pstats from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?" @@ -47,7 +47,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance of hashing function in' 'automatic prefix caching.') parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k') diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 511e443f78403..690559ee265e9 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -46,6 +46,8 @@ is_avx512_disabled(AVX512_DISABLED) find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) +find_isa(${CPUINFO} "POWER10" POWER10_FOUND) +find_isa(${CPUINFO} "POWER9" POWER9_FOUND) if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS @@ -68,8 +70,15 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) elseif (AVX2_FOUND) list(APPEND CXX_COMPILE_FLAGS "-mavx2") message(WARNING "vLLM CPU backend using AVX2 ISA") +elseif (POWER9_FOUND OR POWER10_FOUND) + message(STATUS "PowerPC detected") + # Check for PowerPC VSX support + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=native" + "-mtune=native") else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.") + message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.") endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 071e16336dfa2..4869cad541135 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -147,19 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) if (${GPU_LANG} STREQUAL "HIP") # # `GPU_ARCHES` controls the `--offload-arch` flags. - # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled - # via the `PYTORCH_ROCM_ARCH` env variable. # - + # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list, + # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling + # "rocm_agent_enumerator" in "enable_language(HIP)" + # (in file Modules/CMakeDetermineHIPCompiler.cmake) + # + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH}) + else() + set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES}) + endif() # # Find the intersection of the supported + detected architectures to # set the module architecture flags. # - - set(VLLM_ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") - set(${GPU_ARCHES}) - foreach (_ARCH ${VLLM_ROCM_SUPPORTED_ARCHS}) + foreach (_ARCH ${HIP_ARCHITECTURES}) if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) list(APPEND ${GPU_ARCHES} ${_ARCH}) endif() @@ -167,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) if(NOT ${GPU_ARCHES}) message(FATAL_ERROR - "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" + "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is" " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") endif() diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 86ac2e75e78ee..5ed1dc3b8f792 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -135,6 +135,12 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) { return ((T)0.5) * x * (((T)1.0) + t); } +template +__device__ __forceinline__ T gelu_quick_kernel(const T& x) { + // x * sigmoid(1.702 * x) + return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x))); +} + } // namespace vllm void gelu_new(torch::Tensor& out, // [..., d] @@ -148,3 +154,9 @@ void gelu_fast(torch::Tensor& out, // [..., d] { LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); } + +void gelu_quick(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., d] +{ + LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel); +} diff --git a/csrc/cpu/activation.cpp b/csrc/cpu/activation.cpp index becd2ac42f17a..039b8d5c30d46 100644 --- a/csrc/cpu/activation.cpp +++ b/csrc/cpu/activation.cpp @@ -59,6 +59,13 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) { return w3 * x * (ones + t); } +FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) { + const vec_op::FP32Vec8 zeros(0.0); + const vec_op::FP32Vec8 ones(1.0); + const vec_op::FP32Vec8 w1(1.702f); + return x / (ones + (zeros - w1 * x).exp()); +} + FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 ones(1.0); const vec_op::FP32Vec8 w1(M_SQRT1_2); @@ -142,3 +149,15 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) { CPU_KERNEL_GUARD_OUT(gelu_fast_impl) }); } + +void gelu_quick(torch::Tensor& out, torch::Tensor& input) { + int num_tokens = input.numel() / input.size(-1); + int d = input.size(-1); + + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] { + CPU_KERNEL_GUARD_IN(gelu_quick_impl) + activation_kernel( + num_tokens, d, input.data_ptr(), out.data_ptr()); + CPU_KERNEL_GUARD_OUT(gelu_quick_impl) + }); +} diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index d7621aaae81c9..0213be09105ed 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -2,514 +2,14 @@ #ifndef CPU_TYPES_HPP #define CPU_TYPES_HPP -#include -#include - -#ifndef __AVX2__ -static_assert(false, "AVX2 must be supported for the current implementation."); -#endif - -namespace vec_op { - -// FIXME: FP16 is not fully supported in Torch-CPU -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) - -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) - -#ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) -#else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; -#endif - -#define FORCE_INLINE __attribute__((always_inline)) inline - -namespace { -template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); -} -}; // namespace - -template >> -constexpr void unroll_loop(F &&f) { - unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); -} - -template struct Vec { - constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } -}; - -struct FP32Vec8; -struct FP32Vec16; - -#ifdef __AVX512FP16__ -struct FP16Vec8 : public Vec { - constexpr static int VEC_ELEM_NUM = 8; - - __m128h reg; - - explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {} - - explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {} - - explicit FP16Vec8(__m128h data) : reg(data) {} - - FP16Vec8 operator*(const FP16Vec8 &b) const { - return FP16Vec8(_mm_mul_ph(reg, b.reg)); - } - - FP16Vec8 operator+(const FP16Vec8 &b) const { - return FP16Vec8(_mm_add_ph(reg, b.reg)); - } - - FP16Vec8 operator-(const FP16Vec8 &b) const { - return FP16Vec8(_mm_sub_ph(reg, b.reg)); - } - - FP16Vec8 operator/(const FP16Vec8 &b) const { - return FP16Vec8(_mm_div_ph(reg, b.reg)); - } - - void save(void *ptr) const { _mm_storeu_ph(ptr, reg); } -}; -#endif - -struct BF16Vec8 : public Vec { - constexpr static int VEC_ELEM_NUM = 8; - - __m128i reg; - - explicit BF16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} - - explicit BF16Vec8(const FP32Vec8 &); - - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } -}; - -struct BF16Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - - __m256i reg; - - explicit BF16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} - - explicit BF16Vec16(const FP32Vec16 &); - - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } -}; - -#ifdef __AVX512F__ -struct BF16Vec32 : public Vec { - constexpr static int VEC_ELEM_NUM = 32; - - __m512i reg; - - explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} - - explicit BF16Vec32(__m512i data) : reg(data) {} - - explicit BF16Vec32(BF16Vec8 &vec8_data) - : reg((__m512i)_mm512_inserti32x4( - _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( - (__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1), - (__m128i)vec8_data.reg, 2), - (__m128i)vec8_data.reg, 3)) {} - - void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } -}; -#else -struct BF16Vec32 : public Vec { - constexpr static int VEC_ELEM_NUM = 32; - - __m256i reg_low; - __m256i reg_high; - - explicit BF16Vec32(const void *ptr) - : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), - reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} - - explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), - reg_high(high) {} - - explicit BF16Vec32(BF16Vec8 &vec8_data) - : reg_low((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)), - reg_high((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)) {} - - void save(void *ptr) const { - *reinterpret_cast<__m256i *>(ptr) = reg_low; - *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; - } -}; -#endif - -struct FP32Vec4 : public Vec { - constexpr static int VEC_ELEM_NUM = 4; - union AliasReg { - __m128 reg; - float values[VEC_ELEM_NUM]; - }; - - __m128 reg; - - explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {} - - explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} - - explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} - - explicit FP32Vec4(__m128 data) : reg(data) {} - - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} -}; - -struct FP32Vec8 : public Vec { - constexpr static int VEC_ELEM_NUM = 8; - union AliasReg { - __m256 reg; - float values[VEC_ELEM_NUM]; - }; - - __m256 reg; - - explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {} - - explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} - - explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} - - explicit FP32Vec8(__m256 data) : reg(data) {} - - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} - -#ifdef __AVX512FP16__ - explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {} -#endif - - explicit FP32Vec8(const BF16Vec8 &v) - : reg(_mm256_castsi256_ps( - _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} - - float reduce_sum() const { - AliasReg ar; - ar.reg = reg; - float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); - - return result; - } - - FP32Vec8 exp() const { - AliasReg ar; - ar.reg = reg; - return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]), - expf(ar.values[5]), expf(ar.values[4]), - expf(ar.values[3]), expf(ar.values[2]), - expf(ar.values[1]), expf(ar.values[0]))); - } - - FP32Vec8 tanh() const { - AliasReg ar; - ar.reg = reg; - return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]), - tanhf(ar.values[5]), tanhf(ar.values[4]), - tanhf(ar.values[3]), tanhf(ar.values[2]), - tanhf(ar.values[1]), tanhf(ar.values[0]))); - } - - FP32Vec8 er() const { - AliasReg ar; - ar.reg = reg; - return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]), - erf(ar.values[5]), erf(ar.values[4]), - erf(ar.values[3]), erf(ar.values[2]), - erf(ar.values[1]), erf(ar.values[0]))); - } - - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8(_mm256_mul_ps(reg, b.reg)); - } - - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8(_mm256_add_ps(reg, b.reg)); - } - - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8(_mm256_sub_ps(reg, b.reg)); - } - - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8(_mm256_div_ps(reg, b.reg)); - } - - void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } -}; - -#ifdef __AVX512F__ -struct FP32Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - union AliasReg { - __m512 reg; - float values[VEC_ELEM_NUM]; - }; - - __m512 reg; - - explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {} - - explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} - - explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} - - explicit FP32Vec16(__m512 data) : reg(data) {} - - explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} - - explicit FP32Vec16(const FP32Vec4 &data) - : reg((__m512)_mm512_inserti32x4( - _mm512_inserti32x4( - _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), - (__m128i)data.reg, 1), - (__m128i)data.reg, 2), - (__m128i)data.reg, 3)) {} - - explicit FP32Vec16(const FP32Vec8 &data) - : reg((__m512)_mm512_inserti32x8( - _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} - - explicit FP32Vec16(const BF16Vec16 &v) - : reg(_mm512_castsi512_ps( - _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} - - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} - - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(_mm512_mul_ps(reg, b.reg)); - } - - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(_mm512_add_ps(reg, b.reg)); - } - - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(_mm512_sub_ps(reg, b.reg)); - } - - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(_mm512_div_ps(reg, b.reg)); - } - - float reduce_sum() const { return _mm512_reduce_add_ps(reg); } - - template float reduce_sub_sum(int idx) { - static_assert(VEC_ELEM_NUM % group_size == 0); - constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); - __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); - return _mm512_mask_reduce_add_ps(mask, reg); - } - - void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } -}; +#if defined(__x86_64__) + //x86 implementation + #include "cpu_types_x86.hpp" +#elif defined(__POWER9_VECTOR__) + //ppc implementation + #include "cpu_types_vsx.hpp" #else -struct FP32Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - - union AliasReg { - __m256 reg; - float values[8]; - }; - - __m256 reg_low; - __m256 reg_high; - - explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), - reg_high(_mm256_set1_ps(v)) {} - - explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), - reg_high(_mm256_set1_ps(0.0)) {} - - explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), - reg_high(_mm256_loadu_ps(ptr + 8)) {} - - explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} - - explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), - reg_high(data.reg_high) {} - - explicit FP32Vec16(const FP32Vec4 &data) - : reg_low((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)), - reg_high((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)) {} - - explicit FP32Vec16(const FP32Vec8 &data) - : reg_low(data.reg), reg_high(data.reg) {} - - explicit FP32Vec16(const BF16Vec16 &v) { - __m128i low = _mm256_extractf128_si256(v.reg, 0); - __m128i high = _mm256_extractf128_si256(v.reg, 1); - - __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low); - __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high); - - __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2); - __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2); - - reg_low = _mm256_castsi256_ps(v_low_shifted); - reg_high = _mm256_castsi256_ps(v_high_shifted); - } - - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} - - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), - _mm256_mul_ps(reg_high, b.reg_high)); - } - - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), - _mm256_add_ps(reg_high, b.reg_high)); - } - - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), - _mm256_sub_ps(reg_high, b.reg_high)); - } - - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), - _mm256_div_ps(reg_high, b.reg_high)); - } - - float reduce_sum() const { - FP32Vec8 low = FP32Vec8(reg_low); - FP32Vec8 high = FP32Vec8(reg_high); - return low.reduce_sum() + high.reduce_sum(); - } - - template float reduce_sub_sum(int idx) { - float sum = 0.0; - static_assert(VEC_ELEM_NUM % group_size == 0); - constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); - uint32_t mask = base_mask << (idx * group_size); - - AliasReg ar; - - auto func = [&sum, &mask, &ar](int i) { - int flag = mask & 0x1; - mask = mask >> 1; - if (flag != 0) sum += ar.values[i]; - }; - - ar.reg = reg_low; - unroll_loop(func); - - ar.reg = reg_high; - unroll_loop(func); - - return sum; - } - - void save(float *ptr) const { - _mm256_storeu_ps(ptr, reg_low); - _mm256_storeu_ps(ptr + 8, reg_high); - } -}; -#endif - -template struct VecType { using vec_type = void; }; - -template using vec_t = typename VecType::vec_type; - -template <> struct VecType { using vec_type = FP32Vec8; }; - -#ifdef __AVX512FP16__ -template <> struct VecType { using vec_type = FP16Vec16; }; + #warning "unsupported vLLM cpu implementation" #endif -template <> struct VecType { using vec_type = BF16Vec8; }; - -template void storeFP32(float v, T *ptr) { *ptr = v; } - -#ifdef __AVX512FP16__ -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<_Float16 *>(ptr) = v; -} -#endif - -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { - acc = acc + a * b; -} - -#ifdef __AVX512BF16__ -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); -} - -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) - : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} - -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) - : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} - -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { - acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); -} -#else -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); - *ptr = *(v_ptr + 1); -} - -#ifdef __AVX512F__ -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) - : reg(_mm256_cvtepi32_epi16( - _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} - -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) - : reg(_mm512_cvtepi32_epi16( - _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} -#else -namespace{ -__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { - __m256i ai = _mm256_castps_si256(a); - ai = _mm256_srli_epi32(ai, 16); - ai = _mm256_packus_epi32(ai, ai); - ai = _mm256_permute4x64_epi64(ai, 0b00111001); - return _mm256_extracti128_si256(ai, 0); -} -} - -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) - : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} - -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { - BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); - BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); - reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); -} -#endif // __AVX512F__ -#endif // __AVX512BF16__ - -inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } - -}; // namespace vec_op - #endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index a2bf0d49adba5..39e8cf3ed3c10 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -58,6 +58,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_fast(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_fast", torch::kCPU, &gelu_fast); + // Quick GELU implementation. + ops.def("gelu_quick(Tensor! out, Tensor input) -> ()"); + ops.impl("gelu_quick", torch::kCPU, &gelu_quick); + // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/csrc/ops.h b/csrc/ops.h index 9e2e977fa3c2e..fb1099e4fe0c2 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -1,5 +1,6 @@ #pragma once +#include #include void paged_attention_v1( @@ -49,6 +50,8 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input); void gelu_fast(torch::Tensor& out, torch::Tensor& input); +void gelu_quick(torch::Tensor& out, torch::Tensor& input); + #ifndef USE_ROCM torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, @@ -90,9 +93,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits); +torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, + torch::Tensor& b_scales, torch::Tensor& workspace, + int64_t num_bits, int64_t size_m, int64_t size_n, + int64_t size_k); + +bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); + void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales); + torch::Tensor const& b_scales, + c10::optional const& bias); #endif diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h old mode 100755 new mode 100644 index c38db2dcd2c4d..2c8d007d8719f --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -16,15 +16,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 512) \ f(in_T, out_T, W_T, narrow, 640) \ f(in_T, out_T, W_T, narrow, 768) \ + f(in_T, out_T, W_T, narrow, 896) \ f(in_T, out_T, W_T, narrow, 1024) \ f(in_T, out_T, W_T, narrow, 1152) \ + f(in_T, out_T, W_T, narrow, 1216) \ f(in_T, out_T, W_T, narrow, 1280) \ f(in_T, out_T, W_T, narrow, 1536) \ f(in_T, out_T, W_T, narrow, 1664) \ f(in_T, out_T, W_T, narrow, 1728) \ f(in_T, out_T, W_T, narrow, 1792) \ f(in_T, out_T, W_T, narrow, 2048) \ + f(in_T, out_T, W_T, narrow, 2240) \ f(in_T, out_T, W_T, narrow, 2304) \ + f(in_T, out_T, W_T, narrow, 2368) \ + f(in_T, out_T, W_T, narrow, 2432) \ f(in_T, out_T, W_T, narrow, 2560) \ f(in_T, out_T, W_T, narrow, 2752) \ f(in_T, out_T, W_T, narrow, 2816) \ @@ -32,8 +37,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 3328) \ f(in_T, out_T, W_T, narrow, 3456) \ f(in_T, out_T, W_T, narrow, 3584) \ + f(in_T, out_T, W_T, narrow, 3712) \ f(in_T, out_T, W_T, narrow, 4096) \ + f(in_T, out_T, W_T, narrow, 4480) \ f(in_T, out_T, W_T, narrow, 4608) \ + f(in_T, out_T, W_T, narrow, 4736) \ + f(in_T, out_T, W_T, narrow, 4864) \ f(in_T, out_T, W_T, narrow, 5120) \ f(in_T, out_T, W_T, narrow, 5504) \ f(in_T, out_T, W_T, narrow, 5632) \ @@ -43,8 +52,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 6848) \ f(in_T, out_T, W_T, narrow, 6912) \ f(in_T, out_T, W_T, narrow, 7168) \ + f(in_T, out_T, W_T, narrow, 7424) \ f(in_T, out_T, W_T, narrow, 8192) \ + f(in_T, out_T, W_T, narrow, 8960) \ f(in_T, out_T, W_T, narrow, 9216) \ + f(in_T, out_T, W_T, narrow, 9472) \ f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ f(in_T, out_T, W_T, narrow, 11264) \ @@ -52,8 +64,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ f(in_T, out_T, W_T, narrow, 14336) \ + f(in_T, out_T, W_T, narrow, 14784) \ + f(in_T, out_T, W_T, narrow, 14848) \ f(in_T, out_T, W_T, narrow, 15360) \ f(in_T, out_T, W_T, narrow, 16384) \ + f(in_T, out_T, W_T, narrow, 18944) \ f(in_T, out_T, W_T, narrow, 20480) \ f(in_T, out_T, W_T, narrow, 22016) \ f(in_T, out_T, W_T, narrow, 22528) \ @@ -61,6 +76,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 27392) \ f(in_T, out_T, W_T, narrow, 27648) \ f(in_T, out_T, W_T, narrow, 28672) \ + f(in_T, out_T, W_T, narrow, 29568) \ + f(in_T, out_T, W_T, narrow, 29696) \ f(in_T, out_T, W_T, narrow, 32000) \ f(in_T, out_T, W_T, narrow, 32256) \ f(in_T, out_T, W_T, narrow, 32512) \ @@ -69,6 +86,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 36864) \ f(in_T, out_T, W_T, narrow, 43264) \ f(in_T, out_T, W_T, narrow, 49152) \ + f(in_T, out_T, W_T, narrow, 49408) \ f(in_T, out_T, W_T, narrow, 60544) \ f(in_T, out_T, W_T, narrow, 60672) \ f(in_T, out_T, W_T, narrow, 64000) \ @@ -85,9 +103,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA // and vllm/tests/lora/test_punica.py -// Used for defining kernels going from the variety of +// Used for defining kernels going from the variety of // dim in to the narrow dim out - // Using it for the fully sharded column + // Using it for the fully sharded column // parallel LoRA A which splits the rank dim #define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \ f(in_T, out_T, W_T, 128, narrow) \ @@ -95,15 +113,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 512, narrow) \ f(in_T, out_T, W_T, 640, narrow) \ f(in_T, out_T, W_T, 768, narrow) \ + f(in_T, out_T, W_T, 896, narrow) \ f(in_T, out_T, W_T, 1024, narrow) \ f(in_T, out_T, W_T, 1152, narrow) \ + f(in_T, out_T, W_T, 1216, narrow) \ f(in_T, out_T, W_T, 1280, narrow) \ f(in_T, out_T, W_T, 1536, narrow) \ f(in_T, out_T, W_T, 1664, narrow) \ f(in_T, out_T, W_T, 1728, narrow) \ f(in_T, out_T, W_T, 1792, narrow) \ f(in_T, out_T, W_T, 2048, narrow) \ + f(in_T, out_T, W_T, 2240, narrow) \ f(in_T, out_T, W_T, 2304, narrow) \ + f(in_T, out_T, W_T, 2368, narrow) \ + f(in_T, out_T, W_T, 2432, narrow) \ f(in_T, out_T, W_T, 2560, narrow) \ f(in_T, out_T, W_T, 2752, narrow) \ f(in_T, out_T, W_T, 2816, narrow) \ @@ -111,8 +134,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 3328, narrow) \ f(in_T, out_T, W_T, 3456, narrow) \ f(in_T, out_T, W_T, 3584, narrow) \ + f(in_T, out_T, W_T, 3712, narrow) \ f(in_T, out_T, W_T, 4096, narrow) \ + f(in_T, out_T, W_T, 4480, narrow) \ f(in_T, out_T, W_T, 4608, narrow) \ + f(in_T, out_T, W_T, 4736, narrow) \ + f(in_T, out_T, W_T, 4864, narrow) \ f(in_T, out_T, W_T, 5120, narrow) \ f(in_T, out_T, W_T, 5504, narrow) \ f(in_T, out_T, W_T, 5632, narrow) \ @@ -122,8 +149,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 6848, narrow) \ f(in_T, out_T, W_T, 6912, narrow) \ f(in_T, out_T, W_T, 7168, narrow) \ + f(in_T, out_T, W_T, 7424, narrow) \ f(in_T, out_T, W_T, 8192, narrow) \ + f(in_T, out_T, W_T, 8960, narrow) \ f(in_T, out_T, W_T, 9216, narrow) \ + f(in_T, out_T, W_T, 9472, narrow) \ f(in_T, out_T, W_T, 10240, narrow) \ f(in_T, out_T, W_T, 11008, narrow) \ f(in_T, out_T, W_T, 11264, narrow) \ @@ -131,8 +161,11 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 13696, narrow) \ f(in_T, out_T, W_T, 13824, narrow) \ f(in_T, out_T, W_T, 14336, narrow) \ + f(in_T, out_T, W_T, 14784, narrow) \ + f(in_T, out_T, W_T, 14848, narrow) \ f(in_T, out_T, W_T, 15360, narrow) \ f(in_T, out_T, W_T, 16384, narrow) \ + f(in_T, out_T, W_T, 18944, narrow) \ f(in_T, out_T, W_T, 20480, narrow) \ f(in_T, out_T, W_T, 22016, narrow) \ f(in_T, out_T, W_T, 22528, narrow) \ @@ -140,6 +173,8 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 27392, narrow) \ f(in_T, out_T, W_T, 27648, narrow) \ f(in_T, out_T, W_T, 28672, narrow) \ + f(in_T, out_T, W_T, 29568, narrow) \ + f(in_T, out_T, W_T, 29696, narrow) \ f(in_T, out_T, W_T, 32000, narrow) \ f(in_T, out_T, W_T, 32256, narrow) \ f(in_T, out_T, W_T, 32512, narrow) \ @@ -148,6 +183,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 36864, narrow) \ f(in_T, out_T, W_T, 43264, narrow) \ f(in_T, out_T, W_T, 49152, narrow) \ + f(in_T, out_T, W_T, 49408, narrow) \ f(in_T, out_T, W_T, 60544, narrow) \ f(in_T, out_T, W_T, 60672, narrow) \ f(in_T, out_T, W_T, 64000, narrow) \ diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp index 999b7b251ab33..bf04bb400790f 100644 --- a/csrc/quantization/cutlass_w8a8/common.hpp +++ b/csrc/quantization/cutlass_w8a8/common.hpp @@ -1,6 +1,7 @@ #pragma once #include "cutlass/cutlass.h" +#include /** * Helper function for checking CUTLASS errors @@ -10,3 +11,17 @@ TORCH_CHECK(status == cutlass::Status::kSuccess, \ cutlassGetStatusString(status)) \ } + +inline uint32_t next_pow_2(uint32_t const num) { + if (num <= 1) return num; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { + int max_shared_mem_per_block_opt_in = 0; + cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, + cudaDevAttrMaxSharedMemoryPerBlockOptin, + device); + return max_shared_mem_per_block_opt_in; +} + diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 7651268dc5316..6ce25c5ac897b 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -77,24 +77,12 @@ struct enable_sm89_to_sm90 : Kernel { }; /* - This epilogue function defines a quantized GEMM operation similar to - torch._scaled_mm. - - A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or - per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. -*/ + * This class provides the common ScaleA and ScaleB descriptors for the + * ScaledEpilogue and ScaledEpilogueBias classes. + */ template -struct ScaledEpilogue { - private: +struct ScaledEpilogueBase { + protected: using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< @@ -102,6 +90,32 @@ struct ScaledEpilogue { using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< OutputTileThreadMap, float, Stride, Int<1>, Int<0>>>; +}; + +/* + This epilogue function defines a quantized GEMM operation similar to + torch._scaled_mm. + + A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or + per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::ScaleA; + using ScaleB = typename SUPER::ScaleB; using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< cutlass::multiplies, float, float, @@ -134,6 +148,53 @@ struct ScaledEpilogue { } }; +template +struct ScaledEpilogueBias + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::ScaleA; + using ScaleB = typename SUPER::ScaleB; + + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast< + OutputTileThreadMap, ElementD, Stride, Int<1>, Int<0>>>; + + public: + using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + using ScaleAArgs = typename ScaleA::Arguments; + using ScaleBArgs = typename ScaleB::Arguments; + using BiasArgs = typename Bias::Arguments; + + ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; + ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; + BiasArgs bias_args{static_cast(bias.data_ptr()), {}}; + + typename EVTCompute0::Arguments evt0_compute_args{b_args}; + + typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args, + bias_args}; + return evt_compute_args; + } +}; + template typename ArchGuard, typename ElementAB_, typename ElementD_, template typename Epilogue_, typename TileShape, @@ -168,13 +229,13 @@ struct cutlass_2x_gemm { // clang-format off using RowMajor = typename cutlass::layout::RowMajor; using ColumnMajor = typename cutlass::layout::ColumnMajor; - using KernelType = + using KernelType = ArchGuard +void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + // In some cases, the GPU isn't able to accommodate the + // shared memory requirements of the Gemm. In such cases, use + // the FallbackGemm instead. + static const int max_shared_mem_per_block_opt_in = + get_cuda_max_shared_memory_per_block_opt_in(0); + + size_t const gemm_shared_mem_size = + sizeof(typename Gemm::KernelType::SharedStorage); + size_t const fallback_gemm_shared_mem_size = + sizeof(typename FallbackGemm::KernelType::SharedStorage); + + if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) { + return cutlass_gemm_caller(out, a, b, + std::forward(args)...); + } else { + TORCH_CHECK(fallback_gemm_shared_mem_size <= + max_shared_mem_per_block_opt_in); + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + +template typename Epilogue> +struct sm80_config_default { + // This config is used in 2 cases, + // - M in (128, inf) + // - M in (64, 128] and N >= 8192 + // Shared Memory required by this Gemm - 81920 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M64 { + // This config is used in 2 cases, + // - M in (32, 64] + // - M in (64, 128] and N < 8192 + // Shared Memory required by this Gemm - 122880 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M32 { + // M in (16, 32] + // Shared Memory required by this Gemm - 61440 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M16 { + // M in [1, 16] + // Shared Memory required by this Gemm - 51200 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + } // namespace -void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { +template typename Epilogue, + typename... EpilogueArgs> +void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kInt8); + TORCH_CHECK(b.dtype() == torch::kInt8); + + using Cutlass2xGemmDefault = + typename sm80_config_default::Cutlass2xGemm; + using Cutlass2xGemmM128BigN = + typename sm80_config_default::Cutlass2xGemm; + using Cutlass2xGemmM128SmallN = + typename sm80_config_M64::Cutlass2xGemm; + using Cutlass2xGemmM64 = + typename sm80_config_M64::Cutlass2xGemm; + using Cutlass2xGemmM32 = + typename sm80_config_M32::Cutlass2xGemm; + using Cutlass2xGemmM16 = + typename sm80_config_M16::Cutlass2xGemm; + + // Due to shared memory requirements, some Gemms may fail to run on some + // GPUs. As the name indicates, the Fallback Gemm is used as an alternative + // in such cases. + // sm80_config_M16 has the least shared-memory requirement. However, + // based on some profiling, we select sm80_config_M32 as a better alternative + // performance wise. + using FallbackGemm = + typename sm80_config_M32::Cutlass2xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(16), next_pow_2(m)); // next power of 2 + if (mp2 <= 16) { + // M in [1, 16] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 32) { + // M in (16, 32] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 64) { + // M in (32, 64] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // M in (64, 128] + uint32_t const n = out.size(1); + bool const small_n = n < 8192; + if (small_n) { + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } + } else { + // M in (128, inf) + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + +template